files formatted

Rohan-Bierneni · Rohan-Bierneni · commit 6192638b4709 · 2026-01-29T17:20:17.000Z
diff --git a/tests/unit/flop_calculation_test.py b/tests/unit/flop_calculation_test.py
@@ -98,7 +98,6 @@ def compute_gpt_attention_flops_per_device(self, kwargs: dict) -> float:
 
     return attention_flops / 1e12  # return tflops
 
-
   def compute_qwen3_next_attention_flops_per_device(self, kwargs: dict) -> float:
     """
     Computes the total training TFLOPs per device for a Qwen3-Next model.
@@ -136,7 +135,6 @@ def compute_qwen3_next_attention_flops_per_device(self, kwargs: dict) -> float:
 
     return (full_attn_flops + linear_attn_flops) / 1e12
 
-
   @pytest.mark.cpu_only
   def test_qwen3_next_flops(self):
     """Test Qwen3-Next Flops calculation"""
@@ -148,22 +146,19 @@ def test_qwen3_next_flops(self):
         "decoder_block": "qwen3_next",
         "gradient_accumulation_steps": 1,
         "skip_jax_distributed_system": True,
-
         # Core Architectural Parameters
         "base_emb_dim": 2048,
         "base_num_decoder_layers": 48,
         "base_num_query_heads": 16,
         "base_num_kv_heads": 2,
         "head_dim": 256,
         "vocab_size": 151936,
-
         # MoE Parameters
-        "base_mlp_dim": 512, # Note: maxtext_utils uses moe_mlp_dim for calculations
+        "base_mlp_dim": 512,  # Note: maxtext_utils uses moe_mlp_dim for calculations
         "base_moe_mlp_dim": 512,
         "num_experts": 512,
         "num_experts_per_tok": 10,
         "mlp_activations": ["silu", "linear"],
-
         # Qwen3-Next Specific Parameters
         "inhomogeneous_layer_cycle_interval": 4,
         "gdn_conv_kernel_dim": 4,
@@ -192,9 +187,9 @@ def test_qwen3_next_flops(self):
     num_experts = kwargs["num_experts"]
     num_routed = kwargs["num_experts_per_tok"]
 
-    params_moe_layer = (emb_dim * num_experts) + \
-                       (3 * emb_dim * moe_mlp_dim * 1) + \
-                       (3 * emb_dim * moe_mlp_dim * num_routed)
+    params_moe_layer = (
+        (emb_dim * num_experts) + (3 * emb_dim * moe_mlp_dim * 1) + (3 * emb_dim * moe_mlp_dim * num_routed)
+    )
 
     # Full Attention Params (per full layer)
     Hq = kwargs["base_num_query_heads"]
@@ -214,9 +209,9 @@ def test_qwen3_next_flops(self):
     V_dim = Hv_g * Dv_g
 
     # Projections: qkvz (in->2K+2V), ba (in->2Hv), out (V->in)
-    params_gdn_proj = (emb_dim * (2*K_dim + 2*V_dim)) + (emb_dim * 2*Hv_g) + (V_dim * emb_dim)
+    params_gdn_proj = (emb_dim * (2 * K_dim + 2 * V_dim)) + (emb_dim * 2 * Hv_g) + (V_dim * emb_dim)
     # Conv: depthwise on 2K+V
-    params_gdn_conv = (2*K_dim + V_dim) * K_conv
+    params_gdn_conv = (2 * K_dim + V_dim) * K_conv
 
     params_gdn_layer = params_gdn_proj + params_gdn_conv
 
@@ -225,9 +220,11 @@ def test_qwen3_next_flops(self):
     num_full = N // kwargs["inhomogeneous_layer_cycle_interval"]
     num_linear = N - num_full
 
-    total_active_params = (vocab * emb_dim) + \
-                          (num_full * (params_full_attn + params_moe_layer)) + \
-                          (num_linear * (params_gdn_layer + params_moe_layer))
+    total_active_params = (
+        (vocab * emb_dim)
+        + (num_full * (params_full_attn + params_moe_layer))
+        + (num_linear * (params_gdn_layer + params_moe_layer))
+    )
 
     # Weight TFLOPs = 6 * B * S * P
     B = kwargs["per_device_batch_size"]
@@ -245,7 +242,6 @@ def test_qwen3_next_flops(self):
 
     self.assertFlopsAlmostEqual(calculated_tflops, golden_tflops)
 
-
   @pytest.mark.cpu_only
   def test_llama2_7b_flops(self):
     """Test Llama2 7b Flops calculation with default parameters"""