Onboard DeepSeek MHC feature

RissyRan · RissyRan · commit ee0f01be2afa · 2026-02-02T20:26:09.000Z
diff --git a/src/MaxText/common_types.py b/src/MaxText/common_types.py
@@ -114,3 +114,9 @@ class AttentionType(enum.Enum):
 class ShardMode(enum.Enum):
   AUTO = "auto"  # default
   EXPLICIT = "explicit"
+
+
+class HyperConnectionType(enum.Enum):
+  ATTENTION = "attention"
+  MLP_MOE = "mlp_moe"
+  MLP_DENSE = "mlp_dense"
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -1051,3 +1051,15 @@ vllm_hf_config_path: ""
 vllm_additional_config: {}
 # When use_jax_splash=True, force the layout of the query tensor to be [..., NUM_HEADS, HEAD_DIM, SEQ_LENGTH]
 force_q_layout: false
+
+################################## DeepSeek Manifold-Constrained Hyper Connections (mHC) ##################################
+# The number of parallel streams in Hyper Connection.
+mhc_expansion_rate: 0
+# The scale for the residual mapping.
+mhc_res_alpha_scale: 0.01
+# The scale for the pre mapping.
+mhc_pre_alpha_scale: 0.01
+# The scale for the post mapping.
+mhc_post_alpha_scale: 0.01
+# The number of iterations for the Sinkhorn-Knopp algorithm.
+sinkhorn_iterations: 20
diff --git a/src/MaxText/configs/models/deepseek-custom.yml b/src/MaxText/configs/models/deepseek-custom.yml
@@ -0,0 +1,64 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Small model config for testing (derived from DeepSeek V3.2 - 671B)
+
+base_emb_dim: 1024             # Reduced from 7168
+base_num_query_heads: 16       # Reduced from 128
+base_num_kv_heads: 16          # Reduced from 128
+base_mlp_dim: 2048             # Reduced from 18432
+base_moe_mlp_dim: 512          # Reduced from 2048
+base_num_decoder_layers: 6     # Reduced from 61
+first_num_dense_layers: 1      # Reduced from 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+num_experts: 16                # Reduced from 256
+num_experts_per_tok: 2         # Reduced from 8
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek"
+# MLA
+attention_type: "mla"
+q_lora_rank: 384               # Reduced from 1536
+kv_lora_rank: 128              # Reduced from 512
+qk_nope_head_dim: 32           # Reduced from 128
+qk_rope_head_dim: 16           # Reduced from 64
+v_head_dim: 128
+# RoPE
+mscale: 1.0
+rope_type: "yarn"
+rope_max_timescale: 10_000
+max_position_embeddings: 4096  # Reduced for local testing
+original_max_position_embeddings: 4096
+rope_factor: 1
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
+# Indexer for DeepSeek Sparse Attention
+use_sparse_indexer: True
+index_n_heads: 16              # Reduced from 64
+index_head_dim: 64             # Reduced from 128
+index_topk: 256                # Reduced from 2048
+# Hyper-connections: mHC enabled
+mhc_expansion_rate: 4
+mhc_res_alpha_scale: 0.01
+mhc_pre_alpha_scale: 0.01
+mhc_post_alpha_scale: 0.01
+sinkhorn_iterations: 20
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -209,6 +209,7 @@ class ProfilerType(str, Enum):
     "deepseek3-test",
     "deepseek3-tiny",
     "deepseek3.2-671b",
+    "deepseek-custom",
     "kimi-k2-1t",
     "gemma-7b",
     "gemma-2b",
@@ -1048,6 +1049,16 @@ class TrainingLoop(BaseModel):
   init_weights_seed: int = Field(0, description="Seed for model weight initialization.")
 
 
+class ManifoldConstrainedHyperConnections(BaseModel):
+  """Configuration for DeepSeek Manifold-Constrained Hyper Connections (mHC)."""
+
+  mhc_expansion_rate: int = Field(0, description="The number of parallel streams in Hyper Connection.")
+  mhc_res_alpha_scale: float = Field(0.01, description="The scale for the residual mapping.")
+  mhc_pre_alpha_scale: float = Field(0.01, description="The scale for the pre mapping.")
+  mhc_post_alpha_scale: float = Field(0.01, description="The scale for the post mapping.")
+  sinkhorn_iterations: PositiveInt = Field(20, description="The number of iterations for the Sinkhorn-Knopp algorithm.")
+
+
 class Optimizer(BaseModel):
   """Configuration for the optimizer and learning rate schedule."""
 
@@ -1727,6 +1738,7 @@ class MaxTextConfig(
     # Training, Optimization, and Fine-Tuning
     RematAndOffload,
     TrainingLoop,
+    ManifoldConstrainedHyperConnections,
     Optimizer,
     AdamW,
     Muon,
diff --git a/src/MaxText/layers/mhc.py b/src/MaxText/layers/mhc.py
@@ -0,0 +1,225 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DeepSeek Manifold-Constrained Hyper Connections (mHC) Layer."""
+
+import jax
+from jax.sharding import Mesh
+
+import jax.numpy as jnp
+from flax import nnx
+from typing import Callable
+from MaxText.common_types import Config
+from MaxText.layers.normalizations import RMSNorm
+from MaxText.layers.initializers import nd_dense_init, default_bias_init
+from MaxText.common_types import HyperConnectionType
+
+
+def get_functions(expansion_rate: int):
+  """
+  Creates functions to broadcast a single feature stream into multiple
+  parallel paths (expand) and aggregate them back (reduce).
+  """
+
+  def expand(x: jnp.ndarray):
+    # (batch, length, dim) -> (streams, batch, length, dim)
+    return jnp.repeat(jnp.expand_dims(x, axis=0), expansion_rate, axis=0)
+
+  def reduce(x: jnp.ndarray):
+    # (streams, batch, length, dim) -> (batch, length, dim)
+    return jnp.sum(x, axis=0)
+
+  return expand, reduce
+
+
+def sinkhorn(t, iters=20):
+  """
+  Computes the Sinkhorn normalization of a matrix (rows and columns sum to 1).
+  """
+  # Use float32 precision for numerical stability during normalization
+  initial_dtype = t.dtype
+  t = t.astype(jnp.float32)
+
+  # Initial softmax along the rows (dim -2)
+  # Makes values to be positive and sum up to 1 across columns
+  t = jax.nn.softmax(t, axis=-2)
+
+  def body_fun(i, val):
+    # L1 Normalization: val / sum(val) with clipping of denominator
+    # Normalize rows (axis -1)
+    val = val / jnp.clip(jnp.sum(val, axis=-1, keepdims=True), min=1e-12)
+    # Normalize columns (axis -2)
+    val = val / jnp.clip(jnp.sum(val, axis=-2, keepdims=True), min=1e-12)
+    return val
+
+  # Use lax.fori_loop for an efficient, JIT-friendly loop
+  t = jax.lax.fori_loop(0, iters, body_fun, t)
+  return t.astype(initial_dtype)
+
+
+class ManifoldConstrainedHyperConnections(nnx.Module):
+  """Implements Manifold-Constrained Hyper-Connections (mHC).
+
+  Reference: https://arxiv.org/pdf/2512.24880
+
+  Args:
+      config: Configuration object containing hyperparameters.
+      model_mode: String indicating the execution context.
+      dim: The feature dimensionality.
+      mesh: The hardware mesh for sharding.
+      rngs: Random number generation in NNX.
+  """
+
+  def __init__(
+      self,
+      config: Config,
+      model_mode: str,
+      dim: int,
+      mesh: Mesh,
+      rngs: nnx.Rngs,
+  ):
+    self.config = config
+    self.sinkhorn_iterations = config.sinkhorn_iterations
+    self.k = config.mhc_expansion_rate
+    self.dim = dim
+    self.rngs = rngs
+    self.mesh = mesh
+    self.weight_dtype = self.config.weight_dtype
+
+    # Norm layer
+    self.mhc_norm = RMSNorm(
+        num_features=self.dim,
+        dtype=self.config.dtype,
+        weight_dtype=self.weight_dtype,
+        kernel_axes=("norm",),
+        epsilon=self.config.normalization_layer_epsilon,
+        rngs=self.rngs,
+    )
+
+    # Scalers
+    self.mhc_res_alpha_scale = self.config.mhc_res_alpha_scale
+    self.mhc_pre_alpha_scale = self.config.mhc_pre_alpha_scale
+    self.mhc_post_alpha_scale = self.config.mhc_post_alpha_scale
+
+    # Weight matrices
+    scale_init = nd_dense_init(1.0, "fan_in", "normal")
+    in_axis = (0, 1)
+    out_axis = 2
+    weight_sharding_axis_name = (None, "activation_embed", None)
+    self.res_alpha = nnx.Param(
+        scale_init(
+            self.rngs.params(),
+            (self.k, self.dim, self.k * self.k),
+            self.weight_dtype,
+            in_axis=in_axis,
+            out_axis=out_axis,
+        ),
+        sharding=weight_sharding_axis_name,
+    )
+    self.pre_alpha = nnx.Param(
+        scale_init(
+            self.rngs.params(),
+            (self.k, self.dim, self.k),
+            self.weight_dtype,
+            in_axis=in_axis,
+            out_axis=out_axis,
+        ),
+        sharding=weight_sharding_axis_name,
+    )
+    self.post_alpha = nnx.Param(
+        scale_init(
+            self.rngs.params(),
+            (self.k, self.dim, self.k),
+            self.weight_dtype,
+            in_axis=in_axis,
+            out_axis=out_axis,
+        ),
+        sharding=weight_sharding_axis_name,
+    )
+
+    # Biases
+    self.res_beta = nnx.Param(
+        default_bias_init(self.rngs.params(), (self.k, self.k), self.weight_dtype),
+        sharding=(None, None),
+    )
+    self.pre_beta = nnx.Param(
+        default_bias_init(self.rngs.params(), (self.k,), self.weight_dtype),
+        sharding=(None, None),
+    )
+    self.post_beta = nnx.Param(
+        default_bias_init(self.rngs.params(), (self.k,), self.weight_dtype),
+        sharding=(None, None),
+    )
+
+  def res_mapping(self, x: jnp.ndarray):
+    """Helper function for residule mapping."""
+    # Apply projection: (k, b, s, d) @ (k, d, k*k) -> (k*k)
+    h_res = jnp.einsum("kbsd,kdm -> m", x, self.res_alpha[...])
+    h_res = jnp.reshape(h_res, (self.k, self.k))
+    intermediate = self.mhc_res_alpha_scale * h_res + self.res_beta[...]
+    output = sinkhorn(intermediate, self.sinkhorn_iterations)
+    return output
+
+  def mapping(self, x: jnp.ndarray, alpha_scale: jnp.ndarray, alpha: jnp.ndarray, beta: jnp.ndarray, scale: int):
+    """Helper function for both pre and post mappings."""
+    # Apply projection: (k, b, s, d) @ (k, d, k) -> (k)
+    h = jnp.einsum("kbsd,kdm -> m", x, alpha)
+    intermediate = alpha_scale * h + beta
+    output = scale * jax.nn.sigmoid(intermediate)
+    return output
+
+  def __call__(
+      self,
+      branch_fn: Callable,
+      x: jnp.ndarray,
+      mhc_type: HyperConnectionType,
+      **kwargs,
+  ) -> jnp.ndarray:
+    """Applying manifold-constrained hyper connection based on callable function.
+
+    Args:
+        branch_fn: The function to be wrapped by the hyper-connection.
+        x: Input tensor of shape `(batch..., dim)`.
+        mhc_type: The variant of the connection to apply.
+        **kwargs: Additional context passed to the branch function.
+
+    Returns:
+        The processed tensor, maintaining the shape of `x`.
+    """
+    # x shape: [expansion_rate, batch, seq, emb]
+    # 1. RMS normalization
+    x = self.mhc_norm(x)
+
+    # 2. Pre mapping
+    pre_mapping = self.mapping(x, self.mhc_pre_alpha_scale, self.pre_alpha[...], self.pre_beta[...], 1.0)
+    layer_input = jnp.einsum("kbsd,k -> bsd", x, pre_mapping)
+
+    # 3. Attention or MLP
+    if mhc_type == HyperConnectionType.ATTENTION:
+      layer_out, _ = branch_fn(inputs_q=layer_input, inputs_kv=layer_input, **kwargs)
+    elif mhc_type == HyperConnectionType.MLP_DENSE:
+      layer_out = branch_fn(inputs=layer_input, **kwargs)
+    elif mhc_type == HyperConnectionType.MLP_MOE:
+      layer_out, _, _ = branch_fn(inputs=layer_input, **kwargs)
+    else:
+      raise ValueError(f"Unsupported type: {mhc_type}")
+
+    # 4. Post mapping
+    post_mapping = self.mapping(x, self.mhc_post_alpha_scale, self.post_alpha[...], self.post_beta[...], 2.0)
+    post_out = jnp.einsum("bsd,k -> kbsd", layer_out, post_mapping)
+
+    # 5. Residual mapping, res_out shape as [expansion_rate, batch, seq, emb]
+    res_mapping = self.res_mapping(x)
+    res_out = jnp.einsum("kbsd,km -> mbsd", x, res_mapping)
+    return res_out + post_out
diff --git a/tests/unit/mhc_test.py b/tests/unit/mhc_test.py