fix(training): harden model against NaN/Inf instability in fp16

Pomilon · Pomilon · commit 0d80d09e617e · 2025-11-29T07:46:17.000+01:00
- ssm.py: Clamp `delta` projection (max=5.0) to prevent SSM state explosion.
- trainer.py: Enhance gradient clipping to detect and skip `Inf` gradients (overflow), not just `NaN`.
- model.py: Add runtime fail-safe to patch layer outputs using `nan_to_num` if corruption is detected during forward pass.

This addresses the loss divergence observed at step 11k.
diff --git a/aetheris/model.py b/aetheris/model.py
@@ -43,7 +43,7 @@ def forward(self, input_ids: torch.Tensor, labels: torch.Tensor = None) -> Dict[
         x = self.embedding(input_ids)
         total_aux_loss = torch.tensor(0.0, device=x.device, dtype=x.dtype)
 
-        for layer in self.layers:
+        for i, layer in enumerate(self.layers):
             if self.gradient_checkpointing and self.training:
                 # Checkpoint ALL layers for maximum memory savings
                 if isinstance(layer, SparseMoELayer):
@@ -66,7 +66,7 @@ def moe_forward(module, inp):
 
             # Add gradient clipping per layer to catch issues early
             if self.training and torch.isnan(x).any():
-                print(f"WARNING: NaN detected in layer output!")
+                print(f"WARNING: NaN detected in layer {i} ({layer.__class__.__name__}) output!")
                 x = torch.nan_to_num(x, nan=0.0, posinf=1.0, neginf=-1.0)
 
         x = self.final_norm(x)
diff --git a/aetheris/modules/ssm.py b/aetheris/modules/ssm.py
@@ -74,8 +74,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x_conv = x_conv[:, :, :-2].transpose(1, 2)
         x_conv = self.act(x_conv)
 
-        # Add small epsilon to prevent numerical issues
-        delta = F.softplus(self.delta_proj(x_conv)) + 1e-4
+        # Add small epsilon to prevent numerical issues and clamp max value
+        delta = torch.clamp(F.softplus(self.delta_proj(x_conv)), max=5.0) + 1e-4
         B_ssm = self.B_proj(x_conv)
         C_ssm = self.C_proj(x_conv)
 
diff --git a/aetheris/trainer/trainer.py b/aetheris/trainer/trainer.py
@@ -72,10 +72,11 @@ def train_epoch(self, train_loader, total_steps, start_step=0, stage_name="Train
             # Gradient clipping
             grad_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
 
-            if torch.isnan(grad_norm):
-                print(f"WARNING: NaN gradient at step {global_step}, skipping update")
+            if torch.isnan(grad_norm) or torch.isinf(grad_norm):
+                print(f"WARNING: NaN/Inf gradient at step {global_step}, skipping update")
+            else:
+                self.scaler.step(self.optimizer)
 
-            self.scaler.step(self.optimizer)
             self.scaler.update()
 
             global_step += 1