fix: enforce float32 in ssm scan for numerical stability

Pomilon · Pomilon · commit 480839552a8e · 2025-11-30T20:51:16.000+01:00
diff --git a/aetheris/cli/main.py b/aetheris/cli/main.py
@@ -111,27 +111,39 @@ def generate_command(args):
     history_ids = set(input_ids[0].tolist())
 
     print("-" * 50)
-    print(f"Prompt: {prompt}")
-    print("Generated Continuation:")
-
-    for _ in range(max_new_tokens):
-        # Check if we should use autocast (skip if model uses float32)
-        use_autocast = True
-        if config.torch_dtype == torch.float32:
-            use_autocast = False
-        
-        if use_autocast:
-            with torch.amp.autocast('cuda' if device.type == 'cuda' else 'cpu', dtype=model.config.torch_dtype):
+            print(f"Prompt: {prompt}")
+        print("Generated Continuation:")
+    
+        for step in range(max_new_tokens):
+            # Check if we should use autocast (skip if model uses float32)
+            use_autocast = True
+            if config.torch_dtype == torch.float32:
+                use_autocast = False
+            
+            if use_autocast:
+                with torch.amp.autocast('cuda' if device.type == 'cuda' else 'cpu', dtype=model.config.torch_dtype):
+                    outputs = model(generated_ids)
+                    logits = outputs['logits']
+                    next_token_logits = logits[:, -1, :]
+            else:
                 outputs = model(generated_ids)
                 logits = outputs['logits']
                 next_token_logits = logits[:, -1, :]
-        else:
-            outputs = model(generated_ids)
-            logits = outputs['logits']
-            next_token_logits = logits[:, -1, :]
-
-        # Repetition penalty
-        for token_id in history_ids:
+    
+            # --- DEBUG: Print Top Predictions for First Step ---
+            if step == 0:
+                probs = F.softmax(next_token_logits, dim=-1)
+                top_probs, top_indices = torch.topk(probs, 5)
+                print("\n[DEBUG] Step 0 Top-5 Predictions:")
+                for i in range(5):
+                    token_idx = top_indices[0, i].item()
+                    prob = top_probs[0, i].item()
+                    token_str = tokenizer.decode([token_idx])
+                    print(f"  {i+1}. '{token_str}' ({prob:.4f})")
+                print("-----------------------------------")
+            # ---------------------------------------------------
+    
+            # Repetition penalty        for token_id in history_ids:
             if token_id < next_token_logits.size(-1):
                 logit = next_token_logits[0, token_id].item()
                 if logit > 0:
diff --git a/aetheris/modules/ssm.py b/aetheris/modules/ssm.py
@@ -8,10 +8,23 @@ def selective_scan_native(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
     """Memory-efficient scan with reduced intermediate tensors."""
     B_size, L, D_inner = u.shape
     D_state = A.shape[-1]
+    
+    # Save original dtype
+    original_dtype = u.dtype
 
     # Use in-place operations where possible
-    h = torch.zeros(B_size, D_inner, D_state, device=u.device, dtype=u.dtype)
+    # FORCE FLOAT32 for state to prevent underflow/overflow in long sequences
+    h = torch.zeros(B_size, D_inner, D_state, device=u.device, dtype=torch.float32)
     ys = []
+    
+    # Cast inputs to float32 for the scan
+    # Note: This increases memory usage slightly but is critical for stability
+    u = u.float()
+    delta = delta.float()
+    A = A.float()
+    B = B.float()
+    C = C.float()
+    D = D.float()
 
     for l in range(L):
         dt = delta[:, l, :].unsqueeze(-1)
@@ -28,7 +41,10 @@ def selective_scan_native(u: torch.Tensor, delta: torch.Tensor, A: torch.Tensor,
         ys.append(y_t)
 
     y = torch.stack(ys, dim=1)
-    return y + u * D
+    y = y + u * D
+    
+    # Cast back to original dtype
+    return y.to(dtype=original_dtype)
 
 class SSMBlock(nn.Module):
     """Memory-optimized State Space Model with stability improvements."""