fix: Reduce LR to 5e-5, fix NaN handling, and track skipped batches in dataset state

MeridianAlgo-Developer · MeridianAlgo-Developer · commit d771e1160b08 · 2026-02-11T12:27:12.000-06:00
diff --git a/.github/workflows/train.yml b/.github/workflows/train.yml
@@ -142,6 +142,7 @@ jobs:
           TOTAL_STEPS: '100000'
           GRAD_ACCUM: '4'
           BATCH_SIZE: '2'
+          LEARNING_RATE: '5e-5'
           BLOCK_SIZE: '512'
           USE_EWC: '1'
           GRADIENT_CHECKPOINTING: '1'
diff --git a/meridian/training/trainer.py b/meridian/training/trainer.py
@@ -42,7 +42,7 @@ class TrainingConfig:
     total_steps: int = 100_000
 
     # Optimizer
-    learning_rate: float = 3e-4
+    learning_rate: float = 5e-5
     weight_decay: float = 0.1
     max_grad_norm: float = 1.0
     warmup_ratio: float = 0.06
@@ -92,6 +92,7 @@ def __init__(
         # State
         self.global_step = 0
         self.run_step = 0
+        self.processed_batches = 0
         self.best_loss = float("inf")
 
         # EWC for continual learning
@@ -193,6 +194,7 @@ def train(self) -> None:
             # Get batch
             try:
                 batch = next(data_iter)
+                self.processed_batches += 1
             except StopIteration:
                 print("[INFO] Dataset exhausted. Ending training.")
                 break
@@ -327,6 +329,14 @@ def train(self) -> None:
 
     def save_checkpoint(self, path: str) -> None:
         """Save model + optimizer + trainer state."""
+        # Sanity check for NaN weights
+        for name, param in self.model.named_parameters():
+            if torch.isnan(param).any():
+                print(
+                    f"  [CRITICAL] NaN detected in parameter '{name}'. Aborting checkpoint save to protect repo."
+                )
+                return
+
         os.makedirs(path, exist_ok=True)
         print(f"  [SAVE] Checkpoint → {path}")
 
diff --git a/train.py b/train.py
@@ -218,12 +218,17 @@ def main():
             tokenizer.save_pretrained(checkpoint_path)
 
         # Update dataset state
-        batches_this_run = (
-            (trainer.global_step - initial_global_step)
-            * train_config.batch_size
-            * train_config.gradient_accumulation_steps
-        )
-        new_processed = processed_items + batches_this_run
+        # Use actual processed batches from trainer (includes skipped ones)
+        if hasattr(trainer, "processed_batches"):
+            batches_processed = trainer.processed_batches
+        else:
+            # Fallback for backward compatibility
+            batches_processed = (
+                trainer.global_step - initial_global_step
+            ) * train_config.gradient_accumulation_steps
+
+        items_processed = batches_processed * train_config.batch_size
+        new_processed = processed_items + items_processed
 
         for sp in [state_path, os.path.join(checkpoint_path, "dataset_state.json")]:
             with open(sp, "w") as f: