[Refactor] Move float8_handler initialization from TrainEngine to BaseModel

HAOCHENYE · HAOCHENYE · commit 8f658e16b631 · 2026-03-05T06:44:53.000Z
- Remove float8_handler as a direct attribute of TrainEngine - Add float8_handler as a lazy-initialized property in BaseModel - Move Float8Handler.build() logic to Float8Config.build() - Update _maybe_precompute_float8_dynamic_scale_for_fsdp to iterate through model modules ghstack-source-id: b3e7123 Pull-Request: InternLM#1517
diff --git a/xtuner/v1/engine/train_engine.py b/xtuner/v1/engine/train_engine.py
@@ -3,7 +3,7 @@
 import threading
 from concurrent.futures import wait
 from pathlib import Path
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, Dict, List, cast
 
 import torch
 import torch.distributed as dist
@@ -17,7 +17,6 @@
     set_model_state_dict,
     set_optimizer_state_dict,
 )
-from torch.distributed.device_mesh import DeviceMesh
 from torch.nn.utils.clip_grad import _no_grad
 from torch.utils._foreach_utils import (
     _device_has_foreach_support,
@@ -26,7 +25,6 @@
 
 from xtuner.v1.config import FSDPConfig, OptimConfig
 from xtuner.v1.data_proto.sequence_context import SequenceContext
-from xtuner.v1.float8.float8_handler import Float8Handler
 from xtuner.v1.model.base import BaseModel, ModelItem, XTunerBaseModelConfig
 from xtuner.v1.model.utils import ModelForwardExtraLogInfo
 from xtuner.v1.module.router import NoAuxRouterConfig
@@ -138,7 +136,6 @@ class TrainEngine:
     model: BaseModel
     optimizer: torch.optim.Optimizer
     scheduler: torch.optim.lr_scheduler.LRScheduler
-    float8_handler: Optional[Float8Handler]
 
     def __init__(
         self,
@@ -168,19 +165,10 @@ def build_model(self) -> BaseModel:
         with torch.device("meta"):
             model = self.model_cfg.build()
 
-        self.float8_handler = None
-        if self.model_cfg.float8_cfg is not None and self.model_cfg.float8_cfg.enable_float8:
-            self.float8_handler = Float8Handler(
-                scaling_granularity_gemm=self.model_cfg.float8_cfg.scaling_granularity_gemm,
-                scaling_granularity_grouped_gemm=self.model_cfg.float8_cfg.scaling_granularity_grouped_gemm,
-            )
         model = model.fully_shard(self.fsdp_cfg)
 
         if dist.get_rank() == 0:
             logger.info(model)
-
-        if self.float8_handler:
-            self.float8_handler.build_reduce_mesh(model, cast(DeviceMesh, model.fsdp_mesh))
         return model
 
     def build_optimizer(self, optim_cfg: OptimConfig) -> torch.optim.Optimizer:
@@ -200,18 +188,13 @@ def grad_accumulation_steps(self, data_batches_len: int):
         intra_layer_micro_batch = self.intra_layer_micro_batch
         return data_batches_len // intra_layer_micro_batch
 
-    # this method can be called outside, e.g., at the beginning of compute_actor_logprobs or compute_ref_logprobs during rl training
-    def maybe_precompute_float8_dynamic_scale_for_fsdp(self):
-        if self.float8_handler is not None:
-            self.float8_handler.precompute_float8_dynamic_scale_for_fsdp(self.model)
-
     def train_step(self, data_batches: list[ModelItem]) -> tuple[LossLog, OtherLog]:
         """Perform a training step with the given data batches and mesh.
 
         Args:
             data_batches (List[Dict]): The input data batches for the training step.
         """
-        self.maybe_precompute_float8_dynamic_scale_for_fsdp()
+        self._maybe_precompute_float8_dynamic_scale_for_fsdp()
 
         loss_log: LossLog = {}  # type: ignore[typeddict-item]
         other_log: OtherLog = {}  # type: ignore[typeddict-item]
@@ -523,3 +506,8 @@ def put_optimizer_to_device(self, device: torch.device | str):
                         state[key] = val.to(device, non_blocking=True)
         DEVICE_MODULE.synchronize()
         return
+
+    def _maybe_precompute_float8_dynamic_scale_for_fsdp(self):
+        for model in self.model.modules():
+            if isinstance(model, BaseModel) and model.float8_handler is not None:
+                model.float8_handler.precompute_float8_dynamic_scale_for_fsdp(model)
diff --git a/xtuner/v1/float8/config.py b/xtuner/v1/float8/config.py
@@ -49,3 +49,11 @@ def is_tilewise(self) -> bool:
     def is_tensorwise(self) -> bool:
         """Whether the scaling granularity is TENSORWISE."""
         return self.scaling_granularity_gemm == ScalingGranularity.TENSORWISE
+
+    def build(self):
+        from .float8_handler import Float8Handler
+
+        return Float8Handler(
+            scaling_granularity_gemm=self.scaling_granularity_gemm,
+            scaling_granularity_grouped_gemm=self.scaling_granularity_grouped_gemm,
+        )
diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py
@@ -396,6 +396,19 @@ def compile_cfg(self) -> dict[str, TorchCompileOption]:
 
         return _compile_cfg
 
+    @property
+    def float8_handler(self):
+        if (
+            self.config.float8_cfg is not None
+            and self.config.float8_cfg.enable_float8
+            and self._float8_handler is None
+        ):
+            self._float8_handler = self.config.float8_cfg.build()
+
+            if self.fsdp_mesh is not None:
+                self._float8_handler.build_reduce_mesh(self, self.fsdp_mesh)
+        return self._float8_handler
+
     @torch.no_grad()
     def init_weights(self):
         # TODO: HardCode here. The initialization method should be module specific. All module in model
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
@@ -22,7 +22,7 @@
 )
 from torch.distributed.tensor import DTensor, Replicate, distribute_tensor
 from tqdm import tqdm
-from typing_extensions import NotRequired, overload, override
+from typing_extensions import overload, override
 
 from xtuner.v1.config import FSDPConfig
 from xtuner.v1.data_proto import SequenceContext
diff --git a/xtuner/v1/rl/base/worker.py b/xtuner/v1/rl/base/worker.py
@@ -388,7 +388,7 @@ def compute_actor_logprobs(
         shifted_labels_list: list[torch.Tensor],
     ) -> list[torch.Tensor]:
         # precompute float8 dynamic scale only once
-        self._engine.maybe_precompute_float8_dynamic_scale_for_fsdp()
+        self._engine._maybe_precompute_float8_dynamic_scale_for_fsdp()
         old_logprobs_list: list[torch.Tensor] = []
         for seq_ctx, shifted_labels in zip(seq_ctx_list, shifted_labels_list):
             output = self._engine.forward_only(seq_ctx=seq_ctx)

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`)`
`23`	`23`	`from torch.distributed.tensor import DTensor, Replicate, distribute_tensor`
`24`	`24`	`from tqdm import tqdm`
`25`		`-from typing_extensions import NotRequired, overload, override`
	`25`	`+from typing_extensions import overload, override`
`26`	`26`
`27`	`27`	`from xtuner.v1.config import FSDPConfig`
`28`	`28`	`from xtuner.v1.data_proto import SequenceContext`