diff --git a/xtuner/v1/engine/train_engine.py b/xtuner/v1/engine/train_engine.py index 184ee35db..9032332b8 100644 --- a/xtuner/v1/engine/train_engine.py +++ b/xtuner/v1/engine/train_engine.py @@ -318,11 +318,12 @@ def async_save_hf( save_dtype: torch.dtype = torch.bfloat16, cleanup_hf_dirs: Sequence[str | Path] = (), ) -> AsyncHFSaveHandle: - return self.model.async_save_hf( - hf_dir=hf_dir, - save_dtype=save_dtype, - cleanup_hf_dirs=cleanup_hf_dirs, - ) + with profile_time_and_memory(f"[Async saving HF to {hf_dir} launch cost]"): + return self.model.async_save_hf( + hf_dir=hf_dir, + save_dtype=save_dtype, + cleanup_hf_dirs=cleanup_hf_dirs, + ) def wait_async_hf(self, handle: AsyncHFSaveHandle | None = None) -> Path | None: return self.model.wait_async_hf(handle) diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py index cad387a99..336ebfebf 100644 --- a/xtuner/v1/model/base.py +++ b/xtuner/v1/model/base.py @@ -762,6 +762,7 @@ def _run_async_hf_writer( cleanup_done_path: Path, rank: int, ) -> None: + log_rank0.info(f"[Async saving HF to {tmp_hf_dir} writer] started") try: set_async_save_process_qos() self._cleanup_async_hf_dirs_before_write( @@ -775,7 +776,9 @@ def _run_async_hf_writer( weight_map=weight_map, status_path=status_path, ) + log_rank0.info(f"[Async saving HF to {tmp_hf_dir} writer] finished") except Exception as exc: + log_rank0.error(f"[Async saving HF to {tmp_hf_dir} writer] failed: {exc}") status = {"rank": rank, "ok": False, "error": str(exc), "weight_map": {}} with status_path.open("w") as f: f.write(json.dumps(status, indent=2)) diff --git a/xtuner/v1/model/compose/base.py b/xtuner/v1/model/compose/base.py index 68c9957ba..b970aaad5 100644 --- a/xtuner/v1/model/compose/base.py +++ b/xtuner/v1/model/compose/base.py @@ -258,6 +258,7 @@ def _run_async_hf_compose_writer( cleanup_done_path: Path, rank: int, ) -> None: + log_rank0.info(f"[Async saving HF to {tmp_hf_dir} writer] started") try: set_async_save_process_qos() self._cleanup_async_hf_dirs_before_write( @@ -270,7 +271,9 @@ def _run_async_hf_compose_writer( status = {"rank": rank, "ok": True, "error": "", "weight_map": merged_weight_map} with status_path.open("w") as f: f.write(json.dumps(status, indent=2)) + log_rank0.info(f"[Async saving HF to {tmp_hf_dir} writer] finished") except Exception as exc: + log_rank0.error(f"[Async saving HF to {tmp_hf_dir} writer] failed: {exc}") status = {"rank": rank, "ok": False, "error": str(exc), "weight_map": {}} with status_path.open("w") as f: f.write(json.dumps(status, indent=2))