diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index c8dd0cca535..8119f27b4b5 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -242,12 +242,6 @@ def graph_optimize_and_warm_up_model(self) -> None: ): self.model_runner.capture_model_prefill_and_mixed() - # Capture CUDAGraph for decode phase (all modes) - self.model_runner.capture_model() - - # Block-wise CUDA graph capture (independent loop) - self.model_runner.capture_block_wise_graphs() - # Deterministic mode: reset RNG and share_inputs after warmup. # Warmup _dummy_run() calls consume CUDA RNG state and leave stale # data (infer_seed, stop_flags, seq_lens, etc.) in share_inputs. @@ -257,6 +251,12 @@ def graph_optimize_and_warm_up_model(self) -> None: set_random_seed(self.fd_config.model_config.seed) self.model_runner.share_inputs.reset_share_inputs() + # Capture CUDAGraph for decode phase (all modes) + self.model_runner.capture_model() + + # Block-wise CUDA graph capture (independent loop) + self.model_runner.capture_block_wise_graphs() + def check_health(self) -> bool: """ """ return True