1717import asyncio
1818import os
1919import sys
20+ import tempfile
2021import threading
2122import time
2223import types
@@ -39,7 +40,11 @@ def enable_torch_proxy(scope=None):
3940 paddle .compat = _PaddleCompat ()
4041
4142from fastdeploy .engine .args_utils import EngineArgs
42- from fastdeploy .engine .common_engine import EngineService
43+ from fastdeploy .engine .common_engine import (
44+ EngineService ,
45+ _format_worker_launch_failure_message ,
46+ _read_latest_worker_traceback ,
47+ )
4348from fastdeploy .engine .request import (
4449 ControlRequest ,
4550 ControlResponse ,
@@ -3504,3 +3509,90 @@ def _fake_sleep(s):
35043509 # At least one sleep call was made, confirming the inner function executed
35053510 self .assertGreaterEqual (call_count [0 ], 1 )
35063511 self ._detach_finalizer (eng )
3512+
3513+
3514+ class TestWorkerTracebackFunctions (unittest .TestCase ):
3515+ """测试 _read_latest_worker_traceback 和 _format_worker_launch_failure_message 函数"""
3516+
3517+ def test_read_latest_worker_traceback_finds_traceback (self ):
3518+ """测试能够正确读取 workerlog 文件中的 traceback"""
3519+ with tempfile .TemporaryDirectory () as temp_dir :
3520+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3521+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3522+ fp .write (
3523+ "Some normal log output\n "
3524+ "Traceback (most recent call last):\n "
3525+ ' File "worker_process.py", line 1, in <module>\n '
3526+ " run_worker_proc()\n "
3527+ "ValueError: The total number of blocks cannot be less than zero.\n "
3528+ )
3529+
3530+ result = _read_latest_worker_traceback (temp_dir )
3531+ self .assertIsNotNone (result )
3532+ self .assertIn ("Traceback (most recent call last):" , result )
3533+ self .assertIn ("ValueError:" , result )
3534+
3535+ def test_read_latest_worker_traceback_returns_none_when_no_traceback (self ):
3536+ """测试当没有 traceback 时返回 None"""
3537+ with tempfile .TemporaryDirectory () as temp_dir :
3538+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3539+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3540+ fp .write ("Normal log output without any errors\n " )
3541+
3542+ result = _read_latest_worker_traceback (temp_dir )
3543+ self .assertIsNone (result )
3544+
3545+ def test_read_latest_worker_traceback_returns_none_when_no_files (self ):
3546+ """测试当没有 workerlog 文件时返回 None"""
3547+ with tempfile .TemporaryDirectory () as temp_dir :
3548+ result = _read_latest_worker_traceback (temp_dir )
3549+ self .assertIsNone (result )
3550+
3551+ def test_read_latest_worker_traceback_returns_none_for_nonexistent_dir (self ):
3552+ """测试当目录不存在时返回 None"""
3553+ result = _read_latest_worker_traceback ("/nonexistent/path" )
3554+ self .assertIsNone (result )
3555+
3556+ def test_read_latest_worker_traceback_picks_latest_file (self ):
3557+ """测试当有多个 workerlog 文件时选择最新的"""
3558+ with tempfile .TemporaryDirectory () as temp_dir :
3559+ # 创建较旧的文件
3560+ old_log = os .path .join (temp_dir , "workerlog.0" )
3561+ with open (old_log , "w" , encoding = "utf-8" ) as fp :
3562+ fp .write ("Traceback (most recent call last):\n OldError: old error\n " )
3563+
3564+ # 短暂等待以确保时间戳不同
3565+ time .sleep (0.01 )
3566+
3567+ # 创建较新的文件
3568+ new_log = os .path .join (temp_dir , "workerlog.1" )
3569+ with open (new_log , "w" , encoding = "utf-8" ) as fp :
3570+ fp .write ("Traceback (most recent call last):\n NewError: new error\n " )
3571+
3572+ result = _read_latest_worker_traceback (temp_dir )
3573+ self .assertIsNotNone (result )
3574+ self .assertIn ("NewError" , result )
3575+
3576+ def test_format_worker_launch_failure_message_with_traceback (self ):
3577+ """测试带有 traceback 的错误消息格式化"""
3578+ with tempfile .TemporaryDirectory () as temp_dir :
3579+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3580+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3581+ fp .write (
3582+ "Traceback (most recent call last):\n "
3583+ "ValueError: Test error message\n "
3584+ )
3585+
3586+ result = _format_worker_launch_failure_message (temp_dir )
3587+ self .assertIn ("Failed to launch worker processes" , result )
3588+ self .assertIn ("workerlog.*" , result )
3589+ self .assertIn ("Traceback (most recent call last):" , result )
3590+ self .assertIn ("ValueError: Test error message" , result )
3591+
3592+ def test_format_worker_launch_failure_message_without_traceback (self ):
3593+ """测试没有 traceback 时的错误消息格式化"""
3594+ with tempfile .TemporaryDirectory () as temp_dir :
3595+ result = _format_worker_launch_failure_message (temp_dir )
3596+ self .assertIn ("Failed to launch worker processes" , result )
3597+ self .assertIn ("workerlog.*" , result )
3598+ self .assertNotIn ("Traceback" , result )
0 commit comments