1717import asyncio
1818import os
1919import sys
20+ import tempfile
2021import threading
2122import time
2223import types
@@ -39,7 +40,11 @@ def enable_torch_proxy(scope=None):
3940 paddle .compat = _PaddleCompat ()
4041
4142from fastdeploy .engine .args_utils import EngineArgs
42- from fastdeploy .engine .common_engine import EngineService
43+ from fastdeploy .engine .common_engine import (
44+ EngineService ,
45+ _format_worker_launch_failure_message ,
46+ _read_latest_worker_traceback ,
47+ )
4348from fastdeploy .engine .request import (
4449 ControlRequest ,
4550 ControlResponse ,
@@ -3722,3 +3727,87 @@ def fake_time():
37223727
37233728 eng .resource_manager .recycle_abort_task .assert_called_with ("req-1_0" )
37243729 self ._detach_finalizer (eng )
3730+
3731+
3732+ class TestWorkerTracebackFunctions (unittest .TestCase ):
3733+ """测试 _read_latest_worker_traceback 和 _format_worker_launch_failure_message 函数"""
3734+
3735+ def test_read_latest_worker_traceback_finds_traceback (self ):
3736+ """测试能够正确读取 workerlog 文件中的 traceback"""
3737+ with tempfile .TemporaryDirectory () as temp_dir :
3738+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3739+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3740+ fp .write (
3741+ "Some normal log output\n "
3742+ "Traceback (most recent call last):\n "
3743+ ' File "worker_process.py", line 1, in <module>\n '
3744+ " run_worker_proc()\n "
3745+ "ValueError: The total number of blocks cannot be less than zero.\n "
3746+ )
3747+
3748+ result = _read_latest_worker_traceback (temp_dir )
3749+ self .assertIsNotNone (result )
3750+ self .assertIn ("Traceback (most recent call last):" , result )
3751+ self .assertIn ("ValueError:" , result )
3752+
3753+ def test_read_latest_worker_traceback_returns_none_when_no_traceback (self ):
3754+ """测试当没有 traceback 时返回 None"""
3755+ with tempfile .TemporaryDirectory () as temp_dir :
3756+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3757+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3758+ fp .write ("Normal log output without any errors\n " )
3759+
3760+ result = _read_latest_worker_traceback (temp_dir )
3761+ self .assertIsNone (result )
3762+
3763+ def test_read_latest_worker_traceback_returns_none_when_no_files (self ):
3764+ """测试当没有 workerlog 文件时返回 None"""
3765+ with tempfile .TemporaryDirectory () as temp_dir :
3766+ result = _read_latest_worker_traceback (temp_dir )
3767+ self .assertIsNone (result )
3768+
3769+ def test_read_latest_worker_traceback_returns_none_for_nonexistent_dir (self ):
3770+ """测试当目录不存在时返回 None"""
3771+ result = _read_latest_worker_traceback ("/nonexistent/path" )
3772+ self .assertIsNone (result )
3773+
3774+ def test_read_latest_worker_traceback_picks_latest_file (self ):
3775+ """测试当有多个 workerlog 文件时选择最新的"""
3776+ with tempfile .TemporaryDirectory () as temp_dir :
3777+ # 创建较旧的文件
3778+ old_log = os .path .join (temp_dir , "workerlog.0" )
3779+ with open (old_log , "w" , encoding = "utf-8" ) as fp :
3780+ fp .write ("Traceback (most recent call last):\n OldError: old error\n " )
3781+
3782+ # 短暂等待以确保时间戳不同
3783+ time .sleep (0.01 )
3784+
3785+ # 创建较新的文件
3786+ new_log = os .path .join (temp_dir , "workerlog.1" )
3787+ with open (new_log , "w" , encoding = "utf-8" ) as fp :
3788+ fp .write ("Traceback (most recent call last):\n NewError: new error\n " )
3789+
3790+ result = _read_latest_worker_traceback (temp_dir )
3791+ self .assertIsNotNone (result )
3792+ self .assertIn ("NewError" , result )
3793+
3794+ def test_format_worker_launch_failure_message_with_traceback (self ):
3795+ """测试带有 traceback 的错误消息格式化"""
3796+ with tempfile .TemporaryDirectory () as temp_dir :
3797+ worker_log = os .path .join (temp_dir , "workerlog.0" )
3798+ with open (worker_log , "w" , encoding = "utf-8" ) as fp :
3799+ fp .write ("Traceback (most recent call last):\n " "ValueError: Test error message\n " )
3800+
3801+ result = _format_worker_launch_failure_message (temp_dir )
3802+ self .assertIn ("Failed to launch worker processes" , result )
3803+ self .assertIn ("workerlog.*" , result )
3804+ self .assertIn ("Traceback (most recent call last):" , result )
3805+ self .assertIn ("ValueError: Test error message" , result )
3806+
3807+ def test_format_worker_launch_failure_message_without_traceback (self ):
3808+ """测试没有 traceback 时的错误消息格式化"""
3809+ with tempfile .TemporaryDirectory () as temp_dir :
3810+ result = _format_worker_launch_failure_message (temp_dir )
3811+ self .assertIn ("Failed to launch worker processes" , result )
3812+ self .assertIn ("workerlog.*" , result )
3813+ self .assertNotIn ("Traceback" , result )
0 commit comments