From 190eb1899f308b8aeaa484346432fbf07412d8d2 Mon Sep 17 00:00:00 2001 From: EmmonsCurse <1577972691@qq.com> Date: Mon, 30 Mar 2026 20:15:38 +0800 Subject: [PATCH 1/5] [CI] Optimize test execution with single-GPU parallelism and log collection --- .github/workflows/_unit_test_coverage.yml | 11 + scripts/coverage_run.sh | 377 +++++++++++++++++----- 2 files changed, 303 insertions(+), 85 deletions(-) diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index 8b2d0272bc8..98f6db79283 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -47,6 +47,7 @@ jobs: outputs: all_cov_file_url: ${{ steps.cov_upload.outputs.all_cov_file_url }} unittest_failed_url: ${{ steps.cov_upload.outputs.unittest_failed_url }} + unittest_logs_url: ${{ steps.cov_upload.outputs.unittest_logs_url }} diff_cov_result_json_url: ${{ steps.cov_upload.outputs.diff_cov_result_json_url }} steps: - name: Code Prepare @@ -309,6 +310,15 @@ jobs: echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_ENV fi + # Only upload logs when tests failed + unittest_logs_archive="unittest_logs.tar.gz" + if [ "$HAS_FAILED_TESTS" = true ]; then + python ${push_file} ${unittest_logs_archive} ${target_path}/UnitTestResult + UNIT_TEST_LOGS_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/UnitTestResult/${unittest_logs_archive} + echo "unittest_logs_url=${UNIT_TEST_LOGS_URL}" >> $GITHUB_OUTPUT + echo "unittest_logs_url=${UNIT_TEST_LOGS_URL}" >> $GITHUB_ENV + fi + if [[ "$IS_PR" != "true" ]]; then full_cov_file="full_coverage_report.txt" full_cov_csv="full_coverage_report.csv" @@ -345,6 +355,7 @@ jobs: if [ -f "${filename}" ];then echo "Failed test cases:" cat "${filename}" + echo "unittest_logs_url=${UNIT_TEST_LOGS_URL}" fi exit "$TEST_EXIT_CODE" fi diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index cf2ead344b1..cfdd118a1d5 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -7,113 +7,320 @@ run_path=$( realpath "$DIR/../") export COVERAGE_FILE=${COVERAGE_FILE:-$DIR/../coveragedata/.coverage} export COVERAGE_RCFILE=${COVERAGE_RCFILE:-$DIR/../scripts/.coveragerc} +# ============================================================ +# Classify tests into one of the following categories +# - multi_gpu: requires multiple GPUs / ports (run sequentially) +# - single_gpu: independent tests (can run in parallel) +# ============================================================ +classify_tests() { + local test_file=$1 + # Rule 1: distributed tests (explicit multi-GPU launch) + if [[ "$test_file" =~ tests/distributed/.*test_.*\.py ]]; then + echo "multi_gpu" + return + fi + + # Rule 2: e2e tests (usually involve service / ports) + if [[ "$test_file" =~ tests/e2e/.*test_.*\.py ]]; then + echo "multi_gpu" + return + fi + + # Rule 3: model loader tests (allocate multiple GPUs) + if [[ "$test_file" =~ tests/model_loader/.*test_.*\.py ]]; then + echo "multi_gpu" + return + fi + + # Rule 4: check file content for tensor_parallel_size=[234] or --tensor-parallel-size [234] + # or CUDA_VISIBLE_DEVICES="0,1" + # or PORT environment variables + if [ -f "$test_file" ]; then + if grep -q '"tensor_parallel_size".*[1234]\|--tensor-parallel-size.*[1234]\|tensor_parallel_size.*=[1234]\|CUDA_VISIBLE_DEVICES.*0.*1\|paddle\.distributed\.launch.*--gpus.*0.*1\|FD_API_PORT\|FLASK_PORT\|FD_ENGINE_QUEUE_PORT\|FD_METRICS_PORT\|FD_CACHE_QUEUE_PORT\|FD_ROUTER_PORT\|FD_CONNECTOR_PORT\|FD_RDMA_PORT' "$test_file" 2>/dev/null; then + echo "multi_gpu" + return + fi + fi + + # ========== Single-GPU tests (no port required, can run in parallel) ========== + echo "single_gpu" +} + +# ============================================================ +# Run Test With Logging +# ============================================================ +run_test_with_logging() { + local test_file=$1 + local log_prefix=$2 + local status + + echo "Running: $test_file" + + # Create isolated log directory for this test to avoid race conditions + # Format: unittest_logs///log + local test_rel_path="${test_file#tests/}" + local test_dir=$(dirname "$test_rel_path") + local test_name=$(basename "$test_file" .py) + local isolated_log_dir="${run_path}/unittest_logs/${test_dir}/${test_name}/log" + mkdir -p "$isolated_log_dir" + + # Set FD_LOG_DIR to isolate logs for each test + export FD_LOG_DIR="$isolated_log_dir" + + # Run test + timeout 600 python -m coverage run -m pytest -c ${PYTEST_INI} "$test_file" -vv -s + status=$? + + if [ "$status" -ne 0 ]; then + echo "$test_file" >> "$log_prefix" + echo "" + echo "==================== Test Failed: $test_file ====================" + + # Use isolated log directory for this test + if [ -d "$isolated_log_dir" ]; then + echo + echo ">>>> Processing log directory: ${isolated_log_dir}" + + # workerlog + worker_logs=("${isolated_log_dir}"/workerlog.0) + + if [ -f "${worker_logs[0]}" ]; then + for worker_log in "${worker_logs[@]}"; do + [ -f "${worker_log}" ] || continue + echo "---------------- ${worker_log} (last 100 lines) ----------------" + tail -n 100 "${worker_log}" || true + echo "---------------------------------------------------------------" + done + fi + + echo ">>> grep error in ${isolated_log_dir}" + grep -Rni --color=auto "error" "${isolated_log_dir}" || true + fi + + echo "=======================================================" + fi + + # Clean up port-related processes + if [ -n "$FD_CACHE_QUEUE_PORT" ]; then + ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true + fi + if [ -n "$FD_ENGINE_QUEUE_PORT" ]; then + ps -ef | grep "${FD_ENGINE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true + fi + + # Unset FD_LOG_DIR to avoid affecting next test + unset FD_LOG_DIR + return $status +} + +# ============================================================ +# Run a shard of tests on a dedicated GPU +# - one shard = one process = one GPU +# ============================================================ +run_shard() { + local shard_name=$1 + local gpu_id=$2 + shift 2 + local tests=("$@") + + echo "====================================" + echo "Starting shard '${shard_name}' on GPU ${gpu_id}" + echo "Tests count: ${#tests[@]}" + echo "====================================" + + # Set GPU + export CUDA_VISIBLE_DEVICES="$gpu_id" + export COVERAGE_FILE="${DIR}/../coveragedata/.coverage.${shard_name}" + + # Failed log filename (no path, directly in project root) + local failed_log="${shard_name}_failed.txt" + rm -f "$failed_log" + > "$failed_log" + + local success_count=0 + local failed_count=0 + + for file in "${tests[@]}"; do + echo "[${shard_name}] Running: $file" + + run_test_with_logging "$file" "$failed_log" + local status=$? + + if [ "$status" -eq 0 ]; then + success_count=$((success_count + 1)) + else + failed_count=$((failed_count + 1)) + fi + done + + unset COVERAGE_FILE + + echo "====================================" + echo "Shard '${shard_name}' completed" + echo "Successful: $success_count" + echo "Failed: $failed_count" + echo "====================================" + + unset CUDA_VISIBLE_DEVICES + + return $failed_count +} + +# ============================================================ +# Main Flow +# ============================================================ failed_tests_file="failed_tests.log" > "$failed_tests_file" +echo "====================================" +echo "Coverage Test Execution with Parallel Single-GPU Tests" +echo "====================================" -################################## -# Run pytest, one file at a time -# Use pytest's --collect-only output to extract the actual test file paths (e.g., tests/.../test_*.py). -# Note: pytest may output lines like "ERROR tests/xxx/test_xxx.py::test_xxx ..." on collection failure, -# to avoid treating prefixes like "ERROR"/"FAILED"/"collecting" as filenames, -# we only keep the "tests/.../test_*.py" portion and discard everything else. -TEST_FILES=$( - python -m pytest --collect-only -q -c "${PYTEST_INI}" "${tests_path}" --rootdir="${run_path}" --disable-warnings 2>&1 \ +# ============================================================ +# Step 1: Collect & classify tests +# ============================================================ +echo "Step 1: Collecting and classifying tests" + +ALL_TEST_FILES=$( + python -m pytest --collect-only -q -c "${PYTEST_INI}" "${tests_path}" --rootdir="${run_path}" --disable-warnings 2>&1 \ | grep -E 'tests/.+\/test_.*\.py' \ | sed -E 's@.*(tests/[^: ]*test_[^: ]*\.py).*@\1@' \ | sort -u ) +if [ -z "$ALL_TEST_FILES" ]; then + echo "ERROR: No test files found!" + exit 1 +fi + +MULTI_GPU_TESTS=() +SINGLE_GPU_TESTS=() + +TOTAL_TESTS=0 +for file in $ALL_TEST_FILES; do + TOTAL_TESTS=$((TOTAL_TESTS + 1)) + test_type=$(classify_tests "$file") + + case "$test_type" in + "multi_gpu") + MULTI_GPU_TESTS+=("$file") + ;; + "single_gpu") + SINGLE_GPU_TESTS+=("$file") + ;; + esac +done + +echo "Multi-GPU tests: ${#MULTI_GPU_TESTS[@]}" +echo "Single-GPU tests: ${#SINGLE_GPU_TESTS[@]}" +echo "Total tests: $TOTAL_TESTS" -failed_pytest=0 -success_pytest=0 +# ============================================================ +# Step 2: Run multi-GPU tests (sequential) +# ============================================================ +echo "Step 2: Running multi-GPU tests" -# nullglob: if no match, the pattern expands to nothing -shopt -s nullglob +if [ ${#MULTI_GPU_TESTS[@]} -gt 0 ]; then + export CUDA_VISIBLE_DEVICES="0,1" -for file in $TEST_FILES; do - echo "Running pytest file: $file" - # Clean up previous logs - rm -rf "${run_path}"/log* || true - for f in "${run_path}"/*.log; do - [[ "$(basename "$f")" != "${failed_tests_file}" ]] && rm -f "$f" + for file in "${MULTI_GPU_TESTS[@]}"; do + run_test_with_logging "$file" "$failed_tests_file" done - # Run pytest with coverage for the current file - # Set timeout to 600 seconds to avoid infinite loop - timeout 600 python -m coverage run -m pytest -c ${PYTEST_INI} "$file" -vv -s - status=$? - if [ "$status" -ne 0 ]; then - echo "$file" >> "$failed_tests_file" - failed_pytest=$((failed_pytest+1)) + unset CUDA_VISIBLE_DEVICES +else + echo "No multi-GPU tests to run." +fi - echo "" - echo "==================== Dumping Logs ====================" - - for log_dir in "${run_path}"/log*; do - if [ -d "${log_dir}" ]; then - echo - echo ">>>> Processing log directory: ${log_dir}" - - # print all workerlog.0 - worker_logs=("${log_dir}"/workerlog.0) - if [ "${#worker_logs[@]}" -gt 0 ]; then - for worker_log in "${worker_logs[@]}"; do - if [ -f "${worker_log}" ]; then - echo "---------------- ${worker_log} (last 100 lines) ----------------" - tail -n 100 "${worker_log}" || true - echo "---------------------------------------------------------------" - fi - done - else - echo "No workerlog.0 found in ${log_dir}" - fi - - echo ">>> grep error in ${log_dir}" - grep -Rni --color=auto "error" "${log_dir}" || true - fi - done - - # print all server logs - server_logs=("${run_path}"/*.log) - if [ "${#server_logs[@]}" -gt 0 ]; then - for server_log in "${server_logs[@]}"; do - # skip failed_tests_file - [[ "$(basename "$server_log")" == "$failed_tests_file" ]] && continue - if [ -f "${server_log}" ]; then - echo - echo "---------------- ${server_log} (last 100 lines) ----------------" - tail -n 100 "${server_log}" || true - echo "---------------------------------------------------------------" - fi - done - else - echo "No *.log files found" +# ============================================================ +# Step 3: Run single-GPU tests (parallel shards) +# ============================================================ +echo "Step 3: Running single-GPU tests in parallel" + +if [ ${#SINGLE_GPU_TESTS[@]} -gt 0 ]; then + # Split single-GPU tests into 2 shards (1 per GPU) + TOTAL=${#SINGLE_GPU_TESTS[@]} + HALF=$(( TOTAL / 2 )) + + SHARD_1=("${SINGLE_GPU_TESTS[@]:0:$HALF}") + SHARD_2=("${SINGLE_GPU_TESTS[@]:$HALF}") + + echo "Shard 1: ${#SHARD_1[@]} tests on GPU 0" + echo "Shard 2: ${#SHARD_2[@]} tests on GPU 1" + + # Run in parallel (1 process per GPU) + run_shard "shard1" 0 "${SHARD_1[@]}" & + PID1=$! + run_shard "shard2" 1 "${SHARD_2[@]}" & + PID2=$! + + # Wait for all shards to complete + wait $PID1 + EXIT_CODE1=$? + wait $PID2 + EXIT_CODE2=$? + + # Merge shard failed logs to main failed log + for shard in shard1 shard2; do + if [ -f "${shard}_failed.txt" ]; then + cat "${shard}_failed.txt" >> "$failed_tests_file" + rm -f "${shard}_failed.txt" fi + done - echo "======================================================" - else - success_pytest=$((success_pytest+1)) - fi - ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9 - ps -ef | grep "${FD_ENGINE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9 -done -shopt -u nullglob + echo "" + echo "====================================" + echo "Parallel execution completed" + echo "Shard 1 exit code: $EXIT_CODE1" + echo "Shard 2 exit code: $EXIT_CODE2" + echo "====================================" +else + echo "No single-GPU tests to run." +fi + +# ============================================================ +# Step 4: Summary +# ============================================================ +echo "Step 4: Summary" + +# Count failed tests +if [ -f "$failed_tests_file" ]; then + failed_count=$(wc -l < "$failed_tests_file" | tr -d ' ') +else + failed_count=0 +fi + +success_count=$((TOTAL_TESTS - failed_count)) + +echo "Pytest total: $TOTAL_TESTS" +echo "Pytest successful: $success_count" +echo "Pytest failed: $failed_count" -################################## -# Summary -################################## echo "====================================" -echo "Pytest total: $((failed_pytest + success_pytest))" -echo "Pytest successful: $success_pytest" -echo "Pytest failed: $failed_pytest" +# Exit with error and package logs if there were failures +if [ "$failed_count" -ne 0 ]; then + echo "Failed test cases are listed in $failed_tests_file" + cat "$failed_tests_file" + + # Only package logs when there are failures + echo "====================================" + echo "Step 5: Packaging logs (only on failure)" + echo "====================================" + + if [ -d "${run_path}/unittest_logs" ]; then + tar -czf "${run_path}/unittest_logs.tar.gz" -C "${run_path}" unittest_logs + echo "unittest_logs packaged to: ${run_path}/unittest_logs.tar.gz" + ls -lh "${run_path}/unittest_logs.tar.gz" + else + echo "No unittest_logs directory found." + fi + + echo "====================================" -if [ "$failed_pytest" -ne 0 ]; then - echo "Failed test cases are listed in $failed_tests_file" - cat "$failed_tests_file" - exit 8 + exit 8 fi echo "All tests passed!" +exit 0 From 98573dca9fcc48a20510d47b387682d4cf70c3fb Mon Sep 17 00:00:00 2001 From: EmmonsCurse <1577972691@qq.com> Date: Mon, 30 Mar 2026 21:02:09 +0800 Subject: [PATCH 2/5] remove export CUDA_VISIBLE_DEVICES --- scripts/coverage_run.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index cfdd118a1d5..56b55bab587 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -222,13 +222,9 @@ echo "Total tests: $TOTAL_TESTS" echo "Step 2: Running multi-GPU tests" if [ ${#MULTI_GPU_TESTS[@]} -gt 0 ]; then - export CUDA_VISIBLE_DEVICES="0,1" - for file in "${MULTI_GPU_TESTS[@]}"; do run_test_with_logging "$file" "$failed_tests_file" done - - unset CUDA_VISIBLE_DEVICES else echo "No multi-GPU tests to run." fi From 5e7ce492c5168b972c186c805620fae930e87079 Mon Sep 17 00:00:00 2001 From: EmmonsCurse <1577972691@qq.com> Date: Tue, 31 Mar 2026 16:13:58 +0800 Subject: [PATCH 3/5] fix path error --- .github/workflows/_unit_test_coverage.yml | 2 +- scripts/coverage_run.sh | 59 ++++++++++++++----- tests/e2e/test_EB_Lite_serving.py | 3 +- tests/e2e/test_EB_VL_Lite_serving.py | 3 +- tests/e2e/test_Qwen2-7B-Instruct_serving.py | 3 +- tests/e2e/test_Qwen2_5_VL_serving.py | 3 +- .../openai/test_multi_api_server.py | 3 +- 7 files changed, 54 insertions(+), 22 deletions(-) diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index 98f6db79283..75aef3e937a 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -355,7 +355,7 @@ jobs: if [ -f "${filename}" ];then echo "Failed test cases:" cat "${filename}" - echo "unittest_logs_url=${UNIT_TEST_LOGS_URL}" + echo "unittest_logs_url=${unittest_logs_url}" fi exit "$TEST_EXIT_CODE" fi diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index 56b55bab587..78174156ec6 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -54,7 +54,7 @@ run_test_with_logging() { local log_prefix=$2 local status - echo "Running: $test_file" + echo "Running pytest file: $test_file" # Create isolated log directory for this test to avoid race conditions # Format: unittest_logs///log @@ -97,9 +97,36 @@ run_test_with_logging() { grep -Rni --color=auto "error" "${isolated_log_dir}" || true fi + # print all server logs + server_logs=("${run_path}"/*.log) + if [ "${#server_logs[@]}" -gt 0 ]; then + for server_log in "${server_logs[@]}"; do + # skip failed_tests_file + [[ "$(basename "$server_log")" == "$failed_tests_file" ]] && continue + if [ -f "${server_log}" ]; then + echo + echo "---------------- ${server_log} (last 100 lines) ----------------" + tail -n 100 "${server_log}" || true + echo "---------------------------------------------------------------" + fi + done + else + echo "No *.log files found" + fi + echo "=======================================================" fi + # if passed, remove the isolated log directory and server logs + if [ "$status" -eq 0 ]; then + rm -rf "${isolated_log_dir}" || true + # Clean up server logs in run_path on pass + for f in "${run_path}"/*.log; do + [[ "$(basename "$f")" != "${failed_tests_file}" ]] && rm -f "$f" || true + done + fi + + # Clean up port-related processes if [ -n "$FD_CACHE_QUEUE_PORT" ]; then ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true @@ -297,25 +324,25 @@ echo "====================================" # Exit with error and package logs if there were failures if [ "$failed_count" -ne 0 ]; then - echo "Failed test cases are listed in $failed_tests_file" - cat "$failed_tests_file" + echo "Failed test cases are listed in $failed_tests_file" + cat "$failed_tests_file" - # Only package logs when there are failures - echo "====================================" - echo "Step 5: Packaging logs (only on failure)" - echo "====================================" + # Only package logs when there are failures + echo "====================================" + echo "Step 5: Packaging logs (only on failure)" + echo "====================================" - if [ -d "${run_path}/unittest_logs" ]; then - tar -czf "${run_path}/unittest_logs.tar.gz" -C "${run_path}" unittest_logs - echo "unittest_logs packaged to: ${run_path}/unittest_logs.tar.gz" - ls -lh "${run_path}/unittest_logs.tar.gz" - else - echo "No unittest_logs directory found." - fi + if [ -d "${run_path}/unittest_logs" ]; then + tar -czf "${run_path}/unittest_logs.tar.gz" -C "${run_path}" unittest_logs + echo "Logs packaged to: ${run_path}/unittest_logs.tar.gz" + ls -lh "${run_path}/unittest_logs.tar.gz" + else + echo "No unittest_logs directory found." + fi - echo "====================================" + echo "====================================" - exit 8 + exit 8 fi echo "All tests passed!" diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 9d36fa672a9..6c84306927c 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -1389,7 +1389,8 @@ def test_streaming_chat_finish_reason(openai_client): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" - log_file = "./log/config.log" + log_dir = os.getenv("FD_LOG_DIR", "log") + log_file = os.path.join(log_dir, "config.log") baseline = 31446 if not os.path.exists(log_file): diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index 57356143737..778e192b40a 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -734,7 +734,8 @@ def test_chat_with_response_max_tokens(openai_client): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" - log_file = "./log/config.log" + log_dir = os.getenv("FD_LOG_DIR", "log") + log_file = os.path.join(log_dir, "config.log") baseline = 40000 if not os.path.exists(log_file): diff --git a/tests/e2e/test_Qwen2-7B-Instruct_serving.py b/tests/e2e/test_Qwen2-7B-Instruct_serving.py index dabcb1231ce..d539f4d76f9 100644 --- a/tests/e2e/test_Qwen2-7B-Instruct_serving.py +++ b/tests/e2e/test_Qwen2-7B-Instruct_serving.py @@ -612,7 +612,8 @@ def test_streaming(openai_client, capsys): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" - log_file = "./log/config.log" + log_dir = os.getenv("FD_LOG_DIR", "log") + log_file = os.path.join(log_dir, "config.log") baseline = 32562 if not os.path.exists(log_file): diff --git a/tests/e2e/test_Qwen2_5_VL_serving.py b/tests/e2e/test_Qwen2_5_VL_serving.py index 37362e75322..d2ae7d3562c 100644 --- a/tests/e2e/test_Qwen2_5_VL_serving.py +++ b/tests/e2e/test_Qwen2_5_VL_serving.py @@ -430,7 +430,8 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过15%""" - log_file = "./log/config.log" + log_dir = os.getenv("FD_LOG_DIR", "log") + log_file = os.path.join(log_dir, "config.log") baseline = 30000 if not os.path.exists(log_file): diff --git a/tests/entrypoints/openai/test_multi_api_server.py b/tests/entrypoints/openai/test_multi_api_server.py index 58aa07cbf45..a333ee6de0c 100644 --- a/tests/entrypoints/openai/test_multi_api_server.py +++ b/tests/entrypoints/openai/test_multi_api_server.py @@ -99,7 +99,8 @@ def test_start_servers_success(self, mock_is_port_available, mock_popen): # Verify environment variables are set correctly first_call_kwargs = mock_popen.call_args_list[0][1] self.assertIn("env", first_call_kwargs) - self.assertEqual(first_call_kwargs["env"]["FD_LOG_DIR"], "log/log_0") + log_dir = os.getenv("FD_LOG_DIR", "log") + self.assertEqual(first_call_kwargs["env"]["FD_LOG_DIR"], os.path.join(log_dir, "log_0")) @patch("fastdeploy.entrypoints.openai.multi_api_server.is_port_available") def test_check_param_success(self, mock_is_port_available): From 0e4606dbda39ffab713d957e7fa13313efbe5abd Mon Sep 17 00:00:00 2001 From: EmmonsCurse <1577972691@qq.com> Date: Tue, 31 Mar 2026 19:48:27 +0800 Subject: [PATCH 4/5] fix log_* path and debug --- .../batch_invariant_ops.py | 16 ++++---- tests/e2e/test_EB_Lite_serving.py | 5 ++- tests/e2e/test_ernie_03b_pd_router_v1_ipc.py | 10 +++-- ...rnie_03b_pd_router_v1_rdma_global_cache.py | 37 ++++++++++++------- .../test_ernie_03b_pd_router_v1_rdma_tp1.py | 8 ++-- .../test_ernie_03b_pd_router_v1_rdma_tp2.py | 8 ++-- tests/e2e/test_ernie_03b_router.py | 10 +++-- 7 files changed, 58 insertions(+), 36 deletions(-) diff --git a/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py b/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py index c0df764c07c..fc632dad838 100644 --- a/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py +++ b/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py @@ -690,13 +690,15 @@ def addmm_batch_invariant( return result -def _log_softmax_batch_invariant(x: paddle.Tensor, axis: int = -1, out=None) -> paddle.Tensor: - result = log_softmax(input=x, axis=axis) - # Handle out parameter if provided - if out is not None: - out.copy_(result) - return out - return result +# def _log_softmax_batch_invariant(x: paddle.Tensor, axis: int = -1, out=None) -> paddle.Tensor: +# result = log_softmax(input=x, axis=axis) +# # Handle out parameter if provided +# if out is not None: +# out.copy_(result) +# return out +# return result +def _log_softmax_batch_invariant(x: paddle.Tensor, axis: int = -1) -> paddle.Tensor: + return log_softmax(input=x, axis=axis) def mean_batch_invariant( diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 6c84306927c..5a484cb21d1 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -1389,8 +1389,9 @@ def test_streaming_chat_finish_reason(openai_client): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" - log_dir = os.getenv("FD_LOG_DIR", "log") - log_file = os.path.join(log_dir, "config.log") + log_file = "./log/config.log" + # log_dir = os.getenv("FD_LOG_DIR", "log") + # log_file = os.path.join(log_dir, "config.log") baseline = 31446 if not os.path.exists(log_file): diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py b/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py index 1918d093ddb..85e4075a6e4 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py +++ b/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py @@ -81,10 +81,12 @@ def setup_and_run_server(): model_path = "baidu/ERNIE-4.5-0.3B-Paddle" print(f"model_path: {model_path}") + base_log_dir = os.getenv("FD_LOG_DIR", "log") + # router print("start router...") env_router = os.environ.copy() - env_router["FD_LOG_DIR"] = "log_router" + env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router") router_log_path = "router.log" router_cmd = [ @@ -110,7 +112,7 @@ def setup_and_run_server(): env_prefill = os.environ.copy() env_prefill["CUDA_VISIBLE_DEVICES"] = "0" env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "1" - env_prefill["FD_LOG_DIR"] = "log_prefill" + env_prefill["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_prefill") prefill_log_path = "prefill.log" prefill_cmd = [ sys.executable, @@ -160,7 +162,7 @@ def setup_and_run_server(): env_decode = os.environ.copy() env_decode["CUDA_VISIBLE_DEVICES"] = "1" env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "1" - env_decode["FD_LOG_DIR"] = "log_decode" + env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode") decode_log_path = "decode.log" decode_cmd = [ sys.executable, @@ -416,4 +418,4 @@ def test_non_chat_usage_non_stream(api_url): total_tokens = usage["completion_tokens"] + usage["prompt_tokens"] assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens" assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens" - assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens" + assert usage["total_tokens"] != total_tokens, "total_tokens不等于prompt_tokens + completion_tokens" diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py index bf5ec6dd1b6..723248d341e 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py +++ b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py @@ -78,6 +78,16 @@ def wait_for_mooncake_master(host: str = "127.0.0.1", port: int = FD_MOONCAKE_MA return False +def prepare_log_dir(name): + """ + Prepare log directory for test. + """ + base = os.getenv("FD_LOG_DIR", "log") + path = os.path.join(base, name) + os.makedirs(path, exist_ok=True) + return path + + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ @@ -136,8 +146,8 @@ def setup_and_run_server(): ) env_master = os.environ.copy() - env_master["FD_LOG_DIR"] = "log_master" - os.makedirs("log_master", exist_ok=True) + master_log_dir = prepare_log_dir("log_master") + env_master["FD_LOG_DIR"] = master_log_dir master_cmd = [ "mooncake_master", @@ -147,7 +157,7 @@ def setup_and_run_server(): f"--http_metadata_server_port={FD_MOONCAKE_METADATA_PORT}", ] - with open("log_master/nohup", "w") as logfile: + with open(os.path.join(master_log_dir, "nohup"), "w") as logfile: process_master = subprocess.Popen( master_cmd, stdout=logfile, @@ -160,7 +170,7 @@ def setup_and_run_server(): if not wait_for_mooncake_master(port=FD_MOONCAKE_MASTER_PORT, timeout=30): print("[ERROR] Mooncake Master failed to start") # Print mooncake master log for debugging - master_log_path = "log_master/nohup" + master_log_path = os.path.join(master_log_dir, "nohup") if os.path.exists(master_log_path): print(f"\n===== Mooncake Master Log ({master_log_path}) =====") with open(master_log_path, "r") as f: @@ -175,9 +185,9 @@ def setup_and_run_server(): # ======================== Start Router ======================== print("start router...") env_router = os.environ.copy() - env_router["FD_LOG_DIR"] = "log_router" - os.makedirs("log_router", exist_ok=True) - router_log_path = "log_router/nohup.log" + router_log_dir = prepare_log_dir("log_router") + env_router["FD_LOG_DIR"] = router_log_dir + router_log_path = os.path.join(router_log_dir, "nohup") router_cmd = [ sys.executable, @@ -201,13 +211,14 @@ def setup_and_run_server(): print("start prefill...") env_prefill = os.environ.copy() env_prefill["CUDA_VISIBLE_DEVICES"] = "0" - env_prefill["FD_LOG_DIR"] = "log_prefill" - os.makedirs("log_prefill", exist_ok=True) + prefill_log_dir = prepare_log_dir("log_prefill") + env_prefill["FD_LOG_DIR"] = prefill_log_dir + # Mooncake environment variables for prefill for k, v in mooncake_env.items(): env_prefill[k] = v - prefill_log_path = "log_prefill/nohup.log" + prefill_log_path = os.path.join(prefill_log_dir, "nohup") prefill_cmd = [ sys.executable, "-m", @@ -254,13 +265,13 @@ def setup_and_run_server(): print("start decode...") env_decode = os.environ.copy() env_decode["CUDA_VISIBLE_DEVICES"] = "1" - env_decode["FD_LOG_DIR"] = "log_decode" - os.makedirs("log_decode", exist_ok=True) + decode_log_dir = prepare_log_dir("log_decode") + env_decode["FD_LOG_DIR"] = decode_log_dir # Mooncake environment variables for decode for k, v in mooncake_env.items(): env_decode[k] = v - decode_log_path = "log_decode/nohup.log" + decode_log_path = os.path.join(decode_log_dir, "nohup.log") decode_cmd = [ sys.executable, "-m", diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp1.py b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp1.py index 96a86ece7d3..dd4aa5722b4 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp1.py +++ b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp1.py @@ -84,6 +84,8 @@ def setup_and_run_server(): model_path = "baidu/ERNIE-4.5-0.3B-Paddle" print(f"model_path: {model_path}") + base_log_dir = os.getenv("FD_LOG_DIR", "log") + # get rdma nics current_dir = os.path.dirname(os.path.abspath(__file__)) shell_path = os.path.join(current_dir, "utils/get_rdma_nics.sh") @@ -94,7 +96,7 @@ def setup_and_run_server(): # router print("start router...") env_router = os.environ.copy() - env_router["FD_LOG_DIR"] = "log_router" + env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router") router_log_path = "router.log" router_cmd = [ @@ -119,7 +121,7 @@ def setup_and_run_server(): print("start prefill...") env_prefill = os.environ.copy() env_prefill["CUDA_VISIBLE_DEVICES"] = "0" - env_prefill["FD_LOG_DIR"] = "log_prefill" + env_prefill["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_prefill") # env_prefill["KVCACHE_RDMA_NICS"] = rdma_nics prefill_log_path = "prefill.log" @@ -166,7 +168,7 @@ def setup_and_run_server(): print("start decode...") env_decode = os.environ.copy() env_decode["CUDA_VISIBLE_DEVICES"] = "1" - env_decode["FD_LOG_DIR"] = "log_decode" + env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode") # env_decode["KVCACHE_RDMA_NICS"] = rdma_nics decode_log_path = "decode.log" diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp2.py b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp2.py index a59b78349df..0bbc8186a54 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp2.py +++ b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp2.py @@ -86,6 +86,8 @@ def setup_and_run_server(): model_path = "baidu/ERNIE-4.5-0.3B-Paddle" print(f"model_path: {model_path}") + base_log_dir = os.getenv("FD_LOG_DIR", "log") + # get rdma nics current_dir = os.path.dirname(os.path.abspath(__file__)) shell_path = os.path.join(current_dir, "utils/get_rdma_nics.sh") @@ -96,7 +98,7 @@ def setup_and_run_server(): # router print("start router...") env_router = os.environ.copy() - env_router["FD_LOG_DIR"] = "log_router" + env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router") router_log_path = "router.log" router_cmd = [ @@ -121,7 +123,7 @@ def setup_and_run_server(): print("start prefill...") env_prefill = os.environ.copy() env_prefill["CUDA_VISIBLE_DEVICES"] = "0,1" - env_prefill["FD_LOG_DIR"] = "log_prefill" + env_prefill["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_prefill") # env_prefill["KVCACHE_RDMA_NICS"] = rdma_nics prefill_log_path = "prefill.log" @@ -170,7 +172,7 @@ def setup_and_run_server(): print("start decode...") env_decode = os.environ.copy() env_decode["CUDA_VISIBLE_DEVICES"] = "1" - env_decode["FD_LOG_DIR"] = "log_decode" + env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode") # env_decode["KVCACHE_RDMA_NICS"] = rdma_nics decode_log_path = "decode.log" diff --git a/tests/e2e/test_ernie_03b_router.py b/tests/e2e/test_ernie_03b_router.py index 6ddd7c976f3..17d9b05dafe 100644 --- a/tests/e2e/test_ernie_03b_router.py +++ b/tests/e2e/test_ernie_03b_router.py @@ -97,10 +97,12 @@ def setup_and_run_server(): model_path = "baidu/ERNIE-4.5-0.3B-Paddle" print(f"model_path: {model_path}") + base_log_dir = os.getenv("FD_LOG_DIR", "log") + # router print("start router...") env_router = os.environ.copy() - env_router["FD_LOG_DIR"] = "log_router" + env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router") router_log_path = "router.log" router_cmd = [ @@ -121,10 +123,10 @@ def setup_and_run_server(): ) # server0 - print("start server0...") + print("start server 0...") env_server_0 = os.environ.copy() env_server_0["CUDA_VISIBLE_DEVICES"] = "0" - env_server_0["FD_LOG_DIR"] = "log_server_0" + env_server_0["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_server_0") env_server_0["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT) log_path = "server_0.log" cmd = [ @@ -169,7 +171,7 @@ def setup_and_run_server(): env_server_1 = os.environ.copy() env_server_1["CUDA_VISIBLE_DEVICES"] = "1" env_server_1["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT + 1) - env_server_1["FD_LOG_DIR"] = "log_server_1" + env_server_1["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_server_1") log_path = "server_1.log" cmd = [ sys.executable, From f1d22d67ceb3a1f4fe3e9f5ae0f53a2ff544f66f Mon Sep 17 00:00:00 2001 From: EmmonsCurse <1577972691@qq.com> Date: Tue, 31 Mar 2026 21:43:49 +0800 Subject: [PATCH 5/5] [CI] Optimize test execution with single-GPU parallelism and log collection --- .../batch_invariant_ops/batch_invariant_ops.py | 16 +++++++--------- scripts/coverage_run.sh | 13 +++++++++++++ tests/e2e/test_EB_Lite_serving.py | 5 ++--- tests/e2e/test_ernie_03b_pd_router_v1_ipc.py | 2 +- ...t_ernie_03b_pd_router_v1_rdma_global_cache.py | 2 +- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py b/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py index fc632dad838..c0df764c07c 100644 --- a/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py +++ b/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py @@ -690,15 +690,13 @@ def addmm_batch_invariant( return result -# def _log_softmax_batch_invariant(x: paddle.Tensor, axis: int = -1, out=None) -> paddle.Tensor: -# result = log_softmax(input=x, axis=axis) -# # Handle out parameter if provided -# if out is not None: -# out.copy_(result) -# return out -# return result -def _log_softmax_batch_invariant(x: paddle.Tensor, axis: int = -1) -> paddle.Tensor: - return log_softmax(input=x, axis=axis) +def _log_softmax_batch_invariant(x: paddle.Tensor, axis: int = -1, out=None) -> paddle.Tensor: + result = log_softmax(input=x, axis=axis) + # Handle out parameter if provided + if out is not None: + out.copy_(result) + return out + return result def mean_batch_invariant( diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index 78174156ec6..3907d5055d5 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -327,6 +327,19 @@ if [ "$failed_count" -ne 0 ]; then echo "Failed test cases are listed in $failed_tests_file" cat "$failed_tests_file" + # clean the empty directories + if [ -d "${run_path}/unittest_logs" ]; then + echo "Cleaning empty directories..." + + # perform multi-round clean until no more empty directories are found + while true; do + before=$(find "${run_path}/unittest_logs" -type d | wc -l) + find "${run_path}/unittest_logs" -mindepth 1 -type d -empty -delete || true + after=$(find "${run_path}/unittest_logs" -type d | wc -l) + [ "$before" -eq "$after" ] && break + done + fi + # Only package logs when there are failures echo "====================================" echo "Step 5: Packaging logs (only on failure)" diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 5a484cb21d1..6c84306927c 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -1389,9 +1389,8 @@ def test_streaming_chat_finish_reason(openai_client): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" - log_file = "./log/config.log" - # log_dir = os.getenv("FD_LOG_DIR", "log") - # log_file = os.path.join(log_dir, "config.log") + log_dir = os.getenv("FD_LOG_DIR", "log") + log_file = os.path.join(log_dir, "config.log") baseline = 31446 if not os.path.exists(log_file): diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py b/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py index 85e4075a6e4..55074c97481 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py +++ b/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py @@ -418,4 +418,4 @@ def test_non_chat_usage_non_stream(api_url): total_tokens = usage["completion_tokens"] + usage["prompt_tokens"] assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens" assert payload["metadata"]["min_tokens"] <= usage["completion_tokens"], "completion_tokens小于min_tokens" - assert usage["total_tokens"] != total_tokens, "total_tokens不等于prompt_tokens + completion_tokens" + assert usage["total_tokens"] == total_tokens, "total_tokens不等于prompt_tokens + completion_tokens" diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py index 723248d341e..71ee1607a21 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py +++ b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py @@ -218,7 +218,7 @@ def setup_and_run_server(): for k, v in mooncake_env.items(): env_prefill[k] = v - prefill_log_path = os.path.join(prefill_log_dir, "nohup") + prefill_log_path = os.path.join(prefill_log_dir, "nohup.log") prefill_cmd = [ sys.executable, "-m",