From 1e3cc638bef2e8cc545a8373e7b117eb36218621 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 4 Mar 2026 23:29:37 -0800 Subject: [PATCH] [executorch] show backend test infra pass rate of CUDA backend to PyTorch HUD Add infrastructure to track CUDA backend test pass rates on PyTorch HUD dashboard over time Differential Revision: [D95335059](https://our.internmc.facebook.com/intern/diff/D95335059/) [ghstack-poisoned] --- .../generate_test_pass_rate_in_v3_format.py | 194 ++++++++++++++++++ .github/workflows/test-backend-cuda.yml | 60 ++++++ 2 files changed, 254 insertions(+) create mode 100644 .ci/scripts/generate_test_pass_rate_in_v3_format.py diff --git a/.ci/scripts/generate_test_pass_rate_in_v3_format.py b/.ci/scripts/generate_test_pass_rate_in_v3_format.py new file mode 100644 index 00000000000..cdd89ce1ec7 --- /dev/null +++ b/.ci/scripts/generate_test_pass_rate_in_v3_format.py @@ -0,0 +1,194 @@ +""" +Parse test report JSON produced by pytest --json-report and generate +v3 format benchmark results for upload to PyTorch HUD. + +Metrics produced per suite: + - pass_rate(%) : percentage of passing tests (skips excluded from denominator) + - total_pass : number of passing tests + - total_fail : number of failing tests + - total_skip : number of skipped tests +""" + +import argparse +import json +import sys + + +def parse_test_report(json_path: str) -> dict: + """ + Parse a test report JSON file and return pass/fail/skip counts. + + The JSON is produced by test_backend.sh via pytest --json-report and has the + structure used by generate_markdown_summary_json.py: + { "tests": [ { "metadata": { "subtests": [ { "Result": "Pass"|"Fail"|"Skip", ... } ] } } ] } + """ + with open(json_path) as f: + data = json.load(f) + + passes = 0 + fails = 0 + skips = 0 + + for test_data in data["tests"]: + for subtest in test_data["metadata"]["subtests"]: + result = subtest["Result"] + if result == "Pass": + passes += 1 + elif result == "Fail": + fails += 1 + elif result == "Skip": + skips += 1 + + return {"passes": passes, "fails": fails, "skips": skips} + + +def build_v3_record( + metric_name: str, + value: float, + suite: str, + flow: str, + git_sha: str, + workflow_run_id: str, + workflow_run_url: str, + runner_name: str, +) -> dict: + """Build a single v3 format benchmark record.""" + return { + "benchmark": { + "name": "ExecuTorch", + "mode": "test", + "extra_info": { + "backend": "cuda", + "suite": suite, + "flow": flow, + "git_sha": git_sha, + "workflow_run_id": workflow_run_id, + "workflow_run_url": workflow_run_url, + }, + }, + "model": { + "name": f"cuda_backend_tests_{suite}", + "type": "OSS backend test", + "backend": "cuda", + }, + "metric": { + "name": metric_name, + "benchmark_values": [value], + "target_value": 0, + "extra_info": {}, + }, + "runners": [{"name": runner_name, "type": "linux"}], + } + + +def generate_v3_records( + counts: dict, + suite: str, + flow: str, + git_sha: str, + workflow_run_id: str, + workflow_run_url: str, + runner_name: str, +) -> list: + """Generate v3 format records for all metrics.""" + total_excluding_skips = counts["passes"] + counts["fails"] + pass_rate = ( + (counts["passes"] / total_excluding_skips * 100) + if total_excluding_skips > 0 + else 0.0 + ) + + common = dict( + suite=suite, + flow=flow, + git_sha=git_sha, + workflow_run_id=workflow_run_id, + workflow_run_url=workflow_run_url, + runner_name=runner_name, + ) + + return [ + build_v3_record("pass_rate(%)", pass_rate, **common), + build_v3_record("total_pass", counts["passes"], **common), + build_v3_record("total_fail", counts["fails"], **common), + build_v3_record("total_skip", counts["skips"], **common), + ] + + +def main(): + parser = argparse.ArgumentParser( + description="Generate v3 format benchmark results from test report JSON" + ) + parser.add_argument( + "--report-json", + required=True, + help="Path to the test report JSON file", + ) + parser.add_argument( + "--suite", + required=True, + help="Test suite name (e.g. models, operators)", + ) + parser.add_argument( + "--flow", + required=True, + help="Test flow name (e.g. cuda)", + ) + parser.add_argument( + "--git-sha", + required=True, + help="Git commit SHA", + ) + parser.add_argument( + "--workflow-run-id", + required=True, + help="GitHub workflow run ID", + ) + parser.add_argument( + "--workflow-run-url", + default="", + help="GitHub workflow run URL", + ) + parser.add_argument( + "--runner-name", + default="linux.g5.4xlarge.nvidia.gpu", + help="CI runner name", + ) + parser.add_argument( + "--output-v3", + required=True, + help="Path to write v3 format JSON output", + ) + args = parser.parse_args() + + counts = parse_test_report(args.report_json) + + total_excluding_skips = counts["passes"] + counts["fails"] + pass_rate = ( + (counts["passes"] / total_excluding_skips * 100) + if total_excluding_skips > 0 + else 0.0 + ) + + print(f"Suite: {args.suite}") + print(f" Pass: {counts['passes']}, Fail: {counts['fails']}, Skip: {counts['skips']}") + print(f" Pass rate: {pass_rate:.2f}%") + + records = generate_v3_records( + counts=counts, + suite=args.suite, + flow=args.flow, + git_sha=args.git_sha, + workflow_run_id=args.workflow_run_id, + workflow_run_url=args.workflow_run_url, + runner_name=args.runner_name, + ) + + with open(args.output_v3, "w") as f: + json.dump(records, f, indent=2) + + print(f"Wrote {len(records)} v3 records to {args.output_v3}") + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/test-backend-cuda.yml b/.github/workflows/test-backend-cuda.yml index ac5c9b97c43..a91e20611a2 100644 --- a/.github/workflows/test-backend-cuda.yml +++ b/.github/workflows/test-backend-cuda.yml @@ -12,6 +12,7 @@ on: paths: - .github/workflows/test-backend-cuda.yml - .github/workflows/_test_backend.yml + - .ci/scripts/generate_test_pass_rate_in_v3_format.py workflow_dispatch: concurrency: @@ -28,3 +29,62 @@ jobs: timeout: 120 run-linux: true runner-linux: linux.g5.4xlarge.nvidia.gpu + + upload-test-results: + needs: test-cuda + if: always() + runs-on: ubuntu-22.04 + environment: upload-benchmark-results + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + with: + submodules: false + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Download test report artifacts + uses: actions/download-artifact@v4 + with: + pattern: test-report-* + path: downloaded-reports/ + + - name: Generate v3 benchmark results + shell: bash + run: | + set -eux + mkdir -p benchmark-results/v3 + + for SUITE in models operators; do + REPORT="downloaded-reports/test-report-cuda-${SUITE}/test-report-cuda-${SUITE}.json" + if [ -f "$REPORT" ]; then + echo "Processing report for suite: $SUITE" + python .ci/scripts/generate_test_pass_rate_in_v3_format.py \ + --report-json "$REPORT" \ + --suite "$SUITE" \ + --flow cuda \ + --git-sha "${{ github.sha }}" \ + --workflow-run-id "${{ github.run_id }}" \ + --workflow-run-url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ + --runner-name "linux.g5.4xlarge.nvidia.gpu" \ + --output-v3 "benchmark-results/v3/cuda-test-${SUITE}.json" + else + echo "Warning: Report not found for suite $SUITE at $REPORT" + fi + done + + echo "V3 results prepared:" + ls -lah benchmark-results/v3/ || echo "No v3 results generated" + + - name: Upload test pass rate to dashboard + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + with: + benchmark-results-dir: benchmark-results/v3 + dry-run: false + schema-version: v3 + github-token: ${{ secrets.GITHUB_TOKEN }}