[executorch] show backend test infra pass rate of CUDA backend to PyTorch HUD

Gasoonjia · Gasoonjia · commit 5a5cc6f4cf76 · 2026-03-04T23:29:40.000-08:00
Add infrastructure to track CUDA backend test pass rates on PyTorch HUD dashboard over time Differential Revision: [D95335059](https://our.internmc.facebook.com/intern/diff/D95335059/) ghstack-source-id: 347850731 Pull Request resolved: #17874
diff --git a/.ci/scripts/generate_test_pass_rate_in_v3_format.py b/.ci/scripts/generate_test_pass_rate_in_v3_format.py
@@ -0,0 +1,194 @@
+"""
+Parse test report JSON produced by pytest --json-report and generate
+v3 format benchmark results for upload to PyTorch HUD.
+
+Metrics produced per suite:
+  - pass_rate(%)   : percentage of passing tests (skips excluded from denominator)
+  - total_pass     : number of passing tests
+  - total_fail     : number of failing tests
+  - total_skip     : number of skipped tests
+"""
+
+import argparse
+import json
+import sys
+
+
+def parse_test_report(json_path: str) -> dict:
+    """
+    Parse a test report JSON file and return pass/fail/skip counts.
+
+    The JSON is produced by test_backend.sh via pytest --json-report and has the
+    structure used by generate_markdown_summary_json.py:
+      { "tests": [ { "metadata": { "subtests": [ { "Result": "Pass"|"Fail"|"Skip", ... } ] } } ] }
+    """
+    with open(json_path) as f:
+        data = json.load(f)
+
+    passes = 0
+    fails = 0
+    skips = 0
+
+    for test_data in data["tests"]:
+        for subtest in test_data["metadata"]["subtests"]:
+            result = subtest["Result"]
+            if result == "Pass":
+                passes += 1
+            elif result == "Fail":
+                fails += 1
+            elif result == "Skip":
+                skips += 1
+
+    return {"passes": passes, "fails": fails, "skips": skips}
+
+
+def build_v3_record(
+    metric_name: str,
+    value: float,
+    suite: str,
+    flow: str,
+    git_sha: str,
+    workflow_run_id: str,
+    workflow_run_url: str,
+    runner_name: str,
+) -> dict:
+    """Build a single v3 format benchmark record."""
+    return {
+        "benchmark": {
+            "name": "ExecuTorch",
+            "mode": "test",
+            "extra_info": {
+                "backend": "cuda",
+                "suite": suite,
+                "flow": flow,
+                "git_sha": git_sha,
+                "workflow_run_id": workflow_run_id,
+                "workflow_run_url": workflow_run_url,
+            },
+        },
+        "model": {
+            "name": f"cuda_backend_tests_{suite}",
+            "type": "OSS backend test",
+            "backend": "cuda",
+        },
+        "metric": {
+            "name": metric_name,
+            "benchmark_values": [value],
+            "target_value": 0,
+            "extra_info": {},
+        },
+        "runners": [{"name": runner_name, "type": "linux"}],
+    }
+
+
+def generate_v3_records(
+    counts: dict,
+    suite: str,
+    flow: str,
+    git_sha: str,
+    workflow_run_id: str,
+    workflow_run_url: str,
+    runner_name: str,
+) -> list:
+    """Generate v3 format records for all metrics."""
+    total_excluding_skips = counts["passes"] + counts["fails"]
+    pass_rate = (
+        (counts["passes"] / total_excluding_skips * 100)
+        if total_excluding_skips > 0
+        else 0.0
+    )
+
+    common = dict(
+        suite=suite,
+        flow=flow,
+        git_sha=git_sha,
+        workflow_run_id=workflow_run_id,
+        workflow_run_url=workflow_run_url,
+        runner_name=runner_name,
+    )
+
+    return [
+        build_v3_record("pass_rate(%)", pass_rate, **common),
+        build_v3_record("total_pass", counts["passes"], **common),
+        build_v3_record("total_fail", counts["fails"], **common),
+        build_v3_record("total_skip", counts["skips"], **common),
+    ]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate v3 format benchmark results from test report JSON"
+    )
+    parser.add_argument(
+        "--report-json",
+        required=True,
+        help="Path to the test report JSON file",
+    )
+    parser.add_argument(
+        "--suite",
+        required=True,
+        help="Test suite name (e.g. models, operators)",
+    )
+    parser.add_argument(
+        "--flow",
+        required=True,
+        help="Test flow name (e.g. cuda)",
+    )
+    parser.add_argument(
+        "--git-sha",
+        required=True,
+        help="Git commit SHA",
+    )
+    parser.add_argument(
+        "--workflow-run-id",
+        required=True,
+        help="GitHub workflow run ID",
+    )
+    parser.add_argument(
+        "--workflow-run-url",
+        default="",
+        help="GitHub workflow run URL",
+    )
+    parser.add_argument(
+        "--runner-name",
+        default="linux.g5.4xlarge.nvidia.gpu",
+        help="CI runner name",
+    )
+    parser.add_argument(
+        "--output-v3",
+        required=True,
+        help="Path to write v3 format JSON output",
+    )
+    args = parser.parse_args()
+
+    counts = parse_test_report(args.report_json)
+
+    total_excluding_skips = counts["passes"] + counts["fails"]
+    pass_rate = (
+        (counts["passes"] / total_excluding_skips * 100)
+        if total_excluding_skips > 0
+        else 0.0
+    )
+
+    print(f"Suite: {args.suite}")
+    print(f"  Pass: {counts['passes']}, Fail: {counts['fails']}, Skip: {counts['skips']}")
+    print(f"  Pass rate: {pass_rate:.2f}%")
+
+    records = generate_v3_records(
+        counts=counts,
+        suite=args.suite,
+        flow=args.flow,
+        git_sha=args.git_sha,
+        workflow_run_id=args.workflow_run_id,
+        workflow_run_url=args.workflow_run_url,
+        runner_name=args.runner_name,
+    )
+
+    with open(args.output_v3, "w") as f:
+        json.dump(records, f, indent=2)
+
+    print(f"Wrote {len(records)} v3 records to {args.output_v3}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/test-backend-cuda.yml b/.github/workflows/test-backend-cuda.yml
@@ -12,6 +12,7 @@ on:
     paths:
       - .github/workflows/test-backend-cuda.yml
       - .github/workflows/_test_backend.yml
+      - .ci/scripts/generate_test_pass_rate_in_v3_format.py
   workflow_dispatch:
 
 concurrency:
@@ -28,3 +29,62 @@ jobs:
       timeout: 120
       run-linux: true
       runner-linux: linux.g5.4xlarge.nvidia.gpu
+
+  upload-test-results:
+    needs: test-cuda
+    if: always()
+    runs-on: ubuntu-22.04
+    environment: upload-benchmark-results
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: false
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Download test report artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-report-*
+          path: downloaded-reports/
+
+      - name: Generate v3 benchmark results
+        shell: bash
+        run: |
+          set -eux
+          mkdir -p benchmark-results/v3
+
+          for SUITE in models operators; do
+            REPORT="downloaded-reports/test-report-cuda-${SUITE}/test-report-cuda-${SUITE}.json"
+            if [ -f "$REPORT" ]; then
+              echo "Processing report for suite: $SUITE"
+              python .ci/scripts/generate_test_pass_rate_in_v3_format.py \
+                --report-json "$REPORT" \
+                --suite "$SUITE" \
+                --flow cuda \
+                --git-sha "${{ github.sha }}" \
+                --workflow-run-id "${{ github.run_id }}" \
+                --workflow-run-url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
+                --runner-name "linux.g5.4xlarge.nvidia.gpu" \
+                --output-v3 "benchmark-results/v3/cuda-test-${SUITE}.json"
+            else
+              echo "Warning: Report not found for suite $SUITE at $REPORT"
+            fi
+          done
+
+          echo "V3 results prepared:"
+          ls -lah benchmark-results/v3/ || echo "No v3 results generated"
+
+      - name: Upload test pass rate to dashboard
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: benchmark-results/v3
+          dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}