Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 196 additions & 0 deletions .ci/scripts/generate_test_pass_rate_in_v3_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
"""
Parse test report JSON produced by pytest --json-report and generate
v3 format benchmark results for upload to PyTorch HUD.

Metrics produced per suite:
- pass_rate(%) : percentage of passing tests (skips excluded from denominator)
- total_pass : number of passing tests
- total_fail : number of failing tests
- total_skip : number of skipped tests
"""

import argparse
import json
import sys

Check warning on line 14 in .ci/scripts/generate_test_pass_rate_in_v3_format.py

View workflow job for this annotation

GitHub Actions / lintrunner

FLAKE8 F401

'sys' imported but unused See https://www.flake8rules.com/rules/F401.html.


def parse_test_report(json_path: str) -> dict:
"""
Parse a test report JSON file and return pass/fail/skip counts.

The JSON is produced by test_backend.sh via pytest --json-report and has the
structure used by generate_markdown_summary_json.py:
{ "tests": [ { "metadata": { "subtests": [ { "Result": "Pass"|"Fail"|"Skip", ... } ] } } ] }
"""
with open(json_path) as f:
data = json.load(f)

passes = 0
fails = 0
skips = 0

for test_data in data["tests"]:
for subtest in test_data["metadata"]["subtests"]:
result = subtest["Result"]
if result == "Pass":
passes += 1
elif result == "Fail":
fails += 1
elif result == "Skip":
skips += 1

return {"passes": passes, "fails": fails, "skips": skips}


def build_v3_record(
metric_name: str,
value: float,
suite: str,
flow: str,
git_sha: str,
workflow_run_id: str,
workflow_run_url: str,
runner_name: str,
) -> dict:
"""Build a single v3 format benchmark record."""
return {
"benchmark": {
"name": "ExecuTorch",
"mode": "test",
"extra_info": {
"backend": "cuda",
"suite": suite,
"flow": flow,
"git_sha": git_sha,
"workflow_run_id": workflow_run_id,
"workflow_run_url": workflow_run_url,
},
},
"model": {
"name": f"cuda_backend_tests_{suite}",
"type": "OSS backend test",
"backend": "cuda",
},
"metric": {
"name": metric_name,
"benchmark_values": [value],
"target_value": 0,
"extra_info": {},
},
"runners": [{"name": runner_name, "type": "linux"}],
}


def generate_v3_records(
counts: dict,
suite: str,
flow: str,
git_sha: str,
workflow_run_id: str,
workflow_run_url: str,
runner_name: str,
) -> list:
"""Generate v3 format records for all metrics."""
total_excluding_skips = counts["passes"] + counts["fails"]
pass_rate = (
(counts["passes"] / total_excluding_skips * 100)
if total_excluding_skips > 0
else 0.0
)

common = dict(

Check warning on line 101 in .ci/scripts/generate_test_pass_rate_in_v3_format.py

View workflow job for this annotation

GitHub Actions / lintrunner

FLAKE8 C408

Unnecessary dict call - rewrite as a literal. See https://pypi.org/project/flake8-comprehensions/#rules.
suite=suite,
flow=flow,
git_sha=git_sha,
workflow_run_id=workflow_run_id,
workflow_run_url=workflow_run_url,
runner_name=runner_name,
)

return [
build_v3_record("pass_rate(%)", pass_rate, **common),
build_v3_record("total_pass", counts["passes"], **common),
build_v3_record("total_fail", counts["fails"], **common),
build_v3_record("total_skip", counts["skips"], **common),
]


def main():
parser = argparse.ArgumentParser(
description="Generate v3 format benchmark results from test report JSON"
)
parser.add_argument(
"--report-json",
required=True,
help="Path to the test report JSON file",
)
parser.add_argument(
"--suite",
required=True,
help="Test suite name (e.g. models, operators)",
)
parser.add_argument(
"--flow",
required=True,
help="Test flow name (e.g. cuda)",
)
parser.add_argument(
"--git-sha",
required=True,
help="Git commit SHA",
)
parser.add_argument(
"--workflow-run-id",
required=True,
help="GitHub workflow run ID",
)
parser.add_argument(
"--workflow-run-url",
default="",
help="GitHub workflow run URL",
)
parser.add_argument(
"--runner-name",
default="linux.g5.4xlarge.nvidia.gpu",
help="CI runner name",
)
parser.add_argument(
"--output-v3",
required=True,
help="Path to write v3 format JSON output",
)
args = parser.parse_args()

counts = parse_test_report(args.report_json)

total_excluding_skips = counts["passes"] + counts["fails"]
pass_rate = (
(counts["passes"] / total_excluding_skips * 100)
if total_excluding_skips > 0
else 0.0
)

print(f"Suite: {args.suite}")
print(
f" Pass: {counts['passes']}, Fail: {counts['fails']}, Skip: {counts['skips']}"
)
print(f" Pass rate: {pass_rate:.2f}%")

records = generate_v3_records(
counts=counts,
suite=args.suite,
flow=args.flow,
git_sha=args.git_sha,
workflow_run_id=args.workflow_run_id,
workflow_run_url=args.workflow_run_url,
runner_name=args.runner_name,
)

with open(args.output_v3, "w") as f:
json.dump(records, f, indent=2)

print(f"Wrote {len(records)} v3 records to {args.output_v3}")


if __name__ == "__main__":
main()
68 changes: 68 additions & 0 deletions .github/workflows/test-backend-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ on:
paths:
- .github/workflows/test-backend-cuda.yml
- .ci/scripts/test_backend.sh
- .github/workflows/_test_backend.yml
- .ci/scripts/generate_test_pass_rate_in_v3_format.py
workflow_dispatch:

concurrency:
Expand Down Expand Up @@ -39,3 +41,69 @@ jobs:
set -eux

source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "cuda" "${RUNNER_ARTIFACT_DIR}"

upload-test-results:
needs: test-cuda
if: always()
runs-on: ubuntu-22.04
environment: upload-benchmark-results
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v3
with:
submodules: false

- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Download test report artifacts
uses: actions/download-artifact@v4
with:
pattern: test-report-*
path: downloaded-reports/

- name: Generate v3 benchmark results
shell: bash
run: |
set -eux
mkdir -p benchmark-results/v3

for SUITE in models operators; do
REPORT="downloaded-reports/test-report-cuda-${SUITE}/test-report-cuda-${SUITE}.json"
if [ -f "$REPORT" ]; then
echo "Processing report for suite: $SUITE"
python .ci/scripts/generate_test_pass_rate_in_v3_format.py \
--report-json "$REPORT" \
--suite "$SUITE" \
--flow cuda \
--git-sha "${{ github.sha }}" \
--workflow-run-id "${{ github.run_id }}" \
--workflow-run-url "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
--runner-name "linux.g5.4xlarge.nvidia.gpu" \
--output-v3 "benchmark-results/v3/cuda-test-${SUITE}.json"
else
echo "Warning: Report not found for suite $SUITE at $REPORT"
fi
done

echo "V3 results prepared:"
ls -lah benchmark-results/v3/ || echo "No v3 results generated"

- name: Authenticate with AWS
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
role-duration-seconds: 18000
aws-region: us-east-1

- name: Upload test pass rate to dashboard
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
with:
benchmark-results-dir: benchmark-results/v3
dry-run: false
schema-version: v3
github-token: ${{ secrets.GITHUB_TOKEN }}
Loading