Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Do live API call to perform system evaluation (Evals). Typically manually
# triggered after changes to the part of code that affects the LLM behavior.
# Supports 2 kinds of eval markers:
# "dev_eval" -> frequent dev-dataset evals
# "held_out" -> pre-release hidden held-out evals
#
name: Evals

permissions:
contents: read
pull-requests: write
issues: write
Comment on lines +9 to +12

on:
workflow_dispatch:
inputs:
marker:
description: Which evals to run.
required: false
type: choice
default: "dev_eval"
options:
- "dev_eval"
- "held_out"
Comment on lines +17 to +24
test_filter:
description: Optional pytest -k expression to run a subset.
required: false
default: ""

jobs:
gated-tests:
name: Run gated tests
runs-on: ubuntu-latest
Comment on lines +31 to +33
steps:
- name: Checkout Repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
fetch-tags: true

- uses: prefix-dev/setup-pixi@1b2de7f3351f171c8b4dfeb558c639cb58ed4ec0 # v0.9.5
with:
pixi-version: v0.68.1
locked: true
cache: true
cache-write: false
environments: pdev

- name: Run gated tests
env:
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_ENDPOINT: ${{ vars.AZURE_OPENAI_ENDPOINT }}
AZURE_OPENAI_VERSION: ${{ vars.AZURE_OPENAI_VERSION }}
MARKER: ${{ inputs.marker }}
TEST_FILTER: ${{ inputs.test_filter }}
run: |
pixi reinstall -e pdev INFRA-COMPASS

args=(-rapP -vv -s --log-cli-level=INFO -m "${MARKER}")
if [ -n "${TEST_FILTER}" ]; then
args+=(-k "${TEST_FILTER}")
fi
args+=(tests/python/evals)

echo "Running: pytest ${args[*]}"
pixi run -e pdev pytest "${args[@]}"