From aa15225d2b1aaa702283801e0f7ee7ad3c764107 Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Fri, 22 May 2026 19:16:29 -0500 Subject: [PATCH 01/10] Add manually-triggered workflow for expensive (live-LLM) tests --- .github/workflows/expensive-tests.yml | 63 +++++++++++++++++++++++++++ pixi.toml | 2 + pyproject.toml | 5 ++- 3 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/expensive-tests.yml diff --git a/.github/workflows/expensive-tests.yml b/.github/workflows/expensive-tests.yml new file mode 100644 index 000000000..fe61cfd9b --- /dev/null +++ b/.github/workflows/expensive-tests.yml @@ -0,0 +1,63 @@ +name: Expensive tests (manual) + +# Manually-triggered only. These tests make live, billable LLM calls +# (e.g. the date-extraction accuracy test) and are deselected from every +# other workflow via the `expensive` pytest marker. Run from the Actions +# tab via "Run workflow". + +permissions: + contents: read + +on: + workflow_dispatch: + inputs: + test_filter: + description: >- + Optional pytest -k expression to run a subset (e.g. "Bartow"). + Leave blank to run all expensive tests. + required: false + default: "" + model: + description: Azure deployment name to exercise. + required: false + default: "compassop-gpt-5.4" + +jobs: + expensive-tests: + name: Run expensive (live-LLM) tests + runs-on: ubuntu-latest + steps: + - name: Checkout Repo + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + fetch-tags: true + + - uses: prefix-dev/setup-pixi@1b2de7f3351f171c8b4dfeb558c639cb58ed4ec0 # v0.9.5 + with: + pixi-version: v0.62.2 + locked: true + cache: true + cache-write: false + environments: pdev + + - name: Run expensive tests + env: + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }} + COMPASS_DATE_TEST_MODEL: ${{ github.event.inputs.model }} + run: | + if [ -z "${AZURE_OPENAI_API_KEY}" ] || [ -z "${AZURE_OPENAI_ENDPOINT}" ]; then + echo "::error::AZURE_OPENAI_API_KEY / AZURE_OPENAI_ENDPOINT secrets are not set." + echo "Add them in repo Settings -> Secrets and variables -> Actions." + exit 1 + fi + pixi reinstall -e pdev INFRA-COMPASS + if [ -n "${{ github.event.inputs.test_filter }}" ]; then + pixi run -e pdev pytest -rapP -vv -s --log-cli-level=INFO \ + -m expensive -k "${{ github.event.inputs.test_filter }}" \ + tests/python/integration + else + pixi run -e pdev tests-expensive + fi diff --git a/pixi.toml b/pixi.toml index b7a303ec2..262d7e447 100644 --- a/pixi.toml +++ b/pixi.toml @@ -28,6 +28,8 @@ format = "ruff format ./compass" tests-p = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python" tests-u = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python/unit" tests-i = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=10 tests/python/integration" +# Opt-in tests that make live, billable LLM calls (deselected by default). +tests-expensive = "pytest -rapP -vv -s --log-cli-level=INFO -m expensive tests/python/integration" [feature.python-doc.tasks] python-docs = { cmd = "make clean html", cwd = "docs", env = { SPHINXOPTS = "--fail-on-warning --keep-going --nitpicky" }} diff --git a/pyproject.toml b/pyproject.toml index 893237c2d..88086c6eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -269,9 +269,12 @@ omit = [ [tool.pytest.ini_options] -addopts = "--disable-warnings" +addopts = '--disable-warnings -m "not expensive"' asyncio_mode="auto" asyncio_default_fixture_loop_scope="function" +markers = [ + "expensive: opt-in tests that are deselected by default (e.g. tests that make live, billable LLM calls). Run explicitly with `-m expensive`.", +] testpaths = [ "tests/python/unit", "tests/python/integration", From 8187df31dac6d5a1da2596d4a2e1d05139265aed Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 11:45:53 -0500 Subject: [PATCH 02/10] Make gated-test workflow generic (marker/path/env inputs) --- .github/workflows/expensive-tests.yml | 86 ++++++++++++++++++--------- 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/.github/workflows/expensive-tests.yml b/.github/workflows/expensive-tests.yml index fe61cfd9b..a01a59879 100644 --- a/.github/workflows/expensive-tests.yml +++ b/.github/workflows/expensive-tests.yml @@ -1,9 +1,18 @@ -name: Expensive tests (manual) +name: Gated tests (manual) -# Manually-triggered only. These tests make live, billable LLM calls -# (e.g. the date-extraction accuracy test) and are deselected from every -# other workflow via the `expensive` pytest marker. Run from the Actions -# tab via "Run workflow". +# Manually-triggered runner for tests that are deselected from normal CI +# (e.g. tests that make live, billable LLM calls). Run from the Actions tab +# via "Run workflow". +# +# This workflow is intentionally generic: it runs whatever pytest marker / +# path / filter you pass as inputs. To add a new gated test later, just mark +# it with a pytest marker on your branch and dispatch this workflow against +# that branch (the "Use workflow from" dropdown) -- no need to re-merge the +# workflow to the default branch. +# +# NOTE: GitHub reads the trigger/inputs of a `workflow_dispatch` workflow from +# the DEFAULT branch, so this file must be on the default branch once. After +# that, the job body and the tests it runs come from the selected branch. permissions: contents: read @@ -11,20 +20,34 @@ permissions: on: workflow_dispatch: inputs: - test_filter: + marker: description: >- - Optional pytest -k expression to run a subset (e.g. "Bartow"). - Leave blank to run all expensive tests. + pytest -m marker expression selecting the gated tests to run + (e.g. "expensive", or "expensive and not slow"). + required: false + default: "expensive" + test_path: + description: Path (dir or file) to collect tests from. + required: false + default: "tests/python/integration" + test_filter: + description: Optional pytest -k expression to run a subset. required: false default: "" + pixi_env: + description: pixi environment to run the tests in. + required: false + default: "pdev" model: - description: Azure deployment name to exercise. + description: >- + Optional model/deployment name, exposed as COMPASS_DATE_TEST_MODEL + for tests that read it. required: false - default: "compassop-gpt-5.4" + default: "" jobs: - expensive-tests: - name: Run expensive (live-LLM) tests + gated-tests: + name: Run gated tests runs-on: ubuntu-latest steps: - name: Checkout Repo @@ -35,29 +58,36 @@ jobs: - uses: prefix-dev/setup-pixi@1b2de7f3351f171c8b4dfeb558c639cb58ed4ec0 # v0.9.5 with: - pixi-version: v0.62.2 + pixi-version: v0.68.1 locked: true cache: true cache-write: false - environments: pdev + environments: ${{ inputs.pixi_env }} - - name: Run expensive tests + - name: Run gated tests env: + # Explicit, well-known creds for the current live-LLM tests. AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }} - COMPASS_DATE_TEST_MODEL: ${{ github.event.inputs.model }} + COMPASS_DATE_TEST_MODEL: ${{ inputs.model }} + # Passthrough of all repo secrets as JSON, so a future gated test + # needing a different secret can read it without editing this file. + # Parse in-test, e.g.: + # json.loads(os.environ["ALL_SECRETS"])["MY_SECRET"] + ALL_SECRETS: ${{ toJSON(secrets) }} + MARKER: ${{ inputs.marker }} + TEST_PATH: ${{ inputs.test_path }} + TEST_FILTER: ${{ inputs.test_filter }} + PIXI_ENV: ${{ inputs.pixi_env }} run: | - if [ -z "${AZURE_OPENAI_API_KEY}" ] || [ -z "${AZURE_OPENAI_ENDPOINT}" ]; then - echo "::error::AZURE_OPENAI_API_KEY / AZURE_OPENAI_ENDPOINT secrets are not set." - echo "Add them in repo Settings -> Secrets and variables -> Actions." - exit 1 - fi - pixi reinstall -e pdev INFRA-COMPASS - if [ -n "${{ github.event.inputs.test_filter }}" ]; then - pixi run -e pdev pytest -rapP -vv -s --log-cli-level=INFO \ - -m expensive -k "${{ github.event.inputs.test_filter }}" \ - tests/python/integration - else - pixi run -e pdev tests-expensive + pixi reinstall -e "${PIXI_ENV}" INFRA-COMPASS + + args=(-rapP -vv -s --log-cli-level=INFO -m "${MARKER}") + if [ -n "${TEST_FILTER}" ]; then + args+=(-k "${TEST_FILTER}") fi + args+=("${TEST_PATH}") + + echo "Running: pytest ${args[*]}" + pixi run -e "${PIXI_ENV}" pytest "${args[@]}" From 7fab5aee4ce4d4f365e3d973444bc8b690ab874b Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 14:56:59 -0500 Subject: [PATCH 03/10] Rename gating to eval markers (eval/dev_eval/held_out) --- .../{expensive-tests.yml => evals.yml} | 17 +++++++++++------ pixi.toml | 5 +++-- pyproject.toml | 6 ++++-- 3 files changed, 18 insertions(+), 10 deletions(-) rename .github/workflows/{expensive-tests.yml => evals.yml} (85%) diff --git a/.github/workflows/expensive-tests.yml b/.github/workflows/evals.yml similarity index 85% rename from .github/workflows/expensive-tests.yml rename to .github/workflows/evals.yml index a01a59879..e7de60a62 100644 --- a/.github/workflows/expensive-tests.yml +++ b/.github/workflows/evals.yml @@ -1,9 +1,14 @@ -name: Gated tests (manual) +name: Evals (manual) -# Manually-triggered runner for tests that are deselected from normal CI -# (e.g. tests that make live, billable LLM calls). Run from the Actions tab +# Manually-triggered runner for evals and other tests deselected from normal +# CI (e.g. tests that make live, billable LLM calls). Run from the Actions tab # via "Run workflow". # +# Common uses: +# marker: "eval" -> all evals +# marker: "dev_eval" -> frequent dev-dataset evals +# marker: "held_out" -> pre-release hidden held-out evals +# # This workflow is intentionally generic: it runs whatever pytest marker / # path / filter you pass as inputs. To add a new gated test later, just mark # it with a pytest marker on your branch and dispatch this workflow against @@ -22,10 +27,10 @@ on: inputs: marker: description: >- - pytest -m marker expression selecting the gated tests to run - (e.g. "expensive", or "expensive and not slow"). + pytest -m marker expression selecting the evals to run + (e.g. "eval", "dev_eval", or "held_out"). required: false - default: "expensive" + default: "eval" test_path: description: Path (dir or file) to collect tests from. required: false diff --git a/pixi.toml b/pixi.toml index 262d7e447..d0be84f79 100644 --- a/pixi.toml +++ b/pixi.toml @@ -28,8 +28,9 @@ format = "ruff format ./compass" tests-p = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python" tests-u = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python/unit" tests-i = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=10 tests/python/integration" -# Opt-in tests that make live, billable LLM calls (deselected by default). -tests-expensive = "pytest -rapP -vv -s --log-cli-level=INFO -m expensive tests/python/integration" +# Opt-in accuracy/quality evals (deselected by default; often live LLM calls). +# Runs all evals; narrow with e.g. `pixi run -e pdev evals -m dev_eval`. +evals = "pytest -rapP -vv -s --log-cli-level=INFO -m eval tests/python/integration" [feature.python-doc.tasks] python-docs = { cmd = "make clean html", cwd = "docs", env = { SPHINXOPTS = "--fail-on-warning --keep-going --nitpicky" }} diff --git a/pyproject.toml b/pyproject.toml index 88086c6eb..4c3740d72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -269,11 +269,13 @@ omit = [ [tool.pytest.ini_options] -addopts = '--disable-warnings -m "not expensive"' +addopts = '--disable-warnings -m "not eval"' asyncio_mode="auto" asyncio_default_fixture_loop_scope="function" markers = [ - "expensive: opt-in tests that are deselected by default (e.g. tests that make live, billable LLM calls). Run explicitly with `-m expensive`.", + "eval: accuracy/quality evals, deselected by default (often make live, billable LLM calls). Run explicitly with `-m eval`.", + "dev_eval: evals run frequently during development against the in-repo dev dataset. Implies `eval`.", + "held_out: evals run before a release against the hidden held-out dataset. Implies `eval`.", ] testpaths = [ "tests/python/unit", From 3f42bdec342409e6925903f7395aff88754a2060 Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 15:11:28 -0500 Subject: [PATCH 04/10] Rename eval workflow to manual-eval; note future triggers --- .github/workflows/{evals.yml => manual-eval.yml} | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) rename .github/workflows/{evals.yml => manual-eval.yml} (92%) diff --git a/.github/workflows/evals.yml b/.github/workflows/manual-eval.yml similarity index 92% rename from .github/workflows/evals.yml rename to .github/workflows/manual-eval.yml index e7de60a62..4c0523026 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/manual-eval.yml @@ -1,4 +1,4 @@ -name: Evals (manual) +name: Manual eval # Manually-triggered runner for evals and other tests deselected from normal # CI (e.g. tests that make live, billable LLM calls). Run from the Actions tab @@ -15,6 +15,11 @@ name: Evals (manual) # that branch (the "Use workflow from" dropdown) -- no need to re-merge the # workflow to the default branch. # +# Triggers: manual only (workflow_dispatch). Other triggers (scheduled +# dev_eval, PR-label opt-in, etc.) can be added to the `on:` block later; the +# run step already keys off inputs, and event-specific behavior can branch on +# `github.event_name`. +# # NOTE: GitHub reads the trigger/inputs of a `workflow_dispatch` workflow from # the DEFAULT branch, so this file must be on the default branch once. After # that, the job body and the tests it runs come from the selected branch. From 09a63dce2902cfdf355154761bb9ac2c1f473ea8 Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 15:13:14 -0500 Subject: [PATCH 05/10] Drop toJSON(secrets) passthrough; pass only needed secrets --- .github/workflows/manual-eval.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/manual-eval.yml b/.github/workflows/manual-eval.yml index 4c0523026..15276d9e4 100644 --- a/.github/workflows/manual-eval.yml +++ b/.github/workflows/manual-eval.yml @@ -76,16 +76,14 @@ jobs: - name: Run gated tests env: - # Explicit, well-known creds for the current live-LLM tests. + # Pass ONLY the secrets the evals need (least privilege). When a new + # gated test needs a different secret, add it here explicitly -- do + # not pass `toJSON(secrets)`, which exposes every org/repo secret to + # the runner (flagged by CodeQL's excessive-secrets-exposure rule). AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }} COMPASS_DATE_TEST_MODEL: ${{ inputs.model }} - # Passthrough of all repo secrets as JSON, so a future gated test - # needing a different secret can read it without editing this file. - # Parse in-test, e.g.: - # json.loads(os.environ["ALL_SECRETS"])["MY_SECRET"] - ALL_SECRETS: ${{ toJSON(secrets) }} MARKER: ${{ inputs.marker }} TEST_PATH: ${{ inputs.test_path }} TEST_FILTER: ${{ inputs.test_filter }} From ba8b8b529ec7518eaeef871f579ab775f6c5f35f Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 15:34:07 -0500 Subject: [PATCH 06/10] Keep only workflow YAML in this PR; markers/task land with eval test --- pixi.toml | 3 --- pyproject.toml | 7 +------ 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/pixi.toml b/pixi.toml index d0be84f79..b7a303ec2 100644 --- a/pixi.toml +++ b/pixi.toml @@ -28,9 +28,6 @@ format = "ruff format ./compass" tests-p = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python" tests-u = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python/unit" tests-i = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=10 tests/python/integration" -# Opt-in accuracy/quality evals (deselected by default; often live LLM calls). -# Runs all evals; narrow with e.g. `pixi run -e pdev evals -m dev_eval`. -evals = "pytest -rapP -vv -s --log-cli-level=INFO -m eval tests/python/integration" [feature.python-doc.tasks] python-docs = { cmd = "make clean html", cwd = "docs", env = { SPHINXOPTS = "--fail-on-warning --keep-going --nitpicky" }} diff --git a/pyproject.toml b/pyproject.toml index 4c3740d72..893237c2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -269,14 +269,9 @@ omit = [ [tool.pytest.ini_options] -addopts = '--disable-warnings -m "not eval"' +addopts = "--disable-warnings" asyncio_mode="auto" asyncio_default_fixture_loop_scope="function" -markers = [ - "eval: accuracy/quality evals, deselected by default (often make live, billable LLM calls). Run explicitly with `-m eval`.", - "dev_eval: evals run frequently during development against the in-repo dev dataset. Implies `eval`.", - "held_out: evals run before a release against the hidden held-out dataset. Implies `eval`.", -] testpaths = [ "tests/python/unit", "tests/python/integration", From c8dcce3a0e384463577a8da56a42c5cc4ff3dd15 Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 15:53:28 -0500 Subject: [PATCH 07/10] Rename to evals.yml; restrict markers to dev_eval/held_out; drop model input --- .../workflows/{manual-eval.yml => evals.yml} | 47 +++++-------------- 1 file changed, 12 insertions(+), 35 deletions(-) rename .github/workflows/{manual-eval.yml => evals.yml} (56%) diff --git a/.github/workflows/manual-eval.yml b/.github/workflows/evals.yml similarity index 56% rename from .github/workflows/manual-eval.yml rename to .github/workflows/evals.yml index 15276d9e4..a4d4b5030 100644 --- a/.github/workflows/manual-eval.yml +++ b/.github/workflows/evals.yml @@ -1,28 +1,10 @@ -name: Manual eval +name: Evals -# Manually-triggered runner for evals and other tests deselected from normal -# CI (e.g. tests that make live, billable LLM calls). Run from the Actions tab -# via "Run workflow". -# -# Common uses: -# marker: "eval" -> all evals -# marker: "dev_eval" -> frequent dev-dataset evals -# marker: "held_out" -> pre-release hidden held-out evals -# -# This workflow is intentionally generic: it runs whatever pytest marker / -# path / filter you pass as inputs. To add a new gated test later, just mark -# it with a pytest marker on your branch and dispatch this workflow against -# that branch (the "Use workflow from" dropdown) -- no need to re-merge the -# workflow to the default branch. -# -# Triggers: manual only (workflow_dispatch). Other triggers (scheduled -# dev_eval, PR-label opt-in, etc.) can be added to the `on:` block later; the -# run step already keys off inputs, and event-specific behavior can branch on -# `github.event_name`. -# -# NOTE: GitHub reads the trigger/inputs of a `workflow_dispatch` workflow from -# the DEFAULT branch, so this file must be on the default branch once. After -# that, the job body and the tests it runs come from the selected branch. +# Do live API call to perform system evaluation (Evals). Typically manually +# triggered after changes to the part of code that affects the LLM behavior. +# Supports 2 kinds of eval markers: +# "dev_eval" -> frequent dev-dataset evals +# "held_out" -> pre-release hidden held-out evals permissions: contents: read @@ -31,11 +13,13 @@ on: workflow_dispatch: inputs: marker: - description: >- - pytest -m marker expression selecting the evals to run - (e.g. "eval", "dev_eval", or "held_out"). + description: Which evals to run. required: false - default: "eval" + type: choice + default: "dev_eval" + options: + - "dev_eval" + - "held_out" test_path: description: Path (dir or file) to collect tests from. required: false @@ -48,12 +32,6 @@ on: description: pixi environment to run the tests in. required: false default: "pdev" - model: - description: >- - Optional model/deployment name, exposed as COMPASS_DATE_TEST_MODEL - for tests that read it. - required: false - default: "" jobs: gated-tests: @@ -83,7 +61,6 @@ jobs: AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }} - COMPASS_DATE_TEST_MODEL: ${{ inputs.model }} MARKER: ${{ inputs.marker }} TEST_PATH: ${{ inputs.test_path }} TEST_FILTER: ${{ inputs.test_filter }} From ffab163ef1677b43e544ee9f8e143719c4367bb2 Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 16:02:49 -0500 Subject: [PATCH 08/10] Simplify evals workflow: drop test_path/pixi_env inputs --- .github/workflows/evals.yml | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index a4d4b5030..d13c73371 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -5,6 +5,9 @@ name: Evals # Supports 2 kinds of eval markers: # "dev_eval" -> frequent dev-dataset evals # "held_out" -> pre-release hidden held-out evals +# +# `workflow_dispatch` already requires repo write access to trigger, which is +# the (intentional, mild) control over who can kick off these billable runs. permissions: contents: read @@ -20,18 +23,10 @@ on: options: - "dev_eval" - "held_out" - test_path: - description: Path (dir or file) to collect tests from. - required: false - default: "tests/python/integration" test_filter: description: Optional pytest -k expression to run a subset. required: false default: "" - pixi_env: - description: pixi environment to run the tests in. - required: false - default: "pdev" jobs: gated-tests: @@ -50,7 +45,7 @@ jobs: locked: true cache: true cache-write: false - environments: ${{ inputs.pixi_env }} + environments: pdev - name: Run gated tests env: @@ -62,17 +57,15 @@ jobs: AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }} MARKER: ${{ inputs.marker }} - TEST_PATH: ${{ inputs.test_path }} TEST_FILTER: ${{ inputs.test_filter }} - PIXI_ENV: ${{ inputs.pixi_env }} run: | - pixi reinstall -e "${PIXI_ENV}" INFRA-COMPASS + pixi reinstall -e pdev INFRA-COMPASS args=(-rapP -vv -s --log-cli-level=INFO -m "${MARKER}") if [ -n "${TEST_FILTER}" ]; then args+=(-k "${TEST_FILTER}") fi - args+=("${TEST_PATH}") + args+=(tests/python/evals) echo "Running: pytest ${args[*]}" - pixi run -e "${PIXI_ENV}" pytest "${args[@]}" + pixi run -e pdev pytest "${args[@]}" From 6b996664527e811f2e906bbba3cc06068d4d427b Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 16:05:52 -0500 Subject: [PATCH 09/10] Add pull-requests/issues write perms for future PR-comment jobs --- .github/workflows/evals.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index d13c73371..c6a89a964 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -1,16 +1,15 @@ -name: Evals - # Do live API call to perform system evaluation (Evals). Typically manually # triggered after changes to the part of code that affects the LLM behavior. # Supports 2 kinds of eval markers: # "dev_eval" -> frequent dev-dataset evals # "held_out" -> pre-release hidden held-out evals # -# `workflow_dispatch` already requires repo write access to trigger, which is -# the (intentional, mild) control over who can kick off these billable runs. +name: Evals permissions: contents: read + pull-requests: write + issues: write on: workflow_dispatch: @@ -49,10 +48,6 @@ jobs: - name: Run gated tests env: - # Pass ONLY the secrets the evals need (least privilege). When a new - # gated test needs a different secret, add it here explicitly -- do - # not pass `toJSON(secrets)`, which exposes every org/repo secret to - # the runner (flagged by CodeQL's excessive-secrets-exposure rule). AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }} From 7ffb02311276c41b40a5030ff0ac400fc62bd8e8 Mon Sep 17 00:00:00 2001 From: Rajendra Adhikari Date: Mon, 25 May 2026 16:11:19 -0500 Subject: [PATCH 10/10] Read Azure endpoint/version from repo vars; keep API key as secret --- .github/workflows/evals.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index c6a89a964..304cbf6c8 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -49,8 +49,8 @@ jobs: - name: Run gated tests env: AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} - AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} - AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }} + AZURE_OPENAI_ENDPOINT: ${{ vars.AZURE_OPENAI_ENDPOINT }} + AZURE_OPENAI_VERSION: ${{ vars.AZURE_OPENAI_VERSION }} MARKER: ${{ inputs.marker }} TEST_FILTER: ${{ inputs.test_filter }} run: |