From aa15225d2b1aaa702283801e0f7ee7ad3c764107 Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Fri, 22 May 2026 19:16:29 -0500
Subject: [PATCH 01/10] Add manually-triggered workflow for expensive
 (live-LLM) tests

---
 .github/workflows/expensive-tests.yml | 63 +++++++++++++++++++++++++++
 pixi.toml                             |  2 +
 pyproject.toml                        |  5 ++-
 3 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/expensive-tests.yml

diff --git a/.github/workflows/expensive-tests.yml b/.github/workflows/expensive-tests.yml
new file mode 100644
index 000000000..fe61cfd9b
--- /dev/null
+++ b/.github/workflows/expensive-tests.yml
@@ -0,0 +1,63 @@
+name: Expensive tests (manual)
+
+# Manually-triggered only. These tests make live, billable LLM calls
+# (e.g. the date-extraction accuracy test) and are deselected from every
+# other workflow via the `expensive` pytest marker. Run from the Actions
+# tab via "Run workflow".
+
+permissions:
+  contents: read
+
+on:
+  workflow_dispatch:
+    inputs:
+      test_filter:
+        description: >-
+          Optional pytest -k expression to run a subset (e.g. "Bartow").
+          Leave blank to run all expensive tests.
+        required: false
+        default: ""
+      model:
+        description: Azure deployment name to exercise.
+        required: false
+        default: "compassop-gpt-5.4"
+
+jobs:
+  expensive-tests:
+    name: Run expensive (live-LLM) tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+
+      - uses: prefix-dev/setup-pixi@1b2de7f3351f171c8b4dfeb558c639cb58ed4ec0 # v0.9.5
+        with:
+          pixi-version: v0.62.2
+          locked: true
+          cache: true
+          cache-write: false
+          environments: pdev
+
+      - name: Run expensive tests
+        env:
+          AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
+          AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
+          AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }}
+          COMPASS_DATE_TEST_MODEL: ${{ github.event.inputs.model }}
+        run: |
+          if [ -z "${AZURE_OPENAI_API_KEY}" ] || [ -z "${AZURE_OPENAI_ENDPOINT}" ]; then
+            echo "::error::AZURE_OPENAI_API_KEY / AZURE_OPENAI_ENDPOINT secrets are not set."
+            echo "Add them in repo Settings -> Secrets and variables -> Actions."
+            exit 1
+          fi
+          pixi reinstall -e pdev INFRA-COMPASS
+          if [ -n "${{ github.event.inputs.test_filter }}" ]; then
+            pixi run -e pdev pytest -rapP -vv -s --log-cli-level=INFO \
+              -m expensive -k "${{ github.event.inputs.test_filter }}" \
+              tests/python/integration
+          else
+            pixi run -e pdev tests-expensive
+          fi
diff --git a/pixi.toml b/pixi.toml
index b7a303ec2..262d7e447 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -28,6 +28,8 @@ format = "ruff format ./compass"
 tests-p = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python"
 tests-u = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python/unit"
 tests-i = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=10 tests/python/integration"
+# Opt-in tests that make live, billable LLM calls (deselected by default).
+tests-expensive = "pytest -rapP -vv -s --log-cli-level=INFO -m expensive tests/python/integration"
 
 [feature.python-doc.tasks]
 python-docs = { cmd = "make clean html", cwd = "docs", env = { SPHINXOPTS = "--fail-on-warning --keep-going --nitpicky" }}
diff --git a/pyproject.toml b/pyproject.toml
index 893237c2d..88086c6eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -269,9 +269,12 @@ omit = [
 
 
 [tool.pytest.ini_options]
-addopts = "--disable-warnings"
+addopts = '--disable-warnings -m "not expensive"'
 asyncio_mode="auto"
 asyncio_default_fixture_loop_scope="function"
+markers = [
+  "expensive: opt-in tests that are deselected by default (e.g. tests that make live, billable LLM calls). Run explicitly with `-m expensive`.",
+]
 testpaths = [
   "tests/python/unit",
   "tests/python/integration",

From 8187df31dac6d5a1da2596d4a2e1d05139265aed Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 11:45:53 -0500
Subject: [PATCH 02/10] Make gated-test workflow generic (marker/path/env
 inputs)

---
 .github/workflows/expensive-tests.yml | 86 ++++++++++++++++++---------
 1 file changed, 58 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/expensive-tests.yml b/.github/workflows/expensive-tests.yml
index fe61cfd9b..a01a59879 100644
--- a/.github/workflows/expensive-tests.yml
+++ b/.github/workflows/expensive-tests.yml
@@ -1,9 +1,18 @@
-name: Expensive tests (manual)
+name: Gated tests (manual)
 
-# Manually-triggered only. These tests make live, billable LLM calls
-# (e.g. the date-extraction accuracy test) and are deselected from every
-# other workflow via the `expensive` pytest marker. Run from the Actions
-# tab via "Run workflow".
+# Manually-triggered runner for tests that are deselected from normal CI
+# (e.g. tests that make live, billable LLM calls). Run from the Actions tab
+# via "Run workflow".
+#
+# This workflow is intentionally generic: it runs whatever pytest marker /
+# path / filter you pass as inputs. To add a new gated test later, just mark
+# it with a pytest marker on your branch and dispatch this workflow against
+# that branch (the "Use workflow from" dropdown) -- no need to re-merge the
+# workflow to the default branch.
+#
+# NOTE: GitHub reads the trigger/inputs of a `workflow_dispatch` workflow from
+# the DEFAULT branch, so this file must be on the default branch once. After
+# that, the job body and the tests it runs come from the selected branch.
 
 permissions:
   contents: read
@@ -11,20 +20,34 @@ permissions:
 on:
   workflow_dispatch:
     inputs:
-      test_filter:
+      marker:
         description: >-
-          Optional pytest -k expression to run a subset (e.g. "Bartow").
-          Leave blank to run all expensive tests.
+          pytest -m marker expression selecting the gated tests to run
+          (e.g. "expensive", or "expensive and not slow").
+        required: false
+        default: "expensive"
+      test_path:
+        description: Path (dir or file) to collect tests from.
+        required: false
+        default: "tests/python/integration"
+      test_filter:
+        description: Optional pytest -k expression to run a subset.
         required: false
         default: ""
+      pixi_env:
+        description: pixi environment to run the tests in.
+        required: false
+        default: "pdev"
       model:
-        description: Azure deployment name to exercise.
+        description: >-
+          Optional model/deployment name, exposed as COMPASS_DATE_TEST_MODEL
+          for tests that read it.
         required: false
-        default: "compassop-gpt-5.4"
+        default: ""
 
 jobs:
-  expensive-tests:
-    name: Run expensive (live-LLM) tests
+  gated-tests:
+    name: Run gated tests
     runs-on: ubuntu-latest
     steps:
       - name: Checkout Repo
@@ -35,29 +58,36 @@ jobs:
 
       - uses: prefix-dev/setup-pixi@1b2de7f3351f171c8b4dfeb558c639cb58ed4ec0 # v0.9.5
         with:
-          pixi-version: v0.62.2
+          pixi-version: v0.68.1
           locked: true
           cache: true
           cache-write: false
-          environments: pdev
+          environments: ${{ inputs.pixi_env }}
 
-      - name: Run expensive tests
+      - name: Run gated tests
         env:
+          # Explicit, well-known creds for the current live-LLM tests.
           AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
           AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
           AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }}
-          COMPASS_DATE_TEST_MODEL: ${{ github.event.inputs.model }}
+          COMPASS_DATE_TEST_MODEL: ${{ inputs.model }}
+          # Passthrough of all repo secrets as JSON, so a future gated test
+          # needing a different secret can read it without editing this file.
+          # Parse in-test, e.g.:
+          #   json.loads(os.environ["ALL_SECRETS"])["MY_SECRET"]
+          ALL_SECRETS: ${{ toJSON(secrets) }}
+          MARKER: ${{ inputs.marker }}
+          TEST_PATH: ${{ inputs.test_path }}
+          TEST_FILTER: ${{ inputs.test_filter }}
+          PIXI_ENV: ${{ inputs.pixi_env }}
         run: |
-          if [ -z "${AZURE_OPENAI_API_KEY}" ] || [ -z "${AZURE_OPENAI_ENDPOINT}" ]; then
-            echo "::error::AZURE_OPENAI_API_KEY / AZURE_OPENAI_ENDPOINT secrets are not set."
-            echo "Add them in repo Settings -> Secrets and variables -> Actions."
-            exit 1
-          fi
-          pixi reinstall -e pdev INFRA-COMPASS
-          if [ -n "${{ github.event.inputs.test_filter }}" ]; then
-            pixi run -e pdev pytest -rapP -vv -s --log-cli-level=INFO \
-              -m expensive -k "${{ github.event.inputs.test_filter }}" \
-              tests/python/integration
-          else
-            pixi run -e pdev tests-expensive
+          pixi reinstall -e "${PIXI_ENV}" INFRA-COMPASS
+
+          args=(-rapP -vv -s --log-cli-level=INFO -m "${MARKER}")
+          if [ -n "${TEST_FILTER}" ]; then
+            args+=(-k "${TEST_FILTER}")
           fi
+          args+=("${TEST_PATH}")
+
+          echo "Running: pytest ${args[*]}"
+          pixi run -e "${PIXI_ENV}" pytest "${args[@]}"

From 7fab5aee4ce4d4f365e3d973444bc8b690ab874b Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 14:56:59 -0500
Subject: [PATCH 03/10] Rename gating to eval markers (eval/dev_eval/held_out)

---
 .../{expensive-tests.yml => evals.yml}          | 17 +++++++++++------
 pixi.toml                                       |  5 +++--
 pyproject.toml                                  |  6 ++++--
 3 files changed, 18 insertions(+), 10 deletions(-)
 rename .github/workflows/{expensive-tests.yml => evals.yml} (85%)

diff --git a/.github/workflows/expensive-tests.yml b/.github/workflows/evals.yml
similarity index 85%
rename from .github/workflows/expensive-tests.yml
rename to .github/workflows/evals.yml
index a01a59879..e7de60a62 100644
--- a/.github/workflows/expensive-tests.yml
+++ b/.github/workflows/evals.yml
@@ -1,9 +1,14 @@
-name: Gated tests (manual)
+name: Evals (manual)
 
-# Manually-triggered runner for tests that are deselected from normal CI
-# (e.g. tests that make live, billable LLM calls). Run from the Actions tab
+# Manually-triggered runner for evals and other tests deselected from normal
+# CI (e.g. tests that make live, billable LLM calls). Run from the Actions tab
 # via "Run workflow".
 #
+# Common uses:
+#   marker: "eval"               -> all evals
+#   marker: "dev_eval"           -> frequent dev-dataset evals
+#   marker: "held_out"           -> pre-release hidden held-out evals
+#
 # This workflow is intentionally generic: it runs whatever pytest marker /
 # path / filter you pass as inputs. To add a new gated test later, just mark
 # it with a pytest marker on your branch and dispatch this workflow against
@@ -22,10 +27,10 @@ on:
     inputs:
       marker:
         description: >-
-          pytest -m marker expression selecting the gated tests to run
-          (e.g. "expensive", or "expensive and not slow").
+          pytest -m marker expression selecting the evals to run
+          (e.g. "eval", "dev_eval", or "held_out").
         required: false
-        default: "expensive"
+        default: "eval"
       test_path:
         description: Path (dir or file) to collect tests from.
         required: false
diff --git a/pixi.toml b/pixi.toml
index 262d7e447..d0be84f79 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -28,8 +28,9 @@ format = "ruff format ./compass"
 tests-p = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python"
 tests-u = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python/unit"
 tests-i = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=10 tests/python/integration"
-# Opt-in tests that make live, billable LLM calls (deselected by default).
-tests-expensive = "pytest -rapP -vv -s --log-cli-level=INFO -m expensive tests/python/integration"
+# Opt-in accuracy/quality evals (deselected by default; often live LLM calls).
+# Runs all evals; narrow with e.g. `pixi run -e pdev evals -m dev_eval`.
+evals = "pytest -rapP -vv -s --log-cli-level=INFO -m eval tests/python/integration"
 
 [feature.python-doc.tasks]
 python-docs = { cmd = "make clean html", cwd = "docs", env = { SPHINXOPTS = "--fail-on-warning --keep-going --nitpicky" }}
diff --git a/pyproject.toml b/pyproject.toml
index 88086c6eb..4c3740d72 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -269,11 +269,13 @@ omit = [
 
 
 [tool.pytest.ini_options]
-addopts = '--disable-warnings -m "not expensive"'
+addopts = '--disable-warnings -m "not eval"'
 asyncio_mode="auto"
 asyncio_default_fixture_loop_scope="function"
 markers = [
-  "expensive: opt-in tests that are deselected by default (e.g. tests that make live, billable LLM calls). Run explicitly with `-m expensive`.",
+  "eval: accuracy/quality evals, deselected by default (often make live, billable LLM calls). Run explicitly with `-m eval`.",
+  "dev_eval: evals run frequently during development against the in-repo dev dataset. Implies `eval`.",
+  "held_out: evals run before a release against the hidden held-out dataset. Implies `eval`.",
 ]
 testpaths = [
   "tests/python/unit",

From 3f42bdec342409e6925903f7395aff88754a2060 Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 15:11:28 -0500
Subject: [PATCH 04/10] Rename eval workflow to manual-eval; note future
 triggers

---
 .github/workflows/{evals.yml => manual-eval.yml} | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
 rename .github/workflows/{evals.yml => manual-eval.yml} (92%)

diff --git a/.github/workflows/evals.yml b/.github/workflows/manual-eval.yml
similarity index 92%
rename from .github/workflows/evals.yml
rename to .github/workflows/manual-eval.yml
index e7de60a62..4c0523026 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/manual-eval.yml
@@ -1,4 +1,4 @@
-name: Evals (manual)
+name: Manual eval
 
 # Manually-triggered runner for evals and other tests deselected from normal
 # CI (e.g. tests that make live, billable LLM calls). Run from the Actions tab
@@ -15,6 +15,11 @@ name: Evals (manual)
 # that branch (the "Use workflow from" dropdown) -- no need to re-merge the
 # workflow to the default branch.
 #
+# Triggers: manual only (workflow_dispatch). Other triggers (scheduled
+# dev_eval, PR-label opt-in, etc.) can be added to the `on:` block later; the
+# run step already keys off inputs, and event-specific behavior can branch on
+# `github.event_name`.
+#
 # NOTE: GitHub reads the trigger/inputs of a `workflow_dispatch` workflow from
 # the DEFAULT branch, so this file must be on the default branch once. After
 # that, the job body and the tests it runs come from the selected branch.

From 09a63dce2902cfdf355154761bb9ac2c1f473ea8 Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 15:13:14 -0500
Subject: [PATCH 05/10] Drop toJSON(secrets) passthrough; pass only needed
 secrets

---
 .github/workflows/manual-eval.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/manual-eval.yml b/.github/workflows/manual-eval.yml
index 4c0523026..15276d9e4 100644
--- a/.github/workflows/manual-eval.yml
+++ b/.github/workflows/manual-eval.yml
@@ -76,16 +76,14 @@ jobs:
 
       - name: Run gated tests
         env:
-          # Explicit, well-known creds for the current live-LLM tests.
+          # Pass ONLY the secrets the evals need (least privilege). When a new
+          # gated test needs a different secret, add it here explicitly -- do
+          # not pass `toJSON(secrets)`, which exposes every org/repo secret to
+          # the runner (flagged by CodeQL's excessive-secrets-exposure rule).
           AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
           AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
           AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }}
           COMPASS_DATE_TEST_MODEL: ${{ inputs.model }}
-          # Passthrough of all repo secrets as JSON, so a future gated test
-          # needing a different secret can read it without editing this file.
-          # Parse in-test, e.g.:
-          #   json.loads(os.environ["ALL_SECRETS"])["MY_SECRET"]
-          ALL_SECRETS: ${{ toJSON(secrets) }}
           MARKER: ${{ inputs.marker }}
           TEST_PATH: ${{ inputs.test_path }}
           TEST_FILTER: ${{ inputs.test_filter }}

From ba8b8b529ec7518eaeef871f579ab775f6c5f35f Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 15:34:07 -0500
Subject: [PATCH 06/10] Keep only workflow YAML in this PR; markers/task land
 with eval test

---
 pixi.toml      | 3 ---
 pyproject.toml | 7 +------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/pixi.toml b/pixi.toml
index d0be84f79..b7a303ec2 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -28,9 +28,6 @@ format = "ruff format ./compass"
 tests-p = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python"
 tests-u = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=30 tests/python/unit"
 tests-i = "pytest --durations=20 -rapP -vv --cov=compass --cov-report=html --cov-branch --cov-report=xml:coverage.xml --cov-fail-under=10 tests/python/integration"
-# Opt-in accuracy/quality evals (deselected by default; often live LLM calls).
-# Runs all evals; narrow with e.g. `pixi run -e pdev evals -m dev_eval`.
-evals = "pytest -rapP -vv -s --log-cli-level=INFO -m eval tests/python/integration"
 
 [feature.python-doc.tasks]
 python-docs = { cmd = "make clean html", cwd = "docs", env = { SPHINXOPTS = "--fail-on-warning --keep-going --nitpicky" }}
diff --git a/pyproject.toml b/pyproject.toml
index 4c3740d72..893237c2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -269,14 +269,9 @@ omit = [
 
 
 [tool.pytest.ini_options]
-addopts = '--disable-warnings -m "not eval"'
+addopts = "--disable-warnings"
 asyncio_mode="auto"
 asyncio_default_fixture_loop_scope="function"
-markers = [
-  "eval: accuracy/quality evals, deselected by default (often make live, billable LLM calls). Run explicitly with `-m eval`.",
-  "dev_eval: evals run frequently during development against the in-repo dev dataset. Implies `eval`.",
-  "held_out: evals run before a release against the hidden held-out dataset. Implies `eval`.",
-]
 testpaths = [
   "tests/python/unit",
   "tests/python/integration",

From c8dcce3a0e384463577a8da56a42c5cc4ff3dd15 Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 15:53:28 -0500
Subject: [PATCH 07/10] Rename to evals.yml; restrict markers to
 dev_eval/held_out; drop model input

---
 .../workflows/{manual-eval.yml => evals.yml}  | 47 +++++--------------
 1 file changed, 12 insertions(+), 35 deletions(-)
 rename .github/workflows/{manual-eval.yml => evals.yml} (56%)

diff --git a/.github/workflows/manual-eval.yml b/.github/workflows/evals.yml
similarity index 56%
rename from .github/workflows/manual-eval.yml
rename to .github/workflows/evals.yml
index 15276d9e4..a4d4b5030 100644
--- a/.github/workflows/manual-eval.yml
+++ b/.github/workflows/evals.yml
@@ -1,28 +1,10 @@
-name: Manual eval
+name: Evals
 
-# Manually-triggered runner for evals and other tests deselected from normal
-# CI (e.g. tests that make live, billable LLM calls). Run from the Actions tab
-# via "Run workflow".
-#
-# Common uses:
-#   marker: "eval"               -> all evals
-#   marker: "dev_eval"           -> frequent dev-dataset evals
-#   marker: "held_out"           -> pre-release hidden held-out evals
-#
-# This workflow is intentionally generic: it runs whatever pytest marker /
-# path / filter you pass as inputs. To add a new gated test later, just mark
-# it with a pytest marker on your branch and dispatch this workflow against
-# that branch (the "Use workflow from" dropdown) -- no need to re-merge the
-# workflow to the default branch.
-#
-# Triggers: manual only (workflow_dispatch). Other triggers (scheduled
-# dev_eval, PR-label opt-in, etc.) can be added to the `on:` block later; the
-# run step already keys off inputs, and event-specific behavior can branch on
-# `github.event_name`.
-#
-# NOTE: GitHub reads the trigger/inputs of a `workflow_dispatch` workflow from
-# the DEFAULT branch, so this file must be on the default branch once. After
-# that, the job body and the tests it runs come from the selected branch.
+# Do live API call to perform system evaluation (Evals). Typically manually
+# triggered after changes to the part of code that affects the LLM behavior.
+# Supports 2 kinds of eval markers:
+#   "dev_eval"   -> frequent dev-dataset evals
+#   "held_out"   -> pre-release hidden held-out evals
 
 permissions:
   contents: read
@@ -31,11 +13,13 @@ on:
   workflow_dispatch:
     inputs:
       marker:
-        description: >-
-          pytest -m marker expression selecting the evals to run
-          (e.g. "eval", "dev_eval", or "held_out").
+        description: Which evals to run.
         required: false
-        default: "eval"
+        type: choice
+        default: "dev_eval"
+        options:
+          - "dev_eval"
+          - "held_out"
       test_path:
         description: Path (dir or file) to collect tests from.
         required: false
@@ -48,12 +32,6 @@ on:
         description: pixi environment to run the tests in.
         required: false
         default: "pdev"
-      model:
-        description: >-
-          Optional model/deployment name, exposed as COMPASS_DATE_TEST_MODEL
-          for tests that read it.
-        required: false
-        default: ""
 
 jobs:
   gated-tests:
@@ -83,7 +61,6 @@ jobs:
           AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
           AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
           AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }}
-          COMPASS_DATE_TEST_MODEL: ${{ inputs.model }}
           MARKER: ${{ inputs.marker }}
           TEST_PATH: ${{ inputs.test_path }}
           TEST_FILTER: ${{ inputs.test_filter }}

From ffab163ef1677b43e544ee9f8e143719c4367bb2 Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 16:02:49 -0500
Subject: [PATCH 08/10] Simplify evals workflow: drop test_path/pixi_env inputs

---
 .github/workflows/evals.yml | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index a4d4b5030..d13c73371 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -5,6 +5,9 @@ name: Evals
 # Supports 2 kinds of eval markers:
 #   "dev_eval"   -> frequent dev-dataset evals
 #   "held_out"   -> pre-release hidden held-out evals
+#
+# `workflow_dispatch` already requires repo write access to trigger, which is
+# the (intentional, mild) control over who can kick off these billable runs.
 
 permissions:
   contents: read
@@ -20,18 +23,10 @@ on:
         options:
           - "dev_eval"
           - "held_out"
-      test_path:
-        description: Path (dir or file) to collect tests from.
-        required: false
-        default: "tests/python/integration"
       test_filter:
         description: Optional pytest -k expression to run a subset.
         required: false
         default: ""
-      pixi_env:
-        description: pixi environment to run the tests in.
-        required: false
-        default: "pdev"
 
 jobs:
   gated-tests:
@@ -50,7 +45,7 @@ jobs:
           locked: true
           cache: true
           cache-write: false
-          environments: ${{ inputs.pixi_env }}
+          environments: pdev
 
       - name: Run gated tests
         env:
@@ -62,17 +57,15 @@ jobs:
           AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
           AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }}
           MARKER: ${{ inputs.marker }}
-          TEST_PATH: ${{ inputs.test_path }}
           TEST_FILTER: ${{ inputs.test_filter }}
-          PIXI_ENV: ${{ inputs.pixi_env }}
         run: |
-          pixi reinstall -e "${PIXI_ENV}" INFRA-COMPASS
+          pixi reinstall -e pdev INFRA-COMPASS
 
           args=(-rapP -vv -s --log-cli-level=INFO -m "${MARKER}")
           if [ -n "${TEST_FILTER}" ]; then
             args+=(-k "${TEST_FILTER}")
           fi
-          args+=("${TEST_PATH}")
+          args+=(tests/python/evals)
 
           echo "Running: pytest ${args[*]}"
-          pixi run -e "${PIXI_ENV}" pytest "${args[@]}"
+          pixi run -e pdev pytest "${args[@]}"

From 6b996664527e811f2e906bbba3cc06068d4d427b Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 16:05:52 -0500
Subject: [PATCH 09/10] Add pull-requests/issues write perms for future
 PR-comment jobs

---
 .github/workflows/evals.yml | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index d13c73371..c6a89a964 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -1,16 +1,15 @@
-name: Evals
-
 # Do live API call to perform system evaluation (Evals). Typically manually
 # triggered after changes to the part of code that affects the LLM behavior.
 # Supports 2 kinds of eval markers:
 #   "dev_eval"   -> frequent dev-dataset evals
 #   "held_out"   -> pre-release hidden held-out evals
 #
-# `workflow_dispatch` already requires repo write access to trigger, which is
-# the (intentional, mild) control over who can kick off these billable runs.
+name: Evals
 
 permissions:
   contents: read
+  pull-requests: write
+  issues: write
 
 on:
   workflow_dispatch:
@@ -49,10 +48,6 @@ jobs:
 
       - name: Run gated tests
         env:
-          # Pass ONLY the secrets the evals need (least privilege). When a new
-          # gated test needs a different secret, add it here explicitly -- do
-          # not pass `toJSON(secrets)`, which exposes every org/repo secret to
-          # the runner (flagged by CodeQL's excessive-secrets-exposure rule).
           AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
           AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
           AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }}

From 7ffb02311276c41b40a5030ff0ac400fc62bd8e8 Mon Sep 17 00:00:00 2001
From: Rajendra Adhikari <rajendraadhikari.ee@gmail.com>
Date: Mon, 25 May 2026 16:11:19 -0500
Subject: [PATCH 10/10] Read Azure endpoint/version from repo vars; keep API
 key as secret

---
 .github/workflows/evals.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index c6a89a964..304cbf6c8 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -49,8 +49,8 @@ jobs:
       - name: Run gated tests
         env:
           AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
-          AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
-          AZURE_OPENAI_VERSION: ${{ secrets.AZURE_OPENAI_VERSION }}
+          AZURE_OPENAI_ENDPOINT: ${{ vars.AZURE_OPENAI_ENDPOINT }}
+          AZURE_OPENAI_VERSION: ${{ vars.AZURE_OPENAI_VERSION }}
           MARKER: ${{ inputs.marker }}
           TEST_FILTER: ${{ inputs.test_filter }}
         run: |