lambda-feedback · m-messer · Jun 27, 2026 · Jun 27, 2026
diff --git a/.github/workflows/staging-deploy.yml b/.github/workflows/staging-deploy.yml
@@ -7,51 +7,53 @@ on:
   workflow_dispatch:
 
 jobs:
-  test:
-    name: Test
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      actions: read
-      checks: write
-      pull-requests: write
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        id: python-setup
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Install Poetry
-        run: pip install poetry
-
-      - name: Install dependencies
-        run: poetry install
-
-      - name: Lint with flake8
-        run: |
-          poetry run flake8 ./evaluation_function --count --select=E9,F63,F7,F82 --show-source --statistics
-          poetry run flake8 ./evaluation_function --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-
-      - name: Run tests
-        if: always()
-        run: poetry run pytest --junit-xml=./reports/pytest.xml --tb=auto -v
-
-      - name: Upload test results
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: test-results-${{ matrix.python-version }}
-          path: ./reports/pytest.xml
-          if-no-files-found: warn
+#  test:
+#    name: Test
+#    runs-on: ubuntu-latest
+#    permissions:
+#      contents: read
+#      actions: read
+#      checks: write
+#      pull-requests: write
+#    strategy:
+#      fail-fast: false
+#      matrix:
+#        python-version: ["3.12"]
+#    steps:
+#      - name: Checkout
+#        uses: actions/checkout@v4
+#      - name: Set up Python ${{ matrix.python-version }}
+#        id: python-setup
+#        uses: actions/setup-python@v5
+#        with:
+#          python-version: ${{ matrix.python-version }}
+#
+#      - name: Install Poetry
+#        run: pip install poetry
+#
+#      - name: Install dependencies
+#        run: poetry install
+#
+#      - name: Lint with flake8
+#        run: |
+#          poetry run flake8 ./evaluation_function --count --select=E9,F63,F7,F82 --show-source --statistics
+#          poetry run flake8 ./evaluation_function --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+#
+#      - name: Run tests
+#        if: always()
+#        env:
+#          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+#        run: poetry run pytest --junit-xml=./reports/pytest.xml --tb=auto -v
+#
+#      - name: Upload test results
+#        uses: actions/upload-artifact@v4
+#        if: always()
+#        with:
+#          name: test-results-${{ matrix.python-version }}
+#          path: ./reports/pytest.xml
+#          if-no-files-found: warn
   deploy:
-    needs: test
+#    needs: test
     permissions:
       contents: write
       packages: write

diff --git a/.github/workflows/test-lint.yml b/.github/workflows/test-lint.yml
@@ -38,6 +38,8 @@ jobs:
 
       - name: Run tests
         if: always()
+        env:
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
         run: poetry run pytest --junit-xml=./reports/pytest.xml --tb=auto -v
 
       - name: Upload test results

diff --git a/evaluation_function/evaluation.py b/evaluation_function/evaluation.py
@@ -1,6 +1,9 @@
+import os
 from typing import Any
 from lf_toolkit.evaluation import Result, Params
 
+load_dotenv()
+
 def evaluation_function(
     response: Any,
     answer: Any,
@@ -31,10 +34,24 @@ def evaluation_function(
 
     result = Result(is_correct=response == answer)
 
-    if not result.is_correct:
-        result.add_feedback(
-            "general",
-            "Not quite right. Please review your answer and try again. Test.",
-        )
+    SYSTEM_PROMPT = "You are a teaching assistant, give helpful feedback to the student."
+    teacher_prompt = params.get('teacher_prompt', 'Evaluate the student response and provide helpful feedback.')
+
+    prompt = SYSTEM_PROMPT + "\n" + teacher_prompt
+
+    llm_response = client.chat.completions.create(
+        model=params.get('model', 'openai/gpt-4o-mini'),
+        messages=[
+            {"role": "system", "content": prompt},
+            {"role": "user", "content": response},
+        ],
+    )
+
+    result = Result(is_correct=True)
+
+    result.add_feedback(
+        "general",
+        llm_response.choices[0].message.content,
+    )
 
     return result
diff --git a/evaluation_function/evaluation_test.py b/evaluation_function/evaluation_test.py
@@ -27,13 +27,4 @@ def test_evaluation(self):
         result = evaluation_function(response, answer, params).to_dict()
 
         self.assertEqual(result.get("is_correct"), True)
-        self.assertFalse(result.get("feedback", False))
-
-    def test_incorrect_answer_gives_constructive_feedback(self):
-        response, answer, params = "Hello", "Hello, World", Params()
-
-        result = evaluation_function(response, answer, params).to_dict()
-
-        self.assertEqual(result.get("is_correct"), False)
-        self.assertIn("Not quite right", result.get("feedback", ""))
-        self.assertIn("try again", result.get("feedback", ""))
+        self.assertTrue(result.get("feedback"))