Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion src/lightspeed_evaluation/core/llm/deepeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

import litellm
from deepeval.models import LiteLLMModel
from tenacity import stop_after_attempt

from lightspeed_evaluation.core.constants import DEFAULT_LLM_RETRIES
from lightspeed_evaluation.core.llm.litellm_patch import setup_litellm_ssl

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -46,15 +48,46 @@ def __init__(self, model_name: str, llm_params: dict[str, Any]):
# LiteLLMModel stores **kwargs in self.kwargs and merges them into
# every litellm.completion() call
# Note: Forbidden keys are rejected at LLMParametersConfig load time

# Override DeepEval's hardcoded retry logic with user configuration
# DeepEval uses @retry decorators that capture MAX_RETRIES at import time
# We must patch the retry decorators after import but before instantiation
num_retries = self.llm_params.get("num_retries", DEFAULT_LLM_RETRIES)

self._patch_deepeval_retries(num_retries)

Comment thread
xmican10 marked this conversation as resolved.
self.llm_model = LiteLLMModel(
model=self.model_name,
timeout=self.llm_params.get("timeout"),
num_retries=self.llm_params.get("num_retries"),
**self.llm_params.get("parameters", {}),
)

print(f"✅ DeepEval LLM Manager: {self.model_name}")

def _patch_deepeval_retries(self, max_retries: int) -> None:
"""Monkey-patch DeepEval's retry decorators to use configured max_retries.

DeepEval's @retry decorators capture MAX_RETRIES at import time.
We patch the 'stop' attribute on each retry decorator to use our value.
"""
# Patch the stop condition on all retry-decorated methods
for method_name in [
"generate",
"a_generate",
"generate_raw_response",
"a_generate_raw_response",
"generate_samples",
]:
method = getattr(LiteLLMModel, method_name)
method.retry.stop = stop_after_attempt( # pylint: disable=no-member
Comment on lines +81 to +82
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really don't like this but I can't figure out a better solution :-(

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bug_test_geval_score_mismatch.yaml was alredy removed, the tests do cover the case :-)

max_retries
Comment thread
xmican10 marked this conversation as resolved.
)

logger.info(
"Patched DeepEval retry logic: max_retries=%d",
max_retries,
)

def setup_ssl_verify(self) -> None:
"""Setup SSL verification based on LLM parameters.

Expand Down
22 changes: 20 additions & 2 deletions src/lightspeed_evaluation/core/metrics/deepeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,30 @@ def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
metric.measure(test_case)
self.llm_manager.flush_deepevals_pending_tasks()

score = metric.score
reason = (
metric.reason
if hasattr(metric, "reason") and metric.reason
else f"Score: {metric.score:.2f}"
else f"Score: {score:.2f}" if score is not None else "No score returned"
)
return metric.score, reason

# CRITICAL: Warn if score is None (indicates evaluation failure)
# Without this warning, silent conversion to 0.0 masks bugs like:
# - Rate limiting (429 errors)
# - LLM judge returning malformed JSON that fails parsing
# - Timeout errors from LLM provider
# - API quota/credits exhausted
if score is None:
logger.warning(
"%s metric returned None score; defaulting to 0.0. "
"This typically indicates LLM judge failure (rate limiting, timeout, "
"invalid JSON response, or quota exhausted). Reason: %s",
metric.__class__.__name__,
reason,
)
score = 0.0

return score, reason

def evaluate(
self,
Expand Down
36 changes: 34 additions & 2 deletions src/lightspeed_evaluation/core/metrics/geval.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,12 +298,31 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
metric.measure(test_case)
self.deepeval_llm_manager.flush_deepevals_pending_tasks()

score = metric.score if metric.score is not None else 0.0
# Extract score and reason
score = metric.score
reason = (
str(metric.reason)
if hasattr(metric, "reason") and metric.reason
else "No reason provided"
)

# CRITICAL: Warn if score is None (indicates evaluation failure)
# Without this warning, silent conversion to 0.0 masks bugs like:
# - Rate limiting (429 errors after all retries exhausted)
# - LLM judge returning malformed JSON that fails parsing
# - Timeout errors from LLM provider
# - API quota/credits exhausted
# This makes debugging nearly impossible as failed evaluations
# appear as low scores (0.0) instead of errors.
if score is None:
logger.warning(
"GEval turn-level metric returned None score; defaulting to 0.0. "
"This typically indicates LLM judge failure (rate limiting, timeout, "
"invalid JSON response, or quota exhausted). Reason: %s",
reason,
)
score = 0.0

return score, reason
except Exception as e: # pylint: disable=W0718
logger.error(
Expand Down Expand Up @@ -411,12 +430,25 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
metric.measure(test_case)
self.deepeval_llm_manager.flush_deepevals_pending_tasks()

score = metric.score if metric.score is not None else 0.0
# Extract score and reason
score = metric.score
reason = (
str(metric.reason)
if hasattr(metric, "reason") and metric.reason
else "No reason provided"
)

# CRITICAL: Warn if score is None (indicates evaluation failure)
# See turn-level evaluation for detailed explanation of why this matters
if score is None:
logger.warning(
"GEval conversation-level metric returned None score; defaulting to 0.0. "
"This typically indicates LLM judge failure (rate limiting, timeout, "
"invalid JSON response, or quota exhausted). Reason: %s",
reason,
)
score = 0.0

return score, reason
except Exception as e: # pylint: disable=W0718
logger.error(
Expand Down
61 changes: 61 additions & 0 deletions tests/unit/core/llm/test_deepeval_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Unit tests for DeepEval LLM Manager."""

import logging

import pytest
from pytest_mock import MockerFixture

Expand Down Expand Up @@ -95,3 +97,62 @@ def test_drop_params_always_enabled(self, mocker: MockerFixture) -> None:
DeepEvalLLMManager("gpt-4", {})

assert mock_litellm.drop_params is True

def test_patch_deepeval_retries_called_with_configured_value(
self, mocker: MockerFixture
) -> None:
"""Test that _patch_deepeval_retries patches LiteLLMModel retry logic."""
# Mock LiteLLMModel with methods that have retry decorators
mock_generate = mocker.Mock()
mock_generate.retry = mocker.Mock()
mock_a_generate = mocker.Mock()
mock_a_generate.retry = mocker.Mock()
mock_generate_raw = mocker.Mock()
mock_generate_raw.retry = mocker.Mock()
mock_a_generate_raw = mocker.Mock()
mock_a_generate_raw.retry = mocker.Mock()
mock_generate_samples = mocker.Mock()
mock_generate_samples.retry = mocker.Mock()

mock_litellm_class = mocker.patch(
"lightspeed_evaluation.core.llm.deepeval.LiteLLMModel"
)
mock_litellm_class.generate = mock_generate
mock_litellm_class.a_generate = mock_a_generate
mock_litellm_class.generate_raw_response = mock_generate_raw
mock_litellm_class.a_generate_raw_response = mock_a_generate_raw
mock_litellm_class.generate_samples = mock_generate_samples

# Mock stop_after_attempt to return a mock stop condition
mock_stop_condition = mocker.Mock()
mock_stop_after_attempt = mocker.patch(
"lightspeed_evaluation.core.llm.deepeval.stop_after_attempt",
return_value=mock_stop_condition,
)

params = {"num_retries": 5}
DeepEvalLLMManager("gpt-4", params)

# Verify stop_after_attempt was called 5 times (once per method) with the configured retries
assert mock_stop_after_attempt.call_count == 5
mock_stop_after_attempt.assert_called_with(5)

# Verify each method's retry.stop was set to the new stop condition
assert mock_generate.retry.stop == mock_stop_condition
assert mock_a_generate.retry.stop == mock_stop_condition
assert mock_generate_raw.retry.stop == mock_stop_condition
assert mock_a_generate_raw.retry.stop == mock_stop_condition
assert mock_generate_samples.retry.stop == mock_stop_condition

def test_patch_deepeval_retries_logs_operation(
self, mocker: MockerFixture, caplog: pytest.LogCaptureFixture
) -> None:
"""Test that retry patching logs the max_retries value."""
mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel")

params = {"num_retries": 7}

with caplog.at_level(logging.INFO):
DeepEvalLLMManager("gpt-4", params)

assert "Patched DeepEval retry logic: max_retries=7" in caplog.text
66 changes: 66 additions & 0 deletions tests/unit/core/metrics/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
"""Pytest configuration and fixtures for metrics tests."""

import sys
from typing import Any

import pytest
from pytest_mock import MockerFixture

from lightspeed_evaluation.core.metrics.nlp import NLPMetrics
from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics
from lightspeed_evaluation.core.models import EvaluationScope, TurnData, SystemConfig


Expand Down Expand Up @@ -154,3 +156,67 @@ def mock_similarity_scorer(mocker: MockerFixture) -> MockerFixture:
return_value=mock_scorer_instance,
)
return mock_scorer_instance


@pytest.fixture
def mock_llm_manager(mocker: MockerFixture) -> Any:
"""Create a mock LLMManager for DeepEval tests."""
mock_manager = mocker.MagicMock()
mock_config = mocker.MagicMock()
mock_config.cache_enabled = False
mock_manager.get_config.return_value = mock_config
mock_manager.get_model_name.return_value = "gpt-4"
mock_manager.get_llm_params.return_value = {"num_retries": 3}
return mock_manager


@pytest.fixture
def mock_metric_manager(mocker: MockerFixture) -> Any:
"""Create a mock MetricManager."""
return mocker.MagicMock()


@pytest.fixture
def mock_deepeval_llm_manager(mocker: MockerFixture) -> Any:
"""Create a mock DeepEvalLLMManager."""
mock_manager = mocker.MagicMock()
mock_llm = mocker.MagicMock()
mock_manager.get_llm.return_value = mock_llm
mock_manager.flush_deepevals_pending_tasks.return_value = None
return mock_manager


@pytest.fixture
def mock_conv_data(mocker: MockerFixture) -> Any:
"""Create mock conversation data with turns."""
turn1 = mocker.MagicMock()
turn1.query = "What is AI?"
turn1.response = "AI stands for Artificial Intelligence."

turn2 = mocker.MagicMock()
turn2.query = "Can you explain more?"
turn2.response = "AI is the simulation of human intelligence by machines."

conv_data = mocker.MagicMock()
conv_data.turns = [turn1, turn2]
return conv_data


@pytest.fixture
def deepeval_metrics(
mock_llm_manager: Any,
mock_metric_manager: Any,
mock_deepeval_llm_manager: Any,
mocker: MockerFixture,
) -> DeepEvalMetrics:
"""Create DeepEvalMetrics instance with mocked dependencies."""
mocker.patch(
"lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager",
return_value=mock_deepeval_llm_manager,
)
mocker.patch("lightspeed_evaluation.core.metrics.deepeval.GEvalHandler")

return DeepEvalMetrics(
llm_manager=mock_llm_manager,
metric_manager=mock_metric_manager,
)
Loading
Loading