diff --git a/src/lightspeed_evaluation/core/llm/deepeval.py b/src/lightspeed_evaluation/core/llm/deepeval.py index 3503f930..d6090047 100644 --- a/src/lightspeed_evaluation/core/llm/deepeval.py +++ b/src/lightspeed_evaluation/core/llm/deepeval.py @@ -10,7 +10,9 @@ import litellm from deepeval.models import LiteLLMModel +from tenacity import stop_after_attempt +from lightspeed_evaluation.core.constants import DEFAULT_LLM_RETRIES from lightspeed_evaluation.core.llm.litellm_patch import setup_litellm_ssl logger = logging.getLogger(__name__) @@ -46,15 +48,46 @@ def __init__(self, model_name: str, llm_params: dict[str, Any]): # LiteLLMModel stores **kwargs in self.kwargs and merges them into # every litellm.completion() call # Note: Forbidden keys are rejected at LLMParametersConfig load time + + # Override DeepEval's hardcoded retry logic with user configuration + # DeepEval uses @retry decorators that capture MAX_RETRIES at import time + # We must patch the retry decorators after import but before instantiation + num_retries = self.llm_params.get("num_retries", DEFAULT_LLM_RETRIES) + + self._patch_deepeval_retries(num_retries) + self.llm_model = LiteLLMModel( model=self.model_name, timeout=self.llm_params.get("timeout"), - num_retries=self.llm_params.get("num_retries"), **self.llm_params.get("parameters", {}), ) print(f"✅ DeepEval LLM Manager: {self.model_name}") + def _patch_deepeval_retries(self, max_retries: int) -> None: + """Monkey-patch DeepEval's retry decorators to use configured max_retries. + + DeepEval's @retry decorators capture MAX_RETRIES at import time. + We patch the 'stop' attribute on each retry decorator to use our value. + """ + # Patch the stop condition on all retry-decorated methods + for method_name in [ + "generate", + "a_generate", + "generate_raw_response", + "a_generate_raw_response", + "generate_samples", + ]: + method = getattr(LiteLLMModel, method_name) + method.retry.stop = stop_after_attempt( # pylint: disable=no-member + max_retries + ) + + logger.info( + "Patched DeepEval retry logic: max_retries=%d", + max_retries, + ) + def setup_ssl_verify(self) -> None: """Setup SSL verification based on LLM parameters. diff --git a/src/lightspeed_evaluation/core/metrics/deepeval.py b/src/lightspeed_evaluation/core/metrics/deepeval.py index a7aa1697..e66852db 100644 --- a/src/lightspeed_evaluation/core/metrics/deepeval.py +++ b/src/lightspeed_evaluation/core/metrics/deepeval.py @@ -92,12 +92,30 @@ def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]: metric.measure(test_case) self.llm_manager.flush_deepevals_pending_tasks() + score = metric.score reason = ( metric.reason if hasattr(metric, "reason") and metric.reason - else f"Score: {metric.score:.2f}" + else f"Score: {score:.2f}" if score is not None else "No score returned" ) - return metric.score, reason + + # CRITICAL: Warn if score is None (indicates evaluation failure) + # Without this warning, silent conversion to 0.0 masks bugs like: + # - Rate limiting (429 errors) + # - LLM judge returning malformed JSON that fails parsing + # - Timeout errors from LLM provider + # - API quota/credits exhausted + if score is None: + logger.warning( + "%s metric returned None score; defaulting to 0.0. " + "This typically indicates LLM judge failure (rate limiting, timeout, " + "invalid JSON response, or quota exhausted). Reason: %s", + metric.__class__.__name__, + reason, + ) + score = 0.0 + + return score, reason def evaluate( self, diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py index 3b2b8d48..f668a322 100644 --- a/src/lightspeed_evaluation/core/metrics/geval.py +++ b/src/lightspeed_evaluation/core/metrics/geval.py @@ -298,12 +298,31 @@ def _evaluate_turn( # pylint: disable=R0913,R0917 metric.measure(test_case) self.deepeval_llm_manager.flush_deepevals_pending_tasks() - score = metric.score if metric.score is not None else 0.0 + # Extract score and reason + score = metric.score reason = ( str(metric.reason) if hasattr(metric, "reason") and metric.reason else "No reason provided" ) + + # CRITICAL: Warn if score is None (indicates evaluation failure) + # Without this warning, silent conversion to 0.0 masks bugs like: + # - Rate limiting (429 errors after all retries exhausted) + # - LLM judge returning malformed JSON that fails parsing + # - Timeout errors from LLM provider + # - API quota/credits exhausted + # This makes debugging nearly impossible as failed evaluations + # appear as low scores (0.0) instead of errors. + if score is None: + logger.warning( + "GEval turn-level metric returned None score; defaulting to 0.0. " + "This typically indicates LLM judge failure (rate limiting, timeout, " + "invalid JSON response, or quota exhausted). Reason: %s", + reason, + ) + score = 0.0 + return score, reason except Exception as e: # pylint: disable=W0718 logger.error( @@ -411,12 +430,25 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914 metric.measure(test_case) self.deepeval_llm_manager.flush_deepevals_pending_tasks() - score = metric.score if metric.score is not None else 0.0 + # Extract score and reason + score = metric.score reason = ( str(metric.reason) if hasattr(metric, "reason") and metric.reason else "No reason provided" ) + + # CRITICAL: Warn if score is None (indicates evaluation failure) + # See turn-level evaluation for detailed explanation of why this matters + if score is None: + logger.warning( + "GEval conversation-level metric returned None score; defaulting to 0.0. " + "This typically indicates LLM judge failure (rate limiting, timeout, " + "invalid JSON response, or quota exhausted). Reason: %s", + reason, + ) + score = 0.0 + return score, reason except Exception as e: # pylint: disable=W0718 logger.error( diff --git a/tests/unit/core/llm/test_deepeval_manager.py b/tests/unit/core/llm/test_deepeval_manager.py index 53253d22..8cbb8094 100644 --- a/tests/unit/core/llm/test_deepeval_manager.py +++ b/tests/unit/core/llm/test_deepeval_manager.py @@ -1,5 +1,7 @@ """Unit tests for DeepEval LLM Manager.""" +import logging + import pytest from pytest_mock import MockerFixture @@ -95,3 +97,62 @@ def test_drop_params_always_enabled(self, mocker: MockerFixture) -> None: DeepEvalLLMManager("gpt-4", {}) assert mock_litellm.drop_params is True + + def test_patch_deepeval_retries_called_with_configured_value( + self, mocker: MockerFixture + ) -> None: + """Test that _patch_deepeval_retries patches LiteLLMModel retry logic.""" + # Mock LiteLLMModel with methods that have retry decorators + mock_generate = mocker.Mock() + mock_generate.retry = mocker.Mock() + mock_a_generate = mocker.Mock() + mock_a_generate.retry = mocker.Mock() + mock_generate_raw = mocker.Mock() + mock_generate_raw.retry = mocker.Mock() + mock_a_generate_raw = mocker.Mock() + mock_a_generate_raw.retry = mocker.Mock() + mock_generate_samples = mocker.Mock() + mock_generate_samples.retry = mocker.Mock() + + mock_litellm_class = mocker.patch( + "lightspeed_evaluation.core.llm.deepeval.LiteLLMModel" + ) + mock_litellm_class.generate = mock_generate + mock_litellm_class.a_generate = mock_a_generate + mock_litellm_class.generate_raw_response = mock_generate_raw + mock_litellm_class.a_generate_raw_response = mock_a_generate_raw + mock_litellm_class.generate_samples = mock_generate_samples + + # Mock stop_after_attempt to return a mock stop condition + mock_stop_condition = mocker.Mock() + mock_stop_after_attempt = mocker.patch( + "lightspeed_evaluation.core.llm.deepeval.stop_after_attempt", + return_value=mock_stop_condition, + ) + + params = {"num_retries": 5} + DeepEvalLLMManager("gpt-4", params) + + # Verify stop_after_attempt was called 5 times (once per method) with the configured retries + assert mock_stop_after_attempt.call_count == 5 + mock_stop_after_attempt.assert_called_with(5) + + # Verify each method's retry.stop was set to the new stop condition + assert mock_generate.retry.stop == mock_stop_condition + assert mock_a_generate.retry.stop == mock_stop_condition + assert mock_generate_raw.retry.stop == mock_stop_condition + assert mock_a_generate_raw.retry.stop == mock_stop_condition + assert mock_generate_samples.retry.stop == mock_stop_condition + + def test_patch_deepeval_retries_logs_operation( + self, mocker: MockerFixture, caplog: pytest.LogCaptureFixture + ) -> None: + """Test that retry patching logs the max_retries value.""" + mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel") + + params = {"num_retries": 7} + + with caplog.at_level(logging.INFO): + DeepEvalLLMManager("gpt-4", params) + + assert "Patched DeepEval retry logic: max_retries=7" in caplog.text diff --git a/tests/unit/core/metrics/conftest.py b/tests/unit/core/metrics/conftest.py index e7bc39e0..b73e3fe6 100644 --- a/tests/unit/core/metrics/conftest.py +++ b/tests/unit/core/metrics/conftest.py @@ -3,11 +3,13 @@ """Pytest configuration and fixtures for metrics tests.""" import sys +from typing import Any import pytest from pytest_mock import MockerFixture from lightspeed_evaluation.core.metrics.nlp import NLPMetrics +from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics from lightspeed_evaluation.core.models import EvaluationScope, TurnData, SystemConfig @@ -154,3 +156,67 @@ def mock_similarity_scorer(mocker: MockerFixture) -> MockerFixture: return_value=mock_scorer_instance, ) return mock_scorer_instance + + +@pytest.fixture +def mock_llm_manager(mocker: MockerFixture) -> Any: + """Create a mock LLMManager for DeepEval tests.""" + mock_manager = mocker.MagicMock() + mock_config = mocker.MagicMock() + mock_config.cache_enabled = False + mock_manager.get_config.return_value = mock_config + mock_manager.get_model_name.return_value = "gpt-4" + mock_manager.get_llm_params.return_value = {"num_retries": 3} + return mock_manager + + +@pytest.fixture +def mock_metric_manager(mocker: MockerFixture) -> Any: + """Create a mock MetricManager.""" + return mocker.MagicMock() + + +@pytest.fixture +def mock_deepeval_llm_manager(mocker: MockerFixture) -> Any: + """Create a mock DeepEvalLLMManager.""" + mock_manager = mocker.MagicMock() + mock_llm = mocker.MagicMock() + mock_manager.get_llm.return_value = mock_llm + mock_manager.flush_deepevals_pending_tasks.return_value = None + return mock_manager + + +@pytest.fixture +def mock_conv_data(mocker: MockerFixture) -> Any: + """Create mock conversation data with turns.""" + turn1 = mocker.MagicMock() + turn1.query = "What is AI?" + turn1.response = "AI stands for Artificial Intelligence." + + turn2 = mocker.MagicMock() + turn2.query = "Can you explain more?" + turn2.response = "AI is the simulation of human intelligence by machines." + + conv_data = mocker.MagicMock() + conv_data.turns = [turn1, turn2] + return conv_data + + +@pytest.fixture +def deepeval_metrics( + mock_llm_manager: Any, + mock_metric_manager: Any, + mock_deepeval_llm_manager: Any, + mocker: MockerFixture, +) -> DeepEvalMetrics: + """Create DeepEvalMetrics instance with mocked dependencies.""" + mocker.patch( + "lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager", + return_value=mock_deepeval_llm_manager, + ) + mocker.patch("lightspeed_evaluation.core.metrics.deepeval.GEvalHandler") + + return DeepEvalMetrics( + llm_manager=mock_llm_manager, + metric_manager=mock_metric_manager, + ) diff --git a/tests/unit/core/metrics/test_deepeval.py b/tests/unit/core/metrics/test_deepeval.py new file mode 100644 index 00000000..a8bf5f17 --- /dev/null +++ b/tests/unit/core/metrics/test_deepeval.py @@ -0,0 +1,238 @@ +# pylint: disable=protected-access, unused-variable +"""Unit tests for DeepEval metrics handler.""" + +import logging +from typing import Any + +import pytest +from pytest_mock import MockerFixture + +from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics +from lightspeed_evaluation.core.models import EvaluationScope + + +class TestDeepEvalMetrics: + """Tests for DeepEvalMetrics class.""" + + def test_evaluate_metric_none_score_returns_zero_with_warning( + self, + deepeval_metrics: DeepEvalMetrics, + mocker: MockerFixture, + caplog: pytest.LogCaptureFixture, + ) -> None: + """Test that None score from metric is converted to 0.0 with warning.""" + # Create a mock metric that returns None score + mock_metric = mocker.MagicMock() + mock_metric.score = None + mock_metric.reason = "LLM judge failed to parse response" + mock_metric.__class__.__name__ = "TestMetric" + + # Create a mock test case + mock_test_case = mocker.MagicMock() + + with caplog.at_level(logging.WARNING): + score, reason = deepeval_metrics._evaluate_metric( + mock_metric, mock_test_case + ) + + # Verify score is converted to 0.0 + assert score == 0.0 + assert reason == "LLM judge failed to parse response" + + # Verify warning was logged + assert "TestMetric metric returned None score" in caplog.text + assert "defaulting to 0.0" in caplog.text + assert "rate limiting, timeout" in caplog.text + + def test_evaluate_metric_with_valid_score( + self, deepeval_metrics: DeepEvalMetrics, mocker: MockerFixture + ) -> None: + """Test that valid score is returned as-is.""" + mock_metric = mocker.MagicMock() + mock_metric.score = 0.85 + mock_metric.reason = "Good response quality" + + mock_test_case = mocker.MagicMock() + + score, reason = deepeval_metrics._evaluate_metric(mock_metric, mock_test_case) + + assert score == 0.85 + assert reason == "Good response quality" + + def test_evaluate_metric_without_reason( + self, deepeval_metrics: DeepEvalMetrics, mocker: MockerFixture + ) -> None: + """Test that metric without reason gets default reason.""" + mock_metric = mocker.MagicMock() + mock_metric.score = 0.75 + # Simulate metric without reason attribute + del mock_metric.reason + + mock_test_case = mocker.MagicMock() + + score, reason = deepeval_metrics._evaluate_metric(mock_metric, mock_test_case) + + assert score == 0.75 + assert "Score: 0.75" in reason + + def test_evaluate_metric_none_score_without_reason( + self, deepeval_metrics: DeepEvalMetrics, mocker: MockerFixture + ) -> None: + """Test None score without reason gets appropriate default message.""" + mock_metric = mocker.MagicMock() + mock_metric.score = None + # Simulate metric without reason + del mock_metric.reason + + mock_test_case = mocker.MagicMock() + + score, reason = deepeval_metrics._evaluate_metric(mock_metric, mock_test_case) + + assert score == 0.0 + assert reason == "No score returned" + + def test_initialization_with_cache( + self, + mock_llm_manager: Any, + mock_metric_manager: Any, + mocker: MockerFixture, + ) -> None: + """Test initialization when cache is enabled.""" + mock_config = mocker.MagicMock() + mock_config.cache_enabled = True + mock_config.cache_dir = "/tmp/test_cache" + mock_llm_manager.get_config.return_value = mock_config + + mock_litellm = mocker.patch( + "lightspeed_evaluation.core.metrics.deepeval.litellm" + ) + mock_litellm.cache = None + mock_cache_class = mocker.patch( + "lightspeed_evaluation.core.metrics.deepeval.Cache" + ) + mocker.patch("lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager") + mocker.patch("lightspeed_evaluation.core.metrics.deepeval.GEvalHandler") + + DeepEvalMetrics( + llm_manager=mock_llm_manager, metric_manager=mock_metric_manager + ) + + mock_cache_class.assert_called_once() + + def test_evaluate_conversation_completeness( + self, + deepeval_metrics: DeepEvalMetrics, + mock_conv_data: Any, + mocker: MockerFixture, + ) -> None: + """Test conversation completeness evaluation.""" + mock_metric_class = mocker.patch( + "lightspeed_evaluation.core.metrics.deepeval.ConversationCompletenessMetric" + ) + mocker.patch.object(deepeval_metrics, "_build_conversational_test_case") + mocker.patch.object( + deepeval_metrics, "_evaluate_metric", return_value=(0.85, "Complete") + ) + + score, reason = deepeval_metrics._evaluate_conversation_completeness( + conv_data=mock_conv_data, + _turn_idx=None, + _turn_data=None, + is_conversation=True, + ) + + assert score == 0.85 + mock_metric_class.assert_called_once() + + def test_evaluate_conversation_relevancy( + self, + deepeval_metrics: DeepEvalMetrics, + mock_conv_data: Any, + mocker: MockerFixture, + ) -> None: + """Test conversation relevancy evaluation.""" + mock_metric_class = mocker.patch( + "lightspeed_evaluation.core.metrics.deepeval.TurnRelevancyMetric" + ) + mocker.patch.object(deepeval_metrics, "_build_conversational_test_case") + mocker.patch.object( + deepeval_metrics, "_evaluate_metric", return_value=(0.90, "Relevant") + ) + + score, reason = deepeval_metrics._evaluate_conversation_relevancy( + conv_data=mock_conv_data, + _turn_idx=None, + _turn_data=None, + is_conversation=True, + ) + + assert score == 0.90 + mock_metric_class.assert_called_once() + + def test_evaluate_knowledge_retention( + self, + deepeval_metrics: DeepEvalMetrics, + mock_conv_data: Any, + mocker: MockerFixture, + ) -> None: + """Test knowledge retention evaluation.""" + mock_metric_class = mocker.patch( + "lightspeed_evaluation.core.metrics.deepeval.KnowledgeRetentionMetric" + ) + mocker.patch.object(deepeval_metrics, "_build_conversational_test_case") + mocker.patch.object( + deepeval_metrics, "_evaluate_metric", return_value=(0.75, "Retained") + ) + + score, reason = deepeval_metrics._evaluate_knowledge_retention( + conv_data=mock_conv_data, + _turn_idx=None, + _turn_data=None, + is_conversation=True, + ) + + assert score == 0.75 + mock_metric_class.assert_called_once() + + def test_evaluate_standard_metric_exception_handling( + self, + deepeval_metrics: DeepEvalMetrics, + mock_conv_data: Any, + mocker: MockerFixture, + ) -> None: + """Test evaluate handles exceptions from standard metrics.""" + # Set up the exception to be raised + deepeval_metrics.supported_metrics["conversation_completeness"] = ( + mocker.MagicMock(side_effect=ValueError("Test error")) + ) + + scope = EvaluationScope(turn_idx=None, turn_data=None, is_conversation=True) + score, reason = deepeval_metrics.evaluate( + "conversation_completeness", mock_conv_data, scope + ) + + assert score is None + assert "evaluation failed" in reason + assert "Test error" in reason + + def test_evaluate_routes_to_geval( + self, + deepeval_metrics: DeepEvalMetrics, + mock_conv_data: Any, + mocker: MockerFixture, + ) -> None: + """Test evaluate routes to GEval handler for custom metrics.""" + mock_geval = mocker.MagicMock() + mock_geval.evaluate.return_value = (0.92, "Custom") + deepeval_metrics.geval_handler = mock_geval + + scope = EvaluationScope(turn_idx=None, turn_data=None, is_conversation=True) + deepeval_metrics.evaluate("geval:custom_metric", mock_conv_data, scope) + + mock_geval.evaluate.assert_called_once_with( + metric_name="custom_metric", + conv_data=mock_conv_data, + _turn_idx=None, + turn_data=None, + is_conversation=True, + ) diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py index 0ab50d40..21395fe1 100644 --- a/tests/unit/core/metrics/test_geval.py +++ b/tests/unit/core/metrics/test_geval.py @@ -21,6 +21,8 @@ def mock_llm_manager(self, mocker: MockerFixture) -> Any: mock_manager = mocker.MagicMock() mock_llm = mocker.MagicMock() mock_manager.get_llm.return_value = mock_llm + # Mock llm_params to return valid num_retries (needed for retry logic) + mock_manager.llm_params = {"num_retries": 3} return mock_manager @pytest.fixture