From c3379ec86d9b8d63206b1fbe9f3806b6810124ed Mon Sep 17 00:00:00 2001 From: Scott Florentino Date: Thu, 26 Mar 2026 22:15:00 -0700 Subject: [PATCH] refactor: extract LLM call logging into shared evaluator_utils helper Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/uipath/pyproject.toml | 2 +- .../uipath/eval/evaluators/evaluator_utils.py | 92 ++++++++++++ .../legacy_llm_as_judge_evaluator.py | 5 +- .../evaluators/legacy_trajectory_evaluator.py | 5 +- .../eval/evaluators/llm_as_judge_evaluator.py | 82 +---------- .../tests/evaluators/test_evaluator_utils.py | 134 ++++++++++++++++++ packages/uipath/uv.lock | 4 +- 7 files changed, 239 insertions(+), 85 deletions(-) create mode 100644 packages/uipath/src/uipath/eval/evaluators/evaluator_utils.py create mode 100644 packages/uipath/tests/evaluators/test_evaluator_utils.py diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml index 880d6395e..09c14069c 100644 --- a/packages/uipath/pyproject.toml +++ b/packages/uipath/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.10.34" +version = "2.10.35" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/packages/uipath/src/uipath/eval/evaluators/evaluator_utils.py b/packages/uipath/src/uipath/eval/evaluators/evaluator_utils.py new file mode 100644 index 000000000..f41808487 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/evaluator_utils.py @@ -0,0 +1,92 @@ +"""Internal utilities shared across evaluators.""" + +import copy +import json +import logging +from collections.abc import Callable +from typing import Any + +from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory + +logger = logging.getLogger(__name__) + + +async def _call_llm_with_logging( + llm_service: Callable[..., Any], + request_data: dict[str, Any], + model: str, +) -> Any: + """Call the LLM service with detailed request/response logging and error handling. + + Args: + llm_service: The LLM chat completions callable + request_data: The request payload to send + model: The model name (for logging) + + Returns: + The raw LLM response + + Raises: + UiPathEvaluationError: If the LLM call fails + """ + # Log the request details + logger.info( + f"🤖 Calling LLM evaluator with model: {model} (using function calling)" + ) + logger.debug(f"Request data: model={model}, tool_choice=required") + + # Log full request body for debugging + request_body_for_log = copy.deepcopy(request_data) + if "tool_choice" in request_body_for_log: + request_body_for_log["tool_choice"] = request_body_for_log[ + "tool_choice" + ].model_dump() + if "tools" in request_body_for_log: + request_body_for_log["tools"] = [ + t.model_dump() for t in request_body_for_log["tools"] + ] + logger.info(f"📤 Full request body:\n{json.dumps(request_body_for_log, indent=2)}") + + try: + response = await llm_service(**request_data) + except Exception as e: + logger.error("=" * 80) + logger.error("❌ LLM REQUEST FAILED") + logger.error("=" * 80) + logger.error(f"Model: {model}") + logger.error("API Endpoint: Normalized API (/llm/api/chat/completions)") + logger.error(f"Error Type: {type(e).__name__}") + logger.error(f"Error Message: {str(e)}") + + if hasattr(e, "response"): + logger.error( + f"HTTP Status Code: {e.response.status_code if hasattr(e.response, 'status_code') else 'N/A'}" + ) + try: + error_body = ( + e.response.json() + if hasattr(e.response, "json") + else str(e.response.content) + ) + logger.error( + f"Response Body: {json.dumps(error_body, indent=2) if isinstance(error_body, dict) else error_body}" + ) + except Exception: + logger.error( + f"Response Body: {str(e.response.content) if hasattr(e.response, 'content') else 'N/A'}" + ) + + logger.error(f"Request Details: model={model}, tool_choice=required") + logger.error("=" * 80) + + raise UiPathEvaluationError( + code="FAILED_TO_GET_LLM_RESPONSE", + title="Failed to get LLM response", + detail=f"Model: {model}, Error: {type(e).__name__}: {str(e)}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e + + logger.info(f"✅ LLM response received successfully from {model}") + logger.debug(f"Response: {response}") + + return response diff --git a/packages/uipath/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py index 5b9a8ee1c..ff1fb60dd 100644 --- a/packages/uipath/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py @@ -26,6 +26,7 @@ LegacyEvaluationCriteria, LegacyEvaluatorConfig, ) +from .evaluator_utils import _call_llm_with_logging from .legacy_llm_helpers import create_evaluation_tool, extract_tool_call_response logger = logging.getLogger(__name__) @@ -211,5 +212,7 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: } assert self.llm, "LLM should be initialized before calling this method." - response = await self.llm.chat_completions(**request_data) + response = await _call_llm_with_logging( + self.llm.chat_completions, request_data, model + ) return extract_tool_call_response(response, model) diff --git a/packages/uipath/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py index 24d56560b..ca7c7fddd 100644 --- a/packages/uipath/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py @@ -27,6 +27,7 @@ LegacyEvaluationCriteria, LegacyEvaluatorConfig, ) +from .evaluator_utils import _call_llm_with_logging from .legacy_llm_helpers import create_evaluation_tool, extract_tool_call_response logger = logging.getLogger(__name__) @@ -181,5 +182,7 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: "tool_choice": tool_choice, } - response = await self.llm.chat_completions(**request_data) + response = await _call_llm_with_logging( + self.llm.chat_completions, request_data, model + ) return extract_tool_call_response(response, model) diff --git a/packages/uipath/src/uipath/eval/evaluators/llm_as_judge_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/llm_as_judge_evaluator.py index c67212548..c4d202796 100644 --- a/packages/uipath/src/uipath/eval/evaluators/llm_as_judge_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/llm_as_judge_evaluator.py @@ -1,7 +1,5 @@ """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs.""" -import copy -import json import logging from abc import abstractmethod from collections.abc import Callable @@ -31,6 +29,7 @@ BaseEvaluatorConfig, BaseEvaluatorJustification, ) +from .evaluator_utils import _call_llm_with_logging T = TypeVar("T", bound=BaseEvaluationCriteria) @@ -278,84 +277,7 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: category=UiPathEvaluationErrorCategory.SYSTEM, ) - # Log the request details (exclude non-JSON-serializable objects) - logger.info( - f"🤖 Calling LLM evaluator with model: {model} (using function calling)" - ) - max_tokens_str = ( - str(max_tokens_value) if max_tokens_value is not None else "unset" - ) - logger.debug( - f"Request data: model={model}, max_tokens={max_tokens_str}, temperature={self.evaluator_config.temperature}, tool_choice=required" - ) - - # Log full request body for debugging - request_body_for_log = copy.deepcopy(request_data) - # Convert tool_choice to dict for logging - if "tool_choice" in request_body_for_log: - request_body_for_log["tool_choice"] = request_body_for_log[ - "tool_choice" - ].model_dump() - # Convert tools to dict for logging - if "tools" in request_body_for_log: - request_body_for_log["tools"] = [ - t.model_dump() for t in request_body_for_log["tools"] - ] - logger.info( - f"📤 Full request body:\n{json.dumps(request_body_for_log, indent=2)}" - ) - - try: - response = await self.llm_service(**request_data) - except Exception as e: - # Enhanced error logging with details - logger.error("=" * 80) - logger.error("❌ LLM REQUEST FAILED") - logger.error("=" * 80) - logger.error(f"Model: {model}") - logger.error("API Endpoint: Normalized API (/llm/api/chat/completions)") - logger.error(f"Error Type: {type(e).__name__}") - logger.error(f"Error Message: {str(e)}") - - # Try to extract HTTP error details if available - if hasattr(e, "response"): - logger.error( - f"HTTP Status Code: {e.response.status_code if hasattr(e.response, 'status_code') else 'N/A'}" - ) - try: - error_body = ( - e.response.json() - if hasattr(e.response, "json") - else str(e.response.content) - ) - logger.error( - f"Response Body: {json.dumps(error_body, indent=2) if isinstance(error_body, dict) else error_body}" - ) - except Exception: - logger.error( - f"Response Body: {str(e.response.content) if hasattr(e.response, 'content') else 'N/A'}" - ) - - max_tokens_str = ( - str(self.evaluator_config.max_tokens) - if self.evaluator_config.max_tokens is not None - else "unset" - ) - logger.error( - f"Request Details: model={model}, max_tokens={max_tokens_str}, temperature={self.evaluator_config.temperature}, tool_choice=required" - ) - logger.error("=" * 80) - - raise UiPathEvaluationError( - code="FAILED_TO_GET_LLM_RESPONSE", - title="Failed to get LLM response", - detail=f"Model: {model}, Error: {type(e).__name__}: {str(e)}", - category=UiPathEvaluationErrorCategory.SYSTEM, - ) from e - - # Log successful response - logger.info(f"✅ LLM response received successfully from {model}") - logger.debug(f"Response: {response}") + response = await _call_llm_with_logging(self.llm_service, request_data, model) # Extract structured output from tool call return self._extract_tool_call_response(response, model) diff --git a/packages/uipath/tests/evaluators/test_evaluator_utils.py b/packages/uipath/tests/evaluators/test_evaluator_utils.py new file mode 100644 index 000000000..e8fd8fd5c --- /dev/null +++ b/packages/uipath/tests/evaluators/test_evaluator_utils.py @@ -0,0 +1,134 @@ +"""Tests for evaluator_utils._call_llm_with_logging helper.""" + +import logging +from typing import Any +from unittest.mock import MagicMock + +import pytest + +from uipath.eval.evaluators.evaluator_utils import _call_llm_with_logging +from uipath.eval.models.models import UiPathEvaluationError + +LOGGER_NAME = "uipath.eval.evaluators.evaluator_utils" + + +def _make_request_data() -> dict[str, Any]: + """Create minimal request_data for tests.""" + return { + "model": "gpt-4o", + "messages": [{"role": "user", "content": "test"}], + "tools": [], + "tool_choice": MagicMock(model_dump=lambda: {"type": "required"}), + } + + +class TestCallLlmWithLogging: + """Tests for _call_llm_with_logging.""" + + @pytest.mark.asyncio + async def test_success_returns_response(self) -> None: + """Test that a successful LLM call returns the response unchanged.""" + expected_response = MagicMock() + + async def mock_llm_service(**kwargs: Any) -> Any: + return expected_response + + result = await _call_llm_with_logging( + mock_llm_service, _make_request_data(), "gpt-4o" + ) + assert result is expected_response + + @pytest.mark.asyncio + async def test_passes_request_data_to_llm_service(self) -> None: + """Test that request_data kwargs are forwarded to the LLM service.""" + captured_kwargs: dict[str, Any] = {} + + async def mock_llm_service(**kwargs: Any) -> Any: + captured_kwargs.update(kwargs) + return MagicMock() + + request_data = _make_request_data() + request_data["temperature"] = 0.5 + + await _call_llm_with_logging(mock_llm_service, request_data, "gpt-4o") + assert captured_kwargs["model"] == "gpt-4o" + assert captured_kwargs["temperature"] == 0.5 + + @pytest.mark.asyncio + async def test_plain_exception_wraps_in_evaluation_error(self) -> None: + """Test that a plain exception is wrapped in UiPathEvaluationError.""" + + async def mock_llm_service(**kwargs: Any) -> Any: + raise RuntimeError("connection refused") + + with pytest.raises(UiPathEvaluationError) as exc_info: + await _call_llm_with_logging( + mock_llm_service, _make_request_data(), "gpt-4o" + ) + + error = exc_info.value + assert error.error_info.code == "Python.FAILED_TO_GET_LLM_RESPONSE" + assert "gpt-4o" in error.error_info.detail + assert "RuntimeError" in error.error_info.detail + assert "connection refused" in error.error_info.detail + assert isinstance(error.__cause__, RuntimeError) + + @pytest.mark.asyncio + async def test_http_error_includes_status_in_logs(self) -> None: + """Test that an exception with .response logs HTTP status code and body.""" + mock_response = MagicMock() + mock_response.status_code = 429 + mock_response.json.return_value = {"error": "rate limited"} + + async def mock_llm_service(**kwargs: Any) -> Any: + exc = Exception("Too Many Requests") + exc.response = mock_response # type: ignore[attr-defined] + raise exc + + logger = logging.getLogger(LOGGER_NAME) + logged_messages: list[str] = [] + handler = logging.Handler() + handler.emit = lambda record: logged_messages.append(record.getMessage()) # type: ignore[assignment] + logger.addHandler(handler) + + try: + with pytest.raises(UiPathEvaluationError): + await _call_llm_with_logging( + mock_llm_service, _make_request_data(), "gpt-4o" + ) + finally: + logger.removeHandler(handler) + + all_logs = "\n".join(logged_messages) + assert "429" in all_logs + assert "rate limited" in all_logs + + @pytest.mark.asyncio + async def test_http_error_json_parse_failure_falls_back_to_content(self) -> None: + """Test fallback to .content when .json() raises.""" + mock_response = MagicMock() + mock_response.status_code = 500 + mock_response.json.side_effect = ValueError("not json") + mock_response.content = b"Internal Server Error" + + async def mock_llm_service(**kwargs: Any) -> Any: + exc = Exception("Server Error") + exc.response = mock_response # type: ignore[attr-defined] + raise exc + + logger = logging.getLogger(LOGGER_NAME) + logged_messages: list[str] = [] + handler = logging.Handler() + handler.emit = lambda record: logged_messages.append(record.getMessage()) # type: ignore[assignment] + logger.addHandler(handler) + + try: + with pytest.raises(UiPathEvaluationError): + await _call_llm_with_logging( + mock_llm_service, _make_request_data(), "gpt-4o" + ) + finally: + logger.removeHandler(handler) + + all_logs = "\n".join(logged_messages) + assert "Internal Server Error" in all_logs diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock index 845159eef..0856712ca 100644 --- a/packages/uipath/uv.lock +++ b/packages/uipath/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.11" [[package]] @@ -2543,7 +2543,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.10.34" +version = "2.10.35" source = { editable = "." } dependencies = [ { name = "applicationinsights" },