From 0834bc57a8e02594ccc7ac49410ff26a70ca50ba Mon Sep 17 00:00:00 2001 From: Eva Micankova Date: Mon, 25 May 2026 13:43:07 +0200 Subject: [PATCH] Align lint checks with LSC --- Makefile | 8 ++--- .../core/agent_goal_eval/evaluator.py | 9 +++--- .../core/agent_goal_eval/results.py | 4 +-- pyproject.toml | 17 +++++++++-- script/compare_evaluations.py | 20 ++++++------- script/run_multi_provider_eval.py | 24 ++++++++------- src/generate_answers/generate_answers.py | 3 +- src/lightspeed_evaluation/core/api/client.py | 4 +-- .../core/llm/__init__.py | 5 ++-- src/lightspeed_evaluation/core/llm/custom.py | 4 +-- .../core/llm/litellm_patch.py | 2 +- .../core/metrics/custom/custom.py | 5 ++-- .../core/metrics/custom/tool_eval.py | 3 +- .../core/metrics/geval.py | 1 - .../core/metrics/script.py | 4 +-- .../core/models/__init__.py | 30 +++++++++---------- .../core/models/agents.py | 4 +-- src/lightspeed_evaluation/core/models/data.py | 16 +++++----- .../core/models/quality.py | 3 +- .../core/models/statistics.py | 1 + .../core/models/summary.py | 12 ++++---- .../core/output/data_persistence.py | 4 +-- .../core/output/generator.py | 22 +++++++------- .../core/output/statistics.py | 15 +++++----- .../core/output/visualization.py | 2 +- .../core/script/manager.py | 5 ++-- .../core/storage/__init__.py | 8 ++--- .../core/storage/config.py | 4 +-- .../core/storage/protocol.py | 4 +-- .../core/storage/sql_storage.py | 6 ++-- .../core/system/ssl_certifi.py | 5 ++-- .../core/system/validator.py | 6 ++-- .../pipeline/evaluation/evaluator.py | 10 +++---- .../pipeline/evaluation/judges.py | 3 +- .../pipeline/evaluation/pipeline.py | 9 ++++-- .../runner/evaluation.py | 4 +-- tests/script/test_compare_evaluations.py | 6 ++-- tests/script/test_run_multi_provider_eval.py | 11 +++---- tests/unit/core/api/conftest.py | 2 +- tests/unit/core/api/test_client.py | 11 +++---- tests/unit/core/api/test_client_infer.py | 4 +-- tests/unit/core/api/test_streaming_parser.py | 4 +-- tests/unit/core/config/test_models.py | 1 + tests/unit/core/llm/conftest.py | 15 ++++------ tests/unit/core/llm/test_custom.py | 3 +- tests/unit/core/llm/test_llm_manager.py | 6 ++-- tests/unit/core/llm/test_token_tracker.py | 5 ++-- tests/unit/core/metrics/conftest.py | 2 +- tests/unit/core/metrics/custom/test_custom.py | 1 + .../core/metrics/custom/test_tool_eval.py | 6 ++-- tests/unit/core/metrics/test_geval.py | 2 +- tests/unit/core/models/test_api_additional.py | 4 +-- tests/unit/core/models/test_quality.py | 1 - tests/unit/core/models/test_summary.py | 11 ++++--- tests/unit/core/models/test_system.py | 10 +++---- tests/unit/core/output/conftest.py | 1 + tests/unit/core/output/test_final_coverage.py | 5 ++-- tests/unit/core/output/test_generator.py | 4 +-- tests/unit/core/output/test_statistics.py | 4 +-- tests/unit/core/output/test_statistics_api.py | 2 +- tests/unit/core/script/test_manager.py | 2 +- .../core/script/test_manager_additional.py | 8 ++--- .../storage/test_composite_and_factory.py | 2 +- tests/unit/core/storage/test_protocol.py | 6 ++-- tests/unit/core/storage/test_sql_storage.py | 2 +- tests/unit/core/system/test_loader.py | 4 +-- tests/unit/core/system/test_setup.py | 2 +- tests/unit/core/system/test_ssl_certifi.py | 2 +- tests/unit/core/system/test_validator.py | 3 +- tests/unit/pipeline/evaluation/conftest.py | 9 +++--- .../pipeline/evaluation/test_evaluator.py | 6 ++-- .../pipeline/evaluation/test_processor.py | 11 +------ tests/unit/runner/test_evaluation.py | 14 ++++----- uv.lock | 11 +++++++ 74 files changed, 250 insertions(+), 239 deletions(-) diff --git a/Makefile b/Makefile index 70b93793..cd996ead 100644 --- a/Makefile +++ b/Makefile @@ -23,10 +23,10 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed uv sync --group dev check-types: ## Checks type hints in sources - uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests + uv run mypy src/ lsc_agent_eval/src/ tests black-check: - uv run black src tests script lsc_agent_eval --check + uv run black --check src tests script lsc_agent_eval black-format: uv run black src tests script lsc_agent_eval @@ -115,7 +115,7 @@ shellcheck: ## Run shellcheck pylint: uv run pylint src - uv run pylint --disable=R0801 lsc_agent_eval/src tests + uv run pylint lsc_agent_eval/src tests pyright: uv run pyright src lsc_agent_eval/src tests @@ -127,4 +127,4 @@ ruff: uv run ruff check src tests script lsc_agent_eval bandit: ## Security scanning with Bandit - uv run bandit -r src/lightspeed_evaluation -ll + uv run bandit -c pyproject.toml -r src/lightspeed_evaluation -ll diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py index def4b1d5..42d77c7c 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py @@ -6,7 +6,7 @@ from ..utils.exceptions import AgentAPIError, JudgeModelError, ScriptExecutionError from ..utils.prompt import ANSWER_CORRECTNESS_PROMPT, INTENT_DETECTION_PROMPT from .tool_call_eval import compare_tool_calls -from .utils import create_evaluation_results +from .utils import EvalResultItem, create_evaluation_results if TYPE_CHECKING: from ..utils.api_client import AgentHttpClient @@ -42,12 +42,13 @@ def run_evaluation( # pylint: disable=too-many-arguments,too-many-positional-ar """Run multiple evaluations based on configuration.""" try: # Query the agent once - api_input = { + api_input: dict[str, str] = { "query": data_config.eval_query, "provider": agent_provider, "model": agent_model, - "conversation_id": conversation_id, } + if conversation_id is not None: + api_input["conversation_id"] = conversation_id if endpoint_type == "streaming": agent_response = self.agent_client.streaming_query_agent(api_input) @@ -61,7 +62,7 @@ def run_evaluation( # pylint: disable=too-many-arguments,too-many-positional-ar tool_calls = agent_response.get("tool_calls", []) # Run all evaluations - evaluation_results = [] + evaluation_results: list[EvalResultItem] = [] for eval_type in data_config.eval_types: try: success = self._evaluate_single_type( diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py index 4b4b3e73..e4de0b1c 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py @@ -2,7 +2,7 @@ import json import logging -from datetime import datetime +from datetime import UTC, datetime from pathlib import Path import pandas as pd @@ -32,7 +32,7 @@ def save_results(self, result_dir: str) -> None: output_dir = Path(result_dir) output_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") csv_file = output_dir / f"agent_goal_eval_results_{timestamp}.csv" json_file = output_dir / f"agent_goal_eval_summary_{timestamp}.json" diff --git a/pyproject.toml b/pyproject.toml index 42039f47..bde7e97e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ dev = [ "pytest-cov>=6.0.0,<=6.2.1", "pytest-mock==3.15.1", "pytest-timeout==2.4.0", + "types-PyYAML>=6.0.0", ] [project.scripts] @@ -84,9 +85,12 @@ line-length = 88 convention = "google" [tool.mypy] -disable_error_code = ["union-attr", "return-value", "arg-type", "import-untyped"] -ignore_missing_imports = true plugins = ["pydantic.mypy"] +explicit_package_bases = true +disallow_untyped_calls = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +ignore_missing_imports = true [tool.pydantic-mypy] init_forbid_extra = true @@ -94,12 +98,19 @@ init_typed = true warn_required_dynamic_aliases = true [tool.pylint.MASTER] +source-roots = ["src", "script", "tests"] load-plugins = ["pylint_pydantic"] init-hook = "import sys; sys.path.append('.')" +[tool.pylint."MESSAGES CONTROL"] +disable = ["R0801"] + +[tool.pyright] +extraPaths = ["./src"] [tool.ruff] +line-length = 88 [tool.ruff.lint] -extend-select = ["TID251"] +extend-select = ["TID251", "UP006", "UP007", "UP010", "UP017", "UP035", "RUF100", "B009", "B010", "DTZ005", "D202", "I001", "PLR1733"] [tool.ruff.lint.flake8-tidy-imports.banned-api] unittest = { msg = "use pytest instead of unittest" } "unittest.mock" = { msg = "use pytest-mock instead of unittest.mock" } diff --git a/script/compare_evaluations.py b/script/compare_evaluations.py index be8e1699..fc626d91 100755 --- a/script/compare_evaluations.py +++ b/script/compare_evaluations.py @@ -11,7 +11,7 @@ import sys import traceback from pathlib import Path -from typing import Any, Optional, Tuple, Union, cast +from typing import Any, Optional, cast import numpy as np from scipy.stats import chi2_contingency, fisher_exact, mannwhitneyu, ttest_ind @@ -32,7 +32,7 @@ def __init__(self, alpha: float = 0.05): self.logger = logging.getLogger(__name__) def compare_evaluations( - self, summary1_path: Union[str, Path], summary2_path: Union[str, Path] + self, summary1_path: str | Path, summary2_path: str | Path ) -> dict[str, Any]: """Compare two evaluation summary files and return statistical significance results. @@ -92,7 +92,7 @@ def compare_evaluations( return comparison_results - def _load_summary(self, path: Union[str, Path]) -> dict[str, Any]: + def _load_summary(self, path: str | Path) -> dict[str, Any]: """Load evaluation summary from JSON file.""" path = Path(path) if not path.exists(): @@ -313,9 +313,9 @@ def _compare_single_metric( # Determine overall statistical significance comparison["statistical_significance"] = self._determine_overall_significance( - comparison["score_comparison"], - comparison["pass_rate_comparison"], - comparison["confidence_interval_test"], + cast(Optional[dict[str, Any]], comparison["score_comparison"]), + cast(Optional[dict[str, Any]], comparison["pass_rate_comparison"]), + cast(Optional[dict[str, Any]], comparison["confidence_interval_test"]), ) return comparison @@ -381,7 +381,7 @@ def _compare_score_distributions( # T-test (assumes normal distribution) if len(scores1_array) > 1 and len(scores2_array) > 1: ttest_result = ttest_ind(scores1_array, scores2_array) - t_stat, t_pvalue = cast(Tuple[float, float], ttest_result) + t_stat, t_pvalue = cast(tuple[float, float], ttest_result) comparison["tests"]["t_test"] = { "statistic": t_stat, "p_value": t_pvalue, @@ -394,7 +394,7 @@ def _compare_score_distributions( mw_result = mannwhitneyu( scores1_array, scores2_array, alternative="two-sided" ) - u_stat, u_pvalue = cast(Tuple[float, float], mw_result) + u_stat, u_pvalue = cast(tuple[float, float], mw_result) comparison["tests"]["mann_whitney_u"] = { "statistic": u_stat, "p_value": u_pvalue, @@ -567,7 +567,7 @@ def _perform_chi_square_test( try: chi2_result = chi2_contingency(contingency_table) chi2_stat, chi2_pvalue, dof, _ = cast( - Tuple[float, float, int, Any], chi2_result + tuple[float, float, int, Any], chi2_result ) comparison["tests"]["chi_square"] = { "statistic": float(chi2_stat), @@ -602,7 +602,7 @@ def _perform_fisher_exact_test( ) fisher_result = fisher_exact(contingency_table) - odds_ratio, fisher_pvalue = cast(Tuple[float, float], fisher_result) + odds_ratio, fisher_pvalue = cast(tuple[float, float], fisher_result) comparison["tests"]["fisher_exact"] = { "odds_ratio": float(odds_ratio), "p_value": float(fisher_pvalue), diff --git a/script/run_multi_provider_eval.py b/script/run_multi_provider_eval.py index 34a5d471..afdef545 100755 --- a/script/run_multi_provider_eval.py +++ b/script/run_multi_provider_eval.py @@ -13,21 +13,23 @@ import argparse import copy import json -import re import logging import multiprocessing import os +import re import sys import tempfile import traceback from concurrent.futures import ProcessPoolExecutor, as_completed -from datetime import datetime +from datetime import UTC, datetime from pathlib import Path from typing import Any, Optional -from lightspeed_evaluation.runner.evaluation import run_evaluation + import numpy as np import yaml +from lightspeed_evaluation.runner.evaluation import run_evaluation + # Configure logging logging.basicConfig( level=logging.INFO, @@ -70,7 +72,7 @@ def _run_evaluation_worker( ) worker_logger = logging.getLogger(__name__) - start_time = datetime.now() + start_time = datetime.now(UTC) temp_config_path: Optional[Path] = None # Sanitize names for filesystem @@ -166,7 +168,7 @@ def _run_evaluation_worker( ) # Record end time and duration - end_time = datetime.now() + end_time = datetime.now(UTC) result["end_time"] = end_time.isoformat() result["duration_seconds"] = (end_time - start_time).total_seconds() @@ -436,7 +438,7 @@ def _run_single_evaluation( Returns: Dictionary containing evaluation results and metadata """ - start_time = datetime.now() + start_time = datetime.now(UTC) temp_config_path: Optional[Path] = None # Sanitize names for filesystem and enforce confinement under output_base @@ -508,7 +510,7 @@ def _run_single_evaluation( logger.warning(f"Failed to delete temp config: {temp_config_path}") # Record end time and duration - end_time = datetime.now() + end_time = datetime.now(UTC) result["end_time"] = end_time.isoformat() result["duration_seconds"] = (end_time - start_time).total_seconds() @@ -611,8 +613,8 @@ def _run_parallel_evaluations(self, configs: list[dict[str, Any]]) -> None: "provider_id": config["provider_id"], "model": config["model"], "output_dir": "", - "start_time": datetime.now().isoformat(), - "end_time": datetime.now().isoformat(), + "start_time": datetime.now(UTC).isoformat(), + "end_time": datetime.now(UTC).isoformat(), "duration_seconds": 0, "success": False, "error": f"Worker process failed: {str(e)}", @@ -634,7 +636,7 @@ def generate_summary(self) -> dict[str, Any]: failed = total - successful summary = { - "timestamp": datetime.now().isoformat(), + "timestamp": datetime.now(UTC).isoformat(), "total_evaluations": total, "successful": successful, "failed": failed, @@ -1156,7 +1158,7 @@ def save_model_comparison(self) -> Path: analysis_data = { "total_models": len(self.model_stats), "output_base": str(self.output_base), - "timestamp": datetime.now().isoformat(), + "timestamp": datetime.now(UTC).isoformat(), "rankings": [ { "rank": rank, diff --git a/src/generate_answers/generate_answers.py b/src/generate_answers/generate_answers.py index 5c7a0271..1f164d26 100644 --- a/src/generate_answers/generate_answers.py +++ b/src/generate_answers/generate_answers.py @@ -4,10 +4,11 @@ import logging import os import sys +from collections.abc import Callable from concurrent.futures import ThreadPoolExecutor from functools import partial from pathlib import Path -from typing import Callable, cast +from typing import cast import click import pandas as pd diff --git a/src/lightspeed_evaluation/core/api/client.py b/src/lightspeed_evaluation/core/api/client.py index e9e30a8e..9f28d045 100644 --- a/src/lightspeed_evaluation/core/api/client.py +++ b/src/lightspeed_evaluation/core/api/client.py @@ -4,7 +4,7 @@ import json import logging import os -from typing import Any, Optional, Union, cast +from typing import Any, Optional, cast import httpx from diskcache import Cache @@ -52,7 +52,7 @@ class APIClient: def __init__( self, - config: Union[APIConfig, HttpApiAgentConfig], + config: APIConfig | HttpApiAgentConfig, ): """Initialize the client with configuration.""" self.config = config diff --git a/src/lightspeed_evaluation/core/llm/__init__.py b/src/lightspeed_evaluation/core/llm/__init__.py index 578a62a7..59980cdf 100644 --- a/src/lightspeed_evaluation/core/llm/__init__.py +++ b/src/lightspeed_evaluation/core/llm/__init__.py @@ -3,17 +3,16 @@ from typing import TYPE_CHECKING # Apply litellm patching globally before any litellm usage in this package -import lightspeed_evaluation.core.llm.litellm_patch # noqa: F401 - +import lightspeed_evaluation.core.llm.litellm_patch from lightspeed_evaluation.core.system.lazy_import import create_lazy_getattr if TYPE_CHECKING: # ruff: noqa: F401 from lightspeed_evaluation.core.llm.custom import BaseCustomLLM - from lightspeed_evaluation.core.llm.token_tracker import TokenTracker from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager from lightspeed_evaluation.core.llm.manager import LLMManager from lightspeed_evaluation.core.llm.ragas import RagasLLMManager + from lightspeed_evaluation.core.llm.token_tracker import TokenTracker from lightspeed_evaluation.core.models import LLMConfig from lightspeed_evaluation.core.system.env_validator import validate_provider_env from lightspeed_evaluation.core.system.exceptions import LLMError diff --git a/src/lightspeed_evaluation/core/llm/custom.py b/src/lightspeed_evaluation/core/llm/custom.py index 89309dfb..ae64e682 100644 --- a/src/lightspeed_evaluation/core/llm/custom.py +++ b/src/lightspeed_evaluation/core/llm/custom.py @@ -1,7 +1,7 @@ """Base Custom LLM class for evaluation framework.""" import logging -from typing import Any, Union +from typing import Any import litellm from litellm.exceptions import InternalServerError @@ -37,7 +37,7 @@ def call( n: int = 1, return_single: bool = True, **kwargs: Any, - ) -> Union[str, list[str]]: + ) -> str | list[str]: """Make LLM call and return response(s). Args: diff --git a/src/lightspeed_evaluation/core/llm/litellm_patch.py b/src/lightspeed_evaluation/core/llm/litellm_patch.py index 9eb13a70..690faf26 100644 --- a/src/lightspeed_evaluation/core/llm/litellm_patch.py +++ b/src/lightspeed_evaluation/core/llm/litellm_patch.py @@ -34,8 +34,8 @@ # pylint: disable=wrong-import-position from lightspeed_evaluation.core.llm.token_tracker import ( # noqa: E402 - track_judge_tokens, track_embedding_tokens, + track_judge_tokens, ) logger = logging.getLogger(__name__) diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py index a3cd69a4..b4be309a 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/custom.py +++ b/src/lightspeed_evaluation/core/metrics/custom/custom.py @@ -5,17 +5,16 @@ from lightspeed_evaluation.core.llm.custom import BaseCustomLLM from lightspeed_evaluation.core.llm.manager import LLMManager +from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords from lightspeed_evaluation.core.metrics.custom.prompts import ( ANSWER_CORRECTNESS_PROMPT, INTENT_EVALUATION_PROMPT, ) -from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls +from lightspeed_evaluation.core.metrics.manager import MetricLevel from lightspeed_evaluation.core.models import EvaluationScope, TurnData from lightspeed_evaluation.core.system.exceptions import LLMError -from lightspeed_evaluation.core.metrics.manager import MetricLevel - if TYPE_CHECKING: from lightspeed_evaluation.core.metrics.manager import MetricManager diff --git a/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py b/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py index 306b912f..3f99ffce 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py +++ b/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py @@ -2,7 +2,8 @@ import logging import re -from typing import Any, Callable +from collections.abc import Callable +from typing import Any logger = logging.getLogger(__name__) diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py index 3b2b8d48..30db5995 100644 --- a/src/lightspeed_evaluation/core/metrics/geval.py +++ b/src/lightspeed_evaluation/core/metrics/geval.py @@ -20,7 +20,6 @@ from deepeval.metrics import GEval from deepeval.metrics.g_eval import Rubric from deepeval.test_case import LLMTestCase, LLMTestCaseParams - from pydantic import ValidationError from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager diff --git a/src/lightspeed_evaluation/core/metrics/script.py b/src/lightspeed_evaluation/core/metrics/script.py index 7bf9252e..b8aa03e6 100644 --- a/src/lightspeed_evaluation/core/metrics/script.py +++ b/src/lightspeed_evaluation/core/metrics/script.py @@ -2,7 +2,7 @@ import logging from pathlib import Path -from typing import Any, Optional, Union +from typing import Any, Optional from lightspeed_evaluation.core.models import EvaluationScope from lightspeed_evaluation.core.script import ( @@ -38,7 +38,7 @@ def evaluate( return None, f"Unsupported script metric: {metric_name}" def _evaluate_verify_script( - self, script_path: Optional[Union[str, Path]] + self, script_path: Optional[str | Path] ) -> tuple[Optional[float], str]: """Evaluate verify script.""" if not script_path: diff --git a/src/lightspeed_evaluation/core/models/__init__.py b/src/lightspeed_evaluation/core/models/__init__.py index 588478c8..0d665f2c 100644 --- a/src/lightspeed_evaluation/core/models/__init__.py +++ b/src/lightspeed_evaluation/core/models/__init__.py @@ -21,14 +21,6 @@ MetricResult, TurnData, ) -from lightspeed_evaluation.core.models.mixins import StreamingMetricsMixin -from lightspeed_evaluation.core.models.system import ( - APIConfig, - CoreConfig, - LoggingConfig, - SystemConfig, - VisualizationConfig, -) from lightspeed_evaluation.core.models.llm import ( EmbeddingConfig, GEvalConfig, @@ -37,17 +29,25 @@ LLMConfig, LLMPoolConfig, ) +from lightspeed_evaluation.core.models.mixins import StreamingMetricsMixin from lightspeed_evaluation.core.models.statistics import ( - NumericStats, - ScoreStatistics, - OverallStats, - MetricStats, - ConversationStats, - TagStats, - StreamingStats, AgentTokenUsage, ConfidenceInterval, + ConversationStats, DetailedStats, + MetricStats, + NumericStats, + OverallStats, + ScoreStatistics, + StreamingStats, + TagStats, +) +from lightspeed_evaluation.core.models.system import ( + APIConfig, + CoreConfig, + LoggingConfig, + SystemConfig, + VisualizationConfig, ) __all__ = [ diff --git a/src/lightspeed_evaluation/core/models/agents.py b/src/lightspeed_evaluation/core/models/agents.py index 7ad20129..d21e8206 100644 --- a/src/lightspeed_evaluation/core/models/agents.py +++ b/src/lightspeed_evaluation/core/models/agents.py @@ -1,7 +1,7 @@ """Agent configuration models for the evaluation framework.""" import os -from typing import Any, Literal, Optional, Union +from typing import Any, Literal, Optional from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator @@ -136,7 +136,7 @@ class HttpApiAgentConfig(HttpApiBaseFields): # Discriminated union of all agent config types; extend by adding new # config classes to support additional agent types. -AgentDefinition = Union[HttpApiAgentConfig] +AgentDefinition = HttpApiAgentConfig class AgentDefaultConfig(BaseModel): diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py index aeabbdc0..da4a686d 100644 --- a/src/lightspeed_evaluation/core/models/data.py +++ b/src/lightspeed_evaluation/core/models/data.py @@ -2,7 +2,7 @@ import logging from pathlib import Path -from typing import Any, Optional, Union +from typing import Any, Optional from pydantic import BaseModel, ConfigDict, Field, field_validator @@ -62,7 +62,7 @@ class TurnData(StreamingMetricsMixin): default=None, description="Expected keywords for keyword evaluation (list of alternatives)", ) - expected_response: Optional[Union[str, list[str]]] = Field( + expected_response: Optional[str | list[str]] = Field( default=None, description="Expected response or list of responses for comparison", ) @@ -100,7 +100,7 @@ class TurnData(StreamingMetricsMixin): ) # Script execution support - verify_script: Optional[Union[str, Path]] = Field( + verify_script: Optional[str | Path] = Field( default=None, description="Path to verify script for script-based evaluation" ) @@ -126,8 +126,8 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]: @field_validator("expected_response") @classmethod def validate_expected_response( - cls, v: Optional[Union[str, list[str]]] - ) -> Optional[Union[str, list[str]]]: + cls, v: Optional[str | list[str]] + ) -> Optional[str | list[str]]: """Validate expected response when provided.""" if v is None: return None @@ -400,11 +400,11 @@ class EvaluationData(BaseModel): ) # Script execution support - setup_script: Optional[Union[str, Path]] = Field( + setup_script: Optional[str | Path] = Field( default=None, description="Path to setup script to run before conversation starts", ) - cleanup_script: Optional[Union[str, Path]] = Field( + cleanup_script: Optional[str | Path] = Field( default=None, description="Path to cleanup script to run after conversation ends", ) @@ -549,7 +549,7 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin): contexts: Optional[str] = Field( default=None, description="Contexts formatted as string" ) - expected_response: Optional[Union[str, list[str]]] = Field( + expected_response: Optional[str | list[str]] = Field( default=None, description="Expected response or list of responses for comparison", ) diff --git a/src/lightspeed_evaluation/core/models/quality.py b/src/lightspeed_evaluation/core/models/quality.py index 5ce1cdfc..657fbe4f 100644 --- a/src/lightspeed_evaluation/core/models/quality.py +++ b/src/lightspeed_evaluation/core/models/quality.py @@ -10,13 +10,12 @@ from pydantic import BaseModel, Field from lightspeed_evaluation.core.models.statistics import ( + AgentTokenStats, MetricStats, NumericStats, ScoreStatistics, - AgentTokenStats, ) - logger = logging.getLogger(__name__) diff --git a/src/lightspeed_evaluation/core/models/statistics.py b/src/lightspeed_evaluation/core/models/statistics.py index 7f144e9c..67317f17 100644 --- a/src/lightspeed_evaluation/core/models/statistics.py +++ b/src/lightspeed_evaluation/core/models/statistics.py @@ -1,6 +1,7 @@ """Pydantic models for evaluation statistics.""" from typing import Optional + from pydantic import BaseModel, Field diff --git a/src/lightspeed_evaluation/core/models/summary.py b/src/lightspeed_evaluation/core/models/summary.py index 30a70f64..d092b1ef 100644 --- a/src/lightspeed_evaluation/core/models/summary.py +++ b/src/lightspeed_evaluation/core/models/summary.py @@ -1,6 +1,6 @@ """Evaluation summary models for structured results.""" -from datetime import datetime +from datetime import UTC, datetime from typing import Optional from pydantic import BaseModel, Field @@ -11,21 +11,21 @@ ) from lightspeed_evaluation.core.models.statistics import ( AgentTokenUsage, - NumericStats, ConversationStats, MetricStats, + NumericStats, OverallStats, StreamingStats, TagStats, ) from lightspeed_evaluation.core.output.statistics import ( - compute_agent_token_usage, compute_agent_latency_stats, + compute_agent_token_usage, + compute_conversation_stats, + compute_metric_stats, compute_overall_stats, compute_streaming_stats, compute_tag_stats, - compute_metric_stats, - compute_conversation_stats, ) @@ -83,7 +83,7 @@ def from_results( Returns: A fully populated EvaluationSummary instance. """ - timestamp = datetime.now().isoformat() + timestamp = datetime.now(UTC).isoformat() # Compute overall stats overall = compute_overall_stats(results) diff --git a/src/lightspeed_evaluation/core/output/data_persistence.py b/src/lightspeed_evaluation/core/output/data_persistence.py index 1f79030a..9d9841cd 100644 --- a/src/lightspeed_evaluation/core/output/data_persistence.py +++ b/src/lightspeed_evaluation/core/output/data_persistence.py @@ -1,6 +1,6 @@ """Simple data persistence utilities for evaluation framework.""" -from datetime import datetime +from datetime import UTC, datetime from pathlib import Path from typing import Optional @@ -27,7 +27,7 @@ def save_evaluation_data( output_path.mkdir(parents=True, exist_ok=True) # Create amended data file with timestamp in output directory - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") amended_data_path = ( output_path / f"{original_path.stem}_amended_{timestamp}{original_path.suffix}" diff --git a/src/lightspeed_evaluation/core/output/generator.py b/src/lightspeed_evaluation/core/output/generator.py index 8b56884b..6c89a81f 100644 --- a/src/lightspeed_evaluation/core/output/generator.py +++ b/src/lightspeed_evaluation/core/output/generator.py @@ -3,7 +3,7 @@ import csv import json import logging -from datetime import datetime +from datetime import UTC, datetime from pathlib import Path from typing import Any, Optional @@ -17,9 +17,7 @@ SUPPORTED_OUTPUT_TYPES, ) from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult -from lightspeed_evaluation.core.models.summary import ( - EvaluationSummary, -) +from lightspeed_evaluation.core.models.quality import QualityReport from lightspeed_evaluation.core.models.statistics import ( AgentTokenStats, ConversationStats, @@ -29,9 +27,11 @@ StreamingStats, TagStats, ) -from lightspeed_evaluation.core.models.quality import QualityReport -from lightspeed_evaluation.core.storage import FileBackendConfig, get_file_config +from lightspeed_evaluation.core.models.summary import ( + EvaluationSummary, +) from lightspeed_evaluation.core.output.visualization import GraphGenerator +from lightspeed_evaluation.core.storage import FileBackendConfig, get_file_config logger = logging.getLogger(__name__) @@ -105,7 +105,7 @@ def generate_reports( ) # Prepare timestamped base filename - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") base_filename = f"{self.base_filename}_{timestamp}" # Get enabled outputs from system config @@ -154,7 +154,7 @@ def save( target_dir = Path(output_dir) if output_dir else self.output_dir target_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") base_filename = f"{self.base_filename}_{timestamp}" generated_files: list[Path] = [] @@ -373,7 +373,7 @@ def _generate_quality_score_report( quality_score_file = out / f"{base_filename}_quality_report.json" output = { - "timestamp": datetime.now().isoformat(), + "timestamp": datetime.now(UTC).isoformat(), "quality_score": quality_report.quality_score, "quality_metrics": { metric_id: { @@ -449,7 +449,9 @@ def _generate_text_summary_from_model( f.write("LSC Evaluation Framework - Summary Report\n") f.write("=" * 50 + "\n\n") - f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write( + f"Generated: {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S %Z')}\n" + ) f.write(f"Total Evaluations: {len(summary.results)}\n\n") # Overall statistics diff --git a/src/lightspeed_evaluation/core/output/statistics.py b/src/lightspeed_evaluation/core/output/statistics.py index bf467491..ad4c35af 100644 --- a/src/lightspeed_evaluation/core/output/statistics.py +++ b/src/lightspeed_evaluation/core/output/statistics.py @@ -10,19 +10,18 @@ EvaluationData, EvaluationResult, ) - from lightspeed_evaluation.core.models.statistics import ( + AgentTokenStats, + AgentTokenUsage, + ConfidenceInterval, + ConversationStats, + DetailedStats, + MetricStats, NumericStats, + OverallStats, ScoreStatistics, StreamingStats, - AgentTokenUsage, - AgentTokenStats, - OverallStats, - MetricStats, - ConversationStats, TagStats, - ConfidenceInterval, - DetailedStats, ) diff --git a/src/lightspeed_evaluation/core/output/visualization.py b/src/lightspeed_evaluation/core/output/visualization.py index f3304894..5260e403 100644 --- a/src/lightspeed_evaluation/core/output/visualization.py +++ b/src/lightspeed_evaluation/core/output/visualization.py @@ -16,8 +16,8 @@ ) from lightspeed_evaluation.core.models import EvaluationResult from lightspeed_evaluation.core.output.statistics import ( - compute_overall_stats, compute_detailed_stats, + compute_overall_stats, ) CHART_COLORS = { diff --git a/src/lightspeed_evaluation/core/script/manager.py b/src/lightspeed_evaluation/core/script/manager.py index 3fe8bc14..00617006 100644 --- a/src/lightspeed_evaluation/core/script/manager.py +++ b/src/lightspeed_evaluation/core/script/manager.py @@ -4,7 +4,6 @@ import os import subprocess from pathlib import Path -from typing import Union from lightspeed_evaluation.core.system.exceptions import ScriptExecutionError @@ -25,7 +24,7 @@ def __init__(self, timeout: int = 300): """ self.timeout = timeout - def run_script(self, script_path: Union[str, Path]) -> bool: + def run_script(self, script_path: str | Path) -> bool: """Execute a script and return success status. Args: @@ -57,7 +56,7 @@ def run_script(self, script_path: Union[str, Path]) -> bool: f"Unexpected error running script {script_path}: {e}", str(script_path) ) from e - def _prepare_script_path(self, script_path: Union[str, Path]) -> Path: + def _prepare_script_path(self, script_path: str | Path) -> Path: """Prepare and resolve script path.""" if isinstance(script_path, str): script_path = Path(script_path) diff --git a/src/lightspeed_evaluation/core/storage/__init__.py b/src/lightspeed_evaluation/core/storage/__init__.py index f4ac128f..596a893d 100644 --- a/src/lightspeed_evaluation/core/storage/__init__.py +++ b/src/lightspeed_evaluation/core/storage/__init__.py @@ -25,15 +25,15 @@ backend.close() """ +from lightspeed_evaluation.core.storage.composite_storage import ( + CompositeStorageBackend, + NoOpStorageBackend, +) from lightspeed_evaluation.core.storage.config import ( DatabaseBackendConfig, FileBackendConfig, StorageBackendConfig, ) -from lightspeed_evaluation.core.storage.composite_storage import ( - CompositeStorageBackend, - NoOpStorageBackend, -) from lightspeed_evaluation.core.storage.factory import ( create_database_backend, create_pipeline_storage_backend, diff --git a/src/lightspeed_evaluation/core/storage/config.py b/src/lightspeed_evaluation/core/storage/config.py index e8b84e40..c005b98b 100644 --- a/src/lightspeed_evaluation/core/storage/config.py +++ b/src/lightspeed_evaluation/core/storage/config.py @@ -3,7 +3,7 @@ Defines Pydantic models for file and database storage configuration. """ -from typing import Annotated, Literal, Optional, Union +from typing import Annotated, Literal, Optional from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator @@ -128,6 +128,6 @@ def validate_connection_fields(self) -> "DatabaseBackendConfig": # Discriminated union for polymorphic storage configuration StorageBackendConfig = Annotated[ - Union[FileBackendConfig, DatabaseBackendConfig], + FileBackendConfig | DatabaseBackendConfig, Field(discriminator="type"), ] diff --git a/src/lightspeed_evaluation/core/storage/protocol.py b/src/lightspeed_evaluation/core/storage/protocol.py index 093fa3b0..857125c1 100644 --- a/src/lightspeed_evaluation/core/storage/protocol.py +++ b/src/lightspeed_evaluation/core/storage/protocol.py @@ -6,7 +6,7 @@ """ from dataclasses import dataclass, field -from datetime import datetime, timezone +from datetime import UTC, datetime from typing import Optional, Protocol from uuid import uuid4 @@ -25,7 +25,7 @@ class RunInfo: run_id: str = field(default_factory=lambda: str(uuid4())) name: str = "" - started_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + started_at: datetime = field(default_factory=lambda: datetime.now(UTC)) class BaseStorageBackend(Protocol): diff --git a/src/lightspeed_evaluation/core/storage/sql_storage.py b/src/lightspeed_evaluation/core/storage/sql_storage.py index 1cb545a8..1f28b5fe 100644 --- a/src/lightspeed_evaluation/core/storage/sql_storage.py +++ b/src/lightspeed_evaluation/core/storage/sql_storage.py @@ -6,7 +6,7 @@ import json import logging -from datetime import datetime, timezone +from datetime import UTC, datetime from typing import Any, Optional from sqlalchemy import ( @@ -23,8 +23,8 @@ from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker from lightspeed_evaluation.core.models import EvaluationResult -from lightspeed_evaluation.core.system.exceptions import StorageError from lightspeed_evaluation.core.storage.protocol import BaseStorageBackend, RunInfo +from lightspeed_evaluation.core.system.exceptions import StorageError logger = logging.getLogger(__name__) @@ -307,7 +307,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB: return EvaluationResultDB( run_id=self._run_info.run_id, - timestamp=datetime.now(timezone.utc), + timestamp=datetime.now(UTC), conversation_group_id=result.conversation_group_id, tag=result.tag, turn_id=result.turn_id, diff --git a/src/lightspeed_evaluation/core/system/ssl_certifi.py b/src/lightspeed_evaluation/core/system/ssl_certifi.py index 82293532..1a7034a3 100644 --- a/src/lightspeed_evaluation/core/system/ssl_certifi.py +++ b/src/lightspeed_evaluation/core/system/ssl_certifi.py @@ -2,9 +2,10 @@ import atexit import os -from typing import Any -from pathlib import Path import tempfile +from pathlib import Path +from typing import Any + import certifi diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py index 4c797897..43a4ed6d 100644 --- a/src/lightspeed_evaluation/core/system/validator.py +++ b/src/lightspeed_evaluation/core/system/validator.py @@ -2,7 +2,7 @@ import os from pathlib import Path -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional import yaml from pydantic import ValidationError @@ -86,7 +86,7 @@ def format_pydantic_error(error: ValidationError) -> str: return "; ".join(errors) -def _is_field_empty(value: Optional[Union[str, list, dict]]) -> bool: +def _is_field_empty(value: Optional[str | list | dict]) -> bool: """Return True if value is considered empty for required-field validation.""" if value is None: return True @@ -489,7 +489,7 @@ def _validate_scripts(self, evaluation_data: list[EvaluationData]) -> None: def _validate_single_script( self, - script_file: Optional[Union[str, Path]], + script_file: Optional[str | Path], script_type: str, context: str, ) -> Optional[Path]: diff --git a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py index f091266b..5f60be17 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py @@ -5,9 +5,13 @@ import time from typing import Any, Optional +from lightspeed_evaluation.core.constants import ( + DEFAULT_METRIC_THRESHOLD, + NON_LLM_FRAMEWORKS, +) from lightspeed_evaluation.core.embedding.manager import EmbeddingManager -from lightspeed_evaluation.core.llm.token_tracker import TokenTracker from lightspeed_evaluation.core.llm.manager import LLMManager +from lightspeed_evaluation.core.llm.token_tracker import TokenTracker from lightspeed_evaluation.core.metrics.custom import CustomMetrics from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager @@ -30,10 +34,6 @@ METRIC_REQUIREMENTS, check_metric_required_data, ) -from lightspeed_evaluation.core.constants import ( - DEFAULT_METRIC_THRESHOLD, - NON_LLM_FRAMEWORKS, -) from lightspeed_evaluation.pipeline.evaluation.judges import JudgeOrchestrator logger = logging.getLogger(__name__) diff --git a/src/lightspeed_evaluation/pipeline/evaluation/judges.py b/src/lightspeed_evaluation/pipeline/evaluation/judges.py index 4147a865..01e118e9 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/judges.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/judges.py @@ -1,8 +1,9 @@ """Judge orchestration module - handles multi-judge evaluation and aggregation.""" import logging +from collections.abc import Callable from statistics import mean -from typing import Any, Callable, Optional +from typing import Any, Optional from lightspeed_evaluation.core.constants import DEFAULT_METRIC_THRESHOLD from lightspeed_evaluation.core.llm.manager import LLMManager diff --git a/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py b/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py index 98f10d29..b9b8c42d 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py @@ -3,7 +3,8 @@ import asyncio import concurrent.futures import logging -from typing import Any, Optional +from collections.abc import Callable, Coroutine +from typing import Any, Optional, cast import litellm import tqdm @@ -272,8 +273,10 @@ def close(self) -> None: if cache is not None: try: # Use getattr to call untyped third-party method - disconnect = getattr(cache, "disconnect") + disconnect = cast( + Callable[[], Coroutine[Any, Any, object]], cache.disconnect + ) asyncio.run(disconnect()) - except (AttributeError, RuntimeError, OSError): + except (AttributeError, RuntimeError, OSError, TypeError): logger.debug("litellm cache disconnect raised; ignoring") litellm.cache = None diff --git a/src/lightspeed_evaluation/runner/evaluation.py b/src/lightspeed_evaluation/runner/evaluation.py index 972c6c98..4903487b 100644 --- a/src/lightspeed_evaluation/runner/evaluation.py +++ b/src/lightspeed_evaluation/runner/evaluation.py @@ -8,10 +8,10 @@ from typing import Optional from lightspeed_evaluation.core.models import ( + AgentTokenUsage, LLMPoolConfig, - SystemConfig, OverallStats, - AgentTokenUsage, + SystemConfig, ) # Import only lightweight modules at top level diff --git a/tests/script/test_compare_evaluations.py b/tests/script/test_compare_evaluations.py index ae94474f..334ae4a9 100755 --- a/tests/script/test_compare_evaluations.py +++ b/tests/script/test_compare_evaluations.py @@ -4,12 +4,12 @@ """Pytest tests to verify the compare_evaluations.py script works correctly.""" import json -import tempfile import subprocess import sys +import tempfile from pathlib import Path - from typing import Any + import pytest from script.compare_evaluations import EvaluationComparison @@ -86,7 +86,6 @@ def test_basic_comparison( def test_invalid_arguments(script_path: Path) -> None: """Test error handling for invalid arguments.""" - # Test with only one file result = subprocess.run( [sys.executable, str(script_path), "file1.json"], @@ -116,7 +115,6 @@ def test_invalid_arguments(script_path: Path) -> None: def test_nonexistent_files(script_path: Path) -> None: """Test error handling for nonexistent files.""" - result = subprocess.run( [sys.executable, str(script_path), "nonexistent1.json", "nonexistent2.json"], capture_output=True, diff --git a/tests/script/test_run_multi_provider_eval.py b/tests/script/test_run_multi_provider_eval.py index 0b271c62..6158e85c 100644 --- a/tests/script/test_run_multi_provider_eval.py +++ b/tests/script/test_run_multi_provider_eval.py @@ -4,16 +4,16 @@ """Pytest tests for run_multi_provider_eval.py script.""" import json -from pathlib import Path -from typing import Any -import tempfile as temp_module import logging import multiprocessing import shutil +import tempfile as temp_module +from pathlib import Path +from typing import Any import pytest -from pytest_mock import MockerFixture import yaml +from pytest_mock import MockerFixture from script.run_multi_provider_eval import MultiProviderEvaluationRunner @@ -161,7 +161,6 @@ def test_resource_warning_high_thread_count( caplog: pytest.LogCaptureFixture, ) -> None: """Test warning is logged when total threads is very high.""" - # Create system config with high max_threads system_config = { "core": {"max_threads": 100}, @@ -198,7 +197,6 @@ def test_no_resource_warning_reasonable_config( caplog: pytest.LogCaptureFixture, ) -> None: """Test no warning with reasonable thread count.""" - # Calculate safe thread count based on actual CPU count cpu_count = multiprocessing.cpu_count() # Use values that keep total threads <= cpu_count * 2 @@ -393,7 +391,6 @@ def test_temp_config_cleanup_on_yaml_dump_failure( mocker: MockerFixture, ) -> None: """Test that temp file is cleaned up when yaml.dump() fails.""" - # Track the temp file path that gets created created_temp_path = None original_named_temp_file = temp_module.NamedTemporaryFile diff --git a/tests/unit/core/api/conftest.py b/tests/unit/core/api/conftest.py index f6ed6901..7cdad8d7 100644 --- a/tests/unit/core/api/conftest.py +++ b/tests/unit/core/api/conftest.py @@ -3,8 +3,8 @@ from typing import Any import pytest - from pytest_mock import MockerFixture + from lightspeed_evaluation.core.models import APIConfig diff --git a/tests/unit/core/api/test_client.py b/tests/unit/core/api/test_client.py index eb0bb3ad..ed688213 100644 --- a/tests/unit/core/api/test_client.py +++ b/tests/unit/core/api/test_client.py @@ -3,14 +3,15 @@ """Unit tests for core API client module.""" from pathlib import Path -import pytest + import httpx -from pytest_mock import MockerFixture +import pytest from pydantic import ValidationError +from pytest_mock import MockerFixture +from lightspeed_evaluation.core.api.client import APIClient, _is_retryable_server_error from lightspeed_evaluation.core.models import APIConfig, APIResponse from lightspeed_evaluation.core.system.exceptions import APIError -from lightspeed_evaluation.core.api.client import APIClient, _is_retryable_server_error class TestAPIClient: @@ -18,7 +19,6 @@ class TestAPIClient: def test_initialization_unsupported_endpoint_type(self) -> None: """Test initialization fails with unsupported endpoint type.""" - # Pydantic will validate the endpoint_type, so this should raise ValidationError with pytest.raises(ValidationError, match="Endpoint type must be one of"): APIConfig( @@ -150,7 +150,6 @@ def test_query_timeout_error( self, basic_api_config_query_endpoint: APIConfig, mocker: MockerFixture ) -> None: """Test query handling timeout.""" - mock_client = mocker.Mock() mock_client.post.side_effect = httpx.TimeoutException("Timeout") mock_client.headers = {} @@ -231,7 +230,6 @@ def test_handle_response_errors_non_200( self, basic_api_config_query_endpoint: APIConfig, mocker: MockerFixture ) -> None: """Test _handle_response_errors with non-200 status.""" - mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") client = APIClient(basic_api_config_query_endpoint) @@ -488,7 +486,6 @@ def test_standard_endpoint_initialization( self, basic_api_config_query_endpoint: APIConfig, mocker: MockerFixture ) -> None: """Test initialization with standard (non-streaming) endpoint.""" - mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") client = APIClient(basic_api_config_query_endpoint) diff --git a/tests/unit/core/api/test_client_infer.py b/tests/unit/core/api/test_client_infer.py index e99a5537..8fd931bf 100644 --- a/tests/unit/core/api/test_client_infer.py +++ b/tests/unit/core/api/test_client_infer.py @@ -2,13 +2,13 @@ """Unit tests for APIClient /infer endpoint support.""" -import pytest import httpx +import pytest from pytest_mock import MockerFixture +from lightspeed_evaluation.core.api.client import APIClient from lightspeed_evaluation.core.models import APIConfig, APIResponse from lightspeed_evaluation.core.system.exceptions import APIError -from lightspeed_evaluation.core.api.client import APIClient class TestInferEndpoint: diff --git a/tests/unit/core/api/test_streaming_parser.py b/tests/unit/core/api/test_streaming_parser.py index b6a0d8e6..f90dc746 100644 --- a/tests/unit/core/api/test_streaming_parser.py +++ b/tests/unit/core/api/test_streaming_parser.py @@ -5,10 +5,10 @@ import pytest from lightspeed_evaluation.core.api.streaming_parser import ( - parse_streaming_response, + _format_tool_sequences, _parse_sse_line, _parse_tool_call, - _format_tool_sequences, + parse_streaming_response, ) diff --git a/tests/unit/core/config/test_models.py b/tests/unit/core/config/test_models.py index 3ea18d0f..7d13ae70 100644 --- a/tests/unit/core/config/test_models.py +++ b/tests/unit/core/config/test_models.py @@ -2,6 +2,7 @@ import pytest from pydantic import ValidationError + from lightspeed_evaluation.core.models import ( CoreConfig, EvaluationData, diff --git a/tests/unit/core/llm/conftest.py b/tests/unit/core/llm/conftest.py index 779ef21e..e67cd4b2 100644 --- a/tests/unit/core/llm/conftest.py +++ b/tests/unit/core/llm/conftest.py @@ -1,6 +1,7 @@ """Pytest configuration and fixtures for llm tests.""" -from typing import Any, Callable +from collections.abc import Callable +from typing import Any import pytest from pytest_mock import MockerFixture @@ -57,10 +58,8 @@ def _create_response( mock_response.usage.prompt_tokens = prompt_tokens mock_response.usage.completion_tokens = completion_tokens - setattr( - mock_response, - "_hidden_params", - {"cache_hit": cache_hit} if cache_hit else {}, + mock_response.configure_mock( + _hidden_params={"cache_hit": cache_hit} if cache_hit else {} ) return mock_response @@ -97,10 +96,8 @@ def _create_response( prompt_tokens=prompt_tokens, spec=["prompt_tokens"] ) - setattr( - mock_response, - "_hidden_params", - {"cache_hit": cache_hit} if cache_hit else {}, + mock_response.configure_mock( + _hidden_params={"cache_hit": cache_hit} if cache_hit else {} ) return mock_response diff --git a/tests/unit/core/llm/test_custom.py b/tests/unit/core/llm/test_custom.py index 697abd2e..27d1025c 100644 --- a/tests/unit/core/llm/test_custom.py +++ b/tests/unit/core/llm/test_custom.py @@ -2,7 +2,8 @@ """Unit tests for custom LLM classes.""" -from typing import Any, Callable +from collections.abc import Callable +from typing import Any import pytest from pytest_mock import MockerFixture diff --git a/tests/unit/core/llm/test_llm_manager.py b/tests/unit/core/llm/test_llm_manager.py index 53852484..1f896328 100644 --- a/tests/unit/core/llm/test_llm_manager.py +++ b/tests/unit/core/llm/test_llm_manager.py @@ -5,18 +5,18 @@ import pytest from pytest_mock import MockerFixture +from lightspeed_evaluation.core.llm.manager import LLMManager from lightspeed_evaluation.core.models import ( + JudgePanelConfig, LLMConfig, - SystemConfig, LLMPoolConfig, - JudgePanelConfig, + SystemConfig, ) from lightspeed_evaluation.core.models.llm import ( LLMDefaultsConfig, LLMParametersConfig, LLMProviderConfig, ) -from lightspeed_evaluation.core.llm.manager import LLMManager class TestLLMManager: diff --git a/tests/unit/core/llm/test_token_tracker.py b/tests/unit/core/llm/test_token_tracker.py index 4478a414..6a1ed798 100644 --- a/tests/unit/core/llm/test_token_tracker.py +++ b/tests/unit/core/llm/test_token_tracker.py @@ -1,11 +1,12 @@ """Unit tests for TokenTracker and integration with litellm patch.""" import threading -from typing import Any, Callable +from collections.abc import Callable +from typing import Any +import litellm import pytest from pytest_mock import MockerFixture -import litellm # Simulate litellm completion call through patch from lightspeed_evaluation.core.llm import litellm_patch diff --git a/tests/unit/core/metrics/conftest.py b/tests/unit/core/metrics/conftest.py index e7bc39e0..536f1309 100644 --- a/tests/unit/core/metrics/conftest.py +++ b/tests/unit/core/metrics/conftest.py @@ -8,7 +8,7 @@ from pytest_mock import MockerFixture from lightspeed_evaluation.core.metrics.nlp import NLPMetrics -from lightspeed_evaluation.core.models import EvaluationScope, TurnData, SystemConfig +from lightspeed_evaluation.core.models import EvaluationScope, SystemConfig, TurnData @pytest.fixture diff --git a/tests/unit/core/metrics/custom/test_custom.py b/tests/unit/core/metrics/custom/test_custom.py index d6bccf52..51a002c7 100644 --- a/tests/unit/core/metrics/custom/test_custom.py +++ b/tests/unit/core/metrics/custom/test_custom.py @@ -1,6 +1,7 @@ """Tests for custom metrics module.""" from pytest_mock import MockerFixture + from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics from lightspeed_evaluation.core.metrics.manager import MetricLevel from lightspeed_evaluation.core.models import EvaluationScope, TurnData diff --git a/tests/unit/core/metrics/custom/test_tool_eval.py b/tests/unit/core/metrics/custom/test_tool_eval.py index 6ff890c2..f4d63f3b 100644 --- a/tests/unit/core/metrics/custom/test_tool_eval.py +++ b/tests/unit/core/metrics/custom/test_tool_eval.py @@ -3,12 +3,12 @@ from typing import Any from lightspeed_evaluation.core.metrics.custom.tool_eval import ( - evaluate_tool_calls, - compare_tool_calls, - _compare_tool_call_sequence, _compare_single_tool_call, _compare_tool_arguments, + _compare_tool_call_sequence, _compare_tool_result, + compare_tool_calls, + evaluate_tool_calls, ) diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py index 0ab50d40..29725c08 100644 --- a/tests/unit/core/metrics/test_geval.py +++ b/tests/unit/core/metrics/test_geval.py @@ -4,9 +4,9 @@ from typing import Any import pytest -from pytest_mock import MockerFixture from deepeval.metrics.g_eval import Rubric from deepeval.test_case import LLMTestCaseParams +from pytest_mock import MockerFixture from lightspeed_evaluation.core.metrics.geval import GEvalHandler from lightspeed_evaluation.core.metrics.manager import MetricLevel diff --git a/tests/unit/core/models/test_api_additional.py b/tests/unit/core/models/test_api_additional.py index cde8a1b5..90b64fdc 100644 --- a/tests/unit/core/models/test_api_additional.py +++ b/tests/unit/core/models/test_api_additional.py @@ -4,10 +4,10 @@ from pydantic import ValidationError from lightspeed_evaluation.core.models.api import ( - RAGChunk, - AttachmentData, APIRequest, APIResponse, + AttachmentData, + RAGChunk, ) diff --git a/tests/unit/core/models/test_quality.py b/tests/unit/core/models/test_quality.py index d87ee295..df01baf9 100644 --- a/tests/unit/core/models/test_quality.py +++ b/tests/unit/core/models/test_quality.py @@ -19,7 +19,6 @@ def test_quality_report_creation_happy_path( api_latency_summary: NumericStats, ) -> None: """Test QualityReport creation with valid metrics.""" - # Define quality score metrics (subset of all metrics) quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] diff --git a/tests/unit/core/models/test_summary.py b/tests/unit/core/models/test_summary.py index c84a6f01..ffa9cd3f 100644 --- a/tests/unit/core/models/test_summary.py +++ b/tests/unit/core/models/test_summary.py @@ -4,19 +4,18 @@ from pytest_mock import MockerFixture -from lightspeed_evaluation.core.models.data import ( - EvaluationData, - EvaluationResult, - TurnData, -) from lightspeed_evaluation.core.models import ( ConfidenceInterval, OverallStats, ScoreStatistics, ) +from lightspeed_evaluation.core.models.data import ( + EvaluationData, + EvaluationResult, + TurnData, +) from lightspeed_evaluation.core.models.summary import EvaluationSummary - _RESULT_DEFAULTS: dict[str, Any] = { "conversation_group_id": "conv1", "tag": "eval", diff --git a/tests/unit/core/models/test_system.py b/tests/unit/core/models/test_system.py index 6e5bf344..95cf09eb 100644 --- a/tests/unit/core/models/test_system.py +++ b/tests/unit/core/models/test_system.py @@ -10,23 +10,19 @@ from lightspeed_evaluation.core.models import ( APIConfig, + CoreConfig, EmbeddingConfig, JudgePanelConfig, LLMConfig, LLMPoolConfig, SystemConfig, VisualizationConfig, - CoreConfig, ) from lightspeed_evaluation.core.models.agents import ( AgentDefaultConfig, AgentsConfig, HttpApiAgentConfig, ) -from lightspeed_evaluation.core.models.system import ( - LoggingConfig, - QualityScoreConfig, -) from lightspeed_evaluation.core.models.llm import ( GEvalConfig, GEvalRubricConfig, @@ -34,6 +30,10 @@ LLMParametersConfig, LLMProviderConfig, ) +from lightspeed_evaluation.core.models.system import ( + LoggingConfig, + QualityScoreConfig, +) from lightspeed_evaluation.core.storage import FileBackendConfig from lightspeed_evaluation.core.system.exceptions import ConfigurationError diff --git a/tests/unit/core/output/conftest.py b/tests/unit/core/output/conftest.py index 46cba616..1732f1d0 100644 --- a/tests/unit/core/output/conftest.py +++ b/tests/unit/core/output/conftest.py @@ -2,6 +2,7 @@ import pytest from pytest_mock import MockerFixture + from lightspeed_evaluation.core.models import EvaluationResult from lightspeed_evaluation.core.storage import FileBackendConfig diff --git a/tests/unit/core/output/test_final_coverage.py b/tests/unit/core/output/test_final_coverage.py index dd769612..98a69d3e 100644 --- a/tests/unit/core/output/test_final_coverage.py +++ b/tests/unit/core/output/test_final_coverage.py @@ -5,6 +5,7 @@ from pathlib import Path from pytest_mock import MockerFixture + from lightspeed_evaluation.core.models import ( EvaluationData, EvaluationResult, @@ -14,11 +15,11 @@ from lightspeed_evaluation.core.models.summary import EvaluationSummary from lightspeed_evaluation.core.output.generator import OutputHandler from lightspeed_evaluation.core.output.statistics import ( - compute_overall_stats, compute_detailed_stats, + compute_overall_stats, ) -from lightspeed_evaluation.core.system.validator import DataValidator from lightspeed_evaluation.core.storage import FileBackendConfig +from lightspeed_evaluation.core.system.validator import DataValidator class TestStatisticsEdgeCases: diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py index c8aa29a3..15ecde58 100644 --- a/tests/unit/core/output/test_generator.py +++ b/tests/unit/core/output/test_generator.py @@ -2,15 +2,15 @@ """Unit tests for output generator.""" +import csv as csv_module import json from pathlib import Path -import csv as csv_module from pytest_mock import MockerFixture from lightspeed_evaluation.core.models import EvaluationResult -from lightspeed_evaluation.core.models.summary import EvaluationSummary from lightspeed_evaluation.core.models.quality import QualityReport +from lightspeed_evaluation.core.models.summary import EvaluationSummary from lightspeed_evaluation.core.output.generator import OutputHandler from lightspeed_evaluation.core.storage import FileBackendConfig diff --git a/tests/unit/core/output/test_statistics.py b/tests/unit/core/output/test_statistics.py index 0d32eef7..8a50b792 100644 --- a/tests/unit/core/output/test_statistics.py +++ b/tests/unit/core/output/test_statistics.py @@ -1,16 +1,16 @@ """Unit tests for core statistics module.""" -import pytest import pandas as pd +import pytest from lightspeed_evaluation.core.models.data import ( EvaluationResult, ) from lightspeed_evaluation.core.models.statistics import OverallStats from lightspeed_evaluation.core.output.statistics import ( - compute_score_statistics, bootstrap_intervals, compute_overall_stats, + compute_score_statistics, ) diff --git a/tests/unit/core/output/test_statistics_api.py b/tests/unit/core/output/test_statistics_api.py index 86a82dfe..ed47f25b 100644 --- a/tests/unit/core/output/test_statistics_api.py +++ b/tests/unit/core/output/test_statistics_api.py @@ -4,8 +4,8 @@ from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult, TurnData from lightspeed_evaluation.core.output.statistics import ( - compute_field_numeric_stats_from_evaluation_data, compute_agent_token_usage, + compute_field_numeric_stats_from_evaluation_data, compute_overall_stats, ) diff --git a/tests/unit/core/script/test_manager.py b/tests/unit/core/script/test_manager.py index 33b72350..3e8e31ba 100644 --- a/tests/unit/core/script/test_manager.py +++ b/tests/unit/core/script/test_manager.py @@ -1,8 +1,8 @@ """Unit tests for core script manager module.""" +import os import tempfile from pathlib import Path -import os import pytest diff --git a/tests/unit/core/script/test_manager_additional.py b/tests/unit/core/script/test_manager_additional.py index 642e2e42..6088b891 100644 --- a/tests/unit/core/script/test_manager_additional.py +++ b/tests/unit/core/script/test_manager_additional.py @@ -1,8 +1,9 @@ """Additional tests for script manager to increase coverage.""" -from pathlib import Path -import subprocess import logging +import subprocess +from pathlib import Path + import pytest from pytest_mock import MockFixture @@ -98,7 +99,6 @@ def test_script_output_logging( self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture ) -> None: """Test that script output is logged.""" - caplog.set_level(logging.DEBUG) script = tmp_path / "test_script.sh" @@ -123,7 +123,6 @@ def test_script_stderr_logging_on_failure( self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture ) -> None: """Test that stderr is logged as error on failure.""" - caplog.set_level(logging.ERROR) script = tmp_path / "test_script.sh" @@ -147,7 +146,6 @@ def test_script_stderr_logging_on_success( self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture ) -> None: """Test that stderr is logged as debug on success.""" - caplog.set_level(logging.DEBUG) script = tmp_path / "test_script.sh" diff --git a/tests/unit/core/storage/test_composite_and_factory.py b/tests/unit/core/storage/test_composite_and_factory.py index 02dc959d..2bb80d3b 100644 --- a/tests/unit/core/storage/test_composite_and_factory.py +++ b/tests/unit/core/storage/test_composite_and_factory.py @@ -2,7 +2,7 @@ import pytest -from lightspeed_evaluation.core.models import LLMConfig, SystemConfig, EvaluationResult +from lightspeed_evaluation.core.models import EvaluationResult, LLMConfig, SystemConfig from lightspeed_evaluation.core.storage import ( BaseStorageBackend, CompositeStorageBackend, diff --git a/tests/unit/core/storage/test_protocol.py b/tests/unit/core/storage/test_protocol.py index cf6e59ea..f9f1919d 100644 --- a/tests/unit/core/storage/test_protocol.py +++ b/tests/unit/core/storage/test_protocol.py @@ -1,6 +1,6 @@ """Unit tests for storage protocol and RunInfo.""" -from datetime import datetime, timezone +from datetime import UTC, datetime from lightspeed_evaluation.core.storage import RunInfo @@ -22,9 +22,9 @@ def test_with_name(self) -> None: def test_sets_timestamp(self) -> None: """Test that RunInfo sets a timestamp.""" - before = datetime.now(timezone.utc) + before = datetime.now(UTC) run_info = RunInfo() - after = datetime.now(timezone.utc) + after = datetime.now(UTC) assert before <= run_info.started_at <= after diff --git a/tests/unit/core/storage/test_sql_storage.py b/tests/unit/core/storage/test_sql_storage.py index d79b698a..3f24adfa 100644 --- a/tests/unit/core/storage/test_sql_storage.py +++ b/tests/unit/core/storage/test_sql_storage.py @@ -4,7 +4,7 @@ import os import tempfile -from typing import Generator +from collections.abc import Generator import pytest from pytest_mock import MockerFixture diff --git a/tests/unit/core/system/test_loader.py b/tests/unit/core/system/test_loader.py index 86799bcd..a11152c5 100644 --- a/tests/unit/core/system/test_loader.py +++ b/tests/unit/core/system/test_loader.py @@ -6,10 +6,10 @@ import pytest from pytest_mock import MockerFixture -from lightspeed_evaluation.core.system.exceptions import ConfigurationError -from lightspeed_evaluation.core.system.loader import ConfigLoader from lightspeed_evaluation.core.models import SystemConfig from lightspeed_evaluation.core.storage import get_file_config +from lightspeed_evaluation.core.system.exceptions import ConfigurationError +from lightspeed_evaluation.core.system.loader import ConfigLoader class TestConfigLoader: diff --git a/tests/unit/core/system/test_setup.py b/tests/unit/core/system/test_setup.py index d0f1fe25..47cecdae 100644 --- a/tests/unit/core/system/test_setup.py +++ b/tests/unit/core/system/test_setup.py @@ -4,8 +4,8 @@ import os import pytest -from pytest_mock import MockerFixture from _pytest.capture import CaptureFixture +from pytest_mock import MockerFixture from lightspeed_evaluation.core.models import LoggingConfig from lightspeed_evaluation.core.system.setup import ( diff --git a/tests/unit/core/system/test_ssl_certifi.py b/tests/unit/core/system/test_ssl_certifi.py index d92bf73c..67ee1347 100644 --- a/tests/unit/core/system/test_ssl_certifi.py +++ b/tests/unit/core/system/test_ssl_certifi.py @@ -5,10 +5,10 @@ from pytest_mock import MockerFixture from lightspeed_evaluation.core.system.ssl_certifi import ( + _get_unique_ssl_cert_paths, create_ssl_certifi_bundle, get_ssl_cert_files_paths_from_system_yaml, get_system_ssl_cert_file, - _get_unique_ssl_cert_paths, ) diff --git a/tests/unit/core/system/test_validator.py b/tests/unit/core/system/test_validator.py index d98e3573..27afb075 100644 --- a/tests/unit/core/system/test_validator.py +++ b/tests/unit/core/system/test_validator.py @@ -6,9 +6,8 @@ from pathlib import Path import pytest -from pytest_mock import MockerFixture - from pydantic import ValidationError +from pytest_mock import MockerFixture from lightspeed_evaluation.core.models import EvaluationData, SystemConfig, TurnData from lightspeed_evaluation.core.system.exceptions import DataValidationError diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py index 6c167d05..b4f192d6 100644 --- a/tests/unit/pipeline/evaluation/conftest.py +++ b/tests/unit/pipeline/evaluation/conftest.py @@ -7,23 +7,24 @@ import pytest from pytest_mock import MockerFixture +from lightspeed_evaluation.core.metrics.manager import MetricManager from lightspeed_evaluation.core.models import ( EvaluationData, + EvaluationRequest, + EvaluationResult, SystemConfig, TurnData, ) from lightspeed_evaluation.core.models.agents import AgentsConfig +from lightspeed_evaluation.core.script import ScriptExecutionManager from lightspeed_evaluation.core.storage import FileBackendConfig from lightspeed_evaluation.core.system.loader import ConfigLoader -from lightspeed_evaluation.core.metrics.manager import MetricManager -from lightspeed_evaluation.core.script import ScriptExecutionManager -from lightspeed_evaluation.core.models import EvaluationResult, EvaluationRequest from lightspeed_evaluation.pipeline.evaluation.driver import AgentDriver from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator from lightspeed_evaluation.pipeline.evaluation.processor import ( - ProcessorComponents, ConversationProcessor, + ProcessorComponents, ) diff --git a/tests/unit/pipeline/evaluation/test_evaluator.py b/tests/unit/pipeline/evaluation/test_evaluator.py index 82b63555..4c07d0e0 100644 --- a/tests/unit/pipeline/evaluation/test_evaluator.py +++ b/tests/unit/pipeline/evaluation/test_evaluator.py @@ -8,6 +8,7 @@ from pytest_mock import MockerFixture from lightspeed_evaluation.core.llm.token_tracker import TokenTracker +from lightspeed_evaluation.core.metrics.manager import MetricManager from lightspeed_evaluation.core.models import ( EvaluationData, EvaluationRequest, @@ -15,10 +16,9 @@ MetricResult, TurnData, ) -from lightspeed_evaluation.core.system.loader import ConfigLoader -from lightspeed_evaluation.core.system.exceptions import EvaluationError -from lightspeed_evaluation.core.metrics.manager import MetricManager from lightspeed_evaluation.core.script import ScriptExecutionManager +from lightspeed_evaluation.core.system.exceptions import EvaluationError +from lightspeed_evaluation.core.system.loader import ConfigLoader from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator diff --git a/tests/unit/pipeline/evaluation/test_processor.py b/tests/unit/pipeline/evaluation/test_processor.py index c4a18688..30946930 100644 --- a/tests/unit/pipeline/evaluation/test_processor.py +++ b/tests/unit/pipeline/evaluation/test_processor.py @@ -3,7 +3,7 @@ """Unit tests for ConversationProcessor.""" import logging -from typing import Callable +from collections.abc import Callable import pytest from _pytest.logging import LogCaptureFixture @@ -108,7 +108,6 @@ def test_process_conversation_conversation_metrics( mocker: MockerFixture, ) -> None: """Test processing with conversation-level metrics.""" - turn1 = TurnData(turn_id="turn1", query="Q", response="R") conv_data = EvaluationData( conversation_group_id="conv1", @@ -154,7 +153,6 @@ def test_process_conversation_with_setup_script_success( mocker: MockerFixture, ) -> None: """Test processing with successful setup script.""" - sample_conv_data.setup_script = "setup.sh" mock_agent_driver.enabled = True mock_agent_driver.execute_turn.return_value = (None, "conv_123") @@ -221,7 +219,6 @@ def test_process_conversation_with_cleanup_script( mocker: MockerFixture, ) -> None: """Test cleanup script is always called.""" - sample_conv_data.cleanup_script = "cleanup.sh" mock_agent_driver.enabled = True mock_agent_driver.execute_turn.return_value = (None, "conv_123") @@ -267,7 +264,6 @@ def test_process_conversation_with_agent_execution( mocker: MockerFixture, ) -> None: """Test agent execution during turn processing.""" - mock_agent_driver.enabled = True mock_agent_driver.execute_turn.return_value = (None, "conv_123") @@ -346,7 +342,6 @@ def test_evaluate_turn( mocker: MockerFixture, ) -> None: """Test _evaluate_turn method.""" - mock_result = EvaluationResult( conversation_group_id="conv1", turn_id="turn1", @@ -376,7 +371,6 @@ def test_evaluate_conversation( mocker: MockerFixture, ) -> None: """Test _evaluate_conversation method.""" - mock_result = EvaluationResult( conversation_group_id="conv1", turn_id=None, @@ -499,7 +493,6 @@ def test_evaluate_turn_with_invalid_metric( caplog: LogCaptureFixture, ) -> None: """Test _evaluate_turn with an invalid metric - creates ERROR result and logs error.""" - turn_data = TurnData( turn_id="1", query="What is Python?", @@ -541,7 +534,6 @@ def test_evaluate_turn_with_all_invalid_metrics( caplog: LogCaptureFixture, ) -> None: """Test _evaluate_turn with all metrics invalid - returns ERROR results.""" - turn_data = TurnData( turn_id="1", query="What is Python?", @@ -579,7 +571,6 @@ def test_evaluate_turn_with_mixed_valid_invalid_metrics( caplog: LogCaptureFixture, ) -> None: """Test _evaluate_turn with mix of valid and invalid metrics.""" - turn_data = TurnData( turn_id="1", query="What is Python?", diff --git a/tests/unit/runner/test_evaluation.py b/tests/unit/runner/test_evaluation.py index baf756a8..31fa6821 100644 --- a/tests/unit/runner/test_evaluation.py +++ b/tests/unit/runner/test_evaluation.py @@ -9,17 +9,17 @@ import pytest from pytest_mock import MockerFixture -from lightspeed_evaluation.core.models.statistics import OverallStats -from lightspeed_evaluation.core.models.system import ( - APIConfig, - SystemConfig, -) from lightspeed_evaluation.core.models.llm import ( - LLMDefaultsConfig, - LLMProviderConfig, EmbeddingConfig, LLMConfig, + LLMDefaultsConfig, LLMPoolConfig, + LLMProviderConfig, +) +from lightspeed_evaluation.core.models.statistics import OverallStats +from lightspeed_evaluation.core.models.system import ( + APIConfig, + SystemConfig, ) from lightspeed_evaluation.core.system.exceptions import ( DataValidationError, diff --git a/uv.lock b/uv.lock index f0a56647..13974ac6 100644 --- a/uv.lock +++ b/uv.lock @@ -1628,6 +1628,7 @@ dev = [ { name = "pytest-mock" }, { name = "pytest-timeout" }, { name = "ruff" }, + { name = "types-pyyaml" }, ] [package.metadata] @@ -1672,6 +1673,7 @@ dev = [ { name = "pytest-mock", specifier = "==3.15.1" }, { name = "pytest-timeout", specifier = "==2.4.0" }, { name = "ruff", specifier = ">=0.9.0,<=0.12.11" }, + { name = "types-pyyaml", specifier = ">=6.0.0" }, ] [[package]] @@ -3913,6 +3915,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/91/9b286ab899c008c2cb05e8be99814807e7fbbd33f0c0c960470826e5ac82/typer-0.23.1-py3-none-any.whl", hash = "sha256:3291ad0d3c701cbf522012faccfbb29352ff16ad262db2139e6b01f15781f14e", size = 56813, upload-time = "2026-02-13T10:04:32.008Z" }, ] +[[package]] +name = "types-pyyaml" +version = "6.0.12.20260518" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b8/83/4a1afc3fbfcf5b8d46fc390cd95ed6b0dc9010a265f4e9f46314efffa37a/types_pyyaml-6.0.12.20260518.tar.gz", hash = "sha256:d917f83fb38462550338c1297faedd860b3ec83912b96b1e3d73255f7473e466", size = 17850, upload-time = "2026-05-18T06:01:58.675Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/a2/c01db32be2ae7d6a1689972f3c492b149ee4e164b12fdfd9f64b50888215/types_pyyaml-6.0.12.20260518-py3-none-any.whl", hash = "sha256:d2150f75a231c9fe9c7463bd29487d93e60bac90400287351384bc2284eba7cd", size = 20312, upload-time = "2026-05-18T06:01:57.368Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0"