From 0834bc57a8e02594ccc7ac49410ff26a70ca50ba Mon Sep 17 00:00:00 2001
From: Eva Micankova <emicanko@redhat.com>
Date: Mon, 25 May 2026 13:43:07 +0200
Subject: [PATCH] Align lint checks with LSC

---
 Makefile                                      |  8 ++---
 .../core/agent_goal_eval/evaluator.py         |  9 +++---
 .../core/agent_goal_eval/results.py           |  4 +--
 pyproject.toml                                | 17 +++++++++--
 script/compare_evaluations.py                 | 20 ++++++-------
 script/run_multi_provider_eval.py             | 24 ++++++++-------
 src/generate_answers/generate_answers.py      |  3 +-
 src/lightspeed_evaluation/core/api/client.py  |  4 +--
 .../core/llm/__init__.py                      |  5 ++--
 src/lightspeed_evaluation/core/llm/custom.py  |  4 +--
 .../core/llm/litellm_patch.py                 |  2 +-
 .../core/metrics/custom/custom.py             |  5 ++--
 .../core/metrics/custom/tool_eval.py          |  3 +-
 .../core/metrics/geval.py                     |  1 -
 .../core/metrics/script.py                    |  4 +--
 .../core/models/__init__.py                   | 30 +++++++++----------
 .../core/models/agents.py                     |  4 +--
 src/lightspeed_evaluation/core/models/data.py | 16 +++++-----
 .../core/models/quality.py                    |  3 +-
 .../core/models/statistics.py                 |  1 +
 .../core/models/summary.py                    | 12 ++++----
 .../core/output/data_persistence.py           |  4 +--
 .../core/output/generator.py                  | 22 +++++++-------
 .../core/output/statistics.py                 | 15 +++++-----
 .../core/output/visualization.py              |  2 +-
 .../core/script/manager.py                    |  5 ++--
 .../core/storage/__init__.py                  |  8 ++---
 .../core/storage/config.py                    |  4 +--
 .../core/storage/protocol.py                  |  4 +--
 .../core/storage/sql_storage.py               |  6 ++--
 .../core/system/ssl_certifi.py                |  5 ++--
 .../core/system/validator.py                  |  6 ++--
 .../pipeline/evaluation/evaluator.py          | 10 +++----
 .../pipeline/evaluation/judges.py             |  3 +-
 .../pipeline/evaluation/pipeline.py           |  9 ++++--
 .../runner/evaluation.py                      |  4 +--
 tests/script/test_compare_evaluations.py      |  6 ++--
 tests/script/test_run_multi_provider_eval.py  | 11 +++----
 tests/unit/core/api/conftest.py               |  2 +-
 tests/unit/core/api/test_client.py            | 11 +++----
 tests/unit/core/api/test_client_infer.py      |  4 +--
 tests/unit/core/api/test_streaming_parser.py  |  4 +--
 tests/unit/core/config/test_models.py         |  1 +
 tests/unit/core/llm/conftest.py               | 15 ++++------
 tests/unit/core/llm/test_custom.py            |  3 +-
 tests/unit/core/llm/test_llm_manager.py       |  6 ++--
 tests/unit/core/llm/test_token_tracker.py     |  5 ++--
 tests/unit/core/metrics/conftest.py           |  2 +-
 tests/unit/core/metrics/custom/test_custom.py |  1 +
 .../core/metrics/custom/test_tool_eval.py     |  6 ++--
 tests/unit/core/metrics/test_geval.py         |  2 +-
 tests/unit/core/models/test_api_additional.py |  4 +--
 tests/unit/core/models/test_quality.py        |  1 -
 tests/unit/core/models/test_summary.py        | 11 ++++---
 tests/unit/core/models/test_system.py         | 10 +++----
 tests/unit/core/output/conftest.py            |  1 +
 tests/unit/core/output/test_final_coverage.py |  5 ++--
 tests/unit/core/output/test_generator.py      |  4 +--
 tests/unit/core/output/test_statistics.py     |  4 +--
 tests/unit/core/output/test_statistics_api.py |  2 +-
 tests/unit/core/script/test_manager.py        |  2 +-
 .../core/script/test_manager_additional.py    |  8 ++---
 .../storage/test_composite_and_factory.py     |  2 +-
 tests/unit/core/storage/test_protocol.py      |  6 ++--
 tests/unit/core/storage/test_sql_storage.py   |  2 +-
 tests/unit/core/system/test_loader.py         |  4 +--
 tests/unit/core/system/test_setup.py          |  2 +-
 tests/unit/core/system/test_ssl_certifi.py    |  2 +-
 tests/unit/core/system/test_validator.py      |  3 +-
 tests/unit/pipeline/evaluation/conftest.py    |  9 +++---
 .../pipeline/evaluation/test_evaluator.py     |  6 ++--
 .../pipeline/evaluation/test_processor.py     | 11 +------
 tests/unit/runner/test_evaluation.py          | 14 ++++-----
 uv.lock                                       | 11 +++++++
 74 files changed, 250 insertions(+), 239 deletions(-)

diff --git a/Makefile b/Makefile
index 70b93793..cd996ead 100644
--- a/Makefile
+++ b/Makefile
@@ -23,10 +23,10 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed
 	uv sync --group dev
 
 check-types: ## Checks type hints in sources
-	uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests
+	uv run mypy src/ lsc_agent_eval/src/ tests
 
 black-check:
-	uv run black src tests script lsc_agent_eval --check
+	uv run black --check src tests script lsc_agent_eval
 
 black-format:
 	uv run black src tests script lsc_agent_eval
@@ -115,7 +115,7 @@ shellcheck: ## Run shellcheck
 
 pylint:
 	uv run pylint src
-	uv run pylint --disable=R0801 lsc_agent_eval/src tests
+	uv run pylint lsc_agent_eval/src tests
 
 pyright:
 	uv run pyright src lsc_agent_eval/src tests
@@ -127,4 +127,4 @@ ruff:
 	uv run ruff check src tests script lsc_agent_eval
 
 bandit: ## Security scanning with Bandit
-	uv run bandit -r src/lightspeed_evaluation -ll
+	uv run bandit -c pyproject.toml -r src/lightspeed_evaluation -ll
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py
index def4b1d5..42d77c7c 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py
@@ -6,7 +6,7 @@
 from ..utils.exceptions import AgentAPIError, JudgeModelError, ScriptExecutionError
 from ..utils.prompt import ANSWER_CORRECTNESS_PROMPT, INTENT_DETECTION_PROMPT
 from .tool_call_eval import compare_tool_calls
-from .utils import create_evaluation_results
+from .utils import EvalResultItem, create_evaluation_results
 
 if TYPE_CHECKING:
     from ..utils.api_client import AgentHttpClient
@@ -42,12 +42,13 @@ def run_evaluation(  # pylint: disable=too-many-arguments,too-many-positional-ar
         """Run multiple evaluations based on configuration."""
         try:
             # Query the agent once
-            api_input = {
+            api_input: dict[str, str] = {
                 "query": data_config.eval_query,
                 "provider": agent_provider,
                 "model": agent_model,
-                "conversation_id": conversation_id,
             }
+            if conversation_id is not None:
+                api_input["conversation_id"] = conversation_id
 
             if endpoint_type == "streaming":
                 agent_response = self.agent_client.streaming_query_agent(api_input)
@@ -61,7 +62,7 @@ def run_evaluation(  # pylint: disable=too-many-arguments,too-many-positional-ar
             tool_calls = agent_response.get("tool_calls", [])
 
             # Run all evaluations
-            evaluation_results = []
+            evaluation_results: list[EvalResultItem] = []
             for eval_type in data_config.eval_types:
                 try:
                     success = self._evaluate_single_type(
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py
index 4b4b3e73..e4de0b1c 100644
--- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py
+++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py
@@ -2,7 +2,7 @@
 
 import json
 import logging
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 
 import pandas as pd
@@ -32,7 +32,7 @@ def save_results(self, result_dir: str) -> None:
             output_dir = Path(result_dir)
             output_dir.mkdir(parents=True, exist_ok=True)
 
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
             csv_file = output_dir / f"agent_goal_eval_results_{timestamp}.csv"
             json_file = output_dir / f"agent_goal_eval_summary_{timestamp}.json"
 
diff --git a/pyproject.toml b/pyproject.toml
index 42039f47..bde7e97e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ dev = [
     "pytest-cov>=6.0.0,<=6.2.1",
     "pytest-mock==3.15.1",
     "pytest-timeout==2.4.0",
+    "types-PyYAML>=6.0.0",
 ]
 
 [project.scripts]
@@ -84,9 +85,12 @@ line-length = 88
 convention = "google"
 
 [tool.mypy]
-disable_error_code = ["union-attr", "return-value", "arg-type", "import-untyped"]
-ignore_missing_imports = true
 plugins = ["pydantic.mypy"]
+explicit_package_bases = true
+disallow_untyped_calls = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+ignore_missing_imports = true
 
 [tool.pydantic-mypy]
 init_forbid_extra = true
@@ -94,12 +98,19 @@ init_typed = true
 warn_required_dynamic_aliases = true
 
 [tool.pylint.MASTER]
+source-roots = ["src", "script", "tests"]
 load-plugins = ["pylint_pydantic"]
 init-hook = "import sys; sys.path.append('.')"
+[tool.pylint."MESSAGES CONTROL"]
+disable = ["R0801"]
+
+[tool.pyright]
+extraPaths = ["./src"]
 
 [tool.ruff]
+line-length = 88
 [tool.ruff.lint]
-extend-select = ["TID251"]
+extend-select = ["TID251", "UP006", "UP007", "UP010", "UP017", "UP035", "RUF100", "B009", "B010", "DTZ005", "D202", "I001", "PLR1733"]
 [tool.ruff.lint.flake8-tidy-imports.banned-api]
 unittest = { msg = "use pytest instead of unittest" }
 "unittest.mock" = { msg = "use pytest-mock instead of unittest.mock" }
diff --git a/script/compare_evaluations.py b/script/compare_evaluations.py
index be8e1699..fc626d91 100755
--- a/script/compare_evaluations.py
+++ b/script/compare_evaluations.py
@@ -11,7 +11,7 @@
 import sys
 import traceback
 from pathlib import Path
-from typing import Any, Optional, Tuple, Union, cast
+from typing import Any, Optional, cast
 
 import numpy as np
 from scipy.stats import chi2_contingency, fisher_exact, mannwhitneyu, ttest_ind
@@ -32,7 +32,7 @@ def __init__(self, alpha: float = 0.05):
         self.logger = logging.getLogger(__name__)
 
     def compare_evaluations(
-        self, summary1_path: Union[str, Path], summary2_path: Union[str, Path]
+        self, summary1_path: str | Path, summary2_path: str | Path
     ) -> dict[str, Any]:
         """Compare two evaluation summary files and return statistical significance results.
 
@@ -92,7 +92,7 @@ def compare_evaluations(
 
         return comparison_results
 
-    def _load_summary(self, path: Union[str, Path]) -> dict[str, Any]:
+    def _load_summary(self, path: str | Path) -> dict[str, Any]:
         """Load evaluation summary from JSON file."""
         path = Path(path)
         if not path.exists():
@@ -313,9 +313,9 @@ def _compare_single_metric(
 
         # Determine overall statistical significance
         comparison["statistical_significance"] = self._determine_overall_significance(
-            comparison["score_comparison"],
-            comparison["pass_rate_comparison"],
-            comparison["confidence_interval_test"],
+            cast(Optional[dict[str, Any]], comparison["score_comparison"]),
+            cast(Optional[dict[str, Any]], comparison["pass_rate_comparison"]),
+            cast(Optional[dict[str, Any]], comparison["confidence_interval_test"]),
         )
 
         return comparison
@@ -381,7 +381,7 @@ def _compare_score_distributions(
             # T-test (assumes normal distribution)
             if len(scores1_array) > 1 and len(scores2_array) > 1:
                 ttest_result = ttest_ind(scores1_array, scores2_array)
-                t_stat, t_pvalue = cast(Tuple[float, float], ttest_result)
+                t_stat, t_pvalue = cast(tuple[float, float], ttest_result)
                 comparison["tests"]["t_test"] = {
                     "statistic": t_stat,
                     "p_value": t_pvalue,
@@ -394,7 +394,7 @@ def _compare_score_distributions(
                 mw_result = mannwhitneyu(
                     scores1_array, scores2_array, alternative="two-sided"
                 )
-                u_stat, u_pvalue = cast(Tuple[float, float], mw_result)
+                u_stat, u_pvalue = cast(tuple[float, float], mw_result)
                 comparison["tests"]["mann_whitney_u"] = {
                     "statistic": u_stat,
                     "p_value": u_pvalue,
@@ -567,7 +567,7 @@ def _perform_chi_square_test(
         try:
             chi2_result = chi2_contingency(contingency_table)
             chi2_stat, chi2_pvalue, dof, _ = cast(
-                Tuple[float, float, int, Any], chi2_result
+                tuple[float, float, int, Any], chi2_result
             )
             comparison["tests"]["chi_square"] = {
                 "statistic": float(chi2_stat),
@@ -602,7 +602,7 @@ def _perform_fisher_exact_test(
             )
 
             fisher_result = fisher_exact(contingency_table)
-            odds_ratio, fisher_pvalue = cast(Tuple[float, float], fisher_result)
+            odds_ratio, fisher_pvalue = cast(tuple[float, float], fisher_result)
             comparison["tests"]["fisher_exact"] = {
                 "odds_ratio": float(odds_ratio),
                 "p_value": float(fisher_pvalue),
diff --git a/script/run_multi_provider_eval.py b/script/run_multi_provider_eval.py
index 34a5d471..afdef545 100755
--- a/script/run_multi_provider_eval.py
+++ b/script/run_multi_provider_eval.py
@@ -13,21 +13,23 @@
 import argparse
 import copy
 import json
-import re
 import logging
 import multiprocessing
 import os
+import re
 import sys
 import tempfile
 import traceback
 from concurrent.futures import ProcessPoolExecutor, as_completed
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any, Optional
-from lightspeed_evaluation.runner.evaluation import run_evaluation
+
 import numpy as np
 import yaml
 
+from lightspeed_evaluation.runner.evaluation import run_evaluation
+
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
@@ -70,7 +72,7 @@ def _run_evaluation_worker(
     )
     worker_logger = logging.getLogger(__name__)
 
-    start_time = datetime.now()
+    start_time = datetime.now(UTC)
     temp_config_path: Optional[Path] = None
 
     # Sanitize names for filesystem
@@ -166,7 +168,7 @@ def _run_evaluation_worker(
                 )
 
     # Record end time and duration
-    end_time = datetime.now()
+    end_time = datetime.now(UTC)
     result["end_time"] = end_time.isoformat()
     result["duration_seconds"] = (end_time - start_time).total_seconds()
 
@@ -436,7 +438,7 @@ def _run_single_evaluation(
         Returns:
             Dictionary containing evaluation results and metadata
         """
-        start_time = datetime.now()
+        start_time = datetime.now(UTC)
         temp_config_path: Optional[Path] = None
 
         # Sanitize names for filesystem and enforce confinement under output_base
@@ -508,7 +510,7 @@ def _run_single_evaluation(
                     logger.warning(f"Failed to delete temp config: {temp_config_path}")
 
         # Record end time and duration
-        end_time = datetime.now()
+        end_time = datetime.now(UTC)
         result["end_time"] = end_time.isoformat()
         result["duration_seconds"] = (end_time - start_time).total_seconds()
 
@@ -611,8 +613,8 @@ def _run_parallel_evaluations(self, configs: list[dict[str, Any]]) -> None:
                         "provider_id": config["provider_id"],
                         "model": config["model"],
                         "output_dir": "",
-                        "start_time": datetime.now().isoformat(),
-                        "end_time": datetime.now().isoformat(),
+                        "start_time": datetime.now(UTC).isoformat(),
+                        "end_time": datetime.now(UTC).isoformat(),
                         "duration_seconds": 0,
                         "success": False,
                         "error": f"Worker process failed: {str(e)}",
@@ -634,7 +636,7 @@ def generate_summary(self) -> dict[str, Any]:
         failed = total - successful
 
         summary = {
-            "timestamp": datetime.now().isoformat(),
+            "timestamp": datetime.now(UTC).isoformat(),
             "total_evaluations": total,
             "successful": successful,
             "failed": failed,
@@ -1156,7 +1158,7 @@ def save_model_comparison(self) -> Path:
         analysis_data = {
             "total_models": len(self.model_stats),
             "output_base": str(self.output_base),
-            "timestamp": datetime.now().isoformat(),
+            "timestamp": datetime.now(UTC).isoformat(),
             "rankings": [
                 {
                     "rank": rank,
diff --git a/src/generate_answers/generate_answers.py b/src/generate_answers/generate_answers.py
index 5c7a0271..1f164d26 100644
--- a/src/generate_answers/generate_answers.py
+++ b/src/generate_answers/generate_answers.py
@@ -4,10 +4,11 @@
 import logging
 import os
 import sys
+from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from pathlib import Path
-from typing import Callable, cast
+from typing import cast
 
 import click
 import pandas as pd
diff --git a/src/lightspeed_evaluation/core/api/client.py b/src/lightspeed_evaluation/core/api/client.py
index e9e30a8e..9f28d045 100644
--- a/src/lightspeed_evaluation/core/api/client.py
+++ b/src/lightspeed_evaluation/core/api/client.py
@@ -4,7 +4,7 @@
 import json
 import logging
 import os
-from typing import Any, Optional, Union, cast
+from typing import Any, Optional, cast
 
 import httpx
 from diskcache import Cache
@@ -52,7 +52,7 @@ class APIClient:
 
     def __init__(
         self,
-        config: Union[APIConfig, HttpApiAgentConfig],
+        config: APIConfig | HttpApiAgentConfig,
     ):
         """Initialize the client with configuration."""
         self.config = config
diff --git a/src/lightspeed_evaluation/core/llm/__init__.py b/src/lightspeed_evaluation/core/llm/__init__.py
index 578a62a7..59980cdf 100644
--- a/src/lightspeed_evaluation/core/llm/__init__.py
+++ b/src/lightspeed_evaluation/core/llm/__init__.py
@@ -3,17 +3,16 @@
 from typing import TYPE_CHECKING
 
 # Apply litellm patching globally before any litellm usage in this package
-import lightspeed_evaluation.core.llm.litellm_patch  # noqa: F401
-
+import lightspeed_evaluation.core.llm.litellm_patch
 from lightspeed_evaluation.core.system.lazy_import import create_lazy_getattr
 
 if TYPE_CHECKING:
     # ruff: noqa: F401
     from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
-    from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
     from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
     from lightspeed_evaluation.core.llm.manager import LLMManager
     from lightspeed_evaluation.core.llm.ragas import RagasLLMManager
+    from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
     from lightspeed_evaluation.core.models import LLMConfig
     from lightspeed_evaluation.core.system.env_validator import validate_provider_env
     from lightspeed_evaluation.core.system.exceptions import LLMError
diff --git a/src/lightspeed_evaluation/core/llm/custom.py b/src/lightspeed_evaluation/core/llm/custom.py
index 89309dfb..ae64e682 100644
--- a/src/lightspeed_evaluation/core/llm/custom.py
+++ b/src/lightspeed_evaluation/core/llm/custom.py
@@ -1,7 +1,7 @@
 """Base Custom LLM class for evaluation framework."""
 
 import logging
-from typing import Any, Union
+from typing import Any
 
 import litellm
 from litellm.exceptions import InternalServerError
@@ -37,7 +37,7 @@ def call(
         n: int = 1,
         return_single: bool = True,
         **kwargs: Any,
-    ) -> Union[str, list[str]]:
+    ) -> str | list[str]:
         """Make LLM call and return response(s).
 
         Args:
diff --git a/src/lightspeed_evaluation/core/llm/litellm_patch.py b/src/lightspeed_evaluation/core/llm/litellm_patch.py
index 9eb13a70..690faf26 100644
--- a/src/lightspeed_evaluation/core/llm/litellm_patch.py
+++ b/src/lightspeed_evaluation/core/llm/litellm_patch.py
@@ -34,8 +34,8 @@
 
 # pylint: disable=wrong-import-position
 from lightspeed_evaluation.core.llm.token_tracker import (  # noqa: E402
-    track_judge_tokens,
     track_embedding_tokens,
+    track_judge_tokens,
 )
 
 logger = logging.getLogger(__name__)
diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py
index a3cd69a4..b4be309a 100644
--- a/src/lightspeed_evaluation/core/metrics/custom/custom.py
+++ b/src/lightspeed_evaluation/core/metrics/custom/custom.py
@@ -5,17 +5,16 @@
 
 from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
 from lightspeed_evaluation.core.llm.manager import LLMManager
+from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
 from lightspeed_evaluation.core.metrics.custom.prompts import (
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
 )
-from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
 from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
+from lightspeed_evaluation.core.metrics.manager import MetricLevel
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
 from lightspeed_evaluation.core.system.exceptions import LLMError
 
-from lightspeed_evaluation.core.metrics.manager import MetricLevel
-
 if TYPE_CHECKING:
     from lightspeed_evaluation.core.metrics.manager import MetricManager
 
diff --git a/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py b/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py
index 306b912f..3f99ffce 100644
--- a/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py
+++ b/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py
@@ -2,7 +2,8 @@
 
 import logging
 import re
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/lightspeed_evaluation/core/metrics/geval.py b/src/lightspeed_evaluation/core/metrics/geval.py
index 3b2b8d48..30db5995 100644
--- a/src/lightspeed_evaluation/core/metrics/geval.py
+++ b/src/lightspeed_evaluation/core/metrics/geval.py
@@ -20,7 +20,6 @@
 from deepeval.metrics import GEval
 from deepeval.metrics.g_eval import Rubric
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-
 from pydantic import ValidationError
 
 from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
diff --git a/src/lightspeed_evaluation/core/metrics/script.py b/src/lightspeed_evaluation/core/metrics/script.py
index 7bf9252e..b8aa03e6 100644
--- a/src/lightspeed_evaluation/core/metrics/script.py
+++ b/src/lightspeed_evaluation/core/metrics/script.py
@@ -2,7 +2,7 @@
 
 import logging
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 from lightspeed_evaluation.core.models import EvaluationScope
 from lightspeed_evaluation.core.script import (
@@ -38,7 +38,7 @@ def evaluate(
         return None, f"Unsupported script metric: {metric_name}"
 
     def _evaluate_verify_script(
-        self, script_path: Optional[Union[str, Path]]
+        self, script_path: Optional[str | Path]
     ) -> tuple[Optional[float], str]:
         """Evaluate verify script."""
         if not script_path:
diff --git a/src/lightspeed_evaluation/core/models/__init__.py b/src/lightspeed_evaluation/core/models/__init__.py
index 588478c8..0d665f2c 100644
--- a/src/lightspeed_evaluation/core/models/__init__.py
+++ b/src/lightspeed_evaluation/core/models/__init__.py
@@ -21,14 +21,6 @@
     MetricResult,
     TurnData,
 )
-from lightspeed_evaluation.core.models.mixins import StreamingMetricsMixin
-from lightspeed_evaluation.core.models.system import (
-    APIConfig,
-    CoreConfig,
-    LoggingConfig,
-    SystemConfig,
-    VisualizationConfig,
-)
 from lightspeed_evaluation.core.models.llm import (
     EmbeddingConfig,
     GEvalConfig,
@@ -37,17 +29,25 @@
     LLMConfig,
     LLMPoolConfig,
 )
+from lightspeed_evaluation.core.models.mixins import StreamingMetricsMixin
 from lightspeed_evaluation.core.models.statistics import (
-    NumericStats,
-    ScoreStatistics,
-    OverallStats,
-    MetricStats,
-    ConversationStats,
-    TagStats,
-    StreamingStats,
     AgentTokenUsage,
     ConfidenceInterval,
+    ConversationStats,
     DetailedStats,
+    MetricStats,
+    NumericStats,
+    OverallStats,
+    ScoreStatistics,
+    StreamingStats,
+    TagStats,
+)
+from lightspeed_evaluation.core.models.system import (
+    APIConfig,
+    CoreConfig,
+    LoggingConfig,
+    SystemConfig,
+    VisualizationConfig,
 )
 
 __all__ = [
diff --git a/src/lightspeed_evaluation/core/models/agents.py b/src/lightspeed_evaluation/core/models/agents.py
index 7ad20129..d21e8206 100644
--- a/src/lightspeed_evaluation/core/models/agents.py
+++ b/src/lightspeed_evaluation/core/models/agents.py
@@ -1,7 +1,7 @@
 """Agent configuration models for the evaluation framework."""
 
 import os
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal, Optional
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
@@ -136,7 +136,7 @@ class HttpApiAgentConfig(HttpApiBaseFields):
 
 # Discriminated union of all agent config types; extend by adding new
 # config classes to support additional agent types.
-AgentDefinition = Union[HttpApiAgentConfig]
+AgentDefinition = HttpApiAgentConfig
 
 
 class AgentDefaultConfig(BaseModel):
diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py
index aeabbdc0..da4a686d 100644
--- a/src/lightspeed_evaluation/core/models/data.py
+++ b/src/lightspeed_evaluation/core/models/data.py
@@ -2,7 +2,7 @@
 
 import logging
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 
@@ -62,7 +62,7 @@ class TurnData(StreamingMetricsMixin):
         default=None,
         description="Expected keywords for keyword evaluation (list of alternatives)",
     )
-    expected_response: Optional[Union[str, list[str]]] = Field(
+    expected_response: Optional[str | list[str]] = Field(
         default=None,
         description="Expected response or list of responses for comparison",
     )
@@ -100,7 +100,7 @@ class TurnData(StreamingMetricsMixin):
     )
 
     # Script execution support
-    verify_script: Optional[Union[str, Path]] = Field(
+    verify_script: Optional[str | Path] = Field(
         default=None, description="Path to verify script for script-based evaluation"
     )
 
@@ -126,8 +126,8 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
     @field_validator("expected_response")
     @classmethod
     def validate_expected_response(
-        cls, v: Optional[Union[str, list[str]]]
-    ) -> Optional[Union[str, list[str]]]:
+        cls, v: Optional[str | list[str]]
+    ) -> Optional[str | list[str]]:
         """Validate expected response when provided."""
         if v is None:
             return None
@@ -400,11 +400,11 @@ class EvaluationData(BaseModel):
     )
 
     # Script execution support
-    setup_script: Optional[Union[str, Path]] = Field(
+    setup_script: Optional[str | Path] = Field(
         default=None,
         description="Path to setup script to run before conversation starts",
     )
-    cleanup_script: Optional[Union[str, Path]] = Field(
+    cleanup_script: Optional[str | Path] = Field(
         default=None,
         description="Path to cleanup script to run after conversation ends",
     )
@@ -549,7 +549,7 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
     contexts: Optional[str] = Field(
         default=None, description="Contexts formatted as string"
     )
-    expected_response: Optional[Union[str, list[str]]] = Field(
+    expected_response: Optional[str | list[str]] = Field(
         default=None,
         description="Expected response or list of responses for comparison",
     )
diff --git a/src/lightspeed_evaluation/core/models/quality.py b/src/lightspeed_evaluation/core/models/quality.py
index 5ce1cdfc..657fbe4f 100644
--- a/src/lightspeed_evaluation/core/models/quality.py
+++ b/src/lightspeed_evaluation/core/models/quality.py
@@ -10,13 +10,12 @@
 from pydantic import BaseModel, Field
 
 from lightspeed_evaluation.core.models.statistics import (
+    AgentTokenStats,
     MetricStats,
     NumericStats,
     ScoreStatistics,
-    AgentTokenStats,
 )
 
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/src/lightspeed_evaluation/core/models/statistics.py b/src/lightspeed_evaluation/core/models/statistics.py
index 7f144e9c..67317f17 100644
--- a/src/lightspeed_evaluation/core/models/statistics.py
+++ b/src/lightspeed_evaluation/core/models/statistics.py
@@ -1,6 +1,7 @@
 """Pydantic models for evaluation statistics."""
 
 from typing import Optional
+
 from pydantic import BaseModel, Field
 
 
diff --git a/src/lightspeed_evaluation/core/models/summary.py b/src/lightspeed_evaluation/core/models/summary.py
index 30a70f64..d092b1ef 100644
--- a/src/lightspeed_evaluation/core/models/summary.py
+++ b/src/lightspeed_evaluation/core/models/summary.py
@@ -1,6 +1,6 @@
 """Evaluation summary models for structured results."""
 
-from datetime import datetime
+from datetime import UTC, datetime
 from typing import Optional
 
 from pydantic import BaseModel, Field
@@ -11,21 +11,21 @@
 )
 from lightspeed_evaluation.core.models.statistics import (
     AgentTokenUsage,
-    NumericStats,
     ConversationStats,
     MetricStats,
+    NumericStats,
     OverallStats,
     StreamingStats,
     TagStats,
 )
 from lightspeed_evaluation.core.output.statistics import (
-    compute_agent_token_usage,
     compute_agent_latency_stats,
+    compute_agent_token_usage,
+    compute_conversation_stats,
+    compute_metric_stats,
     compute_overall_stats,
     compute_streaming_stats,
     compute_tag_stats,
-    compute_metric_stats,
-    compute_conversation_stats,
 )
 
 
@@ -83,7 +83,7 @@ def from_results(
         Returns:
             A fully populated EvaluationSummary instance.
         """
-        timestamp = datetime.now().isoformat()
+        timestamp = datetime.now(UTC).isoformat()
 
         # Compute overall stats
         overall = compute_overall_stats(results)
diff --git a/src/lightspeed_evaluation/core/output/data_persistence.py b/src/lightspeed_evaluation/core/output/data_persistence.py
index 1f79030a..9d9841cd 100644
--- a/src/lightspeed_evaluation/core/output/data_persistence.py
+++ b/src/lightspeed_evaluation/core/output/data_persistence.py
@@ -1,6 +1,6 @@
 """Simple data persistence utilities for evaluation framework."""
 
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Optional
 
@@ -27,7 +27,7 @@ def save_evaluation_data(
         output_path.mkdir(parents=True, exist_ok=True)
 
         # Create amended data file with timestamp in output directory
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
         amended_data_path = (
             output_path
             / f"{original_path.stem}_amended_{timestamp}{original_path.suffix}"
diff --git a/src/lightspeed_evaluation/core/output/generator.py b/src/lightspeed_evaluation/core/output/generator.py
index 8b56884b..6c89a81f 100644
--- a/src/lightspeed_evaluation/core/output/generator.py
+++ b/src/lightspeed_evaluation/core/output/generator.py
@@ -3,7 +3,7 @@
 import csv
 import json
 import logging
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any, Optional
 
@@ -17,9 +17,7 @@
     SUPPORTED_OUTPUT_TYPES,
 )
 from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult
-from lightspeed_evaluation.core.models.summary import (
-    EvaluationSummary,
-)
+from lightspeed_evaluation.core.models.quality import QualityReport
 from lightspeed_evaluation.core.models.statistics import (
     AgentTokenStats,
     ConversationStats,
@@ -29,9 +27,11 @@
     StreamingStats,
     TagStats,
 )
-from lightspeed_evaluation.core.models.quality import QualityReport
-from lightspeed_evaluation.core.storage import FileBackendConfig, get_file_config
+from lightspeed_evaluation.core.models.summary import (
+    EvaluationSummary,
+)
 from lightspeed_evaluation.core.output.visualization import GraphGenerator
+from lightspeed_evaluation.core.storage import FileBackendConfig, get_file_config
 
 logger = logging.getLogger(__name__)
 
@@ -105,7 +105,7 @@ def generate_reports(
             )
 
         # Prepare timestamped base filename
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
         base_filename = f"{self.base_filename}_{timestamp}"
 
         # Get enabled outputs from system config
@@ -154,7 +154,7 @@ def save(
         target_dir = Path(output_dir) if output_dir else self.output_dir
         target_dir.mkdir(parents=True, exist_ok=True)
 
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
         base_filename = f"{self.base_filename}_{timestamp}"
 
         generated_files: list[Path] = []
@@ -373,7 +373,7 @@ def _generate_quality_score_report(
         quality_score_file = out / f"{base_filename}_quality_report.json"
 
         output = {
-            "timestamp": datetime.now().isoformat(),
+            "timestamp": datetime.now(UTC).isoformat(),
             "quality_score": quality_report.quality_score,
             "quality_metrics": {
                 metric_id: {
@@ -449,7 +449,9 @@ def _generate_text_summary_from_model(
             f.write("LSC Evaluation Framework - Summary Report\n")
             f.write("=" * 50 + "\n\n")
 
-            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write(
+                f"Generated: {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S %Z')}\n"
+            )
             f.write(f"Total Evaluations: {len(summary.results)}\n\n")
 
             # Overall statistics
diff --git a/src/lightspeed_evaluation/core/output/statistics.py b/src/lightspeed_evaluation/core/output/statistics.py
index bf467491..ad4c35af 100644
--- a/src/lightspeed_evaluation/core/output/statistics.py
+++ b/src/lightspeed_evaluation/core/output/statistics.py
@@ -10,19 +10,18 @@
     EvaluationData,
     EvaluationResult,
 )
-
 from lightspeed_evaluation.core.models.statistics import (
+    AgentTokenStats,
+    AgentTokenUsage,
+    ConfidenceInterval,
+    ConversationStats,
+    DetailedStats,
+    MetricStats,
     NumericStats,
+    OverallStats,
     ScoreStatistics,
     StreamingStats,
-    AgentTokenUsage,
-    AgentTokenStats,
-    OverallStats,
-    MetricStats,
-    ConversationStats,
     TagStats,
-    ConfidenceInterval,
-    DetailedStats,
 )
 
 
diff --git a/src/lightspeed_evaluation/core/output/visualization.py b/src/lightspeed_evaluation/core/output/visualization.py
index f3304894..5260e403 100644
--- a/src/lightspeed_evaluation/core/output/visualization.py
+++ b/src/lightspeed_evaluation/core/output/visualization.py
@@ -16,8 +16,8 @@
 )
 from lightspeed_evaluation.core.models import EvaluationResult
 from lightspeed_evaluation.core.output.statistics import (
-    compute_overall_stats,
     compute_detailed_stats,
+    compute_overall_stats,
 )
 
 CHART_COLORS = {
diff --git a/src/lightspeed_evaluation/core/script/manager.py b/src/lightspeed_evaluation/core/script/manager.py
index 3fe8bc14..00617006 100644
--- a/src/lightspeed_evaluation/core/script/manager.py
+++ b/src/lightspeed_evaluation/core/script/manager.py
@@ -4,7 +4,6 @@
 import os
 import subprocess
 from pathlib import Path
-from typing import Union
 
 from lightspeed_evaluation.core.system.exceptions import ScriptExecutionError
 
@@ -25,7 +24,7 @@ def __init__(self, timeout: int = 300):
         """
         self.timeout = timeout
 
-    def run_script(self, script_path: Union[str, Path]) -> bool:
+    def run_script(self, script_path: str | Path) -> bool:
         """Execute a script and return success status.
 
         Args:
@@ -57,7 +56,7 @@ def run_script(self, script_path: Union[str, Path]) -> bool:
                 f"Unexpected error running script {script_path}: {e}", str(script_path)
             ) from e
 
-    def _prepare_script_path(self, script_path: Union[str, Path]) -> Path:
+    def _prepare_script_path(self, script_path: str | Path) -> Path:
         """Prepare and resolve script path."""
         if isinstance(script_path, str):
             script_path = Path(script_path)
diff --git a/src/lightspeed_evaluation/core/storage/__init__.py b/src/lightspeed_evaluation/core/storage/__init__.py
index f4ac128f..596a893d 100644
--- a/src/lightspeed_evaluation/core/storage/__init__.py
+++ b/src/lightspeed_evaluation/core/storage/__init__.py
@@ -25,15 +25,15 @@
     backend.close()
 """
 
+from lightspeed_evaluation.core.storage.composite_storage import (
+    CompositeStorageBackend,
+    NoOpStorageBackend,
+)
 from lightspeed_evaluation.core.storage.config import (
     DatabaseBackendConfig,
     FileBackendConfig,
     StorageBackendConfig,
 )
-from lightspeed_evaluation.core.storage.composite_storage import (
-    CompositeStorageBackend,
-    NoOpStorageBackend,
-)
 from lightspeed_evaluation.core.storage.factory import (
     create_database_backend,
     create_pipeline_storage_backend,
diff --git a/src/lightspeed_evaluation/core/storage/config.py b/src/lightspeed_evaluation/core/storage/config.py
index e8b84e40..c005b98b 100644
--- a/src/lightspeed_evaluation/core/storage/config.py
+++ b/src/lightspeed_evaluation/core/storage/config.py
@@ -3,7 +3,7 @@
 Defines Pydantic models for file and database storage configuration.
 """
 
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, Optional
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 
@@ -128,6 +128,6 @@ def validate_connection_fields(self) -> "DatabaseBackendConfig":
 
 # Discriminated union for polymorphic storage configuration
 StorageBackendConfig = Annotated[
-    Union[FileBackendConfig, DatabaseBackendConfig],
+    FileBackendConfig | DatabaseBackendConfig,
     Field(discriminator="type"),
 ]
diff --git a/src/lightspeed_evaluation/core/storage/protocol.py b/src/lightspeed_evaluation/core/storage/protocol.py
index 093fa3b0..857125c1 100644
--- a/src/lightspeed_evaluation/core/storage/protocol.py
+++ b/src/lightspeed_evaluation/core/storage/protocol.py
@@ -6,7 +6,7 @@
 """
 
 from dataclasses import dataclass, field
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from typing import Optional, Protocol
 from uuid import uuid4
 
@@ -25,7 +25,7 @@ class RunInfo:
 
     run_id: str = field(default_factory=lambda: str(uuid4()))
     name: str = ""
-    started_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    started_at: datetime = field(default_factory=lambda: datetime.now(UTC))
 
 
 class BaseStorageBackend(Protocol):
diff --git a/src/lightspeed_evaluation/core/storage/sql_storage.py b/src/lightspeed_evaluation/core/storage/sql_storage.py
index 1cb545a8..1f28b5fe 100644
--- a/src/lightspeed_evaluation/core/storage/sql_storage.py
+++ b/src/lightspeed_evaluation/core/storage/sql_storage.py
@@ -6,7 +6,7 @@
 
 import json
 import logging
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from typing import Any, Optional
 
 from sqlalchemy import (
@@ -23,8 +23,8 @@
 from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
 
 from lightspeed_evaluation.core.models import EvaluationResult
-from lightspeed_evaluation.core.system.exceptions import StorageError
 from lightspeed_evaluation.core.storage.protocol import BaseStorageBackend, RunInfo
+from lightspeed_evaluation.core.system.exceptions import StorageError
 
 logger = logging.getLogger(__name__)
 
@@ -307,7 +307,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB:
 
         return EvaluationResultDB(
             run_id=self._run_info.run_id,
-            timestamp=datetime.now(timezone.utc),
+            timestamp=datetime.now(UTC),
             conversation_group_id=result.conversation_group_id,
             tag=result.tag,
             turn_id=result.turn_id,
diff --git a/src/lightspeed_evaluation/core/system/ssl_certifi.py b/src/lightspeed_evaluation/core/system/ssl_certifi.py
index 82293532..1a7034a3 100644
--- a/src/lightspeed_evaluation/core/system/ssl_certifi.py
+++ b/src/lightspeed_evaluation/core/system/ssl_certifi.py
@@ -2,9 +2,10 @@
 
 import atexit
 import os
-from typing import Any
-from pathlib import Path
 import tempfile
+from pathlib import Path
+from typing import Any
+
 import certifi
 
 
diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py
index 4c797897..43a4ed6d 100644
--- a/src/lightspeed_evaluation/core/system/validator.py
+++ b/src/lightspeed_evaluation/core/system/validator.py
@@ -2,7 +2,7 @@
 
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional
 
 import yaml
 from pydantic import ValidationError
@@ -86,7 +86,7 @@ def format_pydantic_error(error: ValidationError) -> str:
     return "; ".join(errors)
 
 
-def _is_field_empty(value: Optional[Union[str, list, dict]]) -> bool:
+def _is_field_empty(value: Optional[str | list | dict]) -> bool:
     """Return True if value is considered empty for required-field validation."""
     if value is None:
         return True
@@ -489,7 +489,7 @@ def _validate_scripts(self, evaluation_data: list[EvaluationData]) -> None:
 
     def _validate_single_script(
         self,
-        script_file: Optional[Union[str, Path]],
+        script_file: Optional[str | Path],
         script_type: str,
         context: str,
     ) -> Optional[Path]:
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py
index f091266b..5f60be17 100644
--- a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py
+++ b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py
@@ -5,9 +5,13 @@
 import time
 from typing import Any, Optional
 
+from lightspeed_evaluation.core.constants import (
+    DEFAULT_METRIC_THRESHOLD,
+    NON_LLM_FRAMEWORKS,
+)
 from lightspeed_evaluation.core.embedding.manager import EmbeddingManager
-from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
 from lightspeed_evaluation.core.llm.manager import LLMManager
+from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
 from lightspeed_evaluation.core.metrics.custom import CustomMetrics
 from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics
 from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager
@@ -30,10 +34,6 @@
     METRIC_REQUIREMENTS,
     check_metric_required_data,
 )
-from lightspeed_evaluation.core.constants import (
-    DEFAULT_METRIC_THRESHOLD,
-    NON_LLM_FRAMEWORKS,
-)
 from lightspeed_evaluation.pipeline.evaluation.judges import JudgeOrchestrator
 
 logger = logging.getLogger(__name__)
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/judges.py b/src/lightspeed_evaluation/pipeline/evaluation/judges.py
index 4147a865..01e118e9 100644
--- a/src/lightspeed_evaluation/pipeline/evaluation/judges.py
+++ b/src/lightspeed_evaluation/pipeline/evaluation/judges.py
@@ -1,8 +1,9 @@
 """Judge orchestration module - handles multi-judge evaluation and aggregation."""
 
 import logging
+from collections.abc import Callable
 from statistics import mean
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 from lightspeed_evaluation.core.constants import DEFAULT_METRIC_THRESHOLD
 from lightspeed_evaluation.core.llm.manager import LLMManager
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py b/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py
index 98f10d29..b9b8c42d 100644
--- a/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py
+++ b/src/lightspeed_evaluation/pipeline/evaluation/pipeline.py
@@ -3,7 +3,8 @@
 import asyncio
 import concurrent.futures
 import logging
-from typing import Any, Optional
+from collections.abc import Callable, Coroutine
+from typing import Any, Optional, cast
 
 import litellm
 import tqdm
@@ -272,8 +273,10 @@ def close(self) -> None:
             if cache is not None:
                 try:
                     # Use getattr to call untyped third-party method
-                    disconnect = getattr(cache, "disconnect")
+                    disconnect = cast(
+                        Callable[[], Coroutine[Any, Any, object]], cache.disconnect
+                    )
                     asyncio.run(disconnect())
-                except (AttributeError, RuntimeError, OSError):
+                except (AttributeError, RuntimeError, OSError, TypeError):
                     logger.debug("litellm cache disconnect raised; ignoring")
                 litellm.cache = None
diff --git a/src/lightspeed_evaluation/runner/evaluation.py b/src/lightspeed_evaluation/runner/evaluation.py
index 972c6c98..4903487b 100644
--- a/src/lightspeed_evaluation/runner/evaluation.py
+++ b/src/lightspeed_evaluation/runner/evaluation.py
@@ -8,10 +8,10 @@
 from typing import Optional
 
 from lightspeed_evaluation.core.models import (
+    AgentTokenUsage,
     LLMPoolConfig,
-    SystemConfig,
     OverallStats,
-    AgentTokenUsage,
+    SystemConfig,
 )
 
 # Import only lightweight modules at top level
diff --git a/tests/script/test_compare_evaluations.py b/tests/script/test_compare_evaluations.py
index ae94474f..334ae4a9 100755
--- a/tests/script/test_compare_evaluations.py
+++ b/tests/script/test_compare_evaluations.py
@@ -4,12 +4,12 @@
 """Pytest tests to verify the compare_evaluations.py script works correctly."""
 
 import json
-import tempfile
 import subprocess
 import sys
+import tempfile
 from pathlib import Path
-
 from typing import Any
+
 import pytest
 
 from script.compare_evaluations import EvaluationComparison
@@ -86,7 +86,6 @@ def test_basic_comparison(
 
 def test_invalid_arguments(script_path: Path) -> None:
     """Test error handling for invalid arguments."""
-
     # Test with only one file
     result = subprocess.run(
         [sys.executable, str(script_path), "file1.json"],
@@ -116,7 +115,6 @@ def test_invalid_arguments(script_path: Path) -> None:
 
 def test_nonexistent_files(script_path: Path) -> None:
     """Test error handling for nonexistent files."""
-
     result = subprocess.run(
         [sys.executable, str(script_path), "nonexistent1.json", "nonexistent2.json"],
         capture_output=True,
diff --git a/tests/script/test_run_multi_provider_eval.py b/tests/script/test_run_multi_provider_eval.py
index 0b271c62..6158e85c 100644
--- a/tests/script/test_run_multi_provider_eval.py
+++ b/tests/script/test_run_multi_provider_eval.py
@@ -4,16 +4,16 @@
 """Pytest tests for run_multi_provider_eval.py script."""
 
 import json
-from pathlib import Path
-from typing import Any
-import tempfile as temp_module
 import logging
 import multiprocessing
 import shutil
+import tempfile as temp_module
+from pathlib import Path
+from typing import Any
 
 import pytest
-from pytest_mock import MockerFixture
 import yaml
+from pytest_mock import MockerFixture
 
 from script.run_multi_provider_eval import MultiProviderEvaluationRunner
 
@@ -161,7 +161,6 @@ def test_resource_warning_high_thread_count(
         caplog: pytest.LogCaptureFixture,
     ) -> None:
         """Test warning is logged when total threads is very high."""
-
         # Create system config with high max_threads
         system_config = {
             "core": {"max_threads": 100},
@@ -198,7 +197,6 @@ def test_no_resource_warning_reasonable_config(
         caplog: pytest.LogCaptureFixture,
     ) -> None:
         """Test no warning with reasonable thread count."""
-
         # Calculate safe thread count based on actual CPU count
         cpu_count = multiprocessing.cpu_count()
         # Use values that keep total threads <= cpu_count * 2
@@ -393,7 +391,6 @@ def test_temp_config_cleanup_on_yaml_dump_failure(
         mocker: MockerFixture,
     ) -> None:
         """Test that temp file is cleaned up when yaml.dump() fails."""
-
         # Track the temp file path that gets created
         created_temp_path = None
         original_named_temp_file = temp_module.NamedTemporaryFile
diff --git a/tests/unit/core/api/conftest.py b/tests/unit/core/api/conftest.py
index f6ed6901..7cdad8d7 100644
--- a/tests/unit/core/api/conftest.py
+++ b/tests/unit/core/api/conftest.py
@@ -3,8 +3,8 @@
 from typing import Any
 
 import pytest
-
 from pytest_mock import MockerFixture
+
 from lightspeed_evaluation.core.models import APIConfig
 
 
diff --git a/tests/unit/core/api/test_client.py b/tests/unit/core/api/test_client.py
index eb0bb3ad..ed688213 100644
--- a/tests/unit/core/api/test_client.py
+++ b/tests/unit/core/api/test_client.py
@@ -3,14 +3,15 @@
 """Unit tests for core API client module."""
 
 from pathlib import Path
-import pytest
+
 import httpx
-from pytest_mock import MockerFixture
+import pytest
 from pydantic import ValidationError
+from pytest_mock import MockerFixture
 
+from lightspeed_evaluation.core.api.client import APIClient, _is_retryable_server_error
 from lightspeed_evaluation.core.models import APIConfig, APIResponse
 from lightspeed_evaluation.core.system.exceptions import APIError
-from lightspeed_evaluation.core.api.client import APIClient, _is_retryable_server_error
 
 
 class TestAPIClient:
@@ -18,7 +19,6 @@ class TestAPIClient:
 
     def test_initialization_unsupported_endpoint_type(self) -> None:
         """Test initialization fails with unsupported endpoint type."""
-
         # Pydantic will validate the endpoint_type, so this should raise ValidationError
         with pytest.raises(ValidationError, match="Endpoint type must be one of"):
             APIConfig(
@@ -150,7 +150,6 @@ def test_query_timeout_error(
         self, basic_api_config_query_endpoint: APIConfig, mocker: MockerFixture
     ) -> None:
         """Test query handling timeout."""
-
         mock_client = mocker.Mock()
         mock_client.post.side_effect = httpx.TimeoutException("Timeout")
         mock_client.headers = {}
@@ -231,7 +230,6 @@ def test_handle_response_errors_non_200(
         self, basic_api_config_query_endpoint: APIConfig, mocker: MockerFixture
     ) -> None:
         """Test _handle_response_errors with non-200 status."""
-
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
         client = APIClient(basic_api_config_query_endpoint)
@@ -488,7 +486,6 @@ def test_standard_endpoint_initialization(
         self, basic_api_config_query_endpoint: APIConfig, mocker: MockerFixture
     ) -> None:
         """Test initialization with standard (non-streaming) endpoint."""
-
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
         client = APIClient(basic_api_config_query_endpoint)
diff --git a/tests/unit/core/api/test_client_infer.py b/tests/unit/core/api/test_client_infer.py
index e99a5537..8fd931bf 100644
--- a/tests/unit/core/api/test_client_infer.py
+++ b/tests/unit/core/api/test_client_infer.py
@@ -2,13 +2,13 @@
 
 """Unit tests for APIClient /infer endpoint support."""
 
-import pytest
 import httpx
+import pytest
 from pytest_mock import MockerFixture
 
+from lightspeed_evaluation.core.api.client import APIClient
 from lightspeed_evaluation.core.models import APIConfig, APIResponse
 from lightspeed_evaluation.core.system.exceptions import APIError
-from lightspeed_evaluation.core.api.client import APIClient
 
 
 class TestInferEndpoint:
diff --git a/tests/unit/core/api/test_streaming_parser.py b/tests/unit/core/api/test_streaming_parser.py
index b6a0d8e6..f90dc746 100644
--- a/tests/unit/core/api/test_streaming_parser.py
+++ b/tests/unit/core/api/test_streaming_parser.py
@@ -5,10 +5,10 @@
 import pytest
 
 from lightspeed_evaluation.core.api.streaming_parser import (
-    parse_streaming_response,
+    _format_tool_sequences,
     _parse_sse_line,
     _parse_tool_call,
-    _format_tool_sequences,
+    parse_streaming_response,
 )
 
 
diff --git a/tests/unit/core/config/test_models.py b/tests/unit/core/config/test_models.py
index 3ea18d0f..7d13ae70 100644
--- a/tests/unit/core/config/test_models.py
+++ b/tests/unit/core/config/test_models.py
@@ -2,6 +2,7 @@
 
 import pytest
 from pydantic import ValidationError
+
 from lightspeed_evaluation.core.models import (
     CoreConfig,
     EvaluationData,
diff --git a/tests/unit/core/llm/conftest.py b/tests/unit/core/llm/conftest.py
index 779ef21e..e67cd4b2 100644
--- a/tests/unit/core/llm/conftest.py
+++ b/tests/unit/core/llm/conftest.py
@@ -1,6 +1,7 @@
 """Pytest configuration and fixtures for llm tests."""
 
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 from pytest_mock import MockerFixture
@@ -57,10 +58,8 @@ def _create_response(
         mock_response.usage.prompt_tokens = prompt_tokens
         mock_response.usage.completion_tokens = completion_tokens
 
-        setattr(
-            mock_response,
-            "_hidden_params",
-            {"cache_hit": cache_hit} if cache_hit else {},
+        mock_response.configure_mock(
+            _hidden_params={"cache_hit": cache_hit} if cache_hit else {}
         )
         return mock_response
 
@@ -97,10 +96,8 @@ def _create_response(
             prompt_tokens=prompt_tokens, spec=["prompt_tokens"]
         )
 
-        setattr(
-            mock_response,
-            "_hidden_params",
-            {"cache_hit": cache_hit} if cache_hit else {},
+        mock_response.configure_mock(
+            _hidden_params={"cache_hit": cache_hit} if cache_hit else {}
         )
         return mock_response
 
diff --git a/tests/unit/core/llm/test_custom.py b/tests/unit/core/llm/test_custom.py
index 697abd2e..27d1025c 100644
--- a/tests/unit/core/llm/test_custom.py
+++ b/tests/unit/core/llm/test_custom.py
@@ -2,7 +2,8 @@
 
 """Unit tests for custom LLM classes."""
 
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 from pytest_mock import MockerFixture
diff --git a/tests/unit/core/llm/test_llm_manager.py b/tests/unit/core/llm/test_llm_manager.py
index 53852484..1f896328 100644
--- a/tests/unit/core/llm/test_llm_manager.py
+++ b/tests/unit/core/llm/test_llm_manager.py
@@ -5,18 +5,18 @@
 import pytest
 from pytest_mock import MockerFixture
 
+from lightspeed_evaluation.core.llm.manager import LLMManager
 from lightspeed_evaluation.core.models import (
+    JudgePanelConfig,
     LLMConfig,
-    SystemConfig,
     LLMPoolConfig,
-    JudgePanelConfig,
+    SystemConfig,
 )
 from lightspeed_evaluation.core.models.llm import (
     LLMDefaultsConfig,
     LLMParametersConfig,
     LLMProviderConfig,
 )
-from lightspeed_evaluation.core.llm.manager import LLMManager
 
 
 class TestLLMManager:
diff --git a/tests/unit/core/llm/test_token_tracker.py b/tests/unit/core/llm/test_token_tracker.py
index 4478a414..6a1ed798 100644
--- a/tests/unit/core/llm/test_token_tracker.py
+++ b/tests/unit/core/llm/test_token_tracker.py
@@ -1,11 +1,12 @@
 """Unit tests for TokenTracker and integration with litellm patch."""
 
 import threading
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
+import litellm
 import pytest
 from pytest_mock import MockerFixture
-import litellm
 
 # Simulate litellm completion call through patch
 from lightspeed_evaluation.core.llm import litellm_patch
diff --git a/tests/unit/core/metrics/conftest.py b/tests/unit/core/metrics/conftest.py
index e7bc39e0..536f1309 100644
--- a/tests/unit/core/metrics/conftest.py
+++ b/tests/unit/core/metrics/conftest.py
@@ -8,7 +8,7 @@
 from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.metrics.nlp import NLPMetrics
-from lightspeed_evaluation.core.models import EvaluationScope, TurnData, SystemConfig
+from lightspeed_evaluation.core.models import EvaluationScope, SystemConfig, TurnData
 
 
 @pytest.fixture
diff --git a/tests/unit/core/metrics/custom/test_custom.py b/tests/unit/core/metrics/custom/test_custom.py
index d6bccf52..51a002c7 100644
--- a/tests/unit/core/metrics/custom/test_custom.py
+++ b/tests/unit/core/metrics/custom/test_custom.py
@@ -1,6 +1,7 @@
 """Tests for custom metrics module."""
 
 from pytest_mock import MockerFixture
+
 from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
 from lightspeed_evaluation.core.metrics.manager import MetricLevel
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
diff --git a/tests/unit/core/metrics/custom/test_tool_eval.py b/tests/unit/core/metrics/custom/test_tool_eval.py
index 6ff890c2..f4d63f3b 100644
--- a/tests/unit/core/metrics/custom/test_tool_eval.py
+++ b/tests/unit/core/metrics/custom/test_tool_eval.py
@@ -3,12 +3,12 @@
 from typing import Any
 
 from lightspeed_evaluation.core.metrics.custom.tool_eval import (
-    evaluate_tool_calls,
-    compare_tool_calls,
-    _compare_tool_call_sequence,
     _compare_single_tool_call,
     _compare_tool_arguments,
+    _compare_tool_call_sequence,
     _compare_tool_result,
+    compare_tool_calls,
+    evaluate_tool_calls,
 )
 
 
diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py
index 0ab50d40..29725c08 100644
--- a/tests/unit/core/metrics/test_geval.py
+++ b/tests/unit/core/metrics/test_geval.py
@@ -4,9 +4,9 @@
 from typing import Any
 
 import pytest
-from pytest_mock import MockerFixture
 from deepeval.metrics.g_eval import Rubric
 from deepeval.test_case import LLMTestCaseParams
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.metrics.geval import GEvalHandler
 from lightspeed_evaluation.core.metrics.manager import MetricLevel
diff --git a/tests/unit/core/models/test_api_additional.py b/tests/unit/core/models/test_api_additional.py
index cde8a1b5..90b64fdc 100644
--- a/tests/unit/core/models/test_api_additional.py
+++ b/tests/unit/core/models/test_api_additional.py
@@ -4,10 +4,10 @@
 from pydantic import ValidationError
 
 from lightspeed_evaluation.core.models.api import (
-    RAGChunk,
-    AttachmentData,
     APIRequest,
     APIResponse,
+    AttachmentData,
+    RAGChunk,
 )
 
 
diff --git a/tests/unit/core/models/test_quality.py b/tests/unit/core/models/test_quality.py
index d87ee295..df01baf9 100644
--- a/tests/unit/core/models/test_quality.py
+++ b/tests/unit/core/models/test_quality.py
@@ -19,7 +19,6 @@ def test_quality_report_creation_happy_path(
         api_latency_summary: NumericStats,
     ) -> None:
         """Test QualityReport creation with valid metrics."""
-
         # Define quality score metrics (subset of all metrics)
         quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"]
 
diff --git a/tests/unit/core/models/test_summary.py b/tests/unit/core/models/test_summary.py
index c84a6f01..ffa9cd3f 100644
--- a/tests/unit/core/models/test_summary.py
+++ b/tests/unit/core/models/test_summary.py
@@ -4,19 +4,18 @@
 
 from pytest_mock import MockerFixture
 
-from lightspeed_evaluation.core.models.data import (
-    EvaluationData,
-    EvaluationResult,
-    TurnData,
-)
 from lightspeed_evaluation.core.models import (
     ConfidenceInterval,
     OverallStats,
     ScoreStatistics,
 )
+from lightspeed_evaluation.core.models.data import (
+    EvaluationData,
+    EvaluationResult,
+    TurnData,
+)
 from lightspeed_evaluation.core.models.summary import EvaluationSummary
 
-
 _RESULT_DEFAULTS: dict[str, Any] = {
     "conversation_group_id": "conv1",
     "tag": "eval",
diff --git a/tests/unit/core/models/test_system.py b/tests/unit/core/models/test_system.py
index 6e5bf344..95cf09eb 100644
--- a/tests/unit/core/models/test_system.py
+++ b/tests/unit/core/models/test_system.py
@@ -10,23 +10,19 @@
 
 from lightspeed_evaluation.core.models import (
     APIConfig,
+    CoreConfig,
     EmbeddingConfig,
     JudgePanelConfig,
     LLMConfig,
     LLMPoolConfig,
     SystemConfig,
     VisualizationConfig,
-    CoreConfig,
 )
 from lightspeed_evaluation.core.models.agents import (
     AgentDefaultConfig,
     AgentsConfig,
     HttpApiAgentConfig,
 )
-from lightspeed_evaluation.core.models.system import (
-    LoggingConfig,
-    QualityScoreConfig,
-)
 from lightspeed_evaluation.core.models.llm import (
     GEvalConfig,
     GEvalRubricConfig,
@@ -34,6 +30,10 @@
     LLMParametersConfig,
     LLMProviderConfig,
 )
+from lightspeed_evaluation.core.models.system import (
+    LoggingConfig,
+    QualityScoreConfig,
+)
 from lightspeed_evaluation.core.storage import FileBackendConfig
 from lightspeed_evaluation.core.system.exceptions import ConfigurationError
 
diff --git a/tests/unit/core/output/conftest.py b/tests/unit/core/output/conftest.py
index 46cba616..1732f1d0 100644
--- a/tests/unit/core/output/conftest.py
+++ b/tests/unit/core/output/conftest.py
@@ -2,6 +2,7 @@
 
 import pytest
 from pytest_mock import MockerFixture
+
 from lightspeed_evaluation.core.models import EvaluationResult
 from lightspeed_evaluation.core.storage import FileBackendConfig
 
diff --git a/tests/unit/core/output/test_final_coverage.py b/tests/unit/core/output/test_final_coverage.py
index dd769612..98a69d3e 100644
--- a/tests/unit/core/output/test_final_coverage.py
+++ b/tests/unit/core/output/test_final_coverage.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 from pytest_mock import MockerFixture
+
 from lightspeed_evaluation.core.models import (
     EvaluationData,
     EvaluationResult,
@@ -14,11 +15,11 @@
 from lightspeed_evaluation.core.models.summary import EvaluationSummary
 from lightspeed_evaluation.core.output.generator import OutputHandler
 from lightspeed_evaluation.core.output.statistics import (
-    compute_overall_stats,
     compute_detailed_stats,
+    compute_overall_stats,
 )
-from lightspeed_evaluation.core.system.validator import DataValidator
 from lightspeed_evaluation.core.storage import FileBackendConfig
+from lightspeed_evaluation.core.system.validator import DataValidator
 
 
 class TestStatisticsEdgeCases:
diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py
index c8aa29a3..15ecde58 100644
--- a/tests/unit/core/output/test_generator.py
+++ b/tests/unit/core/output/test_generator.py
@@ -2,15 +2,15 @@
 
 """Unit tests for output generator."""
 
+import csv as csv_module
 import json
 from pathlib import Path
 
-import csv as csv_module
 from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.models import EvaluationResult
-from lightspeed_evaluation.core.models.summary import EvaluationSummary
 from lightspeed_evaluation.core.models.quality import QualityReport
+from lightspeed_evaluation.core.models.summary import EvaluationSummary
 from lightspeed_evaluation.core.output.generator import OutputHandler
 from lightspeed_evaluation.core.storage import FileBackendConfig
 
diff --git a/tests/unit/core/output/test_statistics.py b/tests/unit/core/output/test_statistics.py
index 0d32eef7..8a50b792 100644
--- a/tests/unit/core/output/test_statistics.py
+++ b/tests/unit/core/output/test_statistics.py
@@ -1,16 +1,16 @@
 """Unit tests for core statistics module."""
 
-import pytest
 import pandas as pd
+import pytest
 
 from lightspeed_evaluation.core.models.data import (
     EvaluationResult,
 )
 from lightspeed_evaluation.core.models.statistics import OverallStats
 from lightspeed_evaluation.core.output.statistics import (
-    compute_score_statistics,
     bootstrap_intervals,
     compute_overall_stats,
+    compute_score_statistics,
 )
 
 
diff --git a/tests/unit/core/output/test_statistics_api.py b/tests/unit/core/output/test_statistics_api.py
index 86a82dfe..ed47f25b 100644
--- a/tests/unit/core/output/test_statistics_api.py
+++ b/tests/unit/core/output/test_statistics_api.py
@@ -4,8 +4,8 @@
 
 from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult, TurnData
 from lightspeed_evaluation.core.output.statistics import (
-    compute_field_numeric_stats_from_evaluation_data,
     compute_agent_token_usage,
+    compute_field_numeric_stats_from_evaluation_data,
     compute_overall_stats,
 )
 
diff --git a/tests/unit/core/script/test_manager.py b/tests/unit/core/script/test_manager.py
index 33b72350..3e8e31ba 100644
--- a/tests/unit/core/script/test_manager.py
+++ b/tests/unit/core/script/test_manager.py
@@ -1,8 +1,8 @@
 """Unit tests for core script manager module."""
 
+import os
 import tempfile
 from pathlib import Path
-import os
 
 import pytest
 
diff --git a/tests/unit/core/script/test_manager_additional.py b/tests/unit/core/script/test_manager_additional.py
index 642e2e42..6088b891 100644
--- a/tests/unit/core/script/test_manager_additional.py
+++ b/tests/unit/core/script/test_manager_additional.py
@@ -1,8 +1,9 @@
 """Additional tests for script manager to increase coverage."""
 
-from pathlib import Path
-import subprocess
 import logging
+import subprocess
+from pathlib import Path
+
 import pytest
 from pytest_mock import MockFixture
 
@@ -98,7 +99,6 @@ def test_script_output_logging(
         self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture
     ) -> None:
         """Test that script output is logged."""
-
         caplog.set_level(logging.DEBUG)
 
         script = tmp_path / "test_script.sh"
@@ -123,7 +123,6 @@ def test_script_stderr_logging_on_failure(
         self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture
     ) -> None:
         """Test that stderr is logged as error on failure."""
-
         caplog.set_level(logging.ERROR)
 
         script = tmp_path / "test_script.sh"
@@ -147,7 +146,6 @@ def test_script_stderr_logging_on_success(
         self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture
     ) -> None:
         """Test that stderr is logged as debug on success."""
-
         caplog.set_level(logging.DEBUG)
 
         script = tmp_path / "test_script.sh"
diff --git a/tests/unit/core/storage/test_composite_and_factory.py b/tests/unit/core/storage/test_composite_and_factory.py
index 02dc959d..2bb80d3b 100644
--- a/tests/unit/core/storage/test_composite_and_factory.py
+++ b/tests/unit/core/storage/test_composite_and_factory.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from lightspeed_evaluation.core.models import LLMConfig, SystemConfig, EvaluationResult
+from lightspeed_evaluation.core.models import EvaluationResult, LLMConfig, SystemConfig
 from lightspeed_evaluation.core.storage import (
     BaseStorageBackend,
     CompositeStorageBackend,
diff --git a/tests/unit/core/storage/test_protocol.py b/tests/unit/core/storage/test_protocol.py
index cf6e59ea..f9f1919d 100644
--- a/tests/unit/core/storage/test_protocol.py
+++ b/tests/unit/core/storage/test_protocol.py
@@ -1,6 +1,6 @@
 """Unit tests for storage protocol and RunInfo."""
 
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 
 from lightspeed_evaluation.core.storage import RunInfo
 
@@ -22,9 +22,9 @@ def test_with_name(self) -> None:
 
     def test_sets_timestamp(self) -> None:
         """Test that RunInfo sets a timestamp."""
-        before = datetime.now(timezone.utc)
+        before = datetime.now(UTC)
         run_info = RunInfo()
-        after = datetime.now(timezone.utc)
+        after = datetime.now(UTC)
 
         assert before <= run_info.started_at <= after
 
diff --git a/tests/unit/core/storage/test_sql_storage.py b/tests/unit/core/storage/test_sql_storage.py
index d79b698a..3f24adfa 100644
--- a/tests/unit/core/storage/test_sql_storage.py
+++ b/tests/unit/core/storage/test_sql_storage.py
@@ -4,7 +4,7 @@
 
 import os
 import tempfile
-from typing import Generator
+from collections.abc import Generator
 
 import pytest
 from pytest_mock import MockerFixture
diff --git a/tests/unit/core/system/test_loader.py b/tests/unit/core/system/test_loader.py
index 86799bcd..a11152c5 100644
--- a/tests/unit/core/system/test_loader.py
+++ b/tests/unit/core/system/test_loader.py
@@ -6,10 +6,10 @@
 import pytest
 from pytest_mock import MockerFixture
 
-from lightspeed_evaluation.core.system.exceptions import ConfigurationError
-from lightspeed_evaluation.core.system.loader import ConfigLoader
 from lightspeed_evaluation.core.models import SystemConfig
 from lightspeed_evaluation.core.storage import get_file_config
+from lightspeed_evaluation.core.system.exceptions import ConfigurationError
+from lightspeed_evaluation.core.system.loader import ConfigLoader
 
 
 class TestConfigLoader:
diff --git a/tests/unit/core/system/test_setup.py b/tests/unit/core/system/test_setup.py
index d0f1fe25..47cecdae 100644
--- a/tests/unit/core/system/test_setup.py
+++ b/tests/unit/core/system/test_setup.py
@@ -4,8 +4,8 @@
 import os
 
 import pytest
-from pytest_mock import MockerFixture
 from _pytest.capture import CaptureFixture
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.models import LoggingConfig
 from lightspeed_evaluation.core.system.setup import (
diff --git a/tests/unit/core/system/test_ssl_certifi.py b/tests/unit/core/system/test_ssl_certifi.py
index d92bf73c..67ee1347 100644
--- a/tests/unit/core/system/test_ssl_certifi.py
+++ b/tests/unit/core/system/test_ssl_certifi.py
@@ -5,10 +5,10 @@
 from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.system.ssl_certifi import (
+    _get_unique_ssl_cert_paths,
     create_ssl_certifi_bundle,
     get_ssl_cert_files_paths_from_system_yaml,
     get_system_ssl_cert_file,
-    _get_unique_ssl_cert_paths,
 )
 
 
diff --git a/tests/unit/core/system/test_validator.py b/tests/unit/core/system/test_validator.py
index d98e3573..27afb075 100644
--- a/tests/unit/core/system/test_validator.py
+++ b/tests/unit/core/system/test_validator.py
@@ -6,9 +6,8 @@
 from pathlib import Path
 
 import pytest
-from pytest_mock import MockerFixture
-
 from pydantic import ValidationError
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.models import EvaluationData, SystemConfig, TurnData
 from lightspeed_evaluation.core.system.exceptions import DataValidationError
diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py
index 6c167d05..b4f192d6 100644
--- a/tests/unit/pipeline/evaluation/conftest.py
+++ b/tests/unit/pipeline/evaluation/conftest.py
@@ -7,23 +7,24 @@
 import pytest
 from pytest_mock import MockerFixture
 
+from lightspeed_evaluation.core.metrics.manager import MetricManager
 from lightspeed_evaluation.core.models import (
     EvaluationData,
+    EvaluationRequest,
+    EvaluationResult,
     SystemConfig,
     TurnData,
 )
 from lightspeed_evaluation.core.models.agents import AgentsConfig
+from lightspeed_evaluation.core.script import ScriptExecutionManager
 from lightspeed_evaluation.core.storage import FileBackendConfig
 from lightspeed_evaluation.core.system.loader import ConfigLoader
-from lightspeed_evaluation.core.metrics.manager import MetricManager
-from lightspeed_evaluation.core.script import ScriptExecutionManager
-from lightspeed_evaluation.core.models import EvaluationResult, EvaluationRequest
 from lightspeed_evaluation.pipeline.evaluation.driver import AgentDriver
 from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler
 from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
 from lightspeed_evaluation.pipeline.evaluation.processor import (
-    ProcessorComponents,
     ConversationProcessor,
+    ProcessorComponents,
 )
 
 
diff --git a/tests/unit/pipeline/evaluation/test_evaluator.py b/tests/unit/pipeline/evaluation/test_evaluator.py
index 82b63555..4c07d0e0 100644
--- a/tests/unit/pipeline/evaluation/test_evaluator.py
+++ b/tests/unit/pipeline/evaluation/test_evaluator.py
@@ -8,6 +8,7 @@
 from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
+from lightspeed_evaluation.core.metrics.manager import MetricManager
 from lightspeed_evaluation.core.models import (
     EvaluationData,
     EvaluationRequest,
@@ -15,10 +16,9 @@
     MetricResult,
     TurnData,
 )
-from lightspeed_evaluation.core.system.loader import ConfigLoader
-from lightspeed_evaluation.core.system.exceptions import EvaluationError
-from lightspeed_evaluation.core.metrics.manager import MetricManager
 from lightspeed_evaluation.core.script import ScriptExecutionManager
+from lightspeed_evaluation.core.system.exceptions import EvaluationError
+from lightspeed_evaluation.core.system.loader import ConfigLoader
 from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
 
 
diff --git a/tests/unit/pipeline/evaluation/test_processor.py b/tests/unit/pipeline/evaluation/test_processor.py
index c4a18688..30946930 100644
--- a/tests/unit/pipeline/evaluation/test_processor.py
+++ b/tests/unit/pipeline/evaluation/test_processor.py
@@ -3,7 +3,7 @@
 """Unit tests for ConversationProcessor."""
 
 import logging
-from typing import Callable
+from collections.abc import Callable
 
 import pytest
 from _pytest.logging import LogCaptureFixture
@@ -108,7 +108,6 @@ def test_process_conversation_conversation_metrics(
         mocker: MockerFixture,
     ) -> None:
         """Test processing with conversation-level metrics."""
-
         turn1 = TurnData(turn_id="turn1", query="Q", response="R")
         conv_data = EvaluationData(
             conversation_group_id="conv1",
@@ -154,7 +153,6 @@ def test_process_conversation_with_setup_script_success(
         mocker: MockerFixture,
     ) -> None:
         """Test processing with successful setup script."""
-
         sample_conv_data.setup_script = "setup.sh"
         mock_agent_driver.enabled = True
         mock_agent_driver.execute_turn.return_value = (None, "conv_123")
@@ -221,7 +219,6 @@ def test_process_conversation_with_cleanup_script(
         mocker: MockerFixture,
     ) -> None:
         """Test cleanup script is always called."""
-
         sample_conv_data.cleanup_script = "cleanup.sh"
         mock_agent_driver.enabled = True
         mock_agent_driver.execute_turn.return_value = (None, "conv_123")
@@ -267,7 +264,6 @@ def test_process_conversation_with_agent_execution(
         mocker: MockerFixture,
     ) -> None:
         """Test agent execution during turn processing."""
-
         mock_agent_driver.enabled = True
         mock_agent_driver.execute_turn.return_value = (None, "conv_123")
 
@@ -346,7 +342,6 @@ def test_evaluate_turn(
         mocker: MockerFixture,
     ) -> None:
         """Test _evaluate_turn method."""
-
         mock_result = EvaluationResult(
             conversation_group_id="conv1",
             turn_id="turn1",
@@ -376,7 +371,6 @@ def test_evaluate_conversation(
         mocker: MockerFixture,
     ) -> None:
         """Test _evaluate_conversation method."""
-
         mock_result = EvaluationResult(
             conversation_group_id="conv1",
             turn_id=None,
@@ -499,7 +493,6 @@ def test_evaluate_turn_with_invalid_metric(
         caplog: LogCaptureFixture,
     ) -> None:
         """Test _evaluate_turn with an invalid metric - creates ERROR result and logs error."""
-
         turn_data = TurnData(
             turn_id="1",
             query="What is Python?",
@@ -541,7 +534,6 @@ def test_evaluate_turn_with_all_invalid_metrics(
         caplog: LogCaptureFixture,
     ) -> None:
         """Test _evaluate_turn with all metrics invalid - returns ERROR results."""
-
         turn_data = TurnData(
             turn_id="1",
             query="What is Python?",
@@ -579,7 +571,6 @@ def test_evaluate_turn_with_mixed_valid_invalid_metrics(
         caplog: LogCaptureFixture,
     ) -> None:
         """Test _evaluate_turn with mix of valid and invalid metrics."""
-
         turn_data = TurnData(
             turn_id="1",
             query="What is Python?",
diff --git a/tests/unit/runner/test_evaluation.py b/tests/unit/runner/test_evaluation.py
index baf756a8..31fa6821 100644
--- a/tests/unit/runner/test_evaluation.py
+++ b/tests/unit/runner/test_evaluation.py
@@ -9,17 +9,17 @@
 import pytest
 from pytest_mock import MockerFixture
 
-from lightspeed_evaluation.core.models.statistics import OverallStats
-from lightspeed_evaluation.core.models.system import (
-    APIConfig,
-    SystemConfig,
-)
 from lightspeed_evaluation.core.models.llm import (
-    LLMDefaultsConfig,
-    LLMProviderConfig,
     EmbeddingConfig,
     LLMConfig,
+    LLMDefaultsConfig,
     LLMPoolConfig,
+    LLMProviderConfig,
+)
+from lightspeed_evaluation.core.models.statistics import OverallStats
+from lightspeed_evaluation.core.models.system import (
+    APIConfig,
+    SystemConfig,
 )
 from lightspeed_evaluation.core.system.exceptions import (
     DataValidationError,
diff --git a/uv.lock b/uv.lock
index f0a56647..13974ac6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1628,6 +1628,7 @@ dev = [
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
     { name = "ruff" },
+    { name = "types-pyyaml" },
 ]
 
 [package.metadata]
@@ -1672,6 +1673,7 @@ dev = [
     { name = "pytest-mock", specifier = "==3.15.1" },
     { name = "pytest-timeout", specifier = "==2.4.0" },
     { name = "ruff", specifier = ">=0.9.0,<=0.12.11" },
+    { name = "types-pyyaml", specifier = ">=6.0.0" },
 ]
 
 [[package]]
@@ -3913,6 +3915,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d5/91/9b286ab899c008c2cb05e8be99814807e7fbbd33f0c0c960470826e5ac82/typer-0.23.1-py3-none-any.whl", hash = "sha256:3291ad0d3c701cbf522012faccfbb29352ff16ad262db2139e6b01f15781f14e", size = 56813, upload-time = "2026-02-13T10:04:32.008Z" },
 ]
 
+[[package]]
+name = "types-pyyaml"
+version = "6.0.12.20260518"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b8/83/4a1afc3fbfcf5b8d46fc390cd95ed6b0dc9010a265f4e9f46314efffa37a/types_pyyaml-6.0.12.20260518.tar.gz", hash = "sha256:d917f83fb38462550338c1297faedd860b3ec83912b96b1e3d73255f7473e466", size = 17850, upload-time = "2026-05-18T06:01:58.675Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/a2/c01db32be2ae7d6a1689972f3c492b149ee4e164b12fdfd9f64b50888215/types_pyyaml-6.0.12.20260518-py3-none-any.whl", hash = "sha256:d2150f75a231c9fe9c7463bd29487d93e60bac90400287351384bc2284eba7cd", size = 20312, upload-time = "2026-05-18T06:01:57.368Z" },
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.15.0"