lightspeed-core · xmican10 · May 25, 2026
diff --git a/Makefile b/Makefile
@@ -23,10 +23,10 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed
 	uv sync --group dev
 
 check-types: ## Checks type hints in sources
-	uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests
+	uv run mypy src/ lsc_agent_eval/src/ tests
 
 black-check:
-	uv run black src tests script lsc_agent_eval --check
+	uv run black --check src tests script lsc_agent_eval
 
 black-format:
 	uv run black src tests script lsc_agent_eval
@@ -115,7 +115,7 @@ shellcheck: ## Run shellcheck
 
 pylint:
 	uv run pylint src
-	uv run pylint --disable=R0801 lsc_agent_eval/src tests
+	uv run pylint lsc_agent_eval/src tests
 
 pyright:
 	uv run pyright src lsc_agent_eval/src tests
@@ -127,4 +127,4 @@ ruff:
 	uv run ruff check src tests script lsc_agent_eval
 
 bandit: ## Security scanning with Bandit
-	uv run bandit -r src/lightspeed_evaluation -ll
+	uv run bandit -c pyproject.toml -r src/lightspeed_evaluation -ll
diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py
@@ -6,7 +6,7 @@
 from ..utils.exceptions import AgentAPIError, JudgeModelError, ScriptExecutionError
 from ..utils.prompt import ANSWER_CORRECTNESS_PROMPT, INTENT_DETECTION_PROMPT
 from .tool_call_eval import compare_tool_calls
-from .utils import create_evaluation_results
+from .utils import EvalResultItem, create_evaluation_results
 
 if TYPE_CHECKING:
     from ..utils.api_client import AgentHttpClient
@@ -42,12 +42,13 @@ def run_evaluation(  # pylint: disable=too-many-arguments,too-many-positional-ar
         """Run multiple evaluations based on configuration."""
         try:
             # Query the agent once
-            api_input = {
+            api_input: dict[str, str] = {
                 "query": data_config.eval_query,
                 "provider": agent_provider,
                 "model": agent_model,
-                "conversation_id": conversation_id,
             }
+            if conversation_id is not None:
+                api_input["conversation_id"] = conversation_id
 
             if endpoint_type == "streaming":
                 agent_response = self.agent_client.streaming_query_agent(api_input)
@@ -61,7 +62,7 @@ def run_evaluation(  # pylint: disable=too-many-arguments,too-many-positional-ar
             tool_calls = agent_response.get("tool_calls", [])
 
             # Run all evaluations
-            evaluation_results = []
+            evaluation_results: list[EvalResultItem] = []
             for eval_type in data_config.eval_types:
                 try:
                     success = self._evaluate_single_type(

diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py
@@ -2,7 +2,7 @@
 
 import json
 import logging
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 
 import pandas as pd
@@ -32,7 +32,7 @@ def save_results(self, result_dir: str) -> None:
             output_dir = Path(result_dir)
             output_dir.mkdir(parents=True, exist_ok=True)
 
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
             csv_file = output_dir / f"agent_goal_eval_results_{timestamp}.csv"
             json_file = output_dir / f"agent_goal_eval_summary_{timestamp}.json"
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -65,6 +65,7 @@ dev = [
     "pytest-cov>=6.0.0,<=6.2.1",
     "pytest-mock==3.15.1",
     "pytest-timeout==2.4.0",
+    "types-PyYAML>=6.0.0",
 ]
 
 [project.scripts]
@@ -84,22 +85,32 @@ line-length = 88
 convention = "google"
 
 [tool.mypy]
-disable_error_code = ["union-attr", "return-value", "arg-type", "import-untyped"]
-ignore_missing_imports = true
 plugins = ["pydantic.mypy"]
+explicit_package_bases = true
+disallow_untyped_calls = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+ignore_missing_imports = true
 
 [tool.pydantic-mypy]
 init_forbid_extra = true
 init_typed = true
 warn_required_dynamic_aliases = true
 
 [tool.pylint.MASTER]
+source-roots = ["src", "script", "tests"]
 load-plugins = ["pylint_pydantic"]
 init-hook = "import sys; sys.path.append('.')"
+[tool.pylint."MESSAGES CONTROL"]
+disable = ["R0801"]
+
+[tool.pyright]
+extraPaths = ["./src"]
 
 [tool.ruff]
+line-length = 88
 [tool.ruff.lint]
-extend-select = ["TID251"]
+extend-select = ["TID251", "UP006", "UP007", "UP010", "UP017", "UP035", "RUF100", "B009", "B010", "DTZ005", "D202", "I001", "PLR1733"]
 [tool.ruff.lint.flake8-tidy-imports.banned-api]
 unittest = { msg = "use pytest instead of unittest" }
 "unittest.mock" = { msg = "use pytest-mock instead of unittest.mock" }

diff --git a/script/compare_evaluations.py b/script/compare_evaluations.py
@@ -11,7 +11,7 @@
 import sys
 import traceback
 from pathlib import Path
-from typing import Any, Optional, Tuple, Union, cast
+from typing import Any, Optional, cast
 
 import numpy as np
 from scipy.stats import chi2_contingency, fisher_exact, mannwhitneyu, ttest_ind
@@ -32,7 +32,7 @@ def __init__(self, alpha: float = 0.05):
         self.logger = logging.getLogger(__name__)
 
     def compare_evaluations(
-        self, summary1_path: Union[str, Path], summary2_path: Union[str, Path]
+        self, summary1_path: str | Path, summary2_path: str | Path
     ) -> dict[str, Any]:
         """Compare two evaluation summary files and return statistical significance results.
 
@@ -92,7 +92,7 @@ def compare_evaluations(
 
         return comparison_results
 
-    def _load_summary(self, path: Union[str, Path]) -> dict[str, Any]:
+    def _load_summary(self, path: str | Path) -> dict[str, Any]:
         """Load evaluation summary from JSON file."""
         path = Path(path)
         if not path.exists():
@@ -313,9 +313,9 @@ def _compare_single_metric(
 
         # Determine overall statistical significance
         comparison["statistical_significance"] = self._determine_overall_significance(
-            comparison["score_comparison"],
-            comparison["pass_rate_comparison"],
-            comparison["confidence_interval_test"],
+            cast(Optional[dict[str, Any]], comparison["score_comparison"]),
+            cast(Optional[dict[str, Any]], comparison["pass_rate_comparison"]),
+            cast(Optional[dict[str, Any]], comparison["confidence_interval_test"]),
         )
 
         return comparison
@@ -381,7 +381,7 @@ def _compare_score_distributions(
             # T-test (assumes normal distribution)
             if len(scores1_array) > 1 and len(scores2_array) > 1:
                 ttest_result = ttest_ind(scores1_array, scores2_array)
-                t_stat, t_pvalue = cast(Tuple[float, float], ttest_result)
+                t_stat, t_pvalue = cast(tuple[float, float], ttest_result)
                 comparison["tests"]["t_test"] = {
                     "statistic": t_stat,
                     "p_value": t_pvalue,
@@ -394,7 +394,7 @@ def _compare_score_distributions(
                 mw_result = mannwhitneyu(
                     scores1_array, scores2_array, alternative="two-sided"
                 )
-                u_stat, u_pvalue = cast(Tuple[float, float], mw_result)
+                u_stat, u_pvalue = cast(tuple[float, float], mw_result)
                 comparison["tests"]["mann_whitney_u"] = {
                     "statistic": u_stat,
                     "p_value": u_pvalue,
@@ -567,7 +567,7 @@ def _perform_chi_square_test(
         try:
             chi2_result = chi2_contingency(contingency_table)
             chi2_stat, chi2_pvalue, dof, _ = cast(
-                Tuple[float, float, int, Any], chi2_result
+                tuple[float, float, int, Any], chi2_result
             )
             comparison["tests"]["chi_square"] = {
                 "statistic": float(chi2_stat),
@@ -602,7 +602,7 @@ def _perform_fisher_exact_test(
             )
 
             fisher_result = fisher_exact(contingency_table)
-            odds_ratio, fisher_pvalue = cast(Tuple[float, float], fisher_result)
+            odds_ratio, fisher_pvalue = cast(tuple[float, float], fisher_result)
             comparison["tests"]["fisher_exact"] = {
                 "odds_ratio": float(odds_ratio),
                 "p_value": float(fisher_pvalue),

diff --git a/script/run_multi_provider_eval.py b/script/run_multi_provider_eval.py
@@ -13,21 +13,23 @@
 import argparse
 import copy
 import json
-import re
 import logging
 import multiprocessing
 import os
+import re
 import sys
 import tempfile
 import traceback
 from concurrent.futures import ProcessPoolExecutor, as_completed
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any, Optional
-from lightspeed_evaluation.runner.evaluation import run_evaluation
+
 import numpy as np
 import yaml
 
+from lightspeed_evaluation.runner.evaluation import run_evaluation
+
 # Configure logging
 logging.basicConfig(
     level=logging.INFO,
@@ -70,7 +72,7 @@ def _run_evaluation_worker(
     )
     worker_logger = logging.getLogger(__name__)
 
-    start_time = datetime.now()
+    start_time = datetime.now(UTC)
     temp_config_path: Optional[Path] = None
 
     # Sanitize names for filesystem
@@ -166,7 +168,7 @@ def _run_evaluation_worker(
                 )
 
     # Record end time and duration
-    end_time = datetime.now()
+    end_time = datetime.now(UTC)
     result["end_time"] = end_time.isoformat()
     result["duration_seconds"] = (end_time - start_time).total_seconds()
 
@@ -436,7 +438,7 @@ def _run_single_evaluation(
         Returns:
             Dictionary containing evaluation results and metadata
         """
-        start_time = datetime.now()
+        start_time = datetime.now(UTC)
         temp_config_path: Optional[Path] = None
 
         # Sanitize names for filesystem and enforce confinement under output_base
@@ -508,7 +510,7 @@ def _run_single_evaluation(
                     logger.warning(f"Failed to delete temp config: {temp_config_path}")
 
         # Record end time and duration
-        end_time = datetime.now()
+        end_time = datetime.now(UTC)
         result["end_time"] = end_time.isoformat()
         result["duration_seconds"] = (end_time - start_time).total_seconds()
 
@@ -611,8 +613,8 @@ def _run_parallel_evaluations(self, configs: list[dict[str, Any]]) -> None:
                         "provider_id": config["provider_id"],
                         "model": config["model"],
                         "output_dir": "",
-                        "start_time": datetime.now().isoformat(),
-                        "end_time": datetime.now().isoformat(),
+                        "start_time": datetime.now(UTC).isoformat(),
+                        "end_time": datetime.now(UTC).isoformat(),
                         "duration_seconds": 0,
                         "success": False,
                         "error": f"Worker process failed: {str(e)}",
@@ -634,7 +636,7 @@ def generate_summary(self) -> dict[str, Any]:
         failed = total - successful
 
         summary = {
-            "timestamp": datetime.now().isoformat(),
+            "timestamp": datetime.now(UTC).isoformat(),
             "total_evaluations": total,
             "successful": successful,
             "failed": failed,
@@ -1156,7 +1158,7 @@ def save_model_comparison(self) -> Path:
         analysis_data = {
             "total_models": len(self.model_stats),
             "output_base": str(self.output_base),
-            "timestamp": datetime.now().isoformat(),
+            "timestamp": datetime.now(UTC).isoformat(),
             "rankings": [
                 {
                     "rank": rank,

diff --git a/src/generate_answers/generate_answers.py b/src/generate_answers/generate_answers.py
@@ -4,10 +4,11 @@
 import logging
 import os
 import sys
+from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from pathlib import Path
-from typing import Callable, cast
+from typing import cast
 
 import click
 import pandas as pd

diff --git a/src/lightspeed_evaluation/core/api/client.py b/src/lightspeed_evaluation/core/api/client.py
@@ -4,7 +4,7 @@
 import json
 import logging
 import os
-from typing import Any, Optional, Union, cast
+from typing import Any, Optional, cast
 
 import httpx
 from diskcache import Cache
@@ -52,7 +52,7 @@ class APIClient:
 
     def __init__(
         self,
-        config: Union[APIConfig, HttpApiAgentConfig],
+        config: APIConfig | HttpApiAgentConfig,
     ):
         """Initialize the client with configuration."""
         self.config = config

diff --git a/src/lightspeed_evaluation/core/llm/__init__.py b/src/lightspeed_evaluation/core/llm/__init__.py
@@ -3,17 +3,16 @@
 from typing import TYPE_CHECKING
 
 # Apply litellm patching globally before any litellm usage in this package
-import lightspeed_evaluation.core.llm.litellm_patch  # noqa: F401
-
+import lightspeed_evaluation.core.llm.litellm_patch
 from lightspeed_evaluation.core.system.lazy_import import create_lazy_getattr
 
 if TYPE_CHECKING:
     # ruff: noqa: F401
     from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
-    from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
     from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
     from lightspeed_evaluation.core.llm.manager import LLMManager
     from lightspeed_evaluation.core.llm.ragas import RagasLLMManager
+    from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
     from lightspeed_evaluation.core.models import LLMConfig
     from lightspeed_evaluation.core.system.env_validator import validate_provider_env
     from lightspeed_evaluation.core.system.exceptions import LLMError

diff --git a/src/lightspeed_evaluation/core/llm/custom.py b/src/lightspeed_evaluation/core/llm/custom.py
@@ -1,7 +1,7 @@
 """Base Custom LLM class for evaluation framework."""
 
 import logging
-from typing import Any, Union
+from typing import Any
 
 import litellm
 from litellm.exceptions import InternalServerError
@@ -37,7 +37,7 @@ def call(
         n: int = 1,
         return_single: bool = True,
         **kwargs: Any,
-    ) -> Union[str, list[str]]:
+    ) -> str | list[str]:
         """Make LLM call and return response(s).
 
         Args:

diff --git a/src/lightspeed_evaluation/core/llm/litellm_patch.py b/src/lightspeed_evaluation/core/llm/litellm_patch.py
@@ -34,8 +34,8 @@
 
 # pylint: disable=wrong-import-position
 from lightspeed_evaluation.core.llm.token_tracker import (  # noqa: E402
-    track_judge_tokens,
     track_embedding_tokens,
+    track_judge_tokens,
 )
 
 logger = logging.getLogger(__name__)

diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py
@@ -5,17 +5,16 @@
 
 from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
 from lightspeed_evaluation.core.llm.manager import LLMManager
+from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
 from lightspeed_evaluation.core.metrics.custom.prompts import (
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
 )
-from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
 from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
+from lightspeed_evaluation.core.metrics.manager import MetricLevel
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
 from lightspeed_evaluation.core.system.exceptions import LLMError
 
-from lightspeed_evaluation.core.metrics.manager import MetricLevel
-
 if TYPE_CHECKING:
     from lightspeed_evaluation.core.metrics.manager import MetricManager