Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed
uv sync --group dev

check-types: ## Checks type hints in sources
uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests
uv run mypy src/ lsc_agent_eval/src/ tests

black-check:
uv run black src tests script lsc_agent_eval --check
uv run black --check src tests script lsc_agent_eval

black-format:
uv run black src tests script lsc_agent_eval
Expand Down Expand Up @@ -115,7 +115,7 @@ shellcheck: ## Run shellcheck

pylint:
uv run pylint src
uv run pylint --disable=R0801 lsc_agent_eval/src tests
uv run pylint lsc_agent_eval/src tests

pyright:
uv run pyright src lsc_agent_eval/src tests
Expand All @@ -127,4 +127,4 @@ ruff:
uv run ruff check src tests script lsc_agent_eval

bandit: ## Security scanning with Bandit
uv run bandit -r src/lightspeed_evaluation -ll
uv run bandit -c pyproject.toml -r src/lightspeed_evaluation -ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ..utils.exceptions import AgentAPIError, JudgeModelError, ScriptExecutionError
from ..utils.prompt import ANSWER_CORRECTNESS_PROMPT, INTENT_DETECTION_PROMPT
from .tool_call_eval import compare_tool_calls
from .utils import create_evaluation_results
from .utils import EvalResultItem, create_evaluation_results

if TYPE_CHECKING:
from ..utils.api_client import AgentHttpClient
Expand Down Expand Up @@ -42,12 +42,13 @@ def run_evaluation( # pylint: disable=too-many-arguments,too-many-positional-ar
"""Run multiple evaluations based on configuration."""
try:
# Query the agent once
api_input = {
api_input: dict[str, str] = {
"query": data_config.eval_query,
"provider": agent_provider,
"model": agent_model,
"conversation_id": conversation_id,
}
if conversation_id is not None:
api_input["conversation_id"] = conversation_id

if endpoint_type == "streaming":
agent_response = self.agent_client.streaming_query_agent(api_input)
Expand All @@ -61,7 +62,7 @@ def run_evaluation( # pylint: disable=too-many-arguments,too-many-positional-ar
tool_calls = agent_response.get("tool_calls", [])

# Run all evaluations
evaluation_results = []
evaluation_results: list[EvalResultItem] = []
for eval_type in data_config.eval_types:
try:
success = self._evaluate_single_type(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
import logging
from datetime import datetime
from datetime import UTC, datetime
from pathlib import Path

import pandas as pd
Expand Down Expand Up @@ -32,7 +32,7 @@ def save_results(self, result_dir: str) -> None:
output_dir = Path(result_dir)
output_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
csv_file = output_dir / f"agent_goal_eval_results_{timestamp}.csv"
json_file = output_dir / f"agent_goal_eval_summary_{timestamp}.json"

Expand Down
17 changes: 14 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ dev = [
"pytest-cov>=6.0.0,<=6.2.1",
"pytest-mock==3.15.1",
"pytest-timeout==2.4.0",
"types-PyYAML>=6.0.0",
]

[project.scripts]
Expand All @@ -84,22 +85,32 @@ line-length = 88
convention = "google"

[tool.mypy]
disable_error_code = ["union-attr", "return-value", "arg-type", "import-untyped"]
ignore_missing_imports = true
plugins = ["pydantic.mypy"]
explicit_package_bases = true
disallow_untyped_calls = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
ignore_missing_imports = true

[tool.pydantic-mypy]
init_forbid_extra = true
init_typed = true
warn_required_dynamic_aliases = true

[tool.pylint.MASTER]
source-roots = ["src", "script", "tests"]
load-plugins = ["pylint_pydantic"]
init-hook = "import sys; sys.path.append('.')"
[tool.pylint."MESSAGES CONTROL"]
disable = ["R0801"]

[tool.pyright]
extraPaths = ["./src"]

[tool.ruff]
line-length = 88
[tool.ruff.lint]
extend-select = ["TID251"]
extend-select = ["TID251", "UP006", "UP007", "UP010", "UP017", "UP035", "RUF100", "B009", "B010", "DTZ005", "D202", "I001", "PLR1733"]
[tool.ruff.lint.flake8-tidy-imports.banned-api]
unittest = { msg = "use pytest instead of unittest" }
"unittest.mock" = { msg = "use pytest-mock instead of unittest.mock" }
Expand Down
20 changes: 10 additions & 10 deletions script/compare_evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import sys
import traceback
from pathlib import Path
from typing import Any, Optional, Tuple, Union, cast
from typing import Any, Optional, cast

import numpy as np
from scipy.stats import chi2_contingency, fisher_exact, mannwhitneyu, ttest_ind
Expand All @@ -32,7 +32,7 @@ def __init__(self, alpha: float = 0.05):
self.logger = logging.getLogger(__name__)

def compare_evaluations(
self, summary1_path: Union[str, Path], summary2_path: Union[str, Path]
self, summary1_path: str | Path, summary2_path: str | Path
) -> dict[str, Any]:
"""Compare two evaluation summary files and return statistical significance results.

Expand Down Expand Up @@ -92,7 +92,7 @@ def compare_evaluations(

return comparison_results

def _load_summary(self, path: Union[str, Path]) -> dict[str, Any]:
def _load_summary(self, path: str | Path) -> dict[str, Any]:
"""Load evaluation summary from JSON file."""
path = Path(path)
if not path.exists():
Expand Down Expand Up @@ -313,9 +313,9 @@ def _compare_single_metric(

# Determine overall statistical significance
comparison["statistical_significance"] = self._determine_overall_significance(
comparison["score_comparison"],
comparison["pass_rate_comparison"],
comparison["confidence_interval_test"],
cast(Optional[dict[str, Any]], comparison["score_comparison"]),
cast(Optional[dict[str, Any]], comparison["pass_rate_comparison"]),
cast(Optional[dict[str, Any]], comparison["confidence_interval_test"]),
)

return comparison
Expand Down Expand Up @@ -381,7 +381,7 @@ def _compare_score_distributions(
# T-test (assumes normal distribution)
if len(scores1_array) > 1 and len(scores2_array) > 1:
ttest_result = ttest_ind(scores1_array, scores2_array)
t_stat, t_pvalue = cast(Tuple[float, float], ttest_result)
t_stat, t_pvalue = cast(tuple[float, float], ttest_result)
comparison["tests"]["t_test"] = {
"statistic": t_stat,
"p_value": t_pvalue,
Expand All @@ -394,7 +394,7 @@ def _compare_score_distributions(
mw_result = mannwhitneyu(
scores1_array, scores2_array, alternative="two-sided"
)
u_stat, u_pvalue = cast(Tuple[float, float], mw_result)
u_stat, u_pvalue = cast(tuple[float, float], mw_result)
comparison["tests"]["mann_whitney_u"] = {
"statistic": u_stat,
"p_value": u_pvalue,
Expand Down Expand Up @@ -567,7 +567,7 @@ def _perform_chi_square_test(
try:
chi2_result = chi2_contingency(contingency_table)
chi2_stat, chi2_pvalue, dof, _ = cast(
Tuple[float, float, int, Any], chi2_result
tuple[float, float, int, Any], chi2_result
)
comparison["tests"]["chi_square"] = {
"statistic": float(chi2_stat),
Expand Down Expand Up @@ -602,7 +602,7 @@ def _perform_fisher_exact_test(
)

fisher_result = fisher_exact(contingency_table)
odds_ratio, fisher_pvalue = cast(Tuple[float, float], fisher_result)
odds_ratio, fisher_pvalue = cast(tuple[float, float], fisher_result)
comparison["tests"]["fisher_exact"] = {
"odds_ratio": float(odds_ratio),
"p_value": float(fisher_pvalue),
Expand Down
24 changes: 13 additions & 11 deletions script/run_multi_provider_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,23 @@
import argparse
import copy
import json
import re
import logging
import multiprocessing
import os
import re
import sys
import tempfile
import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime
from datetime import UTC, datetime
from pathlib import Path
from typing import Any, Optional
from lightspeed_evaluation.runner.evaluation import run_evaluation

import numpy as np
import yaml

from lightspeed_evaluation.runner.evaluation import run_evaluation

# Configure logging
logging.basicConfig(
level=logging.INFO,
Expand Down Expand Up @@ -70,7 +72,7 @@ def _run_evaluation_worker(
)
worker_logger = logging.getLogger(__name__)

start_time = datetime.now()
start_time = datetime.now(UTC)
temp_config_path: Optional[Path] = None

# Sanitize names for filesystem
Expand Down Expand Up @@ -166,7 +168,7 @@ def _run_evaluation_worker(
)

# Record end time and duration
end_time = datetime.now()
end_time = datetime.now(UTC)
result["end_time"] = end_time.isoformat()
result["duration_seconds"] = (end_time - start_time).total_seconds()

Expand Down Expand Up @@ -436,7 +438,7 @@ def _run_single_evaluation(
Returns:
Dictionary containing evaluation results and metadata
"""
start_time = datetime.now()
start_time = datetime.now(UTC)
temp_config_path: Optional[Path] = None

# Sanitize names for filesystem and enforce confinement under output_base
Expand Down Expand Up @@ -508,7 +510,7 @@ def _run_single_evaluation(
logger.warning(f"Failed to delete temp config: {temp_config_path}")

# Record end time and duration
end_time = datetime.now()
end_time = datetime.now(UTC)
result["end_time"] = end_time.isoformat()
result["duration_seconds"] = (end_time - start_time).total_seconds()

Expand Down Expand Up @@ -611,8 +613,8 @@ def _run_parallel_evaluations(self, configs: list[dict[str, Any]]) -> None:
"provider_id": config["provider_id"],
"model": config["model"],
"output_dir": "",
"start_time": datetime.now().isoformat(),
"end_time": datetime.now().isoformat(),
"start_time": datetime.now(UTC).isoformat(),
"end_time": datetime.now(UTC).isoformat(),
"duration_seconds": 0,
"success": False,
"error": f"Worker process failed: {str(e)}",
Expand All @@ -634,7 +636,7 @@ def generate_summary(self) -> dict[str, Any]:
failed = total - successful

summary = {
"timestamp": datetime.now().isoformat(),
"timestamp": datetime.now(UTC).isoformat(),
"total_evaluations": total,
"successful": successful,
"failed": failed,
Expand Down Expand Up @@ -1156,7 +1158,7 @@ def save_model_comparison(self) -> Path:
analysis_data = {
"total_models": len(self.model_stats),
"output_base": str(self.output_base),
"timestamp": datetime.now().isoformat(),
"timestamp": datetime.now(UTC).isoformat(),
"rankings": [
{
"rank": rank,
Expand Down
3 changes: 2 additions & 1 deletion src/generate_answers/generate_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import logging
import os
import sys
from collections.abc import Callable
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from pathlib import Path
from typing import Callable, cast
from typing import cast

import click
import pandas as pd
Expand Down
4 changes: 2 additions & 2 deletions src/lightspeed_evaluation/core/api/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
import logging
import os
from typing import Any, Optional, Union, cast
from typing import Any, Optional, cast

import httpx
from diskcache import Cache
Expand Down Expand Up @@ -52,7 +52,7 @@ class APIClient:

def __init__(
self,
config: Union[APIConfig, HttpApiAgentConfig],
config: APIConfig | HttpApiAgentConfig,
):
"""Initialize the client with configuration."""
self.config = config
Expand Down
5 changes: 2 additions & 3 deletions src/lightspeed_evaluation/core/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,16 @@
from typing import TYPE_CHECKING

# Apply litellm patching globally before any litellm usage in this package
import lightspeed_evaluation.core.llm.litellm_patch # noqa: F401

import lightspeed_evaluation.core.llm.litellm_patch
from lightspeed_evaluation.core.system.lazy_import import create_lazy_getattr

if TYPE_CHECKING:
# ruff: noqa: F401
from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
from lightspeed_evaluation.core.llm.manager import LLMManager
from lightspeed_evaluation.core.llm.ragas import RagasLLMManager
from lightspeed_evaluation.core.llm.token_tracker import TokenTracker
from lightspeed_evaluation.core.models import LLMConfig
from lightspeed_evaluation.core.system.env_validator import validate_provider_env
from lightspeed_evaluation.core.system.exceptions import LLMError
Expand Down
4 changes: 2 additions & 2 deletions src/lightspeed_evaluation/core/llm/custom.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Base Custom LLM class for evaluation framework."""

import logging
from typing import Any, Union
from typing import Any

import litellm
from litellm.exceptions import InternalServerError
Expand Down Expand Up @@ -37,7 +37,7 @@ def call(
n: int = 1,
return_single: bool = True,
**kwargs: Any,
) -> Union[str, list[str]]:
) -> str | list[str]:
"""Make LLM call and return response(s).

Args:
Expand Down
2 changes: 1 addition & 1 deletion src/lightspeed_evaluation/core/llm/litellm_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@

# pylint: disable=wrong-import-position
from lightspeed_evaluation.core.llm.token_tracker import ( # noqa: E402
track_judge_tokens,
track_embedding_tokens,
track_judge_tokens,
)

logger = logging.getLogger(__name__)
Expand Down
5 changes: 2 additions & 3 deletions src/lightspeed_evaluation/core/metrics/custom/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,16 @@

from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
from lightspeed_evaluation.core.llm.manager import LLMManager
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
from lightspeed_evaluation.core.metrics.custom.prompts import (
ANSWER_CORRECTNESS_PROMPT,
INTENT_EVALUATION_PROMPT,
)
from lightspeed_evaluation.core.metrics.custom.keywords_eval import evaluate_keywords
from lightspeed_evaluation.core.metrics.custom.tool_eval import evaluate_tool_calls
from lightspeed_evaluation.core.metrics.manager import MetricLevel
from lightspeed_evaluation.core.models import EvaluationScope, TurnData
from lightspeed_evaluation.core.system.exceptions import LLMError

from lightspeed_evaluation.core.metrics.manager import MetricLevel

if TYPE_CHECKING:
from lightspeed_evaluation.core.metrics.manager import MetricManager

Expand Down
Loading
Loading