Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions livekit-agents/livekit/agents/evals/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,10 @@ def task_completion_judge(llm: LLM | None = None) -> _TaskCompletionJudge:

Based on First Call Resolution (FCR), the key metric in call centers.
Useful for: customer service, appointment booking, order management.

Args:
llm: The LLM instance to use for evaluation. If None, an LLM must be
provided via JudgeGroup, otherwise a ValueError is raised.
"""
return _TaskCompletionJudge(llm=llm)

Expand All @@ -373,6 +377,10 @@ def handoff_judge(llm: LLM | None = None) -> _HandoffJudge:
Automatically passes if no handoffs occurred.

Useful for: multi-agent systems, transfers to specialists, escalations.

Args:
llm: The LLM instance to use for evaluation. If None, an LLM must be
provided via JudgeGroup, otherwise a ValueError is raised.
"""
return _HandoffJudge(llm=llm)

Expand All @@ -384,6 +392,10 @@ def accuracy_judge(llm: LLM | None = None) -> _LLMJudge:
Catches hallucinations, misquoted data, and contradictions with tool results.

Useful for: healthcare, insurance, finance - where wrong information has consequences.

Args:
llm: The LLM instance to use for evaluation. If None, an LLM must be
provided via JudgeGroup, otherwise a ValueError is raised.
"""
return _LLMJudge(
llm=llm,
Expand All @@ -404,6 +416,10 @@ def tool_use_judge(llm: LLM | None = None) -> _LLMJudge:
Voice agents rely on function calls for lookups, bookings, transfers, etc.

Useful for: any agent with tools - appointment systems, order lookups, CRM integrations.

Args:
llm: The LLM instance to use for evaluation. If None, an LLM must be
provided via JudgeGroup, otherwise a ValueError is raised.
"""
return _LLMJudge(
llm=llm,
Expand All @@ -428,6 +444,10 @@ def safety_judge(llm: LLM | None = None) -> _LLMJudge:
and toxic or harmful language.

Useful for: regulated industries, user-facing agents where compliance and tone matter.

Args:
llm: The LLM instance to use for evaluation. If None, an LLM must be
provided via JudgeGroup, otherwise a ValueError is raised.
"""
return _LLMJudge(
llm=llm,
Expand All @@ -449,6 +469,10 @@ def relevancy_judge(llm: LLM | None = None) -> _LLMJudge:
on the topic, and appropriately redirects off-topic requests.

Useful for: any conversational agent, scoped agents, customer service.

Args:
llm: The LLM instance to use for evaluation. If None, an LLM must be
provided via JudgeGroup, otherwise a ValueError is raised.
"""
return _LLMJudge(
llm=llm,
Expand All @@ -469,6 +493,10 @@ def coherence_judge(llm: LLM | None = None) -> _LLMJudge:
contradictions or confusing jumps between topics.

Useful for: complex explanations, multi-turn conversations, technical support.

Args:
llm: The LLM instance to use for evaluation. If None, an LLM must be
provided via JudgeGroup, otherwise a ValueError is raised.
"""
return _LLMJudge(
llm=llm,
Expand All @@ -489,6 +517,10 @@ def conciseness_judge(llm: LLM | None = None) -> _LLMJudge:
verbosity, repetition, and redundant details.

Useful for: voice agents, chat interfaces, any context where user time matters.

Args:
llm: The LLM instance to use for evaluation. If None, an LLM must be
provided via JudgeGroup, otherwise a ValueError is raised.
"""
return _LLMJudge(
llm=llm,
Expand Down
Loading