livekit · Zenvila · Jun 3, 2026
diff --git a/livekit-agents/livekit/agents/evals/judge.py b/livekit-agents/livekit/agents/evals/judge.py
@@ -360,6 +360,10 @@ def task_completion_judge(llm: LLM | None = None) -> _TaskCompletionJudge:
 
     Based on First Call Resolution (FCR), the key metric in call centers.
     Useful for: customer service, appointment booking, order management.
+
+    Args:
+        llm: The LLM instance to use for evaluation. If None, an LLM must be
+            provided via JudgeGroup, otherwise a ValueError is raised.
     """
     return _TaskCompletionJudge(llm=llm)
 
@@ -373,6 +377,10 @@ def handoff_judge(llm: LLM | None = None) -> _HandoffJudge:
     Automatically passes if no handoffs occurred.
 
     Useful for: multi-agent systems, transfers to specialists, escalations.
+
+    Args:
+        llm: The LLM instance to use for evaluation. If None, an LLM must be
+            provided via JudgeGroup, otherwise a ValueError is raised.
     """
     return _HandoffJudge(llm=llm)
 
@@ -384,6 +392,10 @@ def accuracy_judge(llm: LLM | None = None) -> _LLMJudge:
     Catches hallucinations, misquoted data, and contradictions with tool results.
 
     Useful for: healthcare, insurance, finance - where wrong information has consequences.
+
+    Args:
+        llm: The LLM instance to use for evaluation. If None, an LLM must be
+            provided via JudgeGroup, otherwise a ValueError is raised.
     """
     return _LLMJudge(
         llm=llm,
@@ -404,6 +416,10 @@ def tool_use_judge(llm: LLM | None = None) -> _LLMJudge:
     Voice agents rely on function calls for lookups, bookings, transfers, etc.
 
     Useful for: any agent with tools - appointment systems, order lookups, CRM integrations.
+
+    Args:
+        llm: The LLM instance to use for evaluation. If None, an LLM must be
+            provided via JudgeGroup, otherwise a ValueError is raised.
     """
     return _LLMJudge(
         llm=llm,
@@ -428,6 +444,10 @@ def safety_judge(llm: LLM | None = None) -> _LLMJudge:
     and toxic or harmful language.
 
     Useful for: regulated industries, user-facing agents where compliance and tone matter.
+
+    Args:
+        llm: The LLM instance to use for evaluation. If None, an LLM must be
+            provided via JudgeGroup, otherwise a ValueError is raised.
     """
     return _LLMJudge(
         llm=llm,
@@ -449,6 +469,10 @@ def relevancy_judge(llm: LLM | None = None) -> _LLMJudge:
     on the topic, and appropriately redirects off-topic requests.
 
     Useful for: any conversational agent, scoped agents, customer service.
+
+    Args:
+        llm: The LLM instance to use for evaluation. If None, an LLM must be
+            provided via JudgeGroup, otherwise a ValueError is raised.
     """
     return _LLMJudge(
         llm=llm,
@@ -469,6 +493,10 @@ def coherence_judge(llm: LLM | None = None) -> _LLMJudge:
     contradictions or confusing jumps between topics.
 
     Useful for: complex explanations, multi-turn conversations, technical support.
+
+    Args:
+        llm: The LLM instance to use for evaluation. If None, an LLM must be
+            provided via JudgeGroup, otherwise a ValueError is raised.
     """
     return _LLMJudge(
         llm=llm,
@@ -489,6 +517,10 @@ def conciseness_judge(llm: LLM | None = None) -> _LLMJudge:
     verbosity, repetition, and redundant details.
 
     Useful for: voice agents, chat interfaces, any context where user time matters.
+
+    Args:
+        llm: The LLM instance to use for evaluation. If None, an LLM must be
+            provided via JudgeGroup, otherwise a ValueError is raised.
     """
     return _LLMJudge(
         llm=llm,