@@ -191,6 +191,124 @@ async def get_run_status(run_id: str) -> dict[str, Any]:
191191 return resp .json ()
192192
193193
194+ @mcp .tool ()
195+ async def list_eval_sets () -> list [dict [str , Any ]]:
196+ """List all evaluation sets.
197+
198+ Returns the available eval sets with their IDs, names, item counts,
199+ and attached evaluator IDs. Use the returned IDs with run_eval_set.
200+ """
201+ await _report_tool_call ("list_eval_sets" )
202+ async with httpx .AsyncClient () as client :
203+ resp = await client .get (_api_url ("/eval-sets" ), timeout = 10 )
204+ resp .raise_for_status ()
205+ return resp .json ()
206+
207+
208+ @mcp .tool ()
209+ async def get_eval_set (eval_set_id : str ) -> dict [str , Any ]:
210+ """Get full details of an evaluation set including all items.
211+
212+ Args:
213+ eval_set_id: ID of the eval set (from list_eval_sets).
214+
215+ Returns the eval set with all items, their inputs, expected outputs,
216+ and evaluation criteria.
217+ """
218+ await _report_tool_call ("get_eval_set" , {"eval_set_id" : eval_set_id })
219+ async with httpx .AsyncClient () as client :
220+ resp = await client .get (_api_url (f"/eval-sets/{ eval_set_id } " ), timeout = 10 )
221+ resp .raise_for_status ()
222+ return resp .json ()
223+
224+
225+ @mcp .tool ()
226+ async def run_eval_set (
227+ eval_set_id : str ,
228+ ctx : Context , # type: ignore[type-arg]
229+ ) -> dict [str , Any ]:
230+ """Run an evaluation set against the agent.
231+
232+ Args:
233+ eval_set_id: ID of the eval set (from list_eval_sets).
234+
235+ Starts the eval run and streams progress as each item completes.
236+ Returns the full run result with per-item scores and overall score.
237+ """
238+ await _report_tool_call ("run_eval_set" , {"eval_set_id" : eval_set_id })
239+ async with httpx .AsyncClient () as client :
240+ resp = await client .post (_api_url (f"/eval-sets/{ eval_set_id } /runs" ), timeout = 30 )
241+ resp .raise_for_status ()
242+ run : dict [str , Any ] = resp .json ()
243+ run_id = run ["id" ]
244+
245+ await ctx .log ("info" , f"Eval run { run_id } created — streaming progress..." )
246+
247+ async with websockets .connect (_ws_url ()) as ws :
248+ async for raw in ws :
249+ msg = json .loads (raw )
250+ msg_type = msg .get ("type" , "" )
251+ payload = msg .get ("payload" , {})
252+
253+ if msg_type == "eval_run.progress" and payload .get ("run_id" ) == run_id :
254+ completed = payload .get ("completed" , 0 )
255+ total = payload .get ("total" , 0 )
256+ item = payload .get ("item_result" )
257+ if item :
258+ name = item .get ("name" , "" )
259+ status = item .get ("status" , "" )
260+ score = item .get ("overall_score" )
261+ score_str = f" — score: { score :.0%} " if score is not None else ""
262+ await ctx .log (
263+ "info" , f" [{ completed } /{ total } ] { name } : { status } { score_str } "
264+ )
265+ await ctx .report_progress (progress = completed , total = total )
266+
267+ elif msg_type == "eval_run.completed" and payload .get ("run_id" ) == run_id :
268+ overall = payload .get ("overall_score" )
269+ if overall is not None :
270+ await ctx .log (
271+ "info" , f"Eval run completed — overall score: { overall :.0%} "
272+ )
273+ break
274+
275+ # Fetch final run detail
276+ async with httpx .AsyncClient () as client :
277+ resp = await client .get (_api_url (f"/eval-runs/{ run_id } " ), timeout = 10 )
278+ resp .raise_for_status ()
279+ return resp .json ()
280+
281+
282+ @mcp .tool ()
283+ async def list_eval_runs () -> list [dict [str , Any ]]:
284+ """List all evaluation runs.
285+
286+ Returns summaries of all eval runs with their status, scores,
287+ and progress. Use run IDs with get_eval_run for full details.
288+ """
289+ await _report_tool_call ("list_eval_runs" )
290+ async with httpx .AsyncClient () as client :
291+ resp = await client .get (_api_url ("/eval-runs" ), timeout = 10 )
292+ resp .raise_for_status ()
293+ return resp .json ()
294+
295+
296+ @mcp .tool ()
297+ async def get_eval_run (eval_run_id : str ) -> dict [str , Any ]:
298+ """Get full details of an evaluation run including per-item results.
299+
300+ Args:
301+ eval_run_id: ID of the eval run.
302+
303+ Returns the run with all item results, scores, justifications, and traces.
304+ """
305+ await _report_tool_call ("get_eval_run" , {"eval_run_id" : eval_run_id })
306+ async with httpx .AsyncClient () as client :
307+ resp = await client .get (_api_url (f"/eval-runs/{ eval_run_id } " ), timeout = 10 )
308+ resp .raise_for_status ()
309+ return resp .json ()
310+
311+
194312def main () -> None :
195313 """Entry point for the uipath-dev-mcp CLI command."""
196314 mcp .run (transport = "stdio" )
0 commit comments