Skip to content

Commit be11949

Browse files
authored
Merge pull request #102 from UiPath/feat/evals
feat: evaluation sets UI, MCP tools, and evaluator enhancements
2 parents 3b77ec3 + e0b2743 commit be11949

21 files changed

Lines changed: 1633 additions & 391 deletions

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "uipath-dev"
3-
version = "0.0.75"
3+
version = "0.0.76"
44
description = "UiPath Developer Console"
55
readme = { file = "README.md", content-type = "text/markdown" }
66
requires-python = ">=3.11"

src/uipath/dev/mcp/__init__.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,124 @@ async def get_run_status(run_id: str) -> dict[str, Any]:
191191
return resp.json()
192192

193193

194+
@mcp.tool()
195+
async def list_eval_sets() -> list[dict[str, Any]]:
196+
"""List all evaluation sets.
197+
198+
Returns the available eval sets with their IDs, names, item counts,
199+
and attached evaluator IDs. Use the returned IDs with run_eval_set.
200+
"""
201+
await _report_tool_call("list_eval_sets")
202+
async with httpx.AsyncClient() as client:
203+
resp = await client.get(_api_url("/eval-sets"), timeout=10)
204+
resp.raise_for_status()
205+
return resp.json()
206+
207+
208+
@mcp.tool()
209+
async def get_eval_set(eval_set_id: str) -> dict[str, Any]:
210+
"""Get full details of an evaluation set including all items.
211+
212+
Args:
213+
eval_set_id: ID of the eval set (from list_eval_sets).
214+
215+
Returns the eval set with all items, their inputs, expected outputs,
216+
and evaluation criteria.
217+
"""
218+
await _report_tool_call("get_eval_set", {"eval_set_id": eval_set_id})
219+
async with httpx.AsyncClient() as client:
220+
resp = await client.get(_api_url(f"/eval-sets/{eval_set_id}"), timeout=10)
221+
resp.raise_for_status()
222+
return resp.json()
223+
224+
225+
@mcp.tool()
226+
async def run_eval_set(
227+
eval_set_id: str,
228+
ctx: Context, # type: ignore[type-arg]
229+
) -> dict[str, Any]:
230+
"""Run an evaluation set against the agent.
231+
232+
Args:
233+
eval_set_id: ID of the eval set (from list_eval_sets).
234+
235+
Starts the eval run and streams progress as each item completes.
236+
Returns the full run result with per-item scores and overall score.
237+
"""
238+
await _report_tool_call("run_eval_set", {"eval_set_id": eval_set_id})
239+
async with httpx.AsyncClient() as client:
240+
resp = await client.post(_api_url(f"/eval-sets/{eval_set_id}/runs"), timeout=30)
241+
resp.raise_for_status()
242+
run: dict[str, Any] = resp.json()
243+
run_id = run["id"]
244+
245+
await ctx.log("info", f"Eval run {run_id} created — streaming progress...")
246+
247+
async with websockets.connect(_ws_url()) as ws:
248+
async for raw in ws:
249+
msg = json.loads(raw)
250+
msg_type = msg.get("type", "")
251+
payload = msg.get("payload", {})
252+
253+
if msg_type == "eval_run.progress" and payload.get("run_id") == run_id:
254+
completed = payload.get("completed", 0)
255+
total = payload.get("total", 0)
256+
item = payload.get("item_result")
257+
if item:
258+
name = item.get("name", "")
259+
status = item.get("status", "")
260+
score = item.get("overall_score")
261+
score_str = f" — score: {score:.0%}" if score is not None else ""
262+
await ctx.log(
263+
"info", f" [{completed}/{total}] {name}: {status}{score_str}"
264+
)
265+
await ctx.report_progress(progress=completed, total=total)
266+
267+
elif msg_type == "eval_run.completed" and payload.get("run_id") == run_id:
268+
overall = payload.get("overall_score")
269+
if overall is not None:
270+
await ctx.log(
271+
"info", f"Eval run completed — overall score: {overall:.0%}"
272+
)
273+
break
274+
275+
# Fetch final run detail
276+
async with httpx.AsyncClient() as client:
277+
resp = await client.get(_api_url(f"/eval-runs/{run_id}"), timeout=10)
278+
resp.raise_for_status()
279+
return resp.json()
280+
281+
282+
@mcp.tool()
283+
async def list_eval_runs() -> list[dict[str, Any]]:
284+
"""List all evaluation runs.
285+
286+
Returns summaries of all eval runs with their status, scores,
287+
and progress. Use run IDs with get_eval_run for full details.
288+
"""
289+
await _report_tool_call("list_eval_runs")
290+
async with httpx.AsyncClient() as client:
291+
resp = await client.get(_api_url("/eval-runs"), timeout=10)
292+
resp.raise_for_status()
293+
return resp.json()
294+
295+
296+
@mcp.tool()
297+
async def get_eval_run(eval_run_id: str) -> dict[str, Any]:
298+
"""Get full details of an evaluation run including per-item results.
299+
300+
Args:
301+
eval_run_id: ID of the eval run.
302+
303+
Returns the run with all item results, scores, justifications, and traces.
304+
"""
305+
await _report_tool_call("get_eval_run", {"eval_run_id": eval_run_id})
306+
async with httpx.AsyncClient() as client:
307+
resp = await client.get(_api_url(f"/eval-runs/{eval_run_id}"), timeout=10)
308+
resp.raise_for_status()
309+
return resp.json()
310+
311+
194312
def main() -> None:
195313
"""Entry point for the uipath-dev-mcp CLI command."""
196314
mcp.run(transport="stdio")

src/uipath/dev/server/frontend/src/api/eval-client.ts

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { EvaluatorInfo, LocalEvaluator, EvalSetSummary, EvalSetDetail, EvalItem, EvalRunSummary, EvalRunDetail } from "../types/eval";
1+
import type { EvaluatorInfo, LocalEvaluator, LlmModel, EvalSetSummary, EvalSetDetail, EvalItem, EvalRunSummary, EvalRunDetail } from "../types/eval";
22

33
const BASE = "/api";
44

@@ -24,6 +24,10 @@ export async function listEvaluators(): Promise<EvaluatorInfo[]> {
2424
return fetchJson(`${BASE}/evaluators`);
2525
}
2626

27+
export async function listLlmModels(): Promise<LlmModel[]> {
28+
return fetchJson(`${BASE}/llm-models`);
29+
}
30+
2731
export async function listEvalSets(): Promise<EvalSetSummary[]> {
2832
return fetchJson(`${BASE}/eval-sets`);
2933
}
@@ -55,6 +59,42 @@ export async function addEvalItem(
5559
});
5660
}
5761

62+
export async function updateEvalItem(
63+
evalSetId: string,
64+
itemName: string,
65+
body: {
66+
name?: string;
67+
inputs?: Record<string, unknown>;
68+
expected_output?: unknown;
69+
expected_behavior?: string;
70+
simulation_instructions?: string;
71+
},
72+
): Promise<EvalItem> {
73+
return fetchJson(
74+
`${BASE}/eval-sets/${encodeURIComponent(evalSetId)}/items/${encodeURIComponent(itemName)}`,
75+
{
76+
method: "PUT",
77+
headers: { "Content-Type": "application/json" },
78+
body: JSON.stringify(body),
79+
},
80+
);
81+
}
82+
83+
export async function updateEvalItemEvaluators(
84+
evalSetId: string,
85+
itemName: string,
86+
evaluationCriterias: Record<string, unknown>,
87+
): Promise<EvalItem> {
88+
return fetchJson(
89+
`${BASE}/eval-sets/${encodeURIComponent(evalSetId)}/items/${encodeURIComponent(itemName)}/evaluators`,
90+
{
91+
method: "PATCH",
92+
headers: { "Content-Type": "application/json" },
93+
body: JSON.stringify({ evaluation_criterias: evaluationCriterias }),
94+
},
95+
);
96+
}
97+
5898
export async function deleteEvalItem(
5999
evalSetId: string,
60100
itemName: string,
@@ -110,6 +150,25 @@ export async function updateEvalSetEvaluators(
110150
});
111151
}
112152

153+
export async function scaffoldCustomEvaluator(body: {
154+
name: string;
155+
description?: string;
156+
}): Promise<{ file_path: string; filename: string; class_name: string; evaluator_id: string }> {
157+
return fetchJson(`${BASE}/custom-evaluators/scaffold`, {
158+
method: "POST",
159+
headers: { "Content-Type": "application/json" },
160+
body: JSON.stringify(body),
161+
});
162+
}
163+
164+
export async function registerCustomEvaluator(filename: string): Promise<{ evaluator_id: string; spec_path: string }> {
165+
return fetchJson(`${BASE}/custom-evaluators/register`, {
166+
method: "POST",
167+
headers: { "Content-Type": "application/json" },
168+
body: JSON.stringify({ filename }),
169+
});
170+
}
171+
113172
export async function updateLocalEvaluator(
114173
id: string,
115174
body: {

0 commit comments

Comments
 (0)