UiPath
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/uipath/dev/mcp/__init__.py‎
Lines changed: 118 additions & 0 deletions b/‎src/uipath/dev/mcp/__init__.py‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎src/uipath/dev/server/frontend/src/api/eval-client.ts‎
Lines changed: 60 additions & 1 deletion b/‎src/uipath/dev/server/frontend/src/api/eval-client.ts‎
Lines changed: 60 additions & 1 deletion
@@ -1,6 +1,6 @@
 [project]
 name = "uipath-dev"
-version = "0.0.75"
+version = "0.0.76"
 description = "UiPath Developer Console"
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
 
@@ -191,6 +191,124 @@ async def get_run_status(run_id: str) -> dict[str, Any]:
         return resp.json()
 
 
+@mcp.tool()
+async def list_eval_sets() -> list[dict[str, Any]]:
+    """List all evaluation sets.
+
+    Returns the available eval sets with their IDs, names, item counts,
+    and attached evaluator IDs. Use the returned IDs with run_eval_set.
+    """
+    await _report_tool_call("list_eval_sets")
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(_api_url("/eval-sets"), timeout=10)
+        resp.raise_for_status()
+        return resp.json()
+
+
+@mcp.tool()
+async def get_eval_set(eval_set_id: str) -> dict[str, Any]:
+    """Get full details of an evaluation set including all items.
+
+    Args:
+        eval_set_id: ID of the eval set (from list_eval_sets).
+
+    Returns the eval set with all items, their inputs, expected outputs,
+    and evaluation criteria.
+    """
+    await _report_tool_call("get_eval_set", {"eval_set_id": eval_set_id})
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(_api_url(f"/eval-sets/{eval_set_id}"), timeout=10)
+        resp.raise_for_status()
+        return resp.json()
+
+
+@mcp.tool()
+async def run_eval_set(
+    eval_set_id: str,
+    ctx: Context,  # type: ignore[type-arg]
+) -> dict[str, Any]:
+    """Run an evaluation set against the agent.
+
+    Args:
+        eval_set_id: ID of the eval set (from list_eval_sets).
+
+    Starts the eval run and streams progress as each item completes.
+    Returns the full run result with per-item scores and overall score.
+    """
+    await _report_tool_call("run_eval_set", {"eval_set_id": eval_set_id})
+    async with httpx.AsyncClient() as client:
+        resp = await client.post(_api_url(f"/eval-sets/{eval_set_id}/runs"), timeout=30)
+        resp.raise_for_status()
+        run: dict[str, Any] = resp.json()
+        run_id = run["id"]
+
+    await ctx.log("info", f"Eval run {run_id} created — streaming progress...")
+
+    async with websockets.connect(_ws_url()) as ws:
+        async for raw in ws:
+            msg = json.loads(raw)
+            msg_type = msg.get("type", "")
+            payload = msg.get("payload", {})
+
+            if msg_type == "eval_run.progress" and payload.get("run_id") == run_id:
+                completed = payload.get("completed", 0)
+                total = payload.get("total", 0)
+                item = payload.get("item_result")
+                if item:
+                    name = item.get("name", "")
+                    status = item.get("status", "")
+                    score = item.get("overall_score")
+                    score_str = f" — score: {score:.0%}" if score is not None else ""
+                    await ctx.log(
+                        "info", f"  [{completed}/{total}] {name}: {status}{score_str}"
+                    )
+                await ctx.report_progress(progress=completed, total=total)
+
+            elif msg_type == "eval_run.completed" and payload.get("run_id") == run_id:
+                overall = payload.get("overall_score")
+                if overall is not None:
+                    await ctx.log(
+                        "info", f"Eval run completed — overall score: {overall:.0%}"
+                    )
+                break
+
+    # Fetch final run detail
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(_api_url(f"/eval-runs/{run_id}"), timeout=10)
+        resp.raise_for_status()
+        return resp.json()
+
+
+@mcp.tool()
+async def list_eval_runs() -> list[dict[str, Any]]:
+    """List all evaluation runs.
+
+    Returns summaries of all eval runs with their status, scores,
+    and progress. Use run IDs with get_eval_run for full details.
+    """
+    await _report_tool_call("list_eval_runs")
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(_api_url("/eval-runs"), timeout=10)
+        resp.raise_for_status()
+        return resp.json()
+
+
+@mcp.tool()
+async def get_eval_run(eval_run_id: str) -> dict[str, Any]:
+    """Get full details of an evaluation run including per-item results.
+
+    Args:
+        eval_run_id: ID of the eval run.
+
+    Returns the run with all item results, scores, justifications, and traces.
+    """
+    await _report_tool_call("get_eval_run", {"eval_run_id": eval_run_id})
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(_api_url(f"/eval-runs/{eval_run_id}"), timeout=10)
+        resp.raise_for_status()
+        return resp.json()
+
+
 def main() -> None:
     """Entry point for the uipath-dev-mcp CLI command."""
     mcp.run(transport="stdio")
@@ -1,4 +1,4 @@
-import type { EvaluatorInfo, LocalEvaluator, EvalSetSummary, EvalSetDetail, EvalItem, EvalRunSummary, EvalRunDetail } from "../types/eval";
+import type { EvaluatorInfo, LocalEvaluator, LlmModel, EvalSetSummary, EvalSetDetail, EvalItem, EvalRunSummary, EvalRunDetail } from "../types/eval";
 
 const BASE = "/api";
 
@@ -24,6 +24,10 @@ export async function listEvaluators(): Promise<EvaluatorInfo[]> {
   return fetchJson(`${BASE}/evaluators`);
 }
 
+export async function listLlmModels(): Promise<LlmModel[]> {
+  return fetchJson(`${BASE}/llm-models`);
+}
+
 export async function listEvalSets(): Promise<EvalSetSummary[]> {
   return fetchJson(`${BASE}/eval-sets`);
 }
@@ -55,6 +59,42 @@ export async function addEvalItem(
   });
 }
 
+export async function updateEvalItem(
+  evalSetId: string,
+  itemName: string,
+  body: {
+    name?: string;
+    inputs?: Record<string, unknown>;
+    expected_output?: unknown;
+    expected_behavior?: string;
+    simulation_instructions?: string;
+  },
+): Promise<EvalItem> {
+  return fetchJson(
+    `${BASE}/eval-sets/${encodeURIComponent(evalSetId)}/items/${encodeURIComponent(itemName)}`,
+    {
+      method: "PUT",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(body),
+    },
+  );
+}
+
+export async function updateEvalItemEvaluators(
+  evalSetId: string,
+  itemName: string,
+  evaluationCriterias: Record<string, unknown>,
+): Promise<EvalItem> {
+  return fetchJson(
+    `${BASE}/eval-sets/${encodeURIComponent(evalSetId)}/items/${encodeURIComponent(itemName)}/evaluators`,
+    {
+      method: "PATCH",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ evaluation_criterias: evaluationCriterias }),
+    },
+  );
+}
+
 export async function deleteEvalItem(
   evalSetId: string,
   itemName: string,
@@ -110,6 +150,25 @@ export async function updateEvalSetEvaluators(
   });
 }
 
+export async function scaffoldCustomEvaluator(body: {
+  name: string;
+  description?: string;
+}): Promise<{ file_path: string; filename: string; class_name: string; evaluator_id: string }> {
+  return fetchJson(`${BASE}/custom-evaluators/scaffold`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(body),
+  });
+}
+
+export async function registerCustomEvaluator(filename: string): Promise<{ evaluator_id: string; spec_path: string }> {
+  return fetchJson(`${BASE}/custom-evaluators/register`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ filename }),
+  });
+}
+
 export async function updateLocalEvaluator(
   id: string,
   body: {