diff --git a/package.json b/package.json index ee120e2..0ba39b2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@hyperbrowser/agent", - "version": "1.0.8", + "version": "1.0.9", "description": "Hyperbrowsers Web Agent", "author": "", "main": "dist/index.js", diff --git a/src/agent/index.ts b/src/agent/index.ts index 33a5a39..ee8f821 100644 --- a/src/agent/index.ts +++ b/src/agent/index.ts @@ -12,6 +12,9 @@ import { ActionContext, ActionType, AgentActionDefinition, + ActionCacheOutput, + ActionCacheReplayResult, + RunFromActionCacheParams, endTaskStatuses, Task, TaskOutput, @@ -19,6 +22,7 @@ import { TaskState, TaskStatus, } from "@/types"; +import fs from "fs"; import { CompleteActionDefinition, DEFAULT_ACTIONS, @@ -37,7 +41,13 @@ import { } from "../context-providers/a11y-dom/types"; import { MCPClient } from "./mcp/client"; import { runAgentTask } from "./tools/agent"; -import { HyperPage, HyperVariable } from "../types/agent/types"; +import type { + HyperPage, + HyperVariable, + ActionCacheEntry, + AgentTaskOutput, + PerformOptions, +} from "../types/agent/types"; import { z } from "zod"; import { ErrorEmitter } from "../utils"; import { waitForSettledDOM } from "@/utils/waitForSettledDOM"; @@ -48,6 +58,9 @@ import { markDomSnapshotDirty } from "@/context-providers/a11y-dom/dom-cache"; import { setDebugOptions } from "@/debug/options"; import { initializeRuntimeContext } from "./shared/runtime-context"; import { performAction } from "./actions/shared/perform-action"; +import { createScriptFromActionCache } from "./shared/action-cache-script"; +import { attachCachedActionHelpers } from "./shared/action-cache-exec"; +import { AgentDeps } from "@/types/agent/types"; export class HyperAgent { // aiAction configuration constants @@ -71,6 +84,7 @@ export class HyperAgent { private browserProviderType: T; private actions: Array = [...DEFAULT_ACTIONS]; private cdpActionsEnabled: boolean; + private actionCacheByTaskId: Record = {}; public browser: Browser | null = null; public context: BrowserContext | null = null; @@ -248,6 +262,15 @@ export class HyperAgent { delete this._variables[key]; } + public getActionCache(taskId: string): ActionCacheOutput | null { + const cache = this.actionCacheByTaskId[taskId]; + if (!cache) return null; + return { + ...cache, + steps: [...cache.steps], + }; + } + /** * Get all pages in the context * @returns Array of HyperPage objects @@ -352,6 +375,7 @@ export class HyperAgent { throw new HyperagentError(`Task ${taskId} not found`); } return { + id: taskId, getStatus: () => taskState.status, pause: () => { if (taskState.status === TaskStatus.RUNNING) { @@ -432,7 +456,10 @@ export class HyperAgent { taskState, mergedParams ) - .then(() => cleanup()) + .then((result) => { + this.actionCacheByTaskId[taskId] = result.actionCache; + cleanup(); + }) .catch((error: Error) => { cleanup(); // Retrieve the correct state to update @@ -463,7 +490,7 @@ export class HyperAgent { task: string, params?: TaskParams, initPage?: Page - ): Promise { + ): Promise { const taskId = uuidv4(); let activeTaskPage = initPage || (await this.getCurrentPage()); @@ -510,6 +537,7 @@ export class HyperAgent { mergedParams ); this.context?.off("page", onPage); + this.actionCacheByTaskId[taskId] = result.actionCache; return result; } catch (error) { this.context?.off("page", onPage); @@ -518,6 +546,305 @@ export class HyperAgent { } } + public async runFromActionCache( + cache: ActionCacheOutput, + pageOrGetter: Page | (() => Page), + params?: RunFromActionCacheParams + ): Promise { + const replayId = uuidv4(); + const maxXPathRetries = params?.maxXPathRetries ?? 3; + const debug = params?.debug ?? this.debug; + const getPage = () => + typeof pageOrGetter === "function" ? pageOrGetter() : pageOrGetter; + + const stepsResult: ActionCacheReplayResult["steps"] = []; + let replayStatus: TaskStatus.COMPLETED | TaskStatus.FAILED = + TaskStatus.COMPLETED; + + /** + * Type-safe dispatch for HyperPage perform* methods. + * Explicitly routes to the correct method with proper typing. + * + * Methods that require a value argument (second param): type, fill, press, selectOptionFromDropdown, scrollToPercentage + * Methods with only xpath and options: click, hover, check, uncheck, scrollToElement, nextChunk, prevChunk + */ + const dispatchPerformHelper = ( + hp: HyperPage, + method: string, + xpath: string, + value: string | undefined, + options: PerformOptions + ): Promise => { + switch (method) { + case "click": + return hp.performClick(xpath, options); + case "hover": + return hp.performHover(xpath, options); + case "type": + return hp.performType(xpath, value ?? "", options); + case "fill": + return hp.performFill(xpath, value ?? "", options); + case "press": + return hp.performPress(xpath, value ?? "", options); + case "selectOptionFromDropdown": + return hp.performSelectOption(xpath, value ?? "", options); + case "check": + return hp.performCheck(xpath, options); + case "uncheck": + return hp.performUncheck(xpath, options); + case "scrollToElement": + return hp.performScrollToElement(xpath, options); + case "scrollToPercentage": + return hp.performScrollToPercentage(xpath, value ?? "", options); + case "nextChunk": + return hp.performNextChunk(xpath, options); + case "prevChunk": + return hp.performPrevChunk(xpath, options); + default: + throw new Error(`Unknown perform helper method: ${method}`); + } + }; + + /** Set of valid method names that can be dispatched */ + const validHelperMethods = new Set([ + "click", + "fill", + "type", + "press", + "selectOptionFromDropdown", + "check", + "uncheck", + "hover", + "scrollToElement", + "scrollToPercentage", + "nextChunk", + "prevChunk", + ]); + + for (const step of [...cache.steps].sort( + (a, b) => a.stepIndex - b.stepIndex + )) { + const page = getPage(); + const hyperPage = page as HyperPage; + let result: TaskOutput; + + if (step.actionType === "goToUrl") { + const url = + (step.arguments && step.arguments[0]) || + (step.actionParams as any)?.url || + ""; + if (!url || typeof url !== "string") { + result = { + taskId: cache.taskId, + status: TaskStatus.FAILED, + steps: [], + output: "Missing URL for goToUrl", + }; + } else { + await hyperPage.goto(url, { waitUntil: "domcontentloaded" }); + await waitForSettledDOM(hyperPage); + markDomSnapshotDirty(hyperPage); + result = { + taskId: cache.taskId, + status: TaskStatus.COMPLETED, + steps: [], + output: `Navigated to ${url}`, + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 0, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } + } else if (step.actionType === "complete") { + result = { + taskId: cache.taskId, + status: TaskStatus.COMPLETED, + steps: [], + output: "Task Complete", + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 0, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } else if (step.actionType === "refreshPage") { + await hyperPage.reload({ waitUntil: "domcontentloaded" }); + await waitForSettledDOM(hyperPage); + markDomSnapshotDirty(hyperPage); + result = { + taskId: cache.taskId, + status: TaskStatus.COMPLETED, + steps: [], + output: "Page refreshed", + actionCache: { + taskId: cache.taskId, + createdAt: cache.createdAt, + status: TaskStatus.COMPLETED, + steps: [], + }, + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 0, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } else if (step.actionType === "wait") { + const durationRaw = + (step.arguments && step.arguments[0]) || + (step.actionParams as any)?.duration; + const durationMs = + typeof durationRaw === "number" + ? durationRaw + : Number.parseInt(String(durationRaw ?? ""), 10); + const waitMs = Number.isFinite(durationMs) ? durationMs : 1000; + await hyperPage.waitForTimeout(waitMs); + result = { + taskId: cache.taskId, + status: TaskStatus.COMPLETED, + steps: [], + output: `Waited ${waitMs}ms`, + actionCache: { + taskId: cache.taskId, + createdAt: cache.createdAt, + status: TaskStatus.COMPLETED, + steps: [], + }, + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 0, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } else if (step.actionType === "extract") { + try { + const extractResult = await hyperPage.extract(step.instruction); + result = { + taskId: cache.taskId, + status: TaskStatus.COMPLETED, + steps: [], + output: + typeof extractResult === "string" + ? extractResult + : JSON.stringify(extractResult), + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 0, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } catch (err: any) { + result = { + taskId: cache.taskId, + status: TaskStatus.FAILED, + steps: [], + output: `Extract failed: ${err?.message || String(err)}`, + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 0, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } + } else if (step.actionType === "analyzePdf") { + result = { + taskId: cache.taskId, + status: TaskStatus.FAILED, + steps: [], + output: "analyzePdf replay is not supported in runFromActionCache.", + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 0, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } else { + const method = step.method; + if (method && validHelperMethods.has(method)) { + const options: PerformOptions = { + performInstruction: step.instruction, + maxSteps: maxXPathRetries, + }; + if (step.frameIndex !== null && step.frameIndex !== undefined) { + options.frameIndex = step.frameIndex; + } + const valueArg = step.arguments?.[0]; + result = await dispatchPerformHelper( + hyperPage, + method, + step.xpath ?? "", + valueArg, + options + ); + } else { + result = await hyperPage.perform(step.instruction); + } + } + + const finalMeta = result.replayStepMeta; + const finalSuccess = result.status === TaskStatus.COMPLETED; + + stepsResult.push({ + stepIndex: step.stepIndex, + actionType: step.actionType, + usedXPath: finalMeta?.usedCachedAction ?? false, + fallbackUsed: finalMeta?.fallbackUsed ?? false, + cachedXPath: finalMeta?.cachedXPath ?? null, + fallbackXPath: finalMeta?.fallbackXPath ?? null, + fallbackElementId: finalMeta?.fallbackElementId ?? null, + retries: finalMeta?.retries ?? 0, + success: finalSuccess, + message: + result.output || + (finalSuccess ? "Completed" : "Failed to execute cached action"), + }); + + if (!finalSuccess) { + replayStatus = TaskStatus.FAILED; + break; + } + } + + const replayResult: ActionCacheReplayResult = { + replayId, + sourceTaskId: cache.taskId, + steps: stepsResult, + status: replayStatus, + }; + + if (debug) { + const debugDir = "debug/action-cache"; + fs.mkdirSync(debugDir, { recursive: true }); + fs.writeFileSync( + `${debugDir}/replay-${replayId}.json`, + JSON.stringify(replayResult, null, 2) + ); + } + + return replayResult; + } + /** * Find element with retry logic * Retries element finding with DOM refetch until element is found or max retries reached @@ -766,6 +1093,7 @@ export class HyperAgent { pageOrGetter: Page | (() => Page), _params?: TaskParams ): Promise { + const taskId = uuidv4(); const actionStart = performance.now(); const startTime = new Date().toISOString(); if (this.debug) { @@ -831,7 +1159,8 @@ export class HyperAgent { 400 ); } - let actionXPath: string | undefined; + let actionXPath: string | null = + domState?.xpathMap?.[element.elementId] ?? null; // Use shared runtime context const { cdpClient, frameContextManager } = await initializeRuntimeContext( @@ -884,14 +1213,6 @@ export class HyperAgent { confidence: 1, // Implicit confidence for single action }); - if ( - actionOutput.debug && - typeof actionOutput.debug === "object" && - "requestedAction" in actionOutput.debug - ) { - actionXPath = (actionOutput.debug as any).elementMetadata?.xpath; - } - if (!actionOutput.success) { throw new Error(actionOutput.message); } @@ -930,9 +1251,24 @@ export class HyperAgent { logPerf(this.debug, "[Perf][executeSingleAction] total", actionStart); return { + taskId, status: TaskStatus.COMPLETED, steps: [], output: `Successfully executed: ${instruction}`, + actionCache: { + taskId, + createdAt: startTime, + status: TaskStatus.COMPLETED, + steps: [], + }, + replayStepMeta: { + usedCachedAction: false, + fallbackUsed: false, + retries: 1, + cachedXPath: null, + fallbackXPath: actionXPath ?? null, + fallbackElementId: element.elementId ?? null, + }, }; } catch (error) { // If page switched during execution, prioritize that over the error @@ -1139,6 +1475,13 @@ export class HyperAgent { return session; } + public createScriptFromActionCache( + steps: ActionCacheEntry[], + taskId?: string + ): string { + return createScriptFromActionCache({ steps, taskId }); + } + private setupHyperPage(page: Page): HyperPage { const hyperPage = page as HyperPage; @@ -1236,6 +1579,21 @@ export class HyperAgent { return executeSingleActionWithRetry(instruction, params); }; + hyperPage.getActionCache = (taskId: string) => this.getActionCache(taskId); + + hyperPage.runFromActionCache = (cache, params) => + this.runFromActionCache(cache, getActivePage, params); + + const deps: AgentDeps = { + debug: this.debug, + tokenLimit: this.tokenLimit, + llm: this.llm, + mcpClient: this.mcpClient, + variables: Object.values(this._variables), + cdpActionsEnabled: this.cdpActionsEnabled, + }; + attachCachedActionHelpers(deps, hyperPage); + // aiAsync tasks run in background, so we just use the current scope start point. // The task itself has internal auto-following logic (from executeTaskAsync implementation). hyperPage.aiAsync = (task: string, params?: TaskParams) => diff --git a/src/agent/shared/action-cache-exec.ts b/src/agent/shared/action-cache-exec.ts new file mode 100644 index 0000000..d8b49ef --- /dev/null +++ b/src/agent/shared/action-cache-exec.ts @@ -0,0 +1,210 @@ +import { AgentDeps, HyperPage, TaskOutput } from "@/types/agent/types"; +import * as cachedRunner from "./run-cached-action"; + +const DEFAULT_MAX_STEPS = 3; + +type PageAction = + | "click" + | "fill" + | "type" + | "press" + | "selectOptionFromDropdown" + | "check" + | "uncheck" + | "hover" + | "scrollToElement" + | "scrollToPercentage" + | "nextChunk" + | "prevChunk"; + +interface PerformOptions { + frameIndex?: number | null; + performInstruction?: string | null; + maxSteps?: number; +} + +function runCachedAction( + agent: AgentDeps, + page: HyperPage, + instruction: string, + method: PageAction, + xpath: string, + args: Array, + options?: PerformOptions +): Promise { + const runInstruction = + options?.performInstruction && options.performInstruction.length > 0 + ? options.performInstruction + : instruction; + const cachedAction = { + actionType: "actElement", + method, + arguments: args, + frameIndex: options?.frameIndex ?? 0, + xpath, + }; + + return cachedRunner.runCachedStep({ + page, + instruction: runInstruction, + cachedAction, + maxSteps: options?.maxSteps ?? DEFAULT_MAX_STEPS, + debug: agent.debug, + tokenLimit: agent.tokenLimit, + llm: agent.llm, + mcpClient: agent.mcpClient, + variables: agent.variables ?? [], + preferScriptBoundingBox: agent.debug, + cdpActionsEnabled: agent.cdpActionsEnabled, + performFallback: options?.performInstruction + ? (instr) => page.perform(instr) + : undefined, + }); +} + +export function attachCachedActionHelpers( + agent: AgentDeps, + page: HyperPage +): void { + page.performClick = (xpath: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Click element", + "click", + xpath, + [], + options + ); + + page.performHover = (xpath: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Hover element", + "hover", + xpath, + [], + options + ); + + page.performType = (xpath: string, text: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Type text", + "type", + xpath, + [text], + options + ); + + page.performFill = (xpath: string, text: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Fill input", + "fill", + xpath, + [text], + options + ); + + page.performPress = (xpath: string, key: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Press key", + "press", + xpath, + [key], + options + ); + + page.performSelectOption = ( + xpath: string, + option: string, + options?: PerformOptions + ) => + runCachedAction( + agent, + page, + options?.performInstruction || "Select option", + "selectOptionFromDropdown", + xpath, + [option], + options + ); + + page.performCheck = (xpath: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Check element", + "check", + xpath, + [], + options + ); + + page.performUncheck = (xpath: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Uncheck element", + "uncheck", + xpath, + [], + options + ); + + page.performScrollToElement = (xpath: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Scroll to element", + "scrollToElement", + xpath, + [], + options + ); + + page.performScrollToPercentage = ( + xpath: string, + position: string | number, + options?: PerformOptions + ) => + runCachedAction( + agent, + page, + options?.performInstruction || "Scroll to percentage", + "scrollToPercentage", + xpath, + [position], + options + ); + + page.performNextChunk = (xpath: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Scroll next chunk", + "nextChunk", + xpath, + [], + options + ); + + page.performPrevChunk = (xpath: string, options?: PerformOptions) => + runCachedAction( + agent, + page, + options?.performInstruction || "Scroll previous chunk", + "prevChunk", + xpath, + [], + options + ); +} + +export { DEFAULT_MAX_STEPS }; diff --git a/src/agent/shared/action-cache-script.ts b/src/agent/shared/action-cache-script.ts new file mode 100644 index 0000000..61406aa --- /dev/null +++ b/src/agent/shared/action-cache-script.ts @@ -0,0 +1,145 @@ +import { ActionCacheEntry } from "@/types"; + +interface CreateScriptFromActionCacheParams { + taskId?: string; + steps: ActionCacheEntry[]; +} + +export function createScriptFromActionCache( + params: CreateScriptFromActionCacheParams +): string { + const { steps } = params; + + const METHOD_TO_CALL: Record< + string, + { fn: string; needsValue?: boolean; valueName?: string } + > = { + click: { fn: "performClick" }, + fill: { fn: "performFill", needsValue: true, valueName: "text" }, + type: { fn: "performType", needsValue: true, valueName: "text" }, + press: { fn: "performPress", needsValue: true, valueName: "key" }, + selectOptionFromDropdown: { + fn: "performSelectOption", + needsValue: true, + valueName: "option", + }, + check: { fn: "performCheck" }, + uncheck: { fn: "performUncheck" }, + hover: { fn: "performHover" }, + scrollToElement: { fn: "performScrollToElement" }, + scrollToPercentage: { + fn: "performScrollToPercentage", + needsValue: true, + valueName: "position", + }, + nextChunk: { fn: "performNextChunk" }, + prevChunk: { fn: "performPrevChunk" }, + }; + + const formatCall = (step: ActionCacheEntry): string => { + const indent = " "; + const argIndent = `${indent} `; + + if (step.actionType === "complete") { + return `${indent}// Step ${step.stepIndex} (complete skipped in script)`; + } + + if (step.actionType === "goToUrl") { + const urlArg = + (step.arguments && step.arguments[0]) || "https://example.com"; + return `${indent}// Step ${step.stepIndex} +${indent}await page.goto( +${argIndent}${JSON.stringify(urlArg)}, +${argIndent}{ waitUntil: "domcontentloaded" } +${indent});`; + } + + if (step.actionType === "refreshPage") { + return `${indent}// Step ${step.stepIndex} +${indent}await page.reload({ waitUntil: "domcontentloaded" });`; + } + + if (step.actionType === "wait") { + const waitMs = + (step.arguments && Number(step.arguments[0])) || + (step.actionParams as any)?.duration || + 1000; + return `${indent}// Step ${step.stepIndex} +${indent}await page.waitForTimeout(${waitMs});`; + } + + if (step.actionType === "extract") { + return `${indent}// Step ${step.stepIndex} +${indent}await page.extract("${step.instruction}");`; + } + + const call = step.method ? METHOD_TO_CALL[step.method] : undefined; + if (call) { + const args: string[] = []; + args.push(JSON.stringify(step.xpath)); + if (call.needsValue) { + const value = step.arguments?.[0] ?? ""; + args.push(JSON.stringify(value)); + } + const options: Record = {}; + if (step.instruction) { + options.performInstruction = step.instruction; + } + if ( + step.frameIndex !== null && + step.frameIndex !== undefined && + step.frameIndex !== 0 + ) { + options.frameIndex = step.frameIndex; + } + + const optionEntries = Object.entries(options).map( + ([key, value]) => `${argIndent} ${key}: ${JSON.stringify(value)},` + ); + const optionsBlock = + optionEntries.length > 0 + ? `${argIndent}{\n${optionEntries.join("\n")}\n${argIndent}}` + : ""; + + const callArgs = [ + `${argIndent}${JSON.stringify(step.xpath)},`, + call.needsValue + ? `${argIndent}${JSON.stringify(step.arguments?.[0] ?? "")},` + : null, + optionsBlock ? `${optionsBlock},` : null, + ] + .filter(Boolean) + .join("\n"); + + return `${indent}// Step ${step.stepIndex} +${indent}await page.${call.fn}( +${callArgs} +${indent});`; + } + + return `${indent}// Step ${step.stepIndex} (unsupported actionType=${step.actionType}, method=${step.method ?? "N/A"})`; + }; + + const stepSnippets = steps.map((step) => formatCall(step)).join("\n\n"); + + const script = `import { HyperAgent } from "@hyperbrowser/agent"; +async function main() { + const agent = new HyperAgent({ + // Configure your LLM/API keys + }); + + const page = await agent.newPage(); + +${stepSnippets} + + await agent.closeAgent(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); +`; + + return script; +} diff --git a/src/agent/shared/action-cache.ts b/src/agent/shared/action-cache.ts new file mode 100644 index 0000000..11b7b5b --- /dev/null +++ b/src/agent/shared/action-cache.ts @@ -0,0 +1,131 @@ +import { ActionOutput, ActionType } from "@/types"; +import { ActionCacheEntry } from "@/types/agent/types"; +import { + A11yDOMState, + asEncodedId, +} from "@/context-providers/a11y-dom/types"; + +const TEXT_NODE_SUFFIX = /\/text\(\)(\[\d+\])?$/iu; + +const isString = (value: unknown): value is string => + typeof value === "string"; + +const isStringOrNumberArray = ( + value: unknown +): value is Array => + Array.isArray(value) && + value.every((item) => typeof item === "string" || typeof item === "number"); + +const normalizeXPath = (raw?: string | null): string | null => { + if (!raw) { + return null; + } + return raw.replace(TEXT_NODE_SUFFIX, ""); +}; + +const extractInstruction = (action: ActionType): string => { + const params = action.params as Record; + if (isString(params.instruction)) { + return params.instruction; + } + return action.type; +}; + +const extractElementId = (action: ActionType): string | null => { + const params = action.params as Record; + if (isString(params.elementId)) { + return params.elementId; + } + return null; +}; + +const extractMethod = (action: ActionType): string | null => { + const params = action.params as Record; + if (isString(params.method)) { + return params.method; + } + return null; +}; + +const extractArguments = (action: ActionType): string[] => { + const params = action.params as Record; + if (isStringOrNumberArray(params.arguments)) { + return params.arguments.map((item) => item.toString()); + } + return []; +}; + +const extractFrameIndex = (elementId: string | null): number | null => { + if (!elementId) { + return null; + } + const encodedId = asEncodedId(elementId); + if (!encodedId) { + return null; + } + const [framePart] = encodedId.split("-"); + const parsed = Number.parseInt(framePart, 10); + return Number.isNaN(parsed) ? null : parsed; +}; + +const extractXPathFromDebug = (actionOutput: ActionOutput): string | null => { + const debug = actionOutput.debug as Record | undefined; + if (!debug || typeof debug !== "object") { + return null; + } + + const metadata = debug.elementMetadata as Record | undefined; + if (metadata && isString(metadata.xpath)) { + return metadata.xpath; + } + return null; +}; + +export const buildActionCacheEntry = ({ + stepIndex, + action, + actionOutput, + domState, +}: { + stepIndex: number; + action: ActionType; + actionOutput: ActionOutput; + domState: A11yDOMState; +}): ActionCacheEntry => { + const instruction = extractInstruction(action); + const elementId = extractElementId(action); + const method = extractMethod(action); + const args = extractArguments(action); + const encodedId = elementId ? asEncodedId(elementId) : undefined; + const frameIndex = extractFrameIndex(elementId); + + // Normalize goToUrl to use arguments[0] for URL to simplify replay paths + let normalizedArgs = args; + if ( + action.type === "goToUrl" && + (!args || args.length === 0) && + action.params && + typeof (action.params as any).url === "string" + ) { + normalizedArgs = [(action.params as any).url as string]; + } + + const xpathFromDom = encodedId ? domState.xpathMap?.[encodedId] || null : null; + const xpath = normalizeXPath( + xpathFromDom || extractXPathFromDebug(actionOutput) + ); + + return { + stepIndex, + instruction, + elementId, + method, + arguments: normalizedArgs, + actionParams: (action.params as Record) || undefined, + frameIndex, + xpath, + actionType: action.type, + success: actionOutput.success, + message: actionOutput.message, + }; +}; diff --git a/src/agent/shared/element-locator.ts b/src/agent/shared/element-locator.ts index c43799d..ba4c109 100644 --- a/src/agent/shared/element-locator.ts +++ b/src/agent/shared/element-locator.ts @@ -4,7 +4,11 @@ */ import type { Page } from "playwright-core"; -import { toEncodedId, type IframeInfo, resolveFrameByXPath } from "../../context-providers/a11y-dom"; +import { + toEncodedId, + type IframeInfo, + resolveFrameByXPath, +} from "../../context-providers/a11y-dom"; import { HyperagentError } from "../error"; /** diff --git a/src/agent/shared/run-cached-action.ts b/src/agent/shared/run-cached-action.ts new file mode 100644 index 0000000..17bae9d --- /dev/null +++ b/src/agent/shared/run-cached-action.ts @@ -0,0 +1,311 @@ +import { v4 as uuidv4 } from "uuid"; +import { ActionContext } from "@/types"; +import { performAction } from "@/agent/actions/shared/perform-action"; +import { captureDOMState } from "@/agent/shared/dom-capture"; +import { waitForSettledDOM } from "@/utils/waitForSettledDOM"; +import { markDomSnapshotDirty } from "@/context-providers/a11y-dom/dom-cache"; +import { initializeRuntimeContext } from "@/agent/shared/runtime-context"; +import { resolveXPathWithCDP } from "@/agent/shared/xpath-cdp-resolver"; +import { resolveElement, dispatchCDPAction } from "@/cdp"; +import { TaskOutput, TaskStatus } from "@/types/agent/types"; + +export interface CachedActionInput { + actionType: string; + xpath?: string | null; + frameIndex?: number | null; + method?: string | null; + arguments?: Array; + actionParams?: Record; +} + +export interface RunCachedStepParams { + page: import("playwright-core").Page; + instruction: string; + cachedAction: CachedActionInput; + maxSteps?: number; + debug?: boolean; + tokenLimit: number; + llm: any; + mcpClient: any; + variables: Array<{ key: string; value: string; description: string }>; + preferScriptBoundingBox?: boolean; + cdpActionsEnabled?: boolean; + performFallback?: (instruction: string) => Promise; +} + +export async function runCachedStep( + params: RunCachedStepParams +): Promise { + const { + page, + instruction, + cachedAction, + maxSteps = 3, + debug, + tokenLimit, + llm, + mcpClient, + variables, + preferScriptBoundingBox, + cdpActionsEnabled, + } = params; + + const taskId = uuidv4(); + + if (cachedAction.actionType === "goToUrl") { + const url = + (cachedAction.arguments && cachedAction.arguments[0]) || + (cachedAction.actionParams as any)?.url || + ""; + if (!url || typeof url !== "string") { + return { + taskId, + status: TaskStatus.FAILED, + steps: [], + output: "Missing URL for goToUrl", + }; + } + await page.goto(url, { waitUntil: "domcontentloaded" }); + await waitForSettledDOM(page); + markDomSnapshotDirty(page); + return { + taskId, + status: TaskStatus.COMPLETED, + steps: [], + output: `Navigated to ${url}`, + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 1, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } + + if (cachedAction.actionType === "complete") { + return { + taskId, + status: TaskStatus.COMPLETED, + steps: [], + output: "Task Complete", + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: 1, + cachedXPath: null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } + + if ( + cachedAction.actionType !== "actElement" || + !cachedAction.xpath || + !cachedAction.method + ) { + return { + taskId, + status: TaskStatus.FAILED, + steps: [], + output: "Unsupported cached action", + }; + } + + let lastError: unknown = null; + + for (let attempt = 0; attempt < maxSteps; attempt++) { + const attemptIndex = attempt + 1; + const attemptResult = await runCachedAttempt({ + page, + instruction, + cachedAction, + debug, + tokenLimit, + llm, + mcpClient, + variables, + preferScriptBoundingBox, + cdpActionsEnabled, + }).catch((err) => { + lastError = err; + return null; + }); + + if (!attemptResult) { + if (attempt < maxSteps - 1) { + continue; + } + // will fall through to fallback/final failure below + } else if (!attemptResult.success) { + lastError = new Error(attemptResult.message); + if (attempt < maxSteps - 1) { + continue; + } + // will fall through to fallback/final failure below + } else { + await waitForSettledDOM(page); + markDomSnapshotDirty(page); + lastError = null; + return { + taskId, + status: TaskStatus.COMPLETED, + steps: [], + output: `Executed cached action: ${instruction}`, + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: attemptIndex, + cachedXPath: cachedAction.xpath ?? null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; + } + } + + // All cached attempts failed; optionally fall back to LLM perform + if (params.performFallback) { + const fb = await params.performFallback(instruction); + const cachedXPath = cachedAction.xpath || "N/A"; + const resolvedXPath = fb.replayStepMeta?.fallbackXPath || "N/A"; + // eslint-disable-next-line no-console + console.log( + ` +⚠️ [runCachedStep] Cached action failed. Falling back to LLM... + Instruction: "${instruction}" + ❌ Cached XPath Failed: "${cachedXPath}" + ✅ LLM Resolved New XPath: "${resolvedXPath}" +` + ); + return { + ...fb, + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: true, + retries: maxSteps, + cachedXPath: cachedAction.xpath ?? null, + fallbackXPath: fb.replayStepMeta?.fallbackXPath ?? null, + fallbackElementId: fb.replayStepMeta?.fallbackElementId ?? null, + }, + }; + } + + return { + taskId, + status: TaskStatus.FAILED, + steps: [], + output: + (lastError as Error | null)?.message || "Failed to execute cached action", + replayStepMeta: { + usedCachedAction: true, + fallbackUsed: false, + retries: maxSteps, + cachedXPath: cachedAction.xpath ?? null, + fallbackXPath: null, + fallbackElementId: null, + }, + }; +} + +async function runCachedAttempt(args: { + page: import("playwright-core").Page; + instruction: string; + cachedAction: CachedActionInput; + debug?: boolean; + tokenLimit: number; + llm: any; + mcpClient: any; + variables: Array<{ key: string; value: string; description: string }>; + preferScriptBoundingBox?: boolean; + cdpActionsEnabled?: boolean; +}): Promise<{ success: boolean; message: string }> { + const { + page, + instruction, + cachedAction, + debug, + tokenLimit, + llm, + mcpClient, + variables, + preferScriptBoundingBox, + cdpActionsEnabled, + } = args; + + await waitForSettledDOM(page); + const domState = await captureDOMState(page, { + useCache: false, + debug, + enableVisualMode: false, + }); + + const { cdpClient, frameContextManager } = await initializeRuntimeContext( + page, + debug + ); + const resolved = await resolveXPathWithCDP({ + xpath: cachedAction.xpath!, + frameIndex: cachedAction.frameIndex ?? 0, + cdpClient, + frameContextManager, + debug, + }); + + const actionContext: ActionContext = { + domState, + page, + tokenLimit, + llm, + debug, + cdpActions: cdpActionsEnabled !== false, + cdp: { + client: cdpClient, + frameContextManager, + resolveElement, + dispatchCDPAction, + preferScriptBoundingBox: preferScriptBoundingBox ?? debug, + debug, + }, + debugDir: undefined, + mcpClient, + variables, + invalidateDomCache: () => markDomSnapshotDirty(page), + }; + + const encodedId = `${cachedAction.frameIndex ?? 0}-${resolved.backendNodeId}`; + domState.backendNodeMap = { + ...(domState.backendNodeMap || {}), + [encodedId]: resolved.backendNodeId, + }; + domState.xpathMap = { + ...(domState.xpathMap || {}), + [encodedId]: cachedAction.xpath!, + }; + + const methodArgs = (cachedAction.arguments ?? []).map((v) => + v == null ? "" : String(v) + ); + + const actionOutput = await performAction(actionContext, { + elementId: encodedId, + method: cachedAction.method!, + arguments: methodArgs, + instruction, + confidence: 1, + }); + + return { success: actionOutput.success, message: actionOutput.message }; +} + +export async function performGoTo( + page: import("playwright-core").Page, + url: string, + waitUntil: "domcontentloaded" | "load" | "networkidle" = "domcontentloaded" +): Promise { + await page.goto(url, { waitUntil }); + await waitForSettledDOM(page); + markDomSnapshotDirty(page); +} diff --git a/src/agent/shared/xpath-cdp-resolver.ts b/src/agent/shared/xpath-cdp-resolver.ts new file mode 100644 index 0000000..462e164 --- /dev/null +++ b/src/agent/shared/xpath-cdp-resolver.ts @@ -0,0 +1,101 @@ +import { CDPClient } from "@/cdp/types"; +import { FrameContextManager } from "@/cdp/frame-context-manager"; +import { HyperagentError } from "../error"; + +export interface ResolvedCDPFromXPath { + backendNodeId: number; + frameId: string; + objectId?: string; +} + +export interface ResolveXPathWithCDPParams { + xpath: string; + frameIndex: number | null | undefined; + cdpClient: CDPClient; + frameContextManager?: FrameContextManager; + debug?: boolean; +} + +export async function resolveXPathWithCDP( + params: ResolveXPathWithCDPParams +): Promise { + const { xpath, frameIndex = 0, cdpClient, frameContextManager, debug } = + params; + + // Use a DOM session without detaching the shared session; this keeps root session intact. + const session = await cdpClient.acquireSession("dom"); + let targetFrameId: string | undefined; + + if (frameContextManager) { + const frameInfo = frameContextManager.getFrameByIndex(frameIndex ?? 0); + targetFrameId = frameInfo?.frameId; + } + + if (!targetFrameId) { + throw new HyperagentError( + `Unable to resolve frameId for frameIndex ${frameIndex}`, + 404 + ); + } + + const executionContextId = frameContextManager + ? await frameContextManager.waitForExecutionContext(targetFrameId) + : undefined; + + if (!executionContextId && debug) { + console.warn( + `[resolveXPathWithCDP] Missing executionContextId for frame ${frameIndex} (${targetFrameId}), continuing` + ); + } + + await session.send("DOM.enable").catch(() => {}); + await session.send("Runtime.enable").catch(() => {}); + + const evalResponse = await session.send<{ + result: { objectId?: string | null }; + exceptionDetails?: unknown; + }>("Runtime.evaluate", { + expression: buildXPathEvaluationExpression(xpath), + contextId: executionContextId, + includeCommandLineAPI: false, + returnByValue: false, + awaitPromise: false, + }); + + const objectId = evalResponse.result.objectId || undefined; + if (!objectId) { + throw new HyperagentError( + `Failed to resolve XPath to objectId in frame ${frameIndex}`, + 404 + ); + } + + const describeNode = await session.send<{ + node?: { backendNodeId?: number }; + }>("DOM.describeNode", { objectId }); + const backendNodeId = describeNode.node?.backendNodeId; + if (typeof backendNodeId !== "number") { + throw new HyperagentError( + `DOM.describeNode did not return backendNodeId for frame ${frameIndex}`, + 404 + ); + } + + return { + backendNodeId, + frameId: targetFrameId, + objectId, + }; +} + +function buildXPathEvaluationExpression(xpath: string): string { + const escaped = JSON.stringify(xpath); + return `(function() { + try { + const result = document.evaluate(${escaped}, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null); + return result.singleNodeValue || null; + } catch (error) { + return null; + } + })();`; +} diff --git a/src/agent/tools/agent.ts b/src/agent/tools/agent.ts index 5dc96bd..ff38c35 100644 --- a/src/agent/tools/agent.ts +++ b/src/agent/tools/agent.ts @@ -1,4 +1,8 @@ -import { AgentStep } from "@/types/agent/types"; +import { + ActionCacheOutput, + AgentStep, + AgentTaskOutput, +} from "@/types/agent/types"; import fs from "fs"; import { performance } from "perf_hooks"; @@ -22,12 +26,7 @@ import { captureDOMState } from "../shared/dom-capture"; import { initializeRuntimeContext } from "../shared/runtime-context"; import { AgentOutputFn, endTaskStatuses } from "@hyperbrowser/agent/types"; -import { - TaskParams, - TaskOutput, - TaskState, - TaskStatus, -} from "@hyperbrowser/agent/types"; +import { TaskParams, TaskState, TaskStatus } from "@hyperbrowser/agent/types"; import { HyperagentError } from "../error"; import { buildAgentStepMessages } from "../messages/builder"; @@ -39,6 +38,7 @@ import { ActionNotFoundError } from "../actions"; import { AgentCtx } from "./types"; import { HyperAgentMessage } from "@/llm/types"; import { Jimp } from "jimp"; +import { buildActionCacheEntry } from "../shared/action-cache"; // DomChunkAggregator logic moved to shared/dom-capture.ts @@ -209,7 +209,7 @@ export const runAgentTask = async ( ctx: AgentCtx, taskState: TaskState, params?: TaskParams -): Promise => { +): Promise => { const taskStart = performance.now(); const taskId = taskState.id; const debugDir = params?.debugDir || `debug/${taskId}`; @@ -267,6 +267,7 @@ export const runAgentTask = async ( const MAX_CONSECUTIVE_FAILURES_OR_WAITS = 5; let lastOverlayKey: string | null = null; let lastScreenshotBase64: string | undefined; + const actionCacheSteps: ActionCacheOutput["steps"] = []; try { // Initialize context at the start of the task @@ -278,7 +279,9 @@ export const runAgentTask = async ( const newPage = await ctx.activePage(); if (newPage && newPage !== page) { if (ctx.debug) { - console.log(`[Agent] Switching active page context to ${newPage.url()}`); + console.log( + `[Agent] Switching active page context to ${newPage.url()}` + ); } cleanupDomListeners(page); page = newPage; @@ -560,6 +563,14 @@ export const runAgentTask = async ( markDomSnapshotDirty(page); } + const actionCacheEntry = buildActionCacheEntry({ + stepIndex: currStep, + action, + actionOutput, + domState, + }); + actionCacheSteps.push(actionCacheEntry); + // Check action result and handle retry logic if (action.type === "wait") { // Wait action - increment counter @@ -659,10 +670,26 @@ export const runAgentTask = async ( cleanupDomListeners(page); } - const taskOutput: TaskOutput = { + const actionCache: ActionCacheOutput = { + taskId, + createdAt: new Date().toISOString(), + status: taskState.status, + steps: actionCacheSteps, + }; + if (ctx.debug) { + fs.mkdirSync(debugDir, { recursive: true }); + fs.writeFileSync( + `${debugDir}/action-cache.json`, + JSON.stringify(actionCache, null, 2) + ); + } + + const taskOutput: AgentTaskOutput = { + taskId, status: taskState.status, steps: taskState.steps, output, + actionCache, }; if (ctx.debug) { fs.writeFileSync( diff --git a/src/types/agent/types.ts b/src/types/agent/types.ts index d220c30..638ead3 100644 --- a/src/types/agent/types.ts +++ b/src/types/agent/types.ts @@ -28,6 +28,71 @@ export interface AgentStep { actionOutput: ActionOutput; } +export interface ActionCacheEntry { + stepIndex: number; + instruction: string; + elementId: string | null; + method: string | null; + arguments: string[]; + actionParams?: Record; + frameIndex: number | null; + xpath: string | null; + actionType: string; + success: boolean; + message: string; +} + +export interface CachedActionHint { + actionType: string; + xpath?: string | null; + frameIndex?: number | null; + method?: string | null; + arguments?: string[]; + elementId?: string | null; + actionParams?: Record; +} + +export interface ReplayStepMeta { + usedCachedAction: boolean; + fallbackUsed: boolean; + retries?: number; + cachedXPath?: string | null; + fallbackXPath?: string | null; + fallbackElementId?: string | null; +} + +export interface ActionCacheOutput { + taskId: string; + createdAt: string; + status?: TaskStatus; + steps: ActionCacheEntry[]; +} + +export interface ActionCacheReplayStepResult { + stepIndex: number; + actionType: string; + usedXPath: boolean; + fallbackUsed: boolean; + cachedXPath?: string | null; + fallbackXPath?: string | null; + fallbackElementId?: string | null; + retries: number; + success: boolean; + message: string; +} + +export interface ActionCacheReplayResult { + replayId: string; + sourceTaskId: string; + steps: ActionCacheReplayStepResult[]; + status: TaskStatus.COMPLETED | TaskStatus.FAILED; +} + +export interface RunFromActionCacheParams { + maxXPathRetries?: number; + debug?: boolean; +} + export interface TaskParams { maxSteps?: number; debugDir?: string; @@ -41,12 +106,19 @@ export interface TaskParams { } export interface TaskOutput { + taskId: string; status?: TaskStatus; steps: AgentStep[]; output?: string; + actionCache?: ActionCacheOutput; + replayStepMeta?: ReplayStepMeta; } +// Returned by full agent runs (e.g., page.ai()) where actionCache is always populated. +export type AgentTaskOutput = TaskOutput & { actionCache: ActionCacheOutput }; + export interface Task { + id: string; getStatus: () => TaskStatus; pause: () => TaskStatus; resume: () => TaskStatus; @@ -79,19 +151,71 @@ export interface TaskState { error?: string; } +export interface AgentDeps { + debug?: boolean; + tokenLimit: number; + llm: any; + mcpClient: any; + variables: Array<{ key: string; value: string; description: string }>; + cdpActionsEnabled?: boolean; +} export interface HyperVariable { key: string; value: string; description: string; } +/** + * Common options for all perform* helper methods on HyperPage. + */ +export interface PerformOptions { + frameIndex?: number | null; + performInstruction?: string | null; + maxSteps?: number; +} + export interface HyperPage extends Page { + performClick: (xpath: string, options?: PerformOptions) => Promise; + performHover: (xpath: string, options?: PerformOptions) => Promise; + performType: ( + xpath: string, + text: string, + options?: PerformOptions + ) => Promise; + performFill: ( + xpath: string, + text: string, + options?: PerformOptions + ) => Promise; + performPress: ( + xpath: string, + key: string, + options?: PerformOptions + ) => Promise; + performSelectOption: ( + xpath: string, + option: string, + options?: PerformOptions + ) => Promise; + performCheck: (xpath: string, options?: PerformOptions) => Promise; + performUncheck: (xpath: string, options?: PerformOptions) => Promise; + performScrollToElement: ( + xpath: string, + options?: PerformOptions + ) => Promise; + performScrollToPercentage: ( + xpath: string, + position: string | number, + options?: PerformOptions + ) => Promise; + performNextChunk: (xpath: string, options?: PerformOptions) => Promise; + performPrevChunk: (xpath: string, options?: PerformOptions) => Promise; /** * Execute a complex multi-step task using visual mode * Best for: Complex workflows, multi-step tasks, exploratory automation * Mode: Always visual (screenshots with overlays) */ - ai: (task: string, params?: TaskParams) => Promise; + ai: (task: string, params?: TaskParams) => Promise; /** * Execute a single granular action using a11y mode @@ -112,4 +236,9 @@ export interface HyperPage extends Page { outputSchema?: T, params?: Omit ): Promise ? z.infer : string>; + getActionCache: (taskId: string) => ActionCacheOutput | null; + runFromActionCache: ( + cache: ActionCacheOutput, + params?: RunFromActionCacheParams + ) => Promise; } diff --git a/src/types/index.ts b/src/types/index.ts index 9924f9a..9177fa7 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -12,12 +12,18 @@ import { AgentOutputFn, AgentOutput, AgentStep, + ActionCacheEntry, + ActionCacheOutput, + ActionCacheReplayResult, + ActionCacheReplayStepResult, + RunFromActionCacheParams, TaskParams, TaskOutput, Task, TaskStatus, TaskState, endTaskStatuses, + PerformOptions, } from "./agent/types"; // Config Types @@ -39,11 +45,17 @@ export { AgentOutputFn, AgentOutput, AgentStep, + ActionCacheEntry, + ActionCacheOutput, + ActionCacheReplayResult, + ActionCacheReplayStepResult, + RunFromActionCacheParams, TaskParams, TaskOutput, Task, TaskStatus, TaskState, + PerformOptions, // Config Types MCPServerConfig,