diff --git a/CHANGELOG.md b/CHANGELOG.md index 68e1532..3de8682 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,115 @@ All notable changes to `@predicatelabs/sdk` will be documented in this file. ## Unreleased +### 2026-02-15 + +#### PredicateBrowserAgent (snapshot-first, verification-first) + +`PredicateBrowserAgent` is a new high-level agent wrapper that gives you a **browser-use-like** `step()` / `run()` surface, but keeps Predicate’s core philosophy: + +- **Snapshot-first perception** (structured DOM snapshot is the default) +- **Verification-first control plane** (you can gate progress with deterministic checks) +- Optional **vision fallback** (bounded) when snapshots aren’t sufficient + +It’s built on top of `AgentRuntime` + `RuntimeAgent`. + +##### Quickstart (single step) + +```ts +import { + AgentRuntime, + PredicateBrowserAgent, + type RuntimeStep, + LocalLLMProvider, // or OpenAIProvider / AnthropicProvider / DeepInfraProvider +} from '@predicatelabs/sdk'; + +const runtime = new AgentRuntime(browserLike, page, tracer); +const llm = new LocalLLMProvider({ model: 'qwen2.5:7b', baseUrl: 'http://localhost:11434/v1' }); + +const agent = new PredicateBrowserAgent({ + runtime, + executor: llm, + config: { + // Token control: include last N step summaries in the prompt (0 disables history). + historyLastN: 2, + }, +}); + +const ok = await agent.step({ + taskGoal: 'Find pricing and verify checkout button exists', + step: { goal: 'Open pricing page' } satisfies RuntimeStep, +}); +``` + +##### Customize the compact prompt (advanced) + +```ts +const agent = new PredicateBrowserAgent({ + runtime, + executor: llm, + config: { + compactPromptBuilder: (_taskGoal, _stepGoal, domContext, _snap, historySummary) => ({ + systemPrompt: + 'You are a web automation agent. Return ONLY one action: CLICK(id) | TYPE(id,"text") | PRESS("key") | FINISH()', + userPrompt: `RECENT:\n${historySummary}\n\nELEMENTS:\n${domContext}\n\nReturn the single best action:`, + }), + }, +}); +``` + +##### CAPTCHA handling (interface-only; no solver shipped) + +If you set `captcha.policy="callback"`, you must provide a handler. The SDK does **not** include a public CAPTCHA solver. + +```ts +import { HumanHandoffSolver } from '@predicatelabs/sdk'; + +const agent = new PredicateBrowserAgent({ + runtime, + executor: llm, + config: { + captcha: { + policy: 'callback', + // Manual solve in the live session; SDK waits until it clears: + handler: HumanHandoffSolver({ timeoutMs: 10 * 60_000, pollMs: 1_000 }), + }, + }, +}); +``` + +#### RuntimeAgent: structured prompt override hooks + +`RuntimeAgent` now supports optional hooks used by `PredicateBrowserAgent`: + +- `structuredPromptBuilder(...)` +- `domContextPostprocessor(...)` +- `historySummaryProvider(...)` + +#### PredicateBrowserAgent: opt-in token usage accounting (best-effort) + +If you want to measure token spend, you can enable best-effort accounting (depends on provider reporting token counts): + +```ts +const agent = new PredicateBrowserAgent({ + runtime, + executor: llm, + config: { + tokenUsageEnabled: true, + }, +}); + +const usage = agent.getTokenUsage(); +agent.resetTokenUsage(); +``` + +#### RuntimeAgent: actOnce without step lifecycle (orchestrators) + +`RuntimeAgent` now exposes `actOnce(...)` helpers that execute exactly one action **without** calling `runtime.beginStep()` / `runtime.emitStepEnd()`. This is intended for external orchestrators (e.g. WebBench) that already own step lifecycle and just want the SDK’s snapshot-first propose+execute block. + +- `await agent.actOnce(...) -> string` +- `await agent.actOnceWithSnapshot(...) -> { action, snap }` +- `await agent.actOnceResult(...) -> { action, snap, usedVision }` + ### 2026-02-13 #### Expanded deterministic verifications (adaptive resnapshotting) diff --git a/examples/agent/README.md b/examples/agent/README.md new file mode 100644 index 0000000..c42a9ef --- /dev/null +++ b/examples/agent/README.md @@ -0,0 +1,6 @@ +Predicate agent examples. + +- `predicate-browser-agent-minimal.ts`: minimal `PredicateBrowserAgent` usage. +- `predicate-browser-agent-custom-prompt.ts`: customize the compact prompt builder. +- `predicate-browser-agent-video-recording-playwright.ts`: enable Playwright video recording via context options (recommended). + diff --git a/examples/agent/predicate-browser-agent-custom-prompt.ts b/examples/agent/predicate-browser-agent-custom-prompt.ts new file mode 100644 index 0000000..aaf67f7 --- /dev/null +++ b/examples/agent/predicate-browser-agent-custom-prompt.ts @@ -0,0 +1,114 @@ +/** + * Example: PredicateBrowserAgent with compact prompt customization. + * + * Usage: + * ts-node examples/agent/predicate-browser-agent-custom-prompt.ts + */ + +import { Page } from 'playwright'; +import { + AgentRuntime, + PredicateBrowserAgent, + type PredicateBrowserAgentConfig, + RuntimeStep, + SentienceBrowser, +} from '../../src'; +import { createTracer } from '../../src/tracing/tracer-factory'; +import { LLMProvider, type LLMResponse } from '../../src/llm-provider'; +import type { Snapshot } from '../../src/types'; + +function createBrowserAdapter(browser: SentienceBrowser) { + return { + snapshot: async (_page: Page, options?: Record): Promise => { + return await browser.snapshot(options); + }, + }; +} + +class RecordingProvider extends LLMProvider { + public lastSystem: string | null = null; + public lastUser: string | null = null; + + constructor(private action: string = 'FINISH()') { + super(); + } + + get modelName(): string { + return 'recording-provider'; + } + supportsJsonMode(): boolean { + return false; + } + async generate( + systemPrompt: string, + userPrompt: string, + _options: Record = {} + ): Promise { + this.lastSystem = systemPrompt; + this.lastUser = userPrompt; + return { content: this.action, modelName: this.modelName }; + } +} + +const config: PredicateBrowserAgentConfig = { + historyLastN: 2, + compactPromptBuilder: ( + taskGoal: string, + stepGoal: string, + domContext: string, + _snap: Snapshot, + historySummary: string + ) => { + const systemPrompt = + 'You are a web automation executor. Return ONLY ONE action: CLICK(id) | TYPE(id,"text") | PRESS("key") | FINISH(). No prose.'; + const userPrompt = + `TASK GOAL:\n${taskGoal}\n\n` + + (historySummary ? `RECENT STEPS:\n${historySummary}\n\n` : '') + + `STEP GOAL:\n${stepGoal}\n\n` + + `DOM CONTEXT:\n${domContext.slice(0, 4000)}\n`; + return { systemPrompt, userPrompt }; + }, +}; + +async function main() { + const apiKey = (process.env.PREDICATE_API_KEY || + process.env.SENTIENCE_API_KEY) as string | undefined; + if (!apiKey) { + console.error('Error: PREDICATE_API_KEY or SENTIENCE_API_KEY not set'); + process.exit(1); + } + + const runId = 'predicate-browser-agent-custom-prompt'; + const tracer = await createTracer({ apiKey, runId, uploadTrace: false }); + + const browser = new SentienceBrowser(apiKey, undefined, false); + await browser.start(); + const page = browser.getPage(); + + try { + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle'); + + const runtime = new AgentRuntime(createBrowserAdapter(browser), page, tracer); + const executor = new RecordingProvider('FINISH()'); + + const agent = new PredicateBrowserAgent({ runtime, executor, config }); + + const out = await agent.step({ + taskGoal: 'Open example.com', + step: { goal: 'Take no action; just finish' } satisfies RuntimeStep, + }); + + console.log(`step ok: ${out.ok}`); + console.log('--- prompt preview (system) ---'); + console.log((executor.lastSystem || '').slice(0, 300)); + console.log('--- prompt preview (user) ---'); + console.log((executor.lastUser || '').slice(0, 300)); + } finally { + await tracer.close(true); + await browser.close(); + } +} + +main().catch(console.error); + diff --git a/examples/agent/predicate-browser-agent-minimal.ts b/examples/agent/predicate-browser-agent-minimal.ts new file mode 100644 index 0000000..8927eb0 --- /dev/null +++ b/examples/agent/predicate-browser-agent-minimal.ts @@ -0,0 +1,108 @@ +/** + * Example: PredicateBrowserAgent minimal demo. + * + * Usage: + * ts-node examples/agent/predicate-browser-agent-minimal.ts + * + * Requires: + * - PREDICATE_API_KEY or SENTIENCE_API_KEY (SentienceBrowser API key) + */ + +import { Page } from 'playwright'; +import { + AgentRuntime, + PredicateBrowserAgent, + type PredicateBrowserAgentConfig, + RuntimeStep, + StepVerification, + SentienceBrowser, + exists, + urlContains, +} from '../../src'; +import { createTracer } from '../../src/tracing/tracer-factory'; +import { LLMProvider, type LLMResponse } from '../../src/llm-provider'; +import type { Snapshot } from '../../src/types'; + +function createBrowserAdapter(browser: SentienceBrowser) { + return { + snapshot: async (_page: Page, options?: Record): Promise => { + return await browser.snapshot(options); + }, + }; +} + +class FixedActionProvider extends LLMProvider { + constructor(private action: string) { + super(); + } + get modelName(): string { + return 'fixed-action'; + } + supportsJsonMode(): boolean { + return false; + } + async generate( + _systemPrompt: string, + _userPrompt: string, + _options: Record = {} + ): Promise { + return { content: this.action, modelName: this.modelName }; + } +} + +async function main() { + const apiKey = (process.env.PREDICATE_API_KEY || + process.env.SENTIENCE_API_KEY) as string | undefined; + if (!apiKey) { + console.error('Error: PREDICATE_API_KEY or SENTIENCE_API_KEY not set'); + process.exit(1); + } + + const runId = 'predicate-browser-agent-minimal'; + const tracer = await createTracer({ apiKey, runId, uploadTrace: false }); + + const browser = new SentienceBrowser(apiKey, undefined, false); + await browser.start(); + const page = browser.getPage(); + + try { + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle'); + + const runtime = new AgentRuntime(createBrowserAdapter(browser), page, tracer); + + const executor = new FixedActionProvider('FINISH()'); + const config: PredicateBrowserAgentConfig = { historyLastN: 2 }; + + const agent = new PredicateBrowserAgent({ runtime, executor, config }); + + const steps: RuntimeStep[] = [ + { + goal: 'Verify Example Domain is loaded', + verifications: [ + { + predicate: urlContains('example.com'), + label: 'url_contains_example', + required: true, + } satisfies StepVerification, + { + predicate: exists('role=heading'), + label: 'has_heading', + required: true, + } satisfies StepVerification, + ], + maxSnapshotAttempts: 2, + snapshotLimitBase: 60, + }, + ]; + + const ok = await agent.run({ taskGoal: 'Open example.com and verify', steps }); + console.log(`run ok: ${ok}`); + } finally { + await tracer.close(true); + await browser.close(); + } +} + +main().catch(console.error); + diff --git a/examples/agent/predicate-browser-agent-video-recording-playwright.ts b/examples/agent/predicate-browser-agent-video-recording-playwright.ts new file mode 100644 index 0000000..bd95e1e --- /dev/null +++ b/examples/agent/predicate-browser-agent-video-recording-playwright.ts @@ -0,0 +1,101 @@ +/** + * Example: PredicateBrowserAgent + Playwright video recording (recommended approach). + * + * Video recording is a Playwright context feature (recordVideo), not an agent constructor flag. + * This example shows how to: + * 1) create a Playwright context with recordVideo enabled + * 2) wrap the existing page with SentienceBrowser.fromPage(...) + * 3) use AgentRuntime + PredicateBrowserAgent normally + * + * Usage: + * ts-node examples/agent/predicate-browser-agent-video-recording-playwright.ts + */ + +import { chromium } from 'playwright'; +import * as fs from 'fs'; +import * as path from 'path'; + +import { + AgentRuntime, + PredicateBrowserAgent, + type PredicateBrowserAgentConfig, + SentienceBrowser, + type RuntimeStep, +} from '../../src'; +import { createTracer } from '../../src/tracing/tracer-factory'; +import { LLMProvider, type LLMResponse } from '../../src/llm-provider'; +import type { Snapshot } from '../../src/types'; + +function createBrowserAdapter(browser: SentienceBrowser) { + return { + snapshot: async (_page: any, options?: Record): Promise => { + return await browser.snapshot(options); + }, + }; +} + +class FixedActionProvider extends LLMProvider { + constructor(private action: string) { + super(); + } + get modelName(): string { + return 'fixed-action'; + } + supportsJsonMode(): boolean { + return false; + } + async generate(_system: string, _user: string, _opts: Record = {}): Promise { + return { content: this.action, modelName: this.modelName }; + } +} + +async function main() { + const apiKey = (process.env.PREDICATE_API_KEY || + process.env.SENTIENCE_API_KEY) as string | undefined; + + const recordingsDir = path.join(process.cwd(), 'recordings'); + if (!fs.existsSync(recordingsDir)) fs.mkdirSync(recordingsDir, { recursive: true }); + + const pw = await chromium.launch({ headless: false }); + const context = await pw.newContext({ + recordVideo: { dir: recordingsDir, size: { width: 1280, height: 720 } }, + }); + const page = await context.newPage(); + + const runId = 'predicate-browser-agent-video-recording'; + const tracer = await createTracer({ apiKey, runId, uploadTrace: false }); + + // Wrap existing Playwright page. + const sentienceBrowser = SentienceBrowser.fromPage(page, apiKey); + + try { + await page.goto('https://example.com'); + await page.waitForLoadState('networkidle'); + + const runtime = new AgentRuntime(createBrowserAdapter(sentienceBrowser), page as any, tracer); + const config: PredicateBrowserAgentConfig = { historyLastN: 0 }; + + const agent = new PredicateBrowserAgent({ + runtime, + executor: new FixedActionProvider('FINISH()'), + config, + }); + + const out = await agent.step({ + taskGoal: 'Open example.com', + step: { goal: 'Finish immediately' } satisfies RuntimeStep, + }); + console.log(`step ok: ${out.ok}`); + console.log(`videos will be saved under: ${recordingsDir}`); + } finally { + await tracer.close(true); + await context.close(); // flush video + await pw.close(); + } +} + +main().catch(err => { + console.error(err); + process.exit(1); +}); + diff --git a/src/agents/browser-agent.ts b/src/agents/browser-agent.ts new file mode 100644 index 0000000..7245af6 --- /dev/null +++ b/src/agents/browser-agent.ts @@ -0,0 +1,341 @@ +import type { Snapshot, StepHookContext } from '../types'; +import type { PermissionPolicy } from '../browser'; +import type { AgentRuntime } from '../agent-runtime'; +import { LLMProvider } from '../llm-provider'; +import { RuntimeAgent } from '../runtime-agent'; +import type { RuntimeStep } from '../runtime-agent'; +import type { CaptchaOptions } from '../captcha/types'; +import type { CaptchaHandler } from '../captcha/types'; + +export interface PermissionRecoveryConfig { + enabled?: boolean; + maxRestarts?: number; + autoGrant?: string[]; + geolocation?: Record | null; + origin?: string | null; +} + +export interface VisionFallbackConfig { + enabled?: boolean; + maxVisionCalls?: number; + triggerRequiresVision?: boolean; + triggerRepeatedNoop?: boolean; + triggerCanvasOrLowActionables?: boolean; +} + +export interface CaptchaConfig { + policy?: 'abort' | 'callback'; + // Interface-only: SDK does not ship captcha solvers. Users provide a handler/callback. + handler?: CaptchaHandler | null; + timeoutMs?: number | null; + pollMs?: number | null; + minConfidence?: number; +} + +export interface PredicateBrowserAgentConfig { + // Permissions + permissionStartup?: PermissionPolicy | null; + permissionRecovery?: PermissionRecoveryConfig | null; + + // Vision fallback + vision?: VisionFallbackConfig; + + // CAPTCHA handling + captcha?: CaptchaConfig; + + // Prompt / token controls + historyLastN?: number; // 0 disables LLM-facing step history + + // Opt-in: track token usage from LLM provider responses (best-effort). + tokenUsageEnabled?: boolean; + + // Compact prompt customization + // builder(taskGoal, stepGoal, domContext, snapshot, historySummary) -> {systemPrompt, userPrompt} + compactPromptBuilder?: ( + taskGoal: string, + stepGoal: string, + domContext: string, + snap: Snapshot, + historySummary: string + ) => { systemPrompt: string; userPrompt: string }; + + compactPromptPostprocessor?: (domContext: string) => string; +} + +function historySummary(items: string[]): string { + if (!items.length) return ''; + return items.map(s => `- ${s}`).join('\n'); +} + +function applyCaptchaConfigToRuntime(runtime: AgentRuntime, cfg: CaptchaConfig | undefined): void { + if (!cfg) return; + + const policy = (cfg.policy ?? 'abort').toLowerCase() as 'abort' | 'callback'; + if (policy === 'abort') { + runtime.setCaptchaOptions({ + policy: 'abort', + minConfidence: cfg.minConfidence ?? 0.7, + } satisfies CaptchaOptions); + return; + } + + const pollMs = cfg.pollMs ?? 1_000; + const timeoutMs = cfg.timeoutMs ?? 120_000; + const minConfidence = cfg.minConfidence ?? 0.7; + + const handler = cfg.handler ?? null; + if (!handler) { + throw new Error( + 'captcha.handler is required when captcha.policy="callback". ' + + 'Provide a handler callback (e.g. human handoff or your external system).' + ); + } + + runtime.setCaptchaOptions({ + policy: 'callback', + handler, + timeoutMs, + pollMs, + minConfidence, + } satisfies CaptchaOptions); +} + +type TokenUsageTotals = { + calls: number; + promptTokens: number; + completionTokens: number; + totalTokens: number; +}; + +class TokenUsageCollector { + private byRole: Record = {}; + private byModel: Record = {}; + + record(role: string, resp: any): void { + const pt = typeof resp?.promptTokens === 'number' ? resp.promptTokens : 0; + const ct = typeof resp?.completionTokens === 'number' ? resp.completionTokens : 0; + const tt = typeof resp?.totalTokens === 'number' ? resp.totalTokens : pt + ct; + const model = String(resp?.modelName ?? 'unknown') || 'unknown'; + + const bump = (dst: Record, key: string) => { + const cur = + dst[key] ?? + ({ calls: 0, promptTokens: 0, completionTokens: 0, totalTokens: 0 } as TokenUsageTotals); + cur.calls += 1; + cur.promptTokens += Math.max(0, pt); + cur.completionTokens += Math.max(0, ct); + cur.totalTokens += Math.max(0, tt); + dst[key] = cur; + }; + + bump(this.byRole, role); + bump(this.byModel, model); + } + + reset(): void { + this.byRole = {}; + this.byModel = {}; + } + + summary(): { + total: TokenUsageTotals; + byRole: Record; + byModel: Record; + } { + const sum = (src: Record): TokenUsageTotals => { + return Object.values(src).reduce( + (acc, v) => ({ + calls: acc.calls + v.calls, + promptTokens: acc.promptTokens + v.promptTokens, + completionTokens: acc.completionTokens + v.completionTokens, + totalTokens: acc.totalTokens + v.totalTokens, + }), + { calls: 0, promptTokens: 0, completionTokens: 0, totalTokens: 0 } + ); + }; + return { total: sum(this.byRole), byRole: this.byRole, byModel: this.byModel }; + } +} + +class TokenAccountingProvider extends LLMProvider { + constructor( + private inner: LLMProvider, + private collector: TokenUsageCollector, + private role: string + ) { + super(); + } + get modelName(): string { + return this.inner.modelName; + } + supportsJsonMode(): boolean { + return this.inner.supportsJsonMode(); + } + supportsVision(): boolean { + return this.inner.supportsVision?.() ?? false; + } + async generate( + systemPrompt: string, + userPrompt: string, + options: Record = {} + ): Promise { + const resp = await this.inner.generate(systemPrompt, userPrompt, options); + try { + this.collector.record(this.role, resp); + } catch { + // best-effort + } + return resp; + } + async generateWithImage( + systemPrompt: string, + userPrompt: string, + imageBase64: string, + options: Record = {} + ): Promise { + const fn = (this.inner as any).generateWithImage; + if (typeof fn !== 'function') { + throw new Error('Inner provider does not implement generateWithImage'); + } + const resp = await fn.call(this.inner, systemPrompt, userPrompt, imageBase64, options); + try { + this.collector.record(this.role, resp); + } catch { + // best-effort + } + return resp; + } +} + +export type StepOutcome = { stepGoal: string; ok: boolean }; + +export class PredicateBrowserAgent { + readonly runtime: AgentRuntime; + readonly executor: LLMProvider; + readonly visionExecutor?: LLMProvider; + readonly visionVerifier?: LLMProvider; + readonly config: PredicateBrowserAgentConfig; + + private history: string[] = []; + private visionCallsUsed = 0; + private runner: RuntimeAgent; + private tokenUsage: TokenUsageCollector | null = null; + + constructor(opts: { + runtime: AgentRuntime; + executor: LLMProvider; + visionExecutor?: LLMProvider; + visionVerifier?: LLMProvider; + config?: PredicateBrowserAgentConfig; + }) { + const tokenUsageEnabled = Boolean(opts.config?.tokenUsageEnabled); + const collector = tokenUsageEnabled ? new TokenUsageCollector() : null; + + this.runtime = opts.runtime; + this.tokenUsage = collector; + this.executor = collector + ? new TokenAccountingProvider(opts.executor, collector, 'executor') + : opts.executor; + this.visionExecutor = + collector && opts.visionExecutor + ? new TokenAccountingProvider(opts.visionExecutor, collector, 'vision_executor') + : opts.visionExecutor; + this.visionVerifier = + collector && opts.visionVerifier + ? new TokenAccountingProvider(opts.visionVerifier, collector, 'vision_verifier') + : opts.visionVerifier; + this.config = { + permissionStartup: null, + permissionRecovery: null, + vision: { enabled: false, maxVisionCalls: 0 }, + captcha: { policy: 'abort', handler: null }, + historyLastN: 0, + ...(opts.config ?? {}), + }; + + applyCaptchaConfigToRuntime(this.runtime, this.config.captcha); + + this.runner = new RuntimeAgent({ + runtime: this.runtime, + executor: this.executor, + visionExecutor: this.visionExecutor, + visionVerifier: this.visionVerifier, + structuredPromptBuilder: this.config.compactPromptBuilder, + domContextPostprocessor: this.config.compactPromptPostprocessor, + historySummaryProvider: () => { + const n = Math.max(0, this.config.historyLastN ?? 0); + if (n <= 0) return ''; + const slice = this.history.slice(Math.max(0, this.history.length - n)); + return historySummary(slice); + }, + } as any); + } + + getTokenUsage(): any { + if (!this.tokenUsage) { + return { enabled: false, reason: 'tokenUsageEnabled is false' }; + } + return { enabled: true, ...this.tokenUsage.summary() }; + } + + resetTokenUsage(): void { + this.tokenUsage?.reset(); + } + + private recordHistory(stepGoal: string, ok: boolean) { + const n = Math.max(0, this.config.historyLastN ?? 0); + if (n <= 0) return; + this.history.push(`${stepGoal} -> ${ok ? 'ok' : 'fail'}`); + if (this.history.length > n) { + this.history = this.history.slice(this.history.length - n); + } + } + + async step(opts: { + taskGoal: string; + step: RuntimeStep; + onStepStart?: (ctx: StepHookContext) => void | Promise; + onStepEnd?: (ctx: StepHookContext) => void | Promise; + }): Promise { + let step = opts.step; + + const maxVisionCalls = Math.max(0, this.config.vision?.maxVisionCalls ?? 0); + if ( + this.config.vision?.enabled && + maxVisionCalls > 0 && + this.visionCallsUsed >= maxVisionCalls + ) { + step = { ...step, visionExecutorEnabled: false, maxVisionExecutorAttempts: 0 }; + } + + const ok = await this.runner.runStep({ + taskGoal: opts.taskGoal, + step, + onStepStart: opts.onStepStart, + onStepEnd: opts.onStepEnd, + }); + + this.recordHistory(step.goal, ok); + return { stepGoal: step.goal, ok }; + } + + async run(opts: { + taskGoal: string; + steps: RuntimeStep[]; + onStepStart?: (ctx: StepHookContext) => void | Promise; + onStepEnd?: (ctx: StepHookContext) => void | Promise; + stopOnFailure?: boolean; + }): Promise { + const stopOnFailure = opts.stopOnFailure ?? true; + for (const step of opts.steps) { + const out = await this.step({ + taskGoal: opts.taskGoal, + step, + onStepStart: opts.onStepStart, + onStepEnd: opts.onStepEnd, + }); + if (stopOnFailure && !out.ok) return false; + } + return true; + } +} diff --git a/src/index.ts b/src/index.ts index fdbb4de..a114a98 100644 --- a/src/index.ts +++ b/src/index.ts @@ -91,6 +91,13 @@ export { SentienceDebugger, PredicateDebugger } from './debugger'; export { RuntimeAgent } from './runtime-agent'; export type { RuntimeStep, StepVerification } from './runtime-agent'; export { parseVisionExecutorAction, executeVisionExecutorAction } from './vision-executor'; +export { + PredicateBrowserAgent, + type PredicateBrowserAgentConfig, + type PermissionRecoveryConfig, + type VisionFallbackConfig, + type CaptchaConfig, +} from './agents/browser-agent'; export * from './captcha/types'; export * from './captcha/strategies'; export * from './tools'; diff --git a/src/runtime-agent.ts b/src/runtime-agent.ts index d72aa54..8a6559a 100644 --- a/src/runtime-agent.ts +++ b/src/runtime-agent.ts @@ -58,6 +58,17 @@ export class RuntimeAgent { readonly shortCircuitCanvas: boolean; private structuredLLM: LLMInteractionHandler; + private structuredPromptBuilder?: + | (( + taskGoal: string, + stepGoal: string, + domContext: string, + snap: Snapshot, + historySummary: string + ) => { systemPrompt: string; userPrompt: string }) + | undefined; + private domContextPostprocessor?: ((domContext: string) => string) | undefined; + private historySummaryProvider?: (() => string) | undefined; constructor(opts: { runtime: AgentRuntime; @@ -65,6 +76,15 @@ export class RuntimeAgent { visionExecutor?: LLMProvider; visionVerifier?: LLMProvider; shortCircuitCanvas?: boolean; + structuredPromptBuilder?: ( + taskGoal: string, + stepGoal: string, + domContext: string, + snap: Snapshot, + historySummary: string + ) => { systemPrompt: string; userPrompt: string }; + domContextPostprocessor?: (domContext: string) => string; + historySummaryProvider?: () => string; }) { this.runtime = opts.runtime; this.executor = opts.executor; @@ -72,6 +92,9 @@ export class RuntimeAgent { this.visionVerifier = opts.visionVerifier; this.shortCircuitCanvas = opts.shortCircuitCanvas ?? true; this.structuredLLM = new LLMInteractionHandler(this.executor, false); + this.structuredPromptBuilder = opts.structuredPromptBuilder; + this.domContextPostprocessor = opts.domContextPostprocessor; + this.historySummaryProvider = opts.historySummaryProvider; } async runStep(opts: { @@ -156,6 +179,106 @@ export class RuntimeAgent { } } + /** + * Execute exactly one action for a step without owning step lifecycle. + * + * This is intended for orchestrators that already call `runtime.beginStep(...)` / + * `runtime.emitStepEnd(...)` and want to reuse the SDK's snapshot-first action proposal + * and execution logic without double-counting budgets or emitting duplicate events. + */ + async actOnce(opts: { + taskGoal: string; + step: RuntimeStep; + allowVisionFallback?: boolean; + historySummary?: string; + }): Promise { + const res = await this.actOnceResult(opts); + return res.action; + } + + /** + * Like `actOnce`, but also returns the pre-action snapshot used for proposal. + */ + async actOnceWithSnapshot(opts: { + taskGoal: string; + step: RuntimeStep; + allowVisionFallback?: boolean; + historySummary?: string; + }): Promise<{ action: string; snap: Snapshot }> { + const res = await this.actOnceResult(opts); + return { action: res.action, snap: res.snap }; + } + + /** + * Like `actOnce`, but also indicates whether vision was used. + */ + async actOnceResult(opts: { + taskGoal: string; + step: RuntimeStep; + allowVisionFallback?: boolean; + historySummary?: string; + }): Promise<{ action: string; snap: Snapshot; usedVision: boolean }> { + const { taskGoal, step } = opts; + const allowVisionFallback = opts.allowVisionFallback ?? true; + const historySummary = (opts.historySummary ?? '').trim(); + + const snap = await this.snapshotWithRamp(step); + + if (allowVisionFallback && (await this.shouldShortCircuitToVision(step, snap))) { + const provider = this.visionExecutor; + if (provider && provider.supportsVision?.()) { + const url = this.runtime.page?.url?.() ?? snap?.url ?? '(unknown)'; + const buf = (await (this.runtime.page as any).screenshot({ type: 'png' })) as Buffer; + const imageBase64 = Buffer.from(buf).toString('base64'); + + const { systemPrompt, userPrompt } = this.visionExecutorPrompts({ + taskGoal, + step, + url, + snap, + }); + + const resp = await provider.generateWithImage(systemPrompt, userPrompt, imageBase64, { + temperature: 0.0, + }); + const action = this.extractActionFromText(resp.content); + await this.executeAction(action, snap ?? undefined); + return { action, snap, usedVision: true }; + } + } + + // Structured snapshot-first proposal. + let domContext = this.structuredLLM.buildContext(snap, step.goal); + if (this.domContextPostprocessor) { + domContext = this.domContextPostprocessor(domContext); + } + + let action: string; + if (this.structuredPromptBuilder) { + const { systemPrompt, userPrompt } = this.structuredPromptBuilder( + taskGoal, + step.goal, + domContext, + snap, + historySummary || (this.historySummaryProvider?.() ?? '').trim() + ); + const resp = await this.executor.generate(systemPrompt, userPrompt, { temperature: 0.0 }); + action = this.extractActionFromText(resp.content); + } else { + let combinedGoal = taskGoal; + const hs = historySummary || (this.historySummaryProvider?.() ?? '').trim(); + if (hs) { + combinedGoal = `${taskGoal}\n\nRECENT STEPS:\n${hs}`; + } + combinedGoal = `${combinedGoal}\n\nSTEP: ${step.goal}`; + const resp = await this.structuredLLM.queryLLM(domContext, combinedGoal); + action = this.extractActionFromText(resp.content); + } + + await this.executeAction(action, snap); + return { action, snap, usedVision: false }; + } + private async runHook( hook: ((ctx: StepHookContext) => void | Promise) | undefined, ctx: StepHookContext @@ -210,8 +333,31 @@ export class RuntimeAgent { snap: Snapshot; }): Promise { const { taskGoal, step, snap } = opts; - const domContext = this.structuredLLM.buildContext(snap, step.goal); - const combinedGoal = `${taskGoal}\n\nSTEP: ${step.goal}`; + let domContext = this.structuredLLM.buildContext(snap, step.goal); + if (this.domContextPostprocessor) { + domContext = this.domContextPostprocessor(domContext); + } + + const historySummary = (this.historySummaryProvider?.() ?? '').trim(); + + if (this.structuredPromptBuilder) { + const { systemPrompt, userPrompt } = this.structuredPromptBuilder( + taskGoal, + step.goal, + domContext, + snap, + historySummary + ); + const resp = await this.executor.generate(systemPrompt, userPrompt, { temperature: 0.0 }); + return this.extractActionFromText(resp.content); + } + + let combinedGoal = taskGoal; + if (historySummary) { + combinedGoal = `${taskGoal}\n\nRECENT STEPS:\n${historySummary}`; + } + combinedGoal = `${combinedGoal}\n\nSTEP: ${step.goal}`; + const resp = await this.structuredLLM.queryLLM(domContext, combinedGoal); return this.extractActionFromText(resp.content); } diff --git a/tests/predicate-browser-agent.test.ts b/tests/predicate-browser-agent.test.ts new file mode 100644 index 0000000..ad662ea --- /dev/null +++ b/tests/predicate-browser-agent.test.ts @@ -0,0 +1,155 @@ +import { PredicateBrowserAgent } from '../src/agents/browser-agent'; +import { AgentRuntime } from '../src/agent-runtime'; +import { Tracer } from '../src/tracing/tracer'; +import { TraceSink } from '../src/tracing/sink'; +import { MockPage } from './mocks/browser-mock'; +import { LLMProvider } from '../src/llm-provider'; +import type { LLMResponse } from '../src/llm-provider'; +import type { Snapshot, Element } from '../src/types'; + +class MockSink extends TraceSink { + public events: any[] = []; + emit(event: Record): void { + this.events.push(event); + } + async close(): Promise { + // no-op + } + getSinkType(): string { + return 'MockSink'; + } +} + +class ProviderStub extends LLMProvider { + private responses: string[]; + public calls: Array<{ system: string; user: string; options?: any }> = []; + + constructor(responses: string[] = []) { + super(); + this.responses = [...responses]; + } + + get modelName(): string { + return 'stub'; + } + + supportsJsonMode(): boolean { + return true; + } + + async generate( + systemPrompt: string, + userPrompt: string, + options: Record = {} + ): Promise { + this.calls.push({ system: systemPrompt, user: userPrompt, options }); + const content = this.responses.length ? (this.responses.shift() as string) : 'FINISH()'; + return { + content, + modelName: this.modelName, + promptTokens: 11, + completionTokens: 7, + totalTokens: 18, + }; + } +} + +function makeClickableElement(id: number): Element { + return { + id, + role: 'button', + text: 'OK', + importance: 100, + bbox: { x: 10, y: 20, width: 100, height: 40 }, + visual_cues: { is_primary: true, is_clickable: true, background_color_name: null }, + in_viewport: true, + is_occluded: false, + z_index: 1, + }; +} + +describe('PredicateBrowserAgent', () => { + it('allows compactPromptBuilder override', async () => { + const sink = new MockSink(); + const tracer = new Tracer('run', sink); + const page = new MockPage('https://example.com/start') as any; + + const snapshots: Snapshot[] = [ + { + status: 'success', + url: 'https://example.com/start', + elements: [makeClickableElement(1)], + timestamp: 't1', + }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + const executor = new ProviderStub(['CLICK(1)']); + + const agent = new PredicateBrowserAgent({ + runtime, + executor, + config: { + compactPromptBuilder: () => ({ systemPrompt: 'SYSTEM_CUSTOM', userPrompt: 'USER_CUSTOM' }), + captcha: { policy: 'abort' }, + }, + }); + + const ok = await agent.step({ + taskGoal: 'test', + step: { goal: 'Click OK', maxSnapshotAttempts: 1 }, + }); + + expect(ok.ok).toBe(true); + expect(executor.calls.length).toBe(1); + expect(executor.calls[0].system).toContain('SYSTEM_CUSTOM'); + expect(executor.calls[0].user).toBe('USER_CUSTOM'); + }); + + it('tracks token usage when opt-in enabled', async () => { + const sink = new MockSink(); + const tracer = new Tracer('run', sink); + const page = new MockPage('https://example.com/start') as any; + + const snapshots: Snapshot[] = [ + { + status: 'success', + url: 'https://example.com/start', + elements: [makeClickableElement(1)], + timestamp: 't1', + }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + const executor = new ProviderStub(['FINISH()']); + + const agent = new PredicateBrowserAgent({ + runtime, + executor, + config: { tokenUsageEnabled: true, captcha: { policy: 'abort' } }, + }); + + const out = await agent.step({ + taskGoal: 'test', + step: { goal: 'No-op', maxSnapshotAttempts: 1 }, + }); + expect(out.ok).toBe(true); + + const usage = agent.getTokenUsage(); + expect(usage.enabled).toBe(true); + expect(usage.total.totalTokens).toBeGreaterThanOrEqual(18); + expect(usage.byRole.executor.calls).toBeGreaterThanOrEqual(1); + + agent.resetTokenUsage(); + const usage2 = agent.getTokenUsage(); + expect(usage2.total.totalTokens).toBe(0); + }); +}); diff --git a/tests/runtime-agent.test.ts b/tests/runtime-agent.test.ts index 2cea20a..7ba072c 100644 --- a/tests/runtime-agent.test.ts +++ b/tests/runtime-agent.test.ts @@ -83,6 +83,46 @@ function makeClickableElement(id: number): Element { } describe('RuntimeAgent (runtime-backed agent)', () => { + it('actOnce executes without step lifecycle', async () => { + const sink = new MockSink(); + const tracer = new Tracer('run', sink); + const page = new MockPage('https://example.com/start') as any; + + const snapshots: Snapshot[] = [ + { + status: 'success', + url: 'https://example.com/start', + elements: [makeClickableElement(1)], + timestamp: 't1', + }, + ]; + + const browserLike = { + snapshot: async () => snapshots.shift() as Snapshot, + }; + + const runtime = new AgentRuntime(browserLike as any, page as any, tracer); + // Guard: actOnce must not call step lifecycle APIs. + (runtime as any).beginStep = jest.fn(() => { + throw new Error('beginStep should not be called by actOnce'); + }); + (runtime as any).emitStepEnd = jest.fn(() => { + throw new Error('emitStepEnd should not be called by actOnce'); + }); + + const executor = new ProviderStub(['CLICK(1)']); + const agent = new RuntimeAgent({ runtime, executor }); + + const action = await agent.actOnce({ + taskGoal: 'Do a thing', + step: { goal: 'Click OK', maxSnapshotAttempts: 1 }, + allowVisionFallback: false, + }); + + expect(action.toUpperCase().startsWith('CLICK(')).toBe(true); + expect(page.mouseClickCalls.length).toBeGreaterThan(0); + }); + it('structured executor succeeds without vision', async () => { const sink = new MockSink(); const tracer = new Tracer('run', sink);