browser-use · Cheggin · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/app/src/main/hl/engines/browsercode/adapter.ts b/app/src/main/hl/engines/browsercode/adapter.ts
@@ -8,7 +8,7 @@
 
 import { register } from '../registry';
 import { applyBrowserHarnessEnv } from '../browserHarnessEnv';
-import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines } from '../skillIndexPrompt';
+import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines, loginBlockGuidanceLines } from '../skillIndexPrompt';
 import { resolveThemeMode } from '../../../themeMode';
 import { enrichedEnv } from '../pathEnrich';
 import { runCliCapture } from '../cliSpawn';
@@ -179,6 +179,7 @@ const browserCodeAdapter: EngineAdapter = {
       ...htmlBlockGuidanceLines(resolveThemeMode()),
       ...optionsBlockGuidanceLines(),
       ...askBlockGuidanceLines(),
+      ...loginBlockGuidanceLines(),
       "Use the `browser-harness-js` CLI for browser actions. Start with `browser-harness-js 'await connectToAssignedTarget()'`.",
       'Do not use old helpers.js convenience APIs for browser control.',
       'Do not edit harness files unless the user asks or a confirmed Browser Harness JS defect blocks the task.',

diff --git a/app/src/main/hl/engines/claude-code/adapter.ts b/app/src/main/hl/engines/claude-code/adapter.ts
@@ -13,7 +13,7 @@
 import { mainLogger } from '../../../logger';
 import { register } from '../registry';
 import { applyBrowserHarnessEnv } from '../browserHarnessEnv';
-import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines } from '../skillIndexPrompt';
+import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines, loginBlockGuidanceLines } from '../skillIndexPrompt';
 import { resolveThemeMode } from '../../../themeMode';
 import { enrichedEnv } from '../pathEnrich';
 import { runCliCapture, spawnCli } from '../cliSpawn';
@@ -126,6 +126,7 @@ const claudeCodeAdapter: EngineAdapter = {
       ...htmlBlockGuidanceLines(resolveThemeMode()),
       ...optionsBlockGuidanceLines(),
       ...askBlockGuidanceLines(),
+      ...loginBlockGuidanceLines(),
       "Use the `browser-harness-js` CLI for browser actions. Start with `browser-harness-js 'await connectToAssignedTarget()'`.",
       'Do not use old helpers.js convenience APIs for browser control.',
       'Do not edit harness files unless the user asks or a confirmed Browser Harness JS defect blocks the task.',

diff --git a/app/src/main/hl/engines/codex/adapter.ts b/app/src/main/hl/engines/codex/adapter.ts
@@ -21,7 +21,7 @@ import path from 'node:path';
 import { mainLogger } from '../../../logger';
 import { register } from '../registry';
 import { applyBrowserHarnessEnv } from '../browserHarnessEnv';
-import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines } from '../skillIndexPrompt';
+import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines, loginBlockGuidanceLines } from '../skillIndexPrompt';
 import { resolveThemeMode } from '../../../themeMode';
 import { enrichedEnv } from '../pathEnrich';
 import { runCliCapture } from '../cliSpawn';
@@ -117,6 +117,7 @@ const codexAdapter: EngineAdapter = {
       ...htmlBlockGuidanceLines(resolveThemeMode()),
       ...optionsBlockGuidanceLines(),
       ...askBlockGuidanceLines(),
+      ...loginBlockGuidanceLines(),
       "Use the `browser-harness-js` CLI for browser actions. Start with `browser-harness-js 'await connectToAssignedTarget()'`.",
       'Do not use old helpers.js convenience APIs for browser control.',
       'Do not edit harness files unless the user asks or a confirmed Browser Harness JS defect blocks the task.',

diff --git a/app/src/main/hl/engines/skillIndexPrompt.ts b/app/src/main/hl/engines/skillIndexPrompt.ts
@@ -98,6 +98,23 @@ export function askBlockGuidanceLines(): string[] {
   ];
 }
 
+/**
+ * Provider-neutral nudge for the `login` fenced block — the renderer
+ * surfaces it as a username/password form with a "log in manually in the
+ * browser" escape hatch. The agent reads the credentials from the next
+ * user turn and types them into the live browser view. See the
+ * `login-block` interaction skill for the full schema and the
+ * manual-login fallback contract.
+ */
+export function loginBlockGuidanceLines(): string[] {
+  return [
+    'When the live browser hits a login wall and you need the user to provide credentials, emit a ```login fenced block carrying JSON: { site, url, prompt?, usernameLabel?, passwordLabel? }. `site` is the brand token (e.g. "Amazon", not "amazon.com"); `url` is the absolute http(s) login URL.',
+    'The `login` block ENDS YOUR TURN. After emitting it, do not call any more tools — stop and wait for the user. Their reply arrives as "Login for <site>:\\nusername: <u>\\npassword: <p>" — type these verbatim into the username/password fields of the live tab, then submit. Do NOT echo the password back in your own response.',
+    'The form also offers the user a "log in on <site> myself" affordance that opens the in-app browser view directly; if they take that path you will not get a structured reply, just whatever they type next (e.g. "done"). Treat any plain follow-up message as the signal to resume.',
+    'Use `login` only for real credential walls. For multiple-choice disambiguation, use `ask`; for picking among visible options, use `options`. See the `login-block` interaction skill for the full schema and worked examples.',
+  ];
+}
+
 function normalizeSlash(value: string): string {
   return value.split(path.sep).join('/');
 }

diff --git a/app/src/main/hl/stock/interaction-skills/capture-block.md b/app/src/main/hl/stock/interaction-skills/capture-block.md
@@ -0,0 +1,247 @@
+# Capture block — reCAPTCHA 3×3 tile picker
+
+When you hit a Google reCAPTCHA image challenge ("Select all squares with
+motorcycles", "Select all images with traffic lights"), emit a fenced
+` ```capture ` block. The renderer slices your single screenshot into 9
+clickable tiles + a reCAPTCHA-style header bar; the user picks tiles and
+their selection comes back to you as a new user turn.
+
+## Hard constraints — read first
+
+1. **Only the 3×3 grid.** No prompt header, no Verify/Audio/Info row, no
+   surrounding chrome. The renderer evenly subdivides whatever PNG you
+   send into 9 tiles; any extra pixels break the tile boundaries.
+2. **CSS pixels everywhere.** `Page.captureScreenshot`'s `clip` uses CSS
+   pixels regardless of devicePixelRatio. NEVER reason about the output
+   PNG's pixel dimensions when picking clip coordinates. Don't run `sips
+   --cropOffset` on the saved PNG — that's output-pixel space and you'll
+   double-scale.
+3. **No tool calls after the closing fence.** Your turn ends; the agent
+   process idles until the user submits.
+
+## The reliable recipe — copy this
+
+The bframe (`https://www.google.com/recaptcha/api2/bframe?...`) is
+**cross-origin with any third-party host** that embeds reCAPTCHA
+(Cloudflare-protected sites, 2captcha demos, etc.). You cannot reach
+its `contentDocument` from the parent page — you must attach to its
+CDP target and run the DOM queries inside that session.
+
+Compute the grid rect from the **individual tile cells** —
+*never* from the `<table>` element itself, because that element's
+bounding rect includes extra layout space (the floating toolbar sometimes
+sits absolutely positioned inside it, and certain challenge variants
+pad the table beyond the visible tile area).
+
+```js
+browser-harness-js <<'EOF'
+const fs = await import('fs')
+
+// 1. Resolve the bframe's outer rect *in the parent page* first, before
+//    we route Runtime calls into the OOPIF.
+const parentTargetId = session.targetId
+const outerR = await session.Runtime.evaluate({
+  expression: `JSON.stringify((() => {
+    const el = Array.from(document.querySelectorAll('iframe'))
+      .find(e => e.src.includes('/recaptcha/api2/bframe'));
+    if (!el) return null;
+    const r = el.getBoundingClientRect();
+    return { x: r.x, y: r.y, w: r.width, h: r.height };
+  })())`,
+  returnByValue: true,
+})
+const outer = JSON.parse(outerR.result.value)
+if (!outer) throw new Error('bframe <iframe> element not found in parent DOM')
+
+// 2. Route subsequent Runtime/DOM calls to the bframe target. See
+//    cross-origin-iframes.md — `session.use(targetId)` auto-attaches
+//    and works whether the bframe is same-origin or cross-origin.
+const { targetInfos } = await session.Target.getTargets({})
+const bframeTarget = targetInfos.find(
+  t => t.type === 'iframe' && t.url.includes('/recaptcha/api2/bframe'),
+)
+if (!bframeTarget) throw new Error('bframe target not found — is the challenge open?')
+await session.use(bframeTarget.targetId)
+
+try {
+  // 3. Inside the bframe target: tile-union rect + challenge text.
+  //    Returns inner-iframe coords; we add `outer` once back outside.
+  const innerR = await session.Runtime.evaluate({
+    expression: `JSON.stringify((() => {
+      const tiles = Array.from(document.querySelectorAll('td.rc-imageselect-tile'));
+      if (tiles.length === 0) return { error: 'no tile cells (is challenge open?)' };
+      let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
+      for (const t of tiles) {
+        const r = t.getBoundingClientRect();
+        if (r.width <= 0 || r.height <= 0) continue;
+        if (r.x < minX) minX = r.x;
+        if (r.y < minY) minY = r.y;
+        if (r.x + r.width > maxX) maxX = r.x + r.width;
+        if (r.y + r.height > maxY) maxY = r.y + r.height;
+      }
+      const cols = Math.round(Math.sqrt(tiles.length)) === 4 ? 4 : 3;
+      const rows = cols;
+      const desc = document.querySelector('.rc-imageselect-desc-no-canonical, .rc-imageselect-desc');
+      const promptText = desc?.textContent?.trim() ?? '';
+      const target = desc?.querySelector('strong')?.textContent?.trim() ?? '';
+      const candidateImg = document.querySelector('.rc-imageselect-candidates img, .rc-canonical-bounding-box img');
+      const targetImage = candidateImg?.src ?? null;
+      return {
+        inner: { x: minX, y: minY, w: maxX - minX, h: maxY - minY },
+        rows, cols, promptText, target, targetImage,
+      };
+    })())`,
+    returnByValue: true,
+  })
+  var info = JSON.parse(innerR.result.value)
+  if (info.error) throw new Error(info.error)
+} finally {
+  // 4. Always route back to the parent before the next Page.* call.
+  await session.use(parentTargetId)
+}
+
+// 5. Translate inner → parent-page coords.
+const grid = {
+  x: outer.x + info.inner.x,
+  y: outer.y + info.inner.y,
+  w: info.inner.w,
+  h: info.inner.h,
+}
+const { rows, cols, promptText, target, targetImage } = info
+
+// 6. Clip-screenshot to the tile union, in CSS pixels.
+const shot = await session.Page.captureScreenshot({
+  format: 'png',
+  clip: { x: grid.x, y: grid.y, width: grid.w, height: grid.h, scale: 1 },
+})
+const path = `${process.env.BU_OUTPUTS_DIR}/recaptcha-grid.png`
+fs.writeFileSync(path, Buffer.from(shot.data, 'base64'))
+
+// Stash the grid rect for the click-back step on the next turn.
+globalThis.__captcha_grid = { ...grid, rows, cols }
+return { path, grid, rows, cols, promptText, target, targetImage }
+EOF
+```
+
+### Verification — required before emitting the fence
+
+Read the saved PNG back as an image and confirm:
+
+1. **It's a near-square** — width/height ratio between 0.95 and 1.05.
+2. **No toolbar visible** at the bottom (no refresh/audio/info icons,
+   no Verify/Skip button). If you see them, the tile union query was
+   bypassed somewhere — re-run the recipe; do not crop the saved PNG
+   with `sips` or `magick`.
+3. **9 cells fill the frame edge-to-edge.** You should see grout lines
+   roughly at 33% and 66% horizontally and vertically. If there's
+   blank space on any edge, the bframe rect was stale — wait 500ms and
+   re-query.
+
+If anything fails, click the checkbox again to refresh the challenge
+and re-run the recipe. Don't ship a wrong screenshot — the user will
+see misaligned tiles and click the wrong cells.
+
+### Common failures and how to avoid them
+
+- **Querying `.rc-imageselect-table-33` for height.** Its
+  `getBoundingClientRect` can be 388×388 even when the visible tiles
+  occupy only the top ~290px; the bottom strip contains the floating
+  Verify toolbar. Always use the per-tile union recipe above.
+- **Mixing CSS and output-pixel space.** `Page.captureScreenshot`'s
+  `clip` is in CSS pixels; the saved PNG comes out at CSS×DPR. NEVER
+  run `sips --cropOffset` on the saved PNG — that's output-pixel
+  space and you'll double-scale. If you need to re-crop, do it via
+  another `captureScreenshot` call with a tighter clip.
+- **Reading the bframe rect too early.** Right after clicking the "I'm
+  not a robot" checkbox the bframe is repositioning. Wait 500ms
+  before measuring.
+
+## Emit the fence
+
+```
+```capture
+{
+  "prompt": "Select all images with",
+  "target": "cars",
+  "targetImage": "https://www.gstatic.com/recaptcha/api2/payload?...",
+  "image": "/abs/path/outputs/<session>/recaptcha-grid.png",
+  "rows": 3,
+  "cols": 3
+}
+```
+```
+
+Then **stop**. No more tool calls. Wait for the user reply:
+
+> Captcha selected tiles: 0, 2, 6
+
+Indices are 0-based, left-to-right, top-to-bottom:
+
+```
+0 1 2
+3 4 5
+6 7 8
+```
+
+## Clicking the selected tiles
+
+When the reply arrives, convert each index to a click on the live
+page using the grid rect you stashed:
+
+```js
+browser-harness-js <<'EOF'
+const grid = globalThis.__captcha_grid
+const indices = [0, 2, 6] // from the user reply
+
+const COLS = 3, ROWS = 3
+const tileW = grid.w / COLS
+const tileH = grid.h / ROWS
+
+for (const i of indices) {
+  const row = Math.floor(i / COLS)
+  const col = i % COLS
+  const cx = grid.x + (col + 0.5) * tileW
+  const cy = grid.y + (row + 0.5) * tileH
+  await session.Input.dispatchMouseEvent({ type: 'mousePressed',  x: cx, y: cy, button: 'left', clickCount: 1 })
+  await session.Input.dispatchMouseEvent({ type: 'mouseReleased', x: cx, y: cy, button: 'left', clickCount: 1 })
+  await new Promise(r => setTimeout(r, 120))
+}
+// Verify button is roughly at (grid.x + 250, grid.y + 380); compute via
+// the bframe rect rather than hardcoding once you've handled the picks.
+return 'clicked'
+EOF
+```
+
+## Fields
+
+| field         | required | notes |
+|---------------|----------|-------|
+| `image`       | **yes**  | Absolute path under the outputs dir. Only the 3×3 grid, edge-to-edge. |
+| `prompt`      | recommended | Lead-in text ("Select all images with"). Without it the picker still works but has no header context. |
+| `target`      | recommended | The bold subject ("cars", "motorcycles"). Rendered in the reCAPTCHA-style blue header. |
+| `targetImage` | optional | URL of the example thumbnail (some challenges show one). Renders on the right side of the header. |
+| `rows`        | optional | Default `3`. Set to `4` for the rare 4×4 challenge. |
+| `cols`        | optional | Default `3`. |
+
+## "(none)" replies
+
+If the user confirms without picking any tile, you get:
+
+> Captcha selected tiles: (none)
+
+Click Verify anyway — reCAPTCHA will either accept (rare) or refresh
+to a new challenge. Don't loop emitting fresh `capture` blocks without
+giving the user a chance to actually solve the new one.
+
+## Banned
+
+- Cropping to anything beyond the 3×3 grid. The renderer subdivides
+  the image evenly into 9 tiles — extra pixels = misaligned tiles.
+- Running `sips` / `magick` to crop the saved PNG. Always use the
+  `clip` parameter of `Page.captureScreenshot` so you stay in CSS
+  pixels.
+- Multiple `capture` fences in one turn. One challenge at a time.
+- Inline base64 in `image`. The renderer expects an absolute file path;
+  `Page.captureScreenshot`'s wrapper already saves to a real file.
+- Eyeballing offsets. Use the constants in the recipe above. If they
+  fail, fall back to the bframe CDP attach pattern — never guess.