From 2faf3a166609e355a6db0d5eba904dd94455fa08 Mon Sep 17 00:00:00 2001 From: zerob13 Date: Mon, 25 May 2026 15:52:35 +0800 Subject: [PATCH 1/2] chore(cua): sync driver v0.2.0 --- docs/issues/cua-driver-v0-2-0-sync/plan.md | 38 ++ docs/issues/cua-driver-v0-2-0-sync/spec.md | 41 ++ docs/issues/cua-driver-v0-2-0-sync/tasks.md | 13 + .../vendor/cua-driver/source/.bumpversion.cfg | 2 +- .../vendor/cua-driver/source/Package.swift | 4 + .../Sources/CuaDriverCLI/BundleHelpers.swift | 35 ++ .../CuaDriverCLI/CuaDriverCommand.swift | 186 +++++++- .../CuaDriverCLI/Docs/CLIDocExtractor.swift | 40 +- .../Sources/CuaDriverCLI/DoctorCommand.swift | 262 ++++++++++ .../Sources/CuaDriverCLI/ServeCommand.swift | 27 +- .../CuaDriverCore/Apps/AppLauncher.swift | 58 +++ .../CuaDriverCore/Capture/WindowCapture.swift | 151 +++++- .../Sources/CuaDriverCore/CuaDriverCore.swift | 2 +- .../CuaDriverCore/Focus/FocusGuard.swift | 51 +- .../Focus/SystemFocusStealPreventer.swift | 446 +++++++++++++++++- .../Windows/WindowEnumerator.swift | 19 +- .../CuaDriverServer/CuaDriverMCPServer.swift | 179 +++++++ .../CuaDriverServer/ToolRegistry.swift | 29 +- .../CuaDriverServer/Tools/ClickTool.swift | 27 +- .../Tools/GetWindowStateTool.swift | 60 +++ .../CuaDriverServer/Tools/LaunchAppTool.swift | 108 +++-- .../Tools/ListWindowsTool.swift | 19 + .../Tools/ScreenshotTool.swift | 40 ++ .../Tools/WindowChangeDetector.swift | 260 ++++++++++ .../FocusStealPreventerTests.swift | 320 +++++++++++++ .../test_app_name_locale_fallback.py | 110 +++++ .../test_click_opens_new_window.py | 275 +++++++++++ .../integration/test_hidden_app_capture.py | 151 ++++++ .../scripts/build/build-release-notarized.sh | 15 +- .../cua-driver/source/scripts/install.sh | 116 ++++- plugins/cua/vendor/cua-driver/upstream.json | 14 +- 31 files changed, 2962 insertions(+), 136 deletions(-) create mode 100644 docs/issues/cua-driver-v0-2-0-sync/plan.md create mode 100644 docs/issues/cua-driver-v0-2-0-sync/spec.md create mode 100644 docs/issues/cua-driver-v0-2-0-sync/tasks.md create mode 100644 plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/BundleHelpers.swift create mode 100644 plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/DoctorCommand.swift create mode 100644 plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/Tools/WindowChangeDetector.swift create mode 100644 plugins/cua/vendor/cua-driver/source/Tests/FocusStealPreventerTests/FocusStealPreventerTests.swift create mode 100644 plugins/cua/vendor/cua-driver/source/Tests/integration/test_app_name_locale_fallback.py create mode 100644 plugins/cua/vendor/cua-driver/source/Tests/integration/test_click_opens_new_window.py create mode 100644 plugins/cua/vendor/cua-driver/source/Tests/integration/test_hidden_app_capture.py diff --git a/docs/issues/cua-driver-v0-2-0-sync/plan.md b/docs/issues/cua-driver-v0-2-0-sync/plan.md new file mode 100644 index 000000000..02a84604c --- /dev/null +++ b/docs/issues/cua-driver-v0-2-0-sync/plan.md @@ -0,0 +1,38 @@ +# Plan + +## Source Review + +- Compare upstream `trycua/cua` tags `cua-driver-v0.1.5` and + `cua-driver-v0.2.0`. +- Apply the Swift driver delta with a three-way merge against DeepChat's + maintained fork. +- Keep upstream Rust driver changes out of this sync. + +## Implementation + +- Merge upstream Swift runtime changes into + `plugins/cua/vendor/cua-driver/source`. +- Adapt new upstream TCC, doctor, and MCP daemon-proxy text and commands to + `DeepChat Computer Use.app` and `com.wefonk.deepchat.computeruse`. +- Preserve DeepChat-only CLI behavior: `deepchat-permission-probe`, nonblocking + MCP startup, and DeepChat-managed `update`. +- Update `plugins/cua/vendor/cua-driver/upstream.json` to `cua-driver-v0.2.0`. +- Leave packaged skills unchanged unless validation shows upstream skill content + changed in the Swift release. + +## Validation + +- Run `swift build --package-path plugins/cua/vendor/cua-driver/source --product cua-driver`. +- Run `pnpm run format`. +- Run `pnpm run i18n`. +- Run `pnpm run lint`. +- Run `git diff --check`. +- Run `pnpm run plugin:cua:build:mac:arm64`. +- Run `pnpm run plugin:validate -- --name cua --platform darwin --arch arm64`. + +## Risk + +The vendored driver is a local fork with DeepChat-specific TCC and packaging +behavior. A direct replacement with upstream source would risk regressing the +helper identity, permission flow, and plugin-managed update path, so the sync is +kept as an explicit fork merge. diff --git a/docs/issues/cua-driver-v0-2-0-sync/spec.md b/docs/issues/cua-driver-v0-2-0-sync/spec.md new file mode 100644 index 000000000..61547ba70 --- /dev/null +++ b/docs/issues/cua-driver-v0-2-0-sync/spec.md @@ -0,0 +1,41 @@ +# CUA Driver v0.2.0 Sync + +## Problem + +The bundled DeepChat Computer Use helper is based on upstream +`cua-driver-v0.1.5`. Upstream Swift CUA driver `cua-driver-v0.2.0` contains +macOS reliability fixes for focus suppression, screenshot capture fallback, +hidden app handling, side-effect detection, and MCP daemon proxying. + +## User Story + +As a DeepChat user using the bundled CUA plugin, I need the macOS helper to +include current upstream Swift driver fixes while continuing to use DeepChat's +helper app, TCC permissions, MCP registration, and plugin packaging. + +## Acceptance Criteria + +- Vendored upstream metadata records `cua-driver-v0.2.0` and commit + `d3f3b9325f49aa5302c15fb03f6b66bd1e688e27`. +- The local fork includes the upstream Swift driver runtime improvements from + `v0.1.5` through `v0.2.0`. +- DeepChat-specific behavior remains intact: `DeepChat Computer Use.app`, + bundle id `com.wefonk.deepchat.computeruse`, `deepchat-permission-probe`, + DeepChat-managed updates, and MCP-first plugin skills. +- The Rust `cua-driver-rs` runtime is not introduced in this sync. +- Validation covers Swift build, formatting, i18n, lint, diff checks, CUA + runtime build, and plugin validation where practical. + +## Non-goals + +- No migration to `cua-driver-rs`. +- No changes to the CUA plugin manifest, settings UI, MCP server id, or tool + policy. +- No adoption of upstream standalone installer behavior for DeepChat updates. + +## Constraints + +- Preserve DeepChat's local helper app identity for TCC attribution. +- Keep packaged `plugins/cua/skills/cua-driver` guidance MCP-first. +- Treat upstream standalone scripts as reference material unless required by + the bundled helper build. diff --git a/docs/issues/cua-driver-v0-2-0-sync/tasks.md b/docs/issues/cua-driver-v0-2-0-sync/tasks.md new file mode 100644 index 000000000..aed0581d5 --- /dev/null +++ b/docs/issues/cua-driver-v0-2-0-sync/tasks.md @@ -0,0 +1,13 @@ +# Tasks + +- [x] Identify latest upstream Swift CUA driver release. +- [x] Confirm Rust `cua-driver-rs` remains out of scope. +- [x] Compare `cua-driver-v0.1.5` to `cua-driver-v0.2.0`. +- [x] Merge upstream Swift runtime changes into the DeepChat fork. +- [x] Preserve DeepChat helper app identity, permission probe, update policy, + and MCP-first behavior. +- [x] Update vendored upstream metadata. +- [x] Run Swift build validation. +- [x] Run formatting, i18n, lint, and diff checks. +- [x] Build the CUA plugin runtime. +- [x] Validate the CUA plugin package. diff --git a/plugins/cua/vendor/cua-driver/source/.bumpversion.cfg b/plugins/cua/vendor/cua-driver/source/.bumpversion.cfg index 5d7397866..7f322ad7f 100644 --- a/plugins/cua/vendor/cua-driver/source/.bumpversion.cfg +++ b/plugins/cua/vendor/cua-driver/source/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.5 +current_version = 0.2.0 commit = True tag = True tag_name = cua-driver-v{new_version} diff --git a/plugins/cua/vendor/cua-driver/source/Package.swift b/plugins/cua/vendor/cua-driver/source/Package.swift index ce51c9652..ee39eaaee 100644 --- a/plugins/cua/vendor/cua-driver/source/Package.swift +++ b/plugins/cua/vendor/cua-driver/source/Package.swift @@ -39,5 +39,9 @@ let package = Package( name: "ZoomMathTests", dependencies: ["CuaDriverCore"] ), + .testTarget( + name: "FocusStealPreventerTests", + dependencies: ["CuaDriverCore"] + ), ] ) diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/BundleHelpers.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/BundleHelpers.swift new file mode 100644 index 000000000..133ac7894 --- /dev/null +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/BundleHelpers.swift @@ -0,0 +1,35 @@ +import Darwin +import Foundation + +/// Shared "is this binary running from inside an installed DeepChat Computer Use.app +/// bundle?" heuristic used by both `ServeCommand` (for the +/// auto-relaunch-via-`open` path) and `MCPCommand` (for the daemon proxy +/// path). Resolves `Bundle.main.executablePath` (falling back to +/// `CommandLine.arguments.first`) through any symlinks via `realpath` and +/// checks whether the resolved path lives inside some +/// `DeepChat Computer Use.app/Contents/MacOS/` directory. +/// +/// That's the "installed via install-local.sh / install.sh" shape — +/// `/usr/local/bin/cua-driver` is a symlink into +/// `/Applications/DeepChat Computer Use.app`, and `realpath` walks into the +/// bundle. Returns `false` for `swift run` / +/// raw `.build//cua-driver` dev invocations, which have no installed +/// bundle to relaunch into. +/// +/// Subcommands may wrap this with additional gating (env vars, flags, +/// parent-pid checks, etc.) when their relaunch heuristics diverge. +func isExecutableInsideCuaDriverApp() -> Bool { + // Prefer Foundation's executablePath (stable, absolute). + // Fall back to argv[0] when unset, which realpath() still + // resolves via $PATH lookup at the shell level — good enough + // for the cases we care about. + let candidate = Bundle.main.executablePath + ?? CommandLine.arguments.first + ?? "" + guard !candidate.isEmpty else { return false } + + var buffer = [CChar](repeating: 0, count: Int(PATH_MAX)) + guard realpath(candidate, &buffer) != nil else { return false } + let resolved = String(cString: buffer) + return resolved.contains("/DeepChat Computer Use.app/Contents/MacOS/") +} diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/CuaDriverCommand.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/CuaDriverCommand.swift index 2a4798795..6414f3584 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/CuaDriverCommand.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/CuaDriverCommand.swift @@ -25,6 +25,7 @@ struct CuaDriverCommand: AsyncParsableCommand { UpdateCommand.self, DiagnoseCommand.self, DoctorCommand.self, + CleanupCommand.self, DumpDocsCommand.self, ] ) @@ -249,6 +250,7 @@ struct CuaDriverEntryPoint { "update", "diagnose", "doctor", + "cleanup", "dump-docs", "help", ] @@ -373,7 +375,23 @@ struct CuaDriverEntryPoint { struct MCPCommand: ParsableCommand { static let configuration = CommandConfiguration( commandName: "mcp", - abstract: "Run the stdio MCP server." + abstract: "Run the stdio MCP server.", + discussion: """ + When invoked from a shell or IDE terminal (Claude Code, Cursor, \ + VS Code, Warp), macOS TCC attributes the process to the parent \ + terminal — not to DeepChat Computer Use.app — so AX probes silently fail \ + against the wrong bundle id. To sidestep this without breaking \ + the stdio MCP transport, `mcp` detects the context, ensures a \ + `cua-driver serve` daemon is running under LaunchServices \ + (relaunching via `open -n -g -a "DeepChat Computer Use" --args serve` if not), \ + and proxies every MCP tool call through the daemon's Unix \ + socket. Tool semantics are identical to the in-process path. \ + Pass `--no-daemon-relaunch` (or set CUA_DRIVER_MCP_NO_RELAUNCH=1) \ + to force in-process execution — useful when the calling context \ + already has the right TCC grants (e.g. spawned from \ + DeepChat Computer Use.app directly), or for diagnosing \ + in-process failures. + """ ) @Flag( @@ -387,7 +405,38 @@ struct MCPCommand: ParsableCommand { ) var claudeCodeComputerUseCompat: Bool = false + @Flag( + name: .long, + help: """ + Stay in the current process instead of auto-launching a daemon \ + and proxying through its Unix socket when invoked from a shell \ + without DeepChat Computer Use.app's TCC grants. Also toggleable via \ + CUA_DRIVER_MCP_NO_RELAUNCH=1. + """ + ) + var noDaemonRelaunch: Bool = false + + @Option( + name: .long, + help: "Override the daemon Unix socket path used by the proxy fallback." + ) + var socket: String? + func run() throws { + // TCC sidestep. Same heuristic the `serve` subcommand uses + // (shell-spawned bare binary that resolves into DeepChat Computer Use.app + // bundle), gated by an explicit env / flag opt-out. When the + // shell already has the right TCC context (e.g. DeepChat Computer Use.app + // launched us directly), this returns false and we stay + // in-process exactly like before. The proxy path is purely + // additive: it gives stdio MCP clients spawned from IDE + // terminals a correct TCC context without requiring an external + // bridge. + if shouldUseDaemonProxy() { + try runViaDaemonProxy() + return + } + // MCP stdio runs for the lifetime of the host process, so we // bootstrap AppKit here — the agent cursor overlay (disabled // by default, enabled via `set_agent_cursor_enabled`) needs a @@ -421,6 +470,135 @@ struct MCPCommand: ParsableCommand { } } +extension MCPCommand { + /// Decide whether the current `mcp` invocation should auto-launch a + /// daemon and proxy every MCP tool call through its Unix socket. + /// Mirror of `ServeCommand.shouldRelaunchViaOpen()` — same heuristic, + /// same env override convention, separate flag so callers can opt + /// each surface in/out independently. + fileprivate func shouldUseDaemonProxy() -> Bool { + if noDaemonRelaunch { return false } + if isEnvTruthy(ProcessInfo.processInfo.environment["CUA_DRIVER_MCP_NO_RELAUNCH"]) { + return false + } + // When AppKit already attributes us to DeepChat Computer Use.app — either + // because LaunchServices spawned us, or the user invoked the + // bundle's main executable directly — `Bundle.main.bundlePath` + // ends in `.app`. Either case has the right TCC context. + if Bundle.main.bundlePath.hasSuffix(".app") { return false } + // The bare-binary path must resolve into an installed + // DeepChat Computer Use.app bundle, otherwise there's nothing for the + // daemon side to land in. Raw `swift run` dev invocations fail + // this check and stay in-process. + guard isExecutableInsideCuaDriverApp() else { return false } + // ppid == 1 means launchd already reparented us — we're + // post-LaunchServices and have the right TCC context. + if getppid() == 1 { return false } + return true + } + + /// Ensure a `cua-driver serve` daemon is running under the right TCC + /// context, then run the MCP stdio server with `ListTools` / + /// `CallTool` handlers that forward every request through + /// `~/Library/Caches/cua-driver/cua-driver.sock`. Falls back to + /// in-process on launch failure with a diagnostic and a pointer at + /// the `--no-daemon-relaunch` escape hatch. + fileprivate func runViaDaemonProxy() throws { + let socketPath = socket ?? DaemonPaths.defaultSocketPath() + + if !DaemonClient.isDaemonListening(socketPath: socketPath) { + FileHandle.standardError.write( + Data( + "cua-driver: mcp launched without DeepChat Computer Use.app's TCC grants; auto-launching the daemon via `open -n -g -a \"DeepChat Computer Use\" --args serve` and proxying MCP requests through it. Pass --no-daemon-relaunch to stay in-process.\n" + .utf8)) + try launchDaemonViaOpen() + try waitForDaemon(socketPath: socketPath, timeout: 10.0) + } + + let serverName = claudeCodeComputerUseCompat ? "computer-use" : "cua-driver" + let compat = claudeCodeComputerUseCompat + + // The MCP `Server` actor + `StdioTransport` use Swift + // concurrency, so we need a live async runtime. Reuse + // `AppKitBootstrap` for that — it's the same sync→async bridge + // the in-process path already takes, and the idle AppKit + // run-loop costs us nothing here (no AX work runs in this + // process). Critically we skip PermissionsGate entirely: the + // daemon owns TCC, and AX probes against this process would + // lie because we're attributed to the calling shell. + AppKitBootstrap.runBlockingAppKitWith { + let server = try await CuaDriverMCPServer.makeProxy( + serverName: serverName, + socketPath: socketPath, + claudeCodeComputerUseCompat: compat + ) + let transport = StdioTransport() + try await server.start(transport: transport) + await server.waitUntilCompleted() + } + } + + /// Spawn `/usr/bin/open -n -g -a "DeepChat Computer Use" --args serve`. Mirror of + /// `ServeCommand.relaunchViaOpen` minus the post-launch probe (we + /// poll separately via `waitForDaemon`, since the timeout there is + /// MCP-specific). + fileprivate func launchDaemonViaOpen() throws { + let process = Process() + process.executableURL = URL(fileURLWithPath: "/usr/bin/open") + // -n: force a new instance. DeepChat Computer Use.app may already be + // running from a previous `mcp` (different MCP client + // session); without -n, `open -a` would re-use it and + // drop our `--args serve`, leaving no daemon up. + // -g: keep the new instance backgrounded. DeepChat Computer Use.app is + // LSUIElement=true anyway, but this makes that explicit. + process.arguments = ["-n", "-g", "-a", "DeepChat Computer Use", "--args", "serve"] + process.standardOutput = FileHandle.nullDevice + process.standardError = FileHandle.nullDevice + do { + try process.run() + } catch { + FileHandle.standardError.write( + Data( + "cua-driver: failed to exec `/usr/bin/open`: \(error). Pass --no-daemon-relaunch to bypass.\n" + .utf8)) + throw ExitCode(1) + } + process.waitUntilExit() + if process.terminationStatus != 0 { + FileHandle.standardError.write( + Data( + "cua-driver: `open -n -g -a \"DeepChat Computer Use\" --args serve` exited \(process.terminationStatus). Check that `/Applications/DeepChat Computer Use.app` is installed, or pass --no-daemon-relaunch to bypass.\n" + .utf8)) + throw ExitCode(1) + } + } + + /// Block (up to `timeout` seconds) until `socketPath` accepts a + /// protocol-speaking probe. Throws `ExitCode(1)` with a diagnostic + /// if the daemon never appears — usually means the user hasn't + /// granted Accessibility / Screen Recording to DeepChat Computer Use.app yet + /// and the daemon's PermissionsGate is waiting on a dialog. + fileprivate func waitForDaemon(socketPath: String, timeout: TimeInterval) throws { + let deadline = Date().addingTimeInterval(timeout) + while Date() < deadline { + if DaemonClient.isDaemonListening(socketPath: socketPath) { + return + } + usleep(100_000) // 100ms + } + FileHandle.standardError.write( + Data( + "cua-driver: daemon did not appear on \(socketPath) within \(Int(timeout))s. If this is the first launch, grant Accessibility + Screen Recording to DeepChat Computer Use.app in System Settings and retry. Pass --no-daemon-relaunch to stay in-process.\n" + .utf8)) + throw ExitCode(1) + } + + private func isEnvTruthy(_ value: String?) -> Bool { + guard let value = value?.lowercased() else { return false } + return ["1", "true", "yes", "on"].contains(value) + } +} + /// Bootstrap AppKit on the main thread so `AgentCursor` can draw its /// overlay window + CA animations. The caller's async work runs on a /// detached Task; the main thread blocks inside `NSApplication.run()` @@ -502,7 +680,7 @@ struct UpdateCommand: AsyncParsableCommand { } } -/// `cua-driver doctor` — clean up stale install bits left from older versions. +/// `cua-driver cleanup` — clean up stale install bits left from older versions. /// /// v0.0.5 and earlier installed a weekly LaunchAgent at /// `~/Library/LaunchAgents/com.trycua.cua_driver_updater.plist` and a companion @@ -514,9 +692,9 @@ struct UpdateCommand: AsyncParsableCommand { /// update script. The plist lives under `$HOME` (no sudo). The companion /// script under `/usr/local/bin` is root-owned, so we print the exact /// `sudo rm` command for the user to run if it still exists. -struct DoctorCommand: ParsableCommand { +struct CleanupCommand: ParsableCommand { static let configuration = CommandConfiguration( - commandName: "doctor", + commandName: "cleanup", abstract: "Clean up stale install bits left from older cua-driver versions." ) diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/Docs/CLIDocExtractor.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/Docs/CLIDocExtractor.swift index 43c7984be..64ad480ce 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/Docs/CLIDocExtractor.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/Docs/CLIDocExtractor.swift @@ -81,6 +81,7 @@ enum CLIDocExtractor { updateDoc, diagnoseDoc, doctorDoc, + cleanupDoc, dumpDocsDoc, ] } @@ -91,11 +92,28 @@ enum CLIDocExtractor { CommandDoc( name: "mcp", abstract: "Run the stdio MCP server.", - discussion: nil, + discussion: """ + When invoked from a shell or IDE terminal (Claude Code, Cursor, + VS Code, Warp), macOS TCC attributes the process to the parent + terminal — not to DeepChat Computer Use.app — so AX probes silently fail + against the wrong bundle id. To sidestep this without breaking + the stdio MCP transport, `mcp` detects the context, ensures a + `cua-driver serve` daemon is running under LaunchServices + (relaunching via `open -n -g -a "DeepChat Computer Use" --args serve` if not), + and proxies every MCP tool call through the daemon's Unix + socket. Tool semantics are identical to the in-process path. + Pass `--no-daemon-relaunch` (or set CUA_DRIVER_MCP_NO_RELAUNCH=1) + to force in-process execution — useful when the calling context + already has the right TCC grants (e.g. spawned from DeepChat Computer Use.app + directly), or for diagnosing in-process failures. + """, arguments: [], - options: [], + options: [ + OptionDoc(name: "socket", shortName: nil, help: "Override the daemon Unix socket path used by the proxy fallback.", type: "String", defaultValue: nil, isOptional: true), + ], flags: [ FlagDoc(name: "claude-code-computer-use-compat", shortName: nil, help: "Expose normal CuaDriver tools, replacing only `screenshot` with a Claude Code-friendly window-only screenshot that establishes the vision coordinate frame.", defaultValue: false), + FlagDoc(name: "no-daemon-relaunch", shortName: nil, help: "Stay in the current process instead of auto-launching a daemon and proxying through its Unix socket when invoked from a shell without DeepChat Computer Use.app's TCC grants. Also toggleable via CUA_DRIVER_MCP_NO_RELAUNCH=1.", defaultValue: false), ], subcommands: [] ) @@ -191,7 +209,7 @@ enum CLIDocExtractor { OptionDoc(name: "socket", shortName: nil, help: "Override the Unix socket path.", type: "String", defaultValue: nil, isOptional: true), ], flags: [ - FlagDoc(name: "no-relaunch", shortName: nil, help: "Stay in the current process instead of re-execing via `open -n -g -a CuaDriver`.", defaultValue: false), + FlagDoc(name: "no-relaunch", shortName: nil, help: "Stay in the current process instead of re-execing via `open -n -g -a \"DeepChat Computer Use\"`.", defaultValue: false), ], subcommands: [] ) @@ -456,6 +474,22 @@ enum CLIDocExtractor { private static var doctorDoc: CommandDoc { CommandDoc( name: "doctor", + abstract: "Check Accessibility, Screen Recording, and SCK; recommend a capture mode.", + discussion: nil, + arguments: [], + options: [], + flags: [ + FlagDoc(name: "json", shortName: nil, help: "Emit machine-readable JSON instead of human text.", defaultValue: false), + ], + subcommands: [] + ) + } + + // MARK: - cleanup + + private static var cleanupDoc: CommandDoc { + CommandDoc( + name: "cleanup", abstract: "Clean up stale install bits left from older cua-driver versions.", discussion: nil, arguments: [], diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/DoctorCommand.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/DoctorCommand.swift new file mode 100644 index 000000000..0d44393af --- /dev/null +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/DoctorCommand.swift @@ -0,0 +1,262 @@ +import AppKit +import ArgumentParser +import CuaDriverCore +import Foundation +import ScreenCaptureKit + +/// `cua-driver doctor` — probe TCC / SCK / AX and print a recommendation. +/// +/// Unlike `diagnose` (which emits a raw paste-able block for support), +/// `doctor` interprets the probe results and recommends a concrete next +/// step. Use it to quickly discover why captures are failing and which +/// `capture_mode` to set. +struct DoctorCommand: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "doctor", + abstract: "Check Accessibility, Screen Recording, and SCK; recommend a capture mode." + ) + + @Flag(name: .long, help: "Emit machine-readable JSON instead of human text.") + var json: Bool = false + + func run() async throws { + let result = await runProbes() + + if json { + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + if let data = try? encoder.encode(result), + let str = String(data: data, encoding: .utf8) + { + print(str) + } + } else { + print(result.formatted()) + } + + if !result.allOk { + throw ExitCode(1) + } + } + + // MARK: - Probe runner + + private func runProbes() async -> DoctorResult { + // 1. TCC / permission probes. + let axOk = AXIsProcessTrusted() + let sckOk = await probeSCK() + + // 2. Attribution check — are we attributed to DeepChat Computer Use.app or a shell? + let bundleID = Bundle.main.bundleIdentifier ?? "" + let isCorrectBundle = bundleID == "com.wefonk.deepchat.computeruse" + + // 3. AX tree smoke test on Finder. + let finderPid = finderPID() + let axTreeOk: Bool + if axOk, let pid = finderPid { + axTreeOk = probeAXTree(pid: pid) + } else { + axTreeOk = false + } + + // 4. Environment info. + let arch = uname_m() + let osVersion = ProcessInfo.processInfo.operatingSystemVersionString + let locale = Locale.current.identifier + + // 5. Derive recommendation. + let recommendation = recommend( + axOk: axOk, sckOk: sckOk, isCorrectBundle: isCorrectBundle) + + return DoctorResult( + axGranted: axOk, + screenRecordingGranted: sckOk, + correctBundleAttribution: isCorrectBundle, + axTreeSmoke: axTreeOk, + arch: arch, + osVersion: osVersion, + locale: locale, + bundleID: bundleID.isEmpty ? nil : bundleID, + recommendation: recommendation + ) + } + + // MARK: - Individual probes + + /// Check SCK by enumerating shareable content. Cheap — no stream is + /// started. Returns false if SCK is denied or throws (Tahoe regression). + private func probeSCK() async -> Bool { + do { + _ = try await SCShareableContent.excludingDesktopWindows( + false, onScreenWindowsOnly: false) + return true + } catch { + return false + } + } + + /// Fetch the top-level AX children of `pid`. Returns true if we get + /// at least one element without an error — sufficient to confirm AX + /// round-trips are working. + private func probeAXTree(pid: pid_t) -> Bool { + let app = AXUIElementCreateApplication(pid) + var value: CFTypeRef? + let err = AXUIElementCopyAttributeValue( + app, kAXChildrenAttribute as CFString, &value) + return err == .success + } + + /// PID of the running Finder process, or nil. + private func finderPID() -> pid_t? { + NSWorkspace.shared.runningApplications + .first { $0.bundleIdentifier == "com.apple.finder" } + .map { $0.processIdentifier } + } + + private func uname_m() -> String { + var info = utsname() + uname(&info) + return withUnsafeBytes(of: &info.machine) { bytes in + let str = bytes.bindMemory(to: CChar.self) + return String(cString: str.baseAddress!) + } + } + + // MARK: - Recommendation logic + + private func recommend( + axOk: Bool, sckOk: Bool, isCorrectBundle: Bool + ) -> Recommendation { + if !axOk { + return Recommendation( + captureMode: nil, + severity: .error, + summary: "Accessibility is denied.", + detail: """ + Grant Accessibility to DeepChat Computer Use.app in System Settings → Privacy & Security → Accessibility, then restart the daemon: + open -n -g -a "DeepChat Computer Use" --args serve + DeepChat's bundled `cua-driver mcp` auto-relaunches through DeepChat Computer Use.app when needed. + """ + ) + } + + if !isCorrectBundle { + return Recommendation( + captureMode: nil, + severity: .warning, + summary: "TCC is attributed to the wrong process (not DeepChat Computer Use.app).", + detail: """ + Your shell or IDE is the responsible process for TCC, not DeepChat Computer Use.app. + DeepChat's bundled `cua-driver mcp` auto-relaunches through DeepChat Computer Use.app. + Or start the daemon manually: open -n -g -a "DeepChat Computer Use" --args serve + """ + ) + } + + if sckOk { + return Recommendation( + captureMode: "som", + severity: .ok, + summary: "All probes passed. Default `capture_mode: som` (or `vision`) recommended.", + detail: nil + ) + } else { + return Recommendation( + captureMode: "ax", + severity: .warning, + summary: "ScreenCaptureKit is unavailable on this build.", + detail: """ + This is a known regression on some macOS builds (see #1467). + Workaround: set capture_mode to `ax`: + cua-driver config set capture_mode ax + AX mode skips screen capture entirely and relies solely on the Accessibility tree. + """ + ) + } + } +} + +// MARK: - Result types + +struct DoctorResult: Encodable { + let axGranted: Bool + let screenRecordingGranted: Bool + let correctBundleAttribution: Bool + let axTreeSmoke: Bool + let arch: String + let osVersion: String + let locale: String + let bundleID: String? + let recommendation: Recommendation + + var allOk: Bool { recommendation.severity == .ok } + + func formatted() -> String { + let tick = "✅" + let warn = "⚠️ " + let fail = "❌" + + func icon(_ ok: Bool) -> String { ok ? tick : fail } + + var lines: [String] = ["── cua-driver doctor ──────────────────────"] + lines.append("") + lines.append("System") + lines.append(" arch: \(arch)") + lines.append(" os: \(osVersion)") + lines.append(" locale: \(locale)") + if let bid = bundleID { + lines.append(" bundle: \(bid)") + } + lines.append("") + lines.append("Probes") + lines.append(" \(icon(axGranted)) Accessibility (AXIsProcessTrusted)") + lines.append(" \(icon(screenRecordingGranted)) Screen Recording (SCShareableContent)") + lines.append(" \(icon(correctBundleAttribution)) Correct bundle attribution") + lines.append(" \(icon(axTreeSmoke)) AX tree smoke test (Finder)") + lines.append("") + lines.append("Recommendation") + let sevIcon: String + switch recommendation.severity { + case .ok: sevIcon = tick + case .warning: sevIcon = warn + case .error: sevIcon = fail + } + lines.append(" \(sevIcon) \(recommendation.summary)") + if let mode = recommendation.captureMode { + lines.append(" capture_mode: \(mode)") + } + if let detail = recommendation.detail { + lines.append("") + for line in detail.split(separator: "\n", omittingEmptySubsequences: false) { + lines.append(" \(line)") + } + } + lines.append("") + lines.append("────────────────────────────────────────────") + return lines.joined(separator: "\n") + } + + private enum CodingKeys: String, CodingKey { + case axGranted = "ax_granted" + case screenRecordingGranted = "screen_recording_granted" + case correctBundleAttribution = "correct_bundle_attribution" + case axTreeSmoke = "ax_tree_smoke" + case arch, osVersion = "os_version", locale + case bundleID = "bundle_id" + case recommendation + } +} + +struct Recommendation: Encodable { + enum Severity: String, Encodable, Equatable { case ok, warning, error } + + let captureMode: String? + let severity: Severity + let summary: String + let detail: String? + + private enum CodingKeys: String, CodingKey { + case captureMode = "capture_mode" + case severity, summary, detail + } +} diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/ServeCommand.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/ServeCommand.swift index fc5d857bd..4931fdb17 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/ServeCommand.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCLI/ServeCommand.swift @@ -194,7 +194,7 @@ extension ServeCommand { // bundle on disk — the symlink case. Raw `swift run` dev // invocations resolve into `.build//cua-driver` // instead, and have no bundle to relaunch into. - guard resolvedExecutableIsInsideCuaDriverApp() else { return false } + guard isExecutableInsideCuaDriverApp() else { return false } // ppid == 1 means we're already a LaunchServices-spawned process // (or orphaned into init, in which case relaunching wouldn't // change anything useful anyway). @@ -290,31 +290,6 @@ extension ServeCommand { throw ExitCode(1) } - /// True when the argv[0] / executablePath resolves (through any - /// symlinks) to a binary physically living inside some - /// `DeepChat Computer Use.app/Contents/MacOS/` directory. That's the "installed - /// via install-local.sh / install.sh" shape — `/usr/local/bin/cua-driver` - /// is a symlink into `/Applications/DeepChat Computer Use.app`, and `realpath` - /// walks into the bundle. - /// - /// Returns false for `swift run` / raw `.build//cua-driver` - /// dev invocations, which have no installed bundle to relaunch into. - private func resolvedExecutableIsInsideCuaDriverApp() -> Bool { - // Prefer Foundation's executablePath (stable, absolute). - // Fall back to argv[0] when unset, which realpath() still - // resolves via $PATH lookup at the shell level — good enough - // for the cases we care about. - let candidate = Bundle.main.executablePath - ?? CommandLine.arguments.first - ?? "" - guard !candidate.isEmpty else { return false } - - var buffer = [CChar](repeating: 0, count: Int(PATH_MAX)) - guard realpath(candidate, &buffer) != nil else { return false } - let resolved = String(cString: buffer) - return resolved.contains("/DeepChat Computer Use.app/Contents/MacOS/") - } - /// Accepts the same truthy-value conventions the rest of the CLI /// uses for env overrides (see `UpdateCommand` / `TelemetryClient`). private func isEnvTruthy(_ value: String?) -> Bool { diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Apps/AppLauncher.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Apps/AppLauncher.swift index 60343ca1b..8e2701404 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Apps/AppLauncher.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Apps/AppLauncher.swift @@ -228,6 +228,8 @@ public enum AppLauncher { throw LaunchError.notFound("bundle_id '\(bundleId)'") } if let name, !name.isEmpty { + // Pass 1 — filesystem lookup by bundle filename (fastest; locale-independent + // for English app names whose on-disk bundle name matches the display name). let appName = name.hasSuffix(".app") ? name : "\(name).app" // System roots first — they're canonical. User-local paths come // after so an app present in /Applications wins over a same-name @@ -250,6 +252,62 @@ public enum AppLauncher { return URL(fileURLWithPath: path) } } + + // Pass 2 — LaunchServices bundle-ID lookup, in case the caller + // passed a bundle identifier string as `name` rather than using + // the `bundle_id` parameter (e.g. "com.apple.calculator"). + if let url = NSWorkspace.shared.urlForApplication( + withBundleIdentifier: name) + { + return url + } + + // Pass 3 — scan all candidate directories and match against each + // bundle's metadata, in priority order: + // a) localizedName from NSRunningApplication (locale-aware; works + // on non-English systems, e.g. "計算機" on JP macOS) + // b) CFBundleDisplayName / CFBundleName (English; from Info.plist) + // c) bundle URL stem (filename minus .app) + // + // Matching is case-insensitive throughout so "calculator" and + // "Calculator" both resolve. + let needle = name.lowercased() + + // Check running apps first — NSRunningApplication.localizedName + // gives the OS-locale display name without touching the disk. + for app in NSWorkspace.shared.runningApplications { + guard let url = app.bundleURL else { continue } + if (app.localizedName?.lowercased() == needle) { + return url + } + } + + // Fall back to scanning installed bundles in the same roots. + let fm = FileManager.default + for root in roots { + guard let children = try? fm.contentsOfDirectory(atPath: root) + else { continue } + for child in children where child.hasSuffix(".app") { + let path = "\(root)/\(child)" + guard let bundle = Bundle(path: path) else { continue } + // CFBundleDisplayName > CFBundleName > stem + let displayName = + (bundle.infoDictionary?["CFBundleDisplayName"] as? String) + ?? (bundle.infoDictionary?["CFBundleName"] as? String) + ?? URL(fileURLWithPath: path) + .deletingPathExtension().lastPathComponent + if displayName.lowercased() == needle { + return URL(fileURLWithPath: path) + } + // Also match against the raw stem ("Calculator" → "Calculator.app") + let stem = URL(fileURLWithPath: path) + .deletingPathExtension().lastPathComponent + if stem.lowercased() == needle { + return URL(fileURLWithPath: path) + } + } + } + throw LaunchError.notFound("name '\(name)'") } throw LaunchError.nothingSpecified diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Capture/WindowCapture.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Capture/WindowCapture.swift index d29f1a16f..2137cda86 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Capture/WindowCapture.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Capture/WindowCapture.swift @@ -33,6 +33,13 @@ public enum CaptureError: Error, Sendable, CustomStringConvertible { case encodeFailed case captureFailed(String) case windowNotFound(UInt32) + /// ScreenCaptureKit could not start streaming for this window. Distinct + /// from `captureFailed` so callers (e.g. `get_window_state`) can surface + /// an actionable hint — switch to `capture_mode: ax`, retry against a + /// different window — without having to grep error strings. Seen + /// regularly on macOS 26.4.x physical Macs against specific windows + /// where even `screencapture -l` fails (rdar / openclaw/Peekaboo#121). + case streamingFailed(String) public var description: String { switch self { @@ -41,6 +48,7 @@ public enum CaptureError: Error, Sendable, CustomStringConvertible { case .encodeFailed: return "failed to encode CGImage" case .captureFailed(let msg): return "capture failed: \(msg)" case .windowNotFound(let id): return "no shareable window with id \(id)" + case .streamingFailed(let msg): return "ScreenCaptureKit streaming failed: \(msg)" } } } @@ -131,12 +139,43 @@ public actor WindowCapture { config.height = max(1, Int(window.frame.height * scale)) config.showsCursor = false + // One-shot SCK call with a single retry on streaming-start failure. + // macOS 26.4.x has a regression where `SCScreenshotManager.captureImage` + // intermittently returns "Could not start streaming because audio/video + // capture failed" (SCStreamError code -3801) on physical Macs, often + // recovering on a second attempt a moment later. We retry once with a + // brief back-off; if it still fails, we surface `.streamingFailed` so + // the tool layer can hint the caller toward `capture_mode: ax` for + // `get_window_state` workflows. let cgImage: CGImage do { - cgImage = try await SCScreenshotManager.captureImage( - contentFilter: filter, - configuration: config - ) + cgImage = try await captureSCKWithRetry(filter: filter, config: config) + } catch let error as CaptureError { + // Already classified — re-throw without wrapping. CGWindowList + // is intentionally NOT tried for permission errors (it'd just + // fail the same way and confuse the user-facing message). + if case .permissionDenied = error { throw error } + // For streaming / generic SCK failures, try the legacy + // CGWindowListCreateImage path. It's deprecated on macOS 15+ + // but still works in many cases where SCK refuses — particularly + // useful as a last-ditch fallback for the 26.4 SCK regression. + if let fallback = legacyCaptureWindow(windowID: windowID) { + let origW = fallback.width + let origH = fallback.height + let resized = resizeIfNeeded(fallback, maxDim: maxImageDimension) + let didResize = resized.width != origW || resized.height != origH + let data = try encode(resized, format: format, quality: quality) + return Screenshot( + imageData: data, + format: format, + width: resized.width, + height: resized.height, + scaleFactor: Double(scale), + originalWidth: didResize ? origW : nil, + originalHeight: didResize ? origH : nil + ) + } + throw error } catch { throw classify(error) } @@ -207,17 +246,121 @@ public actor WindowCapture { return (best ?? NSScreen.main)?.backingScaleFactor ?? 1.0 } + /// Attempt `SCScreenshotManager.captureImage` once; on a streaming-start + /// failure, wait briefly and retry once more. Returns a classified + /// `CaptureError` on persistent failure so the caller can branch on the + /// kind (permission vs. streaming vs. generic) without string-matching. + /// + /// The retry covers the macOS 26.4.x SCK regression where the very first + /// call after the SCK daemon has been idle returns -3801 ("Could not + /// start streaming because audio/video capture failed") but a second + /// call ~250ms later succeeds. A second failure isn't transient and we + /// stop retrying — the caller falls back to CGWindowList or surfaces + /// the error. + private func captureSCKWithRetry( + filter: SCContentFilter, + config: SCStreamConfiguration + ) async throws -> CGImage { + do { + return try await SCScreenshotManager.captureImage( + contentFilter: filter, + configuration: config + ) + } catch { + let classified = classify(error) + // Only retry on streaming-start failures; permission errors and + // not-found errors won't change on a second attempt. + guard case .streamingFailed = classified else { throw classified } + try? await Task.sleep(nanoseconds: 250_000_000) + do { + return try await SCScreenshotManager.captureImage( + contentFilter: filter, + configuration: config + ) + } catch { + throw classify(error) + } + } + } + + /// Legacy `CGWindowListCreateImage` fallback for the SCK 26.4 regression. + /// Deprecated by Apple in macOS 15 but still functional on most windows, + /// and frequently works where SCK refuses. Returns nil on failure — the + /// caller surfaces the original SCK error in that case so the user knows + /// the real cause. + /// + /// Marked with `@available(*, deprecated)` suppression because the API + /// is the entire point: we *want* the legacy path here. + private func legacyCaptureWindow(windowID: UInt32) -> CGImage? { + // CGWindowListCreateImage is deprecated on macOS 15+. The deprecation + // diagnostic is silenced with the @available pragma. Apple has not + // (yet) removed the symbol, and this path is the only practical + // fallback when SCK's streaming-start is broken for a given window. + let opts: CGWindowImageOption = [.boundsIgnoreFraming, .bestResolution] + let listOption: CGWindowListOption = .optionIncludingWindow + // Wrap the deprecated call so we keep the unsafePointer-style + // signature out of the rest of the code. + let image = legacyCGWindowImage( + windowID: windowID, listOption: listOption, imageOption: opts + ) + // Reject 1×1 placeholder images that the legacy API sometimes returns + // for occluded / off-screen windows — they're worse than no image. + guard let image, image.width > 1, image.height > 1 else { return nil } + return image + } + private func classify(_ error: Error) -> CaptureError { let ns = error as NSError let msg = ns.localizedDescription.lowercased() + + // Permission failure — English and Japanese phrasings observed in + // SCK's `NSError.localizedDescription`. The Japanese strings cover + // users on JP system locale where the SCK error comes back + // localized rather than in English. if msg.contains("permission") || msg.contains("not authorized") || msg.contains("declined") || msg.contains("denied") + || ns.localizedDescription.contains("許可") // "permission" + || ns.localizedDescription.contains("拒否") // "denied" { return .permissionDenied } + + // SCStreamError "could not start streaming" — code -3801 in + // `SCStreamErrorDomain`. macOS localizes the message ("Could not + // start streaming because audio/video capture failed" / Japanese: + // "オーディオ/ビデオの取り込みがうまくいかなかったため、ストリーミングを開始できませんでした"), + // so we match on code first and fall through to substring matching + // for the rare case where the domain isn't surfaced. + let isSCStreamDomain = ns.domain == "SCStreamErrorDomain" + || ns.domain == "com.apple.ScreenCaptureKit.SCStreamErrorDomain" + if (isSCStreamDomain && ns.code == -3801) + || msg.contains("could not start streaming") + || msg.contains("streaming") + || ns.localizedDescription.contains("ストリーミング") // "streaming" + { + return .streamingFailed(ns.localizedDescription) + } + return .captureFailed(ns.localizedDescription) } + /// Thin shim around the deprecated `CGWindowListCreateImage` so the + /// deprecation-warning suppression is isolated to one place. Returns nil + /// if the legacy path also refuses to produce an image. + /// + /// Marking the wrapper itself deprecated downgrades the call-site + /// warning to a no-op — we *want* this legacy path because SCK has a + /// well-known regression on macOS 26.4.x where streaming-start fails + /// for specific windows on physical Macs. + @available(*, deprecated, message: "Intentional fallback for SCK streaming-start failures.") + private func legacyCGWindowImage( + windowID: UInt32, + listOption: CGWindowListOption, + imageOption: CGWindowImageOption + ) -> CGImage? { + CGWindowListCreateImage(.null, listOption, windowID, imageOption) + } + /// Capture the topmost layer-0 window owned by `pid`, or `nil` when the /// pid has no such window at all (menubar-only helpers, apps that /// haven't created any window yet). diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/CuaDriverCore.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/CuaDriverCore.swift index d8b1e158f..fa32fe715 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/CuaDriverCore.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/CuaDriverCore.swift @@ -1,5 +1,5 @@ import Foundation public enum CuaDriverCore { - public static let version = "0.1.5" + public static let version = "0.2.0" } diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Focus/FocusGuard.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Focus/FocusGuard.swift index 79e5efc8d..9077489bf 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Focus/FocusGuard.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Focus/FocusGuard.swift @@ -29,6 +29,17 @@ public actor FocusGuard { private let enforcer: SyntheticAppFocusEnforcer private let systemPreventer: SystemFocusStealPreventer? + /// Construct a guard with the three focus-suppression layers wired in. + /// + /// - Parameters: + /// - enablement: AX enablement assertion used to write synthetic + /// focus on the target window/element. + /// - enforcer: synthetic-focus enforcer that flips + /// `kAXEnhancedUserInterface` etc. for the duration of the body. + /// - systemPreventer: optional layer-3 reactive preventer. When + /// supplied, the guard arms a lease around the body so any + /// target self-activation triggered by the AX action is undone + /// before the next compositor frame. public init( enablement: AXEnablementAssertion, enforcer: SyntheticAppFocusEnforcer, @@ -84,15 +95,23 @@ public actor FocusGuard { // activation notification and immediately re-activates the prior // frontmost app. Only armed when the target isn't already // frontmost (no point suppressing self → self). - var suppressionHandle: SuppressionHandle? + // + // Lease form: ARC fires `deinit` on every exit path including the + // catch branch below. The lease replaces a previous bug-prone + // pattern of manually pairing begin/end across do/catch — if a + // future edit forgets one cleanup branch, the lease still + // releases when the local goes out of scope. + var suppressionLease: SuppressionLease? if let preventer = systemPreventer { let targetApp = NSRunningApplication(processIdentifier: pid) let isTargetFrontmost = targetApp?.isActive ?? false if !isTargetFrontmost, let frontmost = NSWorkspace.shared.frontmostApplication { - suppressionHandle = await preventer.beginSuppression( - targetPid: pid, restoreTo: frontmost + suppressionLease = await preventer.leaseSuppression( + targetPid: pid, + restoreTo: frontmost, + origin: "FocusGuard.withFocusSuppressed" ) } } @@ -100,27 +119,43 @@ public actor FocusGuard { do { let result = try await body() if let focusState { await enforcer.reenableActivation(focusState) } - if let handle = suppressionHandle { - try? await Task.sleep(nanoseconds: 50_000_000) // 50ms - await systemPreventer?.endSuppression(handle) + if let lease = suppressionLease { + // 50ms gives the target's reflex post-AXPress activation + // (Safari WebKit) time to fire before we tear down the + // observer that catches it. Explicit release awaits any + // pending reactivation tasks scheduled in that window. + try? await Task.sleep(nanoseconds: 50_000_000) + await lease.release() } return result } catch { if let focusState { await enforcer.reenableActivation(focusState) } - if let handle = suppressionHandle { - await systemPreventer?.endSuppression(handle) + if let lease = suppressionLease { + await lease.release() } throw error } + // If a future edit ever drops one of the explicit `release()` + // calls above, ARC fires the lease's `deinit` when this scope + // unwinds — the entry still gets released. Belt + suspenders. } // MARK: - Helpers } +/// Errors thrown by ``FocusGuard/withFocusSuppressed(pid:element:body:)``. public enum FocusGuardError: Error, CustomStringConvertible, Sendable { + /// The target window is minimized in the Dock; AX actions on it + /// would force-deminiaturize it (especially in Chrome). Caller must + /// either unminimize first or use a keyboard-input alternative + /// (`type_text_chars`, `press_key`) that does not have this side + /// effect. case windowMinimized(pid: pid_t) + /// Human-readable description of the error including the recovery + /// hint. `Tool.Content.text` propagates this directly to MCP + /// clients. public var description: String { switch self { case .windowMinimized(let pid): diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Focus/SystemFocusStealPreventer.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Focus/SystemFocusStealPreventer.swift index 7145f203e..28522f758 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Focus/SystemFocusStealPreventer.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Focus/SystemFocusStealPreventer.swift @@ -1,10 +1,19 @@ import AppKit import Foundation +import os /// An opaque handle returned by ``SystemFocusStealPreventer/beginSuppression``. /// Pass the same handle to ``SystemFocusStealPreventer/endSuppression`` to /// stop suppressing for that particular target; other concurrent suppressions /// stay active until their own handles are ended. +/// +/// **Prefer ``SystemFocusStealPreventer/withSuppression(targetPid:restoreTo:origin:body:)`` +/// over manual `begin`/`end` whenever the suppression's lifetime fits inside +/// a single async function** — the closure form is leak-proof by construction. +/// When the lifetime must span function boundaries (e.g. a snapshot taken +/// before an action and released after side-effect detection), prefer +/// ``SuppressionLease`` over raw handles — the lease releases the entry in +/// `deinit`, so ARC catches leaks that scope-bound defers cannot. public struct SuppressionHandle: Sendable, Hashable { fileprivate let id: UUID @@ -13,6 +22,78 @@ public struct SuppressionHandle: Sendable, Hashable { } } +/// Reference-typed lease for a focus suppression entry. Releases the entry +/// in `deinit`, which is ARC's strongest available guarantee that no exit +/// path — including thrown errors, task cancellation, or future call-site +/// regressions — can leak the underlying registration. +/// +/// Construct via ``SystemFocusStealPreventer/leaseSuppression(targetPid:restoreTo:origin:)``. +/// Call ``release()`` explicitly when you want to await pending reactivation +/// tasks; otherwise just drop the lease and ARC will fire a fire-and-forget +/// cleanup. `release()` is idempotent. +/// +/// This is the recommended API for the snapshot/detect pattern where the +/// suppression's lifetime must span function boundaries — the lease can be +/// stored in a struct and the cleanup is guaranteed by the language, not +/// by call-site discipline. +public final class SuppressionLease: @unchecked Sendable { + private let preventer: SystemFocusStealPreventer + private let handle: SuppressionHandle + /// `OSAllocatedUnfairLock` rather than `NSLock`+`var` because Swift 6 + /// bans `NSLock.lock()` from async contexts (the kernel-level priority- + /// inversion guarantees of `os_unfair_lock` mean the runtime can prove + /// the critical section is bounded). This is the platform-idiomatic + /// async-safe replacement for "lock + bool flag" patterns. macOS 13+, + /// and we target macOS 14, so it's freely available. + private let releasedFlag = OSAllocatedUnfairLock(initialState: false) + + /// The handle for the underlying entry. Useful for callers that want to + /// pass through the legacy ``SystemFocusStealPreventer/endSuppression(_:)`` + /// API; new code should prefer ``release()``. + public var rawHandle: SuppressionHandle { handle } + + fileprivate init(preventer: SystemFocusStealPreventer, handle: SuppressionHandle) { + self.preventer = preventer + self.handle = handle + } + + /// Release the lease and await any in-flight reactivation tasks. + /// Idempotent: calling more than once is a no-op. Concurrent calls are + /// race-safe — exactly one will perform the dispatcher remove, the + /// rest return early. + public func release() async { + // Atomic test-and-set. Returns the prior value; we proceed only + // when we were the first caller to flip false→true. + let alreadyReleased = releasedFlag.withLock { released in + let prior = released + released = true + return prior + } + if alreadyReleased { return } + await preventer.endSuppression(handle) + } + + deinit { + // ARC safety net: the holder dropped us without calling release(). + // Same atomic test-and-set as release(), but we can't await from a + // deinit so we hand the cleanup to a detached Task. Pending + // reactivation tasks scheduled by the observer are orphaned — + // they're harmless idempotent `activate(options: [])` calls. The + // deadline eviction in the dispatcher (layer 3) catches the same + // case in bounded time even if this Task is never scheduled, so + // we lose nothing by fire-and-forgetting here. + let alreadyReleased = releasedFlag.withLock { released in + let prior = released + released = true + return prior + } + if alreadyReleased { return } + let p = preventer + let h = handle + Task.detached { await p.endSuppression(h) } + } +} + /// Layer 3 of the focus-suppression stack. Reactively counters the /// "target app called `NSApp.activate(ignoringOtherApps:)` in its own /// `applicationDidFinishLaunching`" failure mode. @@ -48,10 +129,33 @@ public struct SuppressionHandle: Sendable, Hashable { /// `CGSRegisterConnectionNotifyProc` / kCPS notifications, which we /// deliberately do not take a dependency on. /// -/// Multiple concurrent suppressions are supported — each `beginSuppression` -/// call returns a distinct handle and adds an entry to the internal map. -/// The shared `NSWorkspace` observer is installed on the first suppression -/// and removed when the last handle is ended. +/// ## Lifetime safety +/// +/// The shared dispatcher applies four overlapping guarantees so that no +/// single bug can resurrect the v0.1.9 focus-trap regression where a +/// leaked wildcard entry hijacked every app activation in the OS for the +/// rest of the process's life: +/// +/// 1. **Closure scope (preferred)** — ``withSuppression(targetPid:restoreTo:origin:body:)`` +/// pairs begin/end with `defer`. No handle escapes the closure. +/// 2. **ARC scope** — ``leaseSuppression(targetPid:restoreTo:origin:)`` returns +/// a ``SuppressionLease`` that ends the entry in `deinit`. Catches any +/// control flow scope-defer cannot — thrown errors between begin and end, +/// task cancellation, future call-site regressions. +/// 3. **Wall-clock deadline** — every entry carries a ``maxLifetimeNs`` +/// expiry (default 5 s). The observer evicts expired entries on every +/// fire; a janitor task evicts during idle. **Worst-case leak duration is +/// bounded by ``maxLifetimeNs``, independent of every other layer.** +/// 4. **Observability** — every entry carries an ``origin`` tag and the +/// dispatcher logs a warning when active count crosses +/// ``warnActiveThreshold`` or when the deadline reaper fires. Future +/// leaks surface in `log show --process cua-driver` instead of silently +/// stealing focus. +/// +/// Multiple concurrent suppressions are supported — each registration adds +/// an entry to the internal map. The shared `NSWorkspace` observer is +/// installed on the first suppression and removed when the last entry is +/// gone (whether removed manually, by lease deinit, or by deadline). public actor SystemFocusStealPreventer { /// Delay between observing the target's self-activation and firing /// the restoring `activate(options: [])`. Tradeoff: @@ -74,35 +178,143 @@ public actor SystemFocusStealPreventer { /// several frames' worth of runloop turns inside /// `applicationDidFinishLaunching` BEFORE our demote reaches /// WindowServer — the activation notification itself is async. - /// Calculator still gets its window created (orthogonal path via - /// the `hides=YES` + `unhide()` dance). Chrome still gets its - /// URL handoff processed. Net: zero-delay demote is strictly - /// better. - private static let suppressionDelayNs: UInt64 = 0 + /// Calculator-with-no-window has been verified to be a separate + /// issue (`activates = false` swallows the initial window event) + /// and tuning this delay does not rescue it. + public static let suppressionDelayNs: UInt64 = 0 + + /// Wall-clock upper bound on a suppression entry's lifetime. The + /// dispatcher evicts entries older than this whenever the observer + /// fires or the janitor runs. Set well above the longest legitimate + /// click + detect window (≈1.3 s) so the safety net never trips + /// during normal operation, but tight enough that a runaway leak + /// recovers in seconds rather than the entire process lifetime. + /// + /// This bound is the layer-3 safety net that makes ``SuppressionLease`` + /// `deinit` and ``withSuppression`` `defer` mistakes recoverable. + public static let maxLifetimeNs: UInt64 = 5_000_000_000 // 5 s + + /// How often the janitor task wakes up during idle to evict expired + /// entries when no NSWorkspace activation events arrive. Cheap — + /// just a lock + dictionary scan. Keeps the worst-case eviction + /// latency at `maxLifetimeNs + janitorIntervalNs`. + public static let janitorIntervalNs: UInt64 = 1_000_000_000 // 1 s + + /// Active-entry count above which the dispatcher logs a warning to the + /// unified log. Legitimate workloads have at most ~2 concurrent + /// suppressions (one from `WindowChangeDetector.snapshot()`, one from + /// `LaunchAppTool`'s placeholder→pid swap). Anything above 2 is + /// suspicious; above this threshold it's almost certainly a leak. + public static let warnActiveThreshold: Int = 4 + + /// Default origin tag used when a caller doesn't supply one. Surfaces + /// in leak warnings as a fallback so we can still grep for the file. + fileprivate static let unknownOrigin = "" private let dispatcher: Dispatcher + private let janitorIntervalNs: UInt64 + private var janitorTask: Task? - public init() { - self.dispatcher = Dispatcher(suppressionDelayNs: Self.suppressionDelayNs) + /// Designated initializer. Production callers use the default values + /// for `maxLifetimeNs` / `janitorIntervalNs` / `warnActiveThreshold` + /// — those are the safety-net knobs and there's no good reason to + /// vary them in production. Tests pass tight values to verify the + /// layer-3 reaper deterministically. + /// + /// Actors don't support `convenience` inits (they have a flat init + /// model), so we expose one initializer with sensible defaults. + public init( + suppressionDelayNs: UInt64 = SystemFocusStealPreventer.suppressionDelayNs, + maxLifetimeNs: UInt64 = SystemFocusStealPreventer.maxLifetimeNs, + janitorIntervalNs: UInt64 = SystemFocusStealPreventer.janitorIntervalNs, + warnActiveThreshold: Int = SystemFocusStealPreventer.warnActiveThreshold + ) { + self.dispatcher = Dispatcher( + suppressionDelayNs: suppressionDelayNs, + maxLifetimeNs: maxLifetimeNs, + warnActiveThreshold: warnActiveThreshold + ) + self.janitorIntervalNs = janitorIntervalNs } - /// Begin suppressing focus-steal events for `targetPid`. Any - /// `NSWorkspace.didActivateApplicationNotification` that fires while the - /// suppression is active and names `targetPid` as the newly-active app - /// schedules a delayed `restoreTo.activate(options: [])` on the main - /// actor to steal focus back onto whatever was frontmost before the - /// launch. + // MARK: - Closure-scoped (preferred) + + /// Run `body` while a suppression entry is active. The entry is + /// guaranteed to be released on every exit path — return, throw, task + /// cancellation. No handle escapes the closure, so callers cannot + /// forget to release. + /// + /// This is the strongest available API: the language enforces the + /// lifetime. Use it whenever the suppression fits inside a single + /// async function. + @discardableResult + public func withSuppression( + targetPid: pid_t, + restoreTo: NSRunningApplication, + origin: StaticString = #function, + body: @Sendable () async throws -> T + ) async rethrows -> T { + let handle = dispatcher.add( + targetPid: targetPid, restoreTo: restoreTo, origin: "\(origin)" + ) + startJanitorIfNeeded() + do { + let result = try await body() + await endSuppression(handle) + return result + } catch { + await endSuppression(handle) + throw error + } + } + + // MARK: - ARC-scoped + + /// Register a suppression and return a ``SuppressionLease`` that ends + /// it in `deinit`. Use this when the lifetime must span function + /// boundaries (e.g. snapshot/detect pattern) and a closure scope won't + /// work. ARC catches leaks that scope-defers cannot. + /// + /// The caller can call ``SuppressionLease/release()`` to await pending + /// reactivation tasks; if the caller simply drops the lease, ARC fires + /// a fire-and-forget cleanup. Either way the entry is released. + public func leaseSuppression( + targetPid: pid_t, + restoreTo: NSRunningApplication, + origin: StaticString = #function + ) -> SuppressionLease { + let handle = dispatcher.add( + targetPid: targetPid, restoreTo: restoreTo, origin: "\(origin)" + ) + startJanitorIfNeeded() + return SuppressionLease(preventer: self, handle: handle) + } + + // MARK: - Manual (deprecated; kept for migration) + + /// Begin suppressing. Manual lifetime — caller is responsible for + /// matching ``endSuppression(_:)``. **Prefer ``withSuppression`` or + /// ``leaseSuppression`` over this manual API.** Direct begin/end pairs + /// are vulnerable to leaks across error and async boundaries; the + /// scoped APIs above make those leaks impossible. /// /// Returns a handle that must be passed to ``endSuppression(_:)`` to /// stop the suppression. Overlapping calls for different targets are - /// independent — each registers its own `(pid, restoreTo)` entry. + /// independent — each registers its own `(pid, restoreTo)` entry. The + /// underlying entry is also subject to the dispatcher's + /// ``maxLifetimeNs`` deadline, so a forgotten end will self-recover + /// in bounded time. + @available(*, deprecated, message: "Prefer withSuppression { … } (closure-scoped) or leaseSuppression() (ARC-scoped). Manual begin/end pairs are leak-prone across error and async boundaries.") @discardableResult public func beginSuppression( targetPid: pid_t, - restoreTo: NSRunningApplication + restoreTo: NSRunningApplication, + origin: StaticString = #function ) async -> SuppressionHandle { - let handle = SuppressionHandle() - dispatcher.add(handle: handle, targetPid: targetPid, restoreTo: restoreTo) + let handle = dispatcher.add( + targetPid: targetPid, restoreTo: restoreTo, origin: "\(origin)" + ) + startJanitorIfNeeded() return handle } @@ -120,6 +332,49 @@ public actor SystemFocusStealPreventer { _ = await task.value } } + + // MARK: - Diagnostics + + /// Number of currently-active suppression entries. Test/diagnostic-only. + public var activeCount: Int { + dispatcher.activeCount + } + + // MARK: - Janitor + + private func startJanitorIfNeeded() { + if janitorTask != nil { return } + let dispatcher = self.dispatcher + let interval = self.janitorIntervalNs + janitorTask = Task.detached(priority: .background) { [weak self] in + while !Task.isCancelled { + try? await Task.sleep(nanoseconds: interval) + let evicted = dispatcher.reapExpired() + for task in evicted { _ = await task.value } + // Idle shutdown: when the dispatcher has no entries and + // observer is torn down, stop the janitor. + if await self?.shouldStopJanitor() ?? true { break } + } + await self?.clearJanitor() + } + } + + /// Test-only: force a reap pass without waiting for the janitor or + /// an `NSWorkspace` activation. Production code should never call + /// this — eviction is automatic. Exposed for unit tests so the + /// layer-3 deadline contract can be verified deterministically. + public func _forceReapForTesting() async { + let pending = dispatcher.reapExpired() + for task in pending { _ = await task.value } + } + + private func shouldStopJanitor() -> Bool { + dispatcher.activeCount == 0 + } + + private func clearJanitor() { + janitorTask = nil + } } // MARK: - Dispatcher @@ -134,27 +389,88 @@ private final class Dispatcher: @unchecked Sendable { private struct Entry { let targetPid: pid_t let restoreTo: NSRunningApplication + let origin: String + /// Wall-clock deadline (mach_absolute_time-style monotonic ns). + /// Layer-3 safety net: when the observer fires or the janitor + /// runs, any entry with `now > deadline` is force-evicted. + let deadline: UInt64 } private let suppressionDelayNs: UInt64 + private let maxLifetimeNs: UInt64 + private let warnActiveThreshold: Int + private let lock = NSLock() private var entries: [UUID: Entry] = [:] private var pendingRestoreTasks: [Task] = [] private var observer: NSObjectProtocol? - init(suppressionDelayNs: UInt64) { + /// Unified-log subsystem. Routed through `os.Logger` so the messages + /// appear in `log show --process cua-driver` and `log stream`. We + /// don't take a swift-log dependency — `os.Logger` is free, builds + /// into Console.app, and is the right tool for "operator wants to + /// see what the driver did last Tuesday" diagnostics. + private let logger = Logger( + subsystem: "io.trycua.cua-driver", category: "FocusStealPreventer" + ) + + init(suppressionDelayNs: UInt64, maxLifetimeNs: UInt64, warnActiveThreshold: Int) { self.suppressionDelayNs = suppressionDelayNs + self.maxLifetimeNs = maxLifetimeNs + self.warnActiveThreshold = warnActiveThreshold } - func add(handle: SuppressionHandle, targetPid: pid_t, restoreTo: NSRunningApplication) { + var activeCount: Int { + lock.lock(); defer { lock.unlock() } + return entries.count + } + + /// Register a new entry and return its handle. Installs the shared + /// `NSWorkspace` observer if this is the first entry. Logs a warning + /// if the active count crosses the leak-suspicion threshold so future + /// regressions surface in the unified log instead of silently + /// stealing focus. + func add( + targetPid: pid_t, restoreTo: NSRunningApplication, origin: String + ) -> SuppressionHandle { + let handle = SuppressionHandle() + let deadline = monotonicNow() &+ maxLifetimeNs + lock.lock() - entries[handle.id] = Entry(targetPid: targetPid, restoreTo: restoreTo) + entries[handle.id] = Entry( + targetPid: targetPid, + restoreTo: restoreTo, + origin: origin, + deadline: deadline + ) + let count = entries.count let needsObserver = (observer == nil) + // Snapshot a description list while holding the lock so we can + // log without re-acquiring it. + let leakSuspect = count > warnActiveThreshold + let originList = leakSuspect ? entries.values.map(\.origin).sorted() : [] lock.unlock() if needsObserver { installObserver() } + + if leakSuspect { + // Surface, don't crash. A leak is a bug we want to fix; an + // assert in production breaks the user's automation. Log it + // loudly in the unified log instead — operators can grep for + // "FocusStealPreventer leak" and the origin list pinpoints + // the call sites holding the entries. + logger.warning( + """ + FocusStealPreventer leak suspect: \(count, privacy: .public) active \ + entries (threshold \(self.warnActiveThreshold, privacy: .public)). \ + Origins: \(originList.joined(separator: ", "), privacy: .public) + """ + ) + } + + return handle } /// Removes the entry for `handle` and returns any in-flight @@ -182,6 +498,56 @@ private final class Dispatcher: @unchecked Sendable { return pending } + /// Layer-3 safety net: scan for entries past their deadline and force- + /// evict them. Returns any pending reactivation tasks that the caller + /// can drain. + /// + /// Called from two places: (1) the janitor task on a timer, (2) the + /// activation observer on every fire. The observer-side reap is what + /// makes a leaked wildcard entry stop hijacking activations *before* + /// the next user app-switch — even if the janitor is starved. + @discardableResult + func reapExpired() -> [Task] { + let now = monotonicNow() + + lock.lock() + var evicted: [(UUID, Entry)] = [] + for (id, entry) in entries where now > entry.deadline { + evicted.append((id, entry)) + entries.removeValue(forKey: id) + } + let shouldRemoveObserver = entries.isEmpty && !evicted.isEmpty + let token = observer + if shouldRemoveObserver { + observer = nil + } + let pending = shouldRemoveObserver ? pendingRestoreTasks : [] + if shouldRemoveObserver { + pendingRestoreTasks = [] + } + lock.unlock() + + if shouldRemoveObserver, let token { + NSWorkspace.shared.notificationCenter.removeObserver(token) + } + + for (_, entry) in evicted { + // Errors, not warnings: deadline reap means a higher-layer + // guarantee (closure defer / lease deinit) failed. Surface + // loudly so the next operator pass can find it. + logger.error( + """ + FocusStealPreventer deadline reap: evicted entry origin=\ + \(entry.origin, privacy: .public) targetPid=\ + \(entry.targetPid, privacy: .public). This indicates a \ + missing release path; investigate the named origin. + """ + ) + } + + return pending + } + private func installObserver() { // queue: nil delivers the callback synchronously on the posting // thread. NSWorkspace posts on main, so the activation handler @@ -218,9 +584,27 @@ private final class Dispatcher: @unchecked Sendable { let activatedPid = app.processIdentifier + // Reap on every fire. Cheap (one dictionary scan) and bounds the + // worst-case leak duration to `maxLifetimeNs` — the leaked entry + // stops hijacking activations *before* this very fire schedules a + // restore task. + reapExpired() + lock.lock() + // Match entries where: + // - targetPid == activatedPid (specific target suppression), OR + // - targetPid == 0 (wildcard: suppress any activation that + // isn't restoreTo — used by the side-effect + // guard in WindowChangeDetector so that a + // background click opening a new app, e.g. + // UTM Gallery → Safari, is suppressed even + // though we didn't know Safari's pid ahead + // of time.) let restoreCandidates = entries.values - .filter { $0.targetPid == activatedPid } + .filter { + $0.targetPid == activatedPid || + ($0.targetPid == 0 && activatedPid != $0.restoreTo.processIdentifier) + } .map { $0.restoreTo } lock.unlock() @@ -245,3 +629,15 @@ private final class Dispatcher: @unchecked Sendable { lock.unlock() } } + +// MARK: - Time + +/// Monotonic nanosecond clock for entry deadlines. Uses +/// `clock_gettime(CLOCK_MONOTONIC_RAW)` so jumps in wall time (sleep, +/// NTP slew) cannot accidentally expire entries early or extend leaks. +@inline(__always) +private func monotonicNow() -> UInt64 { + var ts = timespec() + clock_gettime(CLOCK_MONOTONIC_RAW, &ts) + return UInt64(ts.tv_sec) &* 1_000_000_000 &+ UInt64(ts.tv_nsec) +} diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Windows/WindowEnumerator.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Windows/WindowEnumerator.swift index 0afc8464d..026e2fbc5 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Windows/WindowEnumerator.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverCore/Windows/WindowEnumerator.swift @@ -50,10 +50,25 @@ public enum WindowEnumerator { /// callers that also need `bounds` (e.g. the auth-signed click recipe that /// computes a window-local point via `CGEventSetWindowLocation`) can /// read both off a single query. + /// + /// Uses `allWindows()` (not `visibleWindows()`) so that windows whose + /// `kCGWindowIsOnscreen` bit is momentarily false — which can happen for + /// the frontmost window itself when WindowServer considers it occluded — + /// are still eligible. Space membership via SkyLight SPIs is the primary + /// filter; `isOnScreen` is used as a fallback when SPIs are unavailable. public static func frontmostWindow(forPid pid: Int32) -> WindowInfo? { - let candidates = visibleWindows() - .filter { $0.pid == pid && $0.isOnScreen } + let currentSpace = SpaceMigrator.currentSpaceID() + let candidates = allWindows() + .filter { $0.pid == pid && $0.layer == 0 } .filter { $0.bounds.width > 1 && $0.bounds.height > 1 } + .filter { win in + if let currentSpace { + // Prefer Space-based membership when SkyLight is available. + let spaces = SpaceMigrator.spaceIDs(forWindowID: UInt32(win.id)) + return spaces?.contains(currentSpace) ?? win.isOnScreen + } + return win.isOnScreen + } return candidates.max(by: { $0.zIndex < $1.zIndex }) } diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/CuaDriverMCPServer.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/CuaDriverMCPServer.swift index 3add416cc..2b67bcf0e 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/CuaDriverMCPServer.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/CuaDriverMCPServer.swift @@ -27,4 +27,183 @@ public enum CuaDriverMCPServer { return server } + + /// Build an MCP Server whose `ListTools` / `CallTool` handlers forward + /// every request to a running `cua-driver serve` daemon over its Unix + /// domain socket. Used by the `mcp` subcommand's TCC-sidestep path: + /// when stdio MCP is spawned from an IDE terminal, the process inherits + /// the terminal's TCC responsibility chain so AX probes silently fail. + /// Proxying through the daemon — which runs under LaunchServices and is + /// correctly attributed to `com.wefonk.deepchat.computeruse` — gives MCP clients + /// identical behavior without requiring an external Python bridge. + /// + /// `claudeCodeComputerUseCompat` advertises the compat tool set in + /// `ListTools`, but every `CallTool` still hits the daemon. The daemon + /// always exposes the full native registry; the shim is purely a + /// client-side rename of `screenshot` and is implemented entirely by + /// the in-process MCP layer. When proxying, we therefore rewrite the + /// `screenshot` tool advertised to the client into its compat-mode + /// shape and translate inbound `screenshot` calls back into the + /// equivalent native daemon call. + public static func makeProxy( + serverName: String = "cua-driver", + version: String = CuaDriverCore.version, + socketPath: String, + claudeCodeComputerUseCompat: Bool = false + ) async throws -> Server { + let server = Server( + name: serverName, + version: version, + capabilities: Server.Capabilities(tools: .init(listChanged: false)) + ) + + // Cache the tool list once at startup. Daemon registries are + // static — every connected client sees the same handlers — so a + // single fetch is enough for the life of the stdio MCP session. + // Fail fast on a missing/unhealthy daemon so the MCP client sees + // a clear startup error instead of a "successful" handshake that + // advertises zero tools and then errors on every `CallTool`. + let cachedToolList = try await fetchProxyToolList( + socketPath: socketPath, + claudeCodeComputerUseCompat: claudeCodeComputerUseCompat + ) + + await server.withMethodHandler(ListTools.self) { _ in + ListTools.Result(tools: cachedToolList) + } + + await server.withMethodHandler(CallTool.self) { params in + let (name, args) = rewriteForProxy( + name: params.name, + arguments: params.arguments, + claudeCodeComputerUseCompat: claudeCodeComputerUseCompat + ) + return try await forwardCallToDaemon( + name: name, + arguments: args, + socketPath: socketPath + ) + } + + return server + } + + /// Translate `(name, arguments)` from the MCP client's view of the + /// compat tool surface into the native daemon registry's view. + /// + /// Compat-mode `screenshot` takes `{pid, window_id}` and returns a + /// JPEG; the daemon's native `screenshot` takes `{window_id, format, + /// quality}` and defaults to PNG. We map the former onto the latter + /// by dropping the unused `pid` and pinning `format: "jpeg", + /// quality: 85` to match the compat shim's output shape. + /// + /// Non-compat mode passes through unchanged. + private static func rewriteForProxy( + name: String, + arguments: [String: Value]?, + claudeCodeComputerUseCompat: Bool + ) -> (String, [String: Value]?) { + guard claudeCodeComputerUseCompat else { return (name, arguments) } + if name == "screenshot" { + var rewritten: [String: Value] = [:] + if let windowID = arguments?["window_id"] { + rewritten["window_id"] = windowID + } + rewritten["format"] = .string("jpeg") + rewritten["quality"] = .int(85) + return (name, rewritten) + } + return (name, arguments) + } + + /// One-shot daemon `list` over the UDS, with the compat-mode rename + /// applied client-side. Throws a descriptive `MCPError.internalError` + /// if the daemon is unreachable, transport-failed, or returned an + /// unexpected envelope — surfacing the failure during `makeProxy`'s + /// init rather than producing a proxy that advertises zero tools and + /// errors on every subsequent `CallTool`. + private static func fetchProxyToolList( + socketPath: String, + claudeCodeComputerUseCompat: Bool + ) async throws -> [Tool] { + let request = DaemonRequest(method: "list") + let result = DaemonClient.sendRequest(request, socketPath: socketPath) + let tools: [Tool] + switch result { + case .noDaemon: + throw MCPError.internalError( + "cua-driver daemon not reachable on \(socketPath). " + + "Start it with `open -n -g -a \"DeepChat Computer Use\" --args serve` and retry." + ) + case .error(let message): + throw MCPError.internalError( + "cua-driver daemon transport error while listing tools on \(socketPath): \(message)" + ) + case .ok(let response): + guard response.ok, case let .list(listed) = response.result else { + let reason = response.error ?? "daemon returned unexpected result kind for list" + throw MCPError.internalError( + "cua-driver daemon refused tool list on \(socketPath): \(reason)" + ) + } + tools = listed + } + if !claudeCodeComputerUseCompat { + return tools + } + // Compat mode: swap the native `screenshot` tool descriptor for + // the window-only shim's descriptor so MCP clients see the same + // schema they'd see in the in-process compat registry. + let compatHandlers = ClaudeCodeComputerUseCompatTools.all + let compatToolsByName = Dictionary( + uniqueKeysWithValues: compatHandlers.map { ($0.tool.name, $0.tool) } + ) + return tools.map { tool in + compatToolsByName[tool.name] ?? tool + } + } + + /// Forward a single `CallTool` invocation to the daemon and translate + /// the `DaemonResponse` back into an MCP `CallTool.Result` (or throw + /// `MCPError` on protocol-level failures). + /// + /// Tool-level errors — i.e. the tool ran but returned `isError: true` + /// — round-trip cleanly as part of the `.call` payload, so MCP clients + /// see exactly the same error envelope they would in the in-process + /// path. Only daemon-level failures (socket gone, decode error, unknown + /// tool) throw. + private static func forwardCallToDaemon( + name: String, + arguments: [String: Value]?, + socketPath: String + ) async throws -> CallTool.Result { + let request = DaemonRequest(method: "call", name: name, args: arguments) + // Match the daemon's own per-call read budget. AX-heavy tools + // (e.g. `screenshot`, `get_window_state`) regularly take a few + // seconds; the default 120s in `DaemonClient` is plenty. + let result = DaemonClient.sendRequest(request, socketPath: socketPath) + switch result { + case .noDaemon: + throw MCPError.internalError( + "cua-driver daemon not reachable on \(socketPath). " + + "Start it with `open -n -g -a \"DeepChat Computer Use\" --args serve` and retry." + ) + case .error(let message): + throw MCPError.internalError("daemon transport: \(message)") + case .ok(let response): + if !response.ok { + let reason = response.error ?? "daemon reported failure" + if response.exitCode == DaemonExit.usage { + throw MCPError.invalidParams(reason) + } + throw MCPError.internalError(reason) + } + guard case let .call(callResult) = response.result else { + throw MCPError.internalError( + "daemon returned unexpected result kind for call" + ) + } + return callResult + } + } } diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/ToolRegistry.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/ToolRegistry.swift index 6a152bf3e..6113c345f 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/ToolRegistry.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/ToolRegistry.swift @@ -52,12 +52,29 @@ public struct ToolRegistry: Sendable { ] public func call(_ name: String, arguments: [String: Value]?) async throws -> CallTool.Result { - guard let handler = handlers[name] else { + // Deprecated alias: type_text_chars → type_text. Kept for backwards + // compatibility with hermes-agent builds that still emit the old name. + // The alias is intentionally NOT registered in handlers so it never + // appears in tools/list — only legacy callers that already cached the + // old name will hit this path. + let effectiveName: String + if name == "type_text_chars" { + FileHandle.standardError.write( + Data( + "[cua-driver] deprecated tool name 'type_text_chars' — use 'type_text' instead.\n" + .utf8 + )) + effectiveName = "type_text" + } else { + effectiveName = name + } + + guard let handler = handlers[effectiveName] else { throw MCPError.invalidParams("Unknown tool: \(name)") } // Capture monotonic start time before any animation or side-effect // so the recorded span brackets the full action duration. - let actionStartNs: UInt64 = Self.actionToolNames.contains(name) + let actionStartNs: UInt64 = Self.actionToolNames.contains(effectiveName) ? clock_gettime_nsec_np(CLOCK_UPTIME_RAW) : 0 let result = try await handler.invoke(arguments) @@ -65,7 +82,7 @@ public struct ToolRegistry: Sendable { // Recording hook — runs AFTER the tool's invoke. Errors inside // the recorder are swallowed by the actor; the tool caller // never sees a recording-path failure. - if Self.actionToolNames.contains(name), + if Self.actionToolNames.contains(effectiveName), await RecordingSession.shared.isEnabled() { // Bind the shared engine lazily. `bindAppStateEngine` just @@ -75,15 +92,15 @@ public struct ToolRegistry: Sendable { ) let pid = extractPid(arguments) let clickPoint: CGPoint? - if Self.clickFamilyToolNames.contains(name) { + if Self.clickFamilyToolNames.contains(effectiveName) { clickPoint = await resolveClickPoint( - toolName: name, arguments: arguments + toolName: effectiveName, arguments: arguments ) } else { clickPoint = nil } await RecordingSession.shared.record( - toolName: name, + toolName: effectiveName, arguments: snapshotArguments(arguments), pid: pid, clickPoint: clickPoint, diff --git a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/Tools/ClickTool.swift b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/Tools/ClickTool.swift index 8ada96ae6..8d138c202 100644 --- a/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/Tools/ClickTool.swift +++ b/plugins/cua/vendor/cua-driver/source/Sources/CuaDriverServer/Tools/ClickTool.swift @@ -252,6 +252,8 @@ public enum ClickTool { guard let axAction = axActionByName[actionName] else { return errorResult("Unknown action: \(actionName).") } + // Snapshot before the action so we can detect cross-app side-effects. + let snap = await WindowChangeDetector.snapshot() do { let element = try await AppStateRegistry.engine.lookup( pid: pid, @@ -345,6 +347,15 @@ public enum ClickTool { // period and arm the idle-hide timer. No-op when // disabled. await AgentCursor.shared.finishClick(pid: pid) + // Detect side-effects: new windows or foreground-app change triggered + // by this action (e.g. "Browse UTM Gallery" opens Safari, or + // "Open in UTM" hands off to UTM via a URL scheme). + let changes = await WindowChangeDetector.detectChanges(snapshot: snap) + if let origPid = snap.frontPid, changes.needsRestore { + await MainActor.run { + WindowChangeDetector.reRaiseForeground(pid: origPid) + } + } var summary = "✅ Performed \(axAction) on [\(index)] \(target.role ?? "?") \"\(target.title ?? "")\"." // For popup buttons (HTML