diff --git a/.env.example b/.env.example index 66eecbf..3bcaecf 100644 --- a/.env.example +++ b/.env.example @@ -10,3 +10,4 @@ HYPERBROWSER_API_KEY= ONKERNEL_API_KEY= REBROWSER_API_KEY= STEEL_API_KEY= +DRIVER_API_KEY= \ No newline at end of file diff --git a/browsers/__init__.py b/browsers/__init__.py index 808ce40..0862591 100644 --- a/browsers/__init__.py +++ b/browsers/__init__.py @@ -21,6 +21,7 @@ async def disconnect() -> None -- cleans up the session "anchor", "browserbase", "browserless", + "driver", "hyperbrowser", "local_headful", "local_headless", diff --git a/browsers/driver.py b/browsers/driver.py new file mode 100644 index 0000000..00b8ec7 --- /dev/null +++ b/browsers/driver.py @@ -0,0 +1,43 @@ +import os + +import httpx + +from browsers import retry_on_429 + +_sessions: list[str] = [] + +CDP_PROXY_URL = os.environ.get( + "CDP_PROXY_URL", "https://bu-compat.driver.dev" +).rstrip("/") + + +async def connect() -> str: + async def _create(): + async with httpx.AsyncClient() as client: + resp = await client.post( + f"{CDP_PROXY_URL}/v1/proxy/session", + headers={"Authorization": f"Bearer {os.environ['DRIVER_API_KEY']}"}, + json={"captchaSolver": True, "type": "hosted", "country": "CA"}, + timeout=60, + ) + resp.raise_for_status() + return resp.json() + + data = await retry_on_429(_create) + _sessions.append(data["data"]["sessionId"]) + return data["data"]["cdpUrl"] + + +async def disconnect() -> None: + if not _sessions: + return + session_id = _sessions.pop() + try: + async with httpx.AsyncClient() as client: + await client.delete( + f"{CDP_PROXY_URL}/v1/proxy/session/{session_id}", + headers={"Authorization": f"Bearer {os.environ['DRIVER_API_KEY']}"}, + timeout=30, + ) + except Exception: + pass # Best effort cleanup diff --git a/run_eval.py b/run_eval.py index 104ed4c..3e98e36 100644 --- a/run_eval.py +++ b/run_eval.py @@ -4,11 +4,13 @@ uv run python run_eval.py # defaults: browser-use-cloud + bu-2-0 uv run python run_eval.py --browser anchor # use Anchor Browser provider uv run python run_eval.py --browser local_headless # use local headless Chromium - uv run python run_eval.py --tasks 5 # run only 5 tasks + uv run python run_eval.py --tasks 5 # run first 5 tasks + uv run python run_eval.py --task-ids 2 5 12 14 # rerun specific task IDs + uv run python run_eval.py --task-ids 29-35 # rerun a range of task IDs Available browsers: browser-use-cloud (default), anchor, browserbase, - browserless, hyperbrowser, local_headful, local_headless, onkernel, - rebrowser, steel + browserless, driver, hyperbrowser, local_headful, local_headless, + onkernel, rebrowser, steel """ # Fix for MacOS users using uv without SSL certificate setup @@ -41,8 +43,11 @@ # Judge LLM - always use gemini-2.5-flash for consistent judging across all evaluations JUDGE_LLM = ChatGoogle(model="gemini-2.5-flash", api_key=os.getenv("GOOGLE_API_KEY")) -TASKS_FILE = Path(__file__).parent / "BU_Bench_V1.enc" -MAX_CONCURRENT = 3 +BENCH_NAMES = { + "bu": "BU_Bench_V1", + "stealth": "Stealth_Bench_V1", +} +MAX_CONCURRENT = 1 TASK_TIMEOUT = 1800 # 30 minutes max per task AGENT_FRAMEWORK_NAME = "BrowserUse" @@ -60,9 +65,10 @@ def encode_screenshots(paths: list[str]) -> list[str]: return result -def load_tasks() -> list[dict]: - key = base64.urlsafe_b64encode(hashlib.sha256(b"BU_Bench_V1").digest()) - encrypted = base64.b64decode(TASKS_FILE.read_text()) +def load_tasks(bench_name: str) -> list[dict]: + tasks_file = Path(__file__).parent / f"{bench_name}.enc" + key = base64.urlsafe_b64encode(hashlib.sha256(bench_name.encode()).digest()) + encrypted = base64.b64decode(tasks_file.read_text()) return json.loads(Fernet(key).decrypt(encrypted)) @@ -211,7 +217,13 @@ async def run_task( async def main(): - parser = argparse.ArgumentParser(description="Run BU_Bench_V1 evaluation") + parser = argparse.ArgumentParser(description="Run benchmark evaluation") + parser.add_argument( + "--bench", + default="bu", + choices=list(BENCH_NAMES.keys()), + help="Benchmark to run: bu (BU_Bench_V1) or stealth (Stealth_Bench_V1) (default: bu)", + ) parser.add_argument( "--browser", default="browser-use-cloud", @@ -222,10 +234,19 @@ async def main(): "--tasks", type=int, default=None, - help="Number of tasks to run (default: all)", + help="Number of tasks to run from the start (default: all)", + ) + parser.add_argument( + "--task-ids", + nargs="+", + default=None, + help="Specific task IDs to run (e.g. 2 5 12 or 29-35 for ranges)", ) args = parser.parse_args() + # Resolve bench name + bench_name = BENCH_NAMES[args.bench] + # Resolve browser provider (None = use native browser-use-cloud path) browser_name = args.browser if browser_name == "browser-use-cloud": @@ -235,15 +256,36 @@ async def main(): # Build run key and paths run_start = datetime.now().strftime("%Y%m%d_%H%M%S") - run_key = f"{AGENT_FRAMEWORK_NAME}_{AGENT_FRAMEWORK_VERSION}_browser_{browser_name}_model_{MODEL_NAME}" + run_key = f"{bench_name}_{AGENT_FRAMEWORK_NAME}_{AGENT_FRAMEWORK_VERSION}_browser_{browser_name}_model_{MODEL_NAME}" run_data_dir = ( Path(__file__).parent / "run_data" / f"{run_key}_start_at_{run_start}" ) results_file = Path(__file__).parent / "results" / f"{run_key}.json" - tasks = load_tasks() - if args.tasks: - tasks = tasks[: args.tasks] + all_tasks = load_tasks(bench_name) + + # Filter tasks + if args.task_ids: + # Parse task IDs: supports individual IDs and ranges (e.g. "29-35") + selected_ids: set[str] = set() + for spec in args.task_ids: + if "-" in spec and not spec.startswith("-"): + start, end = spec.split("-", 1) + for i in range(int(start), int(end) + 1): + selected_ids.add(str(i)) + else: + selected_ids.add(spec) + tasks = [t for t in all_tasks if str(t.get("task_id", "")) in selected_ids] + if not tasks: + print(f"No tasks matched IDs: {selected_ids}") + print(f"Available IDs: {[t.get('task_id') for t in all_tasks[:10]]}...") + return + print(f"Running {len(tasks)} selected task(s): {sorted(selected_ids, key=lambda x: int(x) if x.isdigit() else x)}") + elif args.tasks: + tasks = all_tasks[: args.tasks] + else: + tasks = all_tasks + sem = asyncio.Semaphore(MAX_CONCURRENT) results = await asyncio.gather( *[ diff --git a/stealth_bench/official_plots/accuracy_by_browser_dark.png b/stealth_bench/official_plots/accuracy_by_browser_dark.png index 1d7b0da..32566e9 100644 Binary files a/stealth_bench/official_plots/accuracy_by_browser_dark.png and b/stealth_bench/official_plots/accuracy_by_browser_dark.png differ diff --git a/stealth_bench/official_plots/accuracy_by_browser_dark_old.png b/stealth_bench/official_plots/accuracy_by_browser_dark_old.png new file mode 100644 index 0000000..1d7b0da Binary files /dev/null and b/stealth_bench/official_plots/accuracy_by_browser_dark_old.png differ diff --git a/stealth_bench/official_plots/accuracy_by_browser_light.png b/stealth_bench/official_plots/accuracy_by_browser_light.png index 414a7b8..3e0ef8d 100644 Binary files a/stealth_bench/official_plots/accuracy_by_browser_light.png and b/stealth_bench/official_plots/accuracy_by_browser_light.png differ diff --git a/stealth_bench/official_plots/accuracy_by_browser_light_old.png b/stealth_bench/official_plots/accuracy_by_browser_light_old.png new file mode 100644 index 0000000..414a7b8 Binary files /dev/null and b/stealth_bench/official_plots/accuracy_by_browser_light_old.png differ diff --git a/stealth_bench/official_plots/category_heatmap_dark.png b/stealth_bench/official_plots/category_heatmap_dark.png index 46456f9..20f2d49 100644 Binary files a/stealth_bench/official_plots/category_heatmap_dark.png and b/stealth_bench/official_plots/category_heatmap_dark.png differ diff --git a/stealth_bench/official_plots/category_heatmap_dark_old.png b/stealth_bench/official_plots/category_heatmap_dark_old.png new file mode 100644 index 0000000..46456f9 Binary files /dev/null and b/stealth_bench/official_plots/category_heatmap_dark_old.png differ diff --git a/stealth_bench/official_plots/category_heatmap_light.png b/stealth_bench/official_plots/category_heatmap_light.png index dd184a4..fb84fcc 100644 Binary files a/stealth_bench/official_plots/category_heatmap_light.png and b/stealth_bench/official_plots/category_heatmap_light.png differ diff --git a/stealth_bench/official_plots/category_heatmap_light_old.png b/stealth_bench/official_plots/category_heatmap_light_old.png new file mode 100644 index 0000000..dd184a4 Binary files /dev/null and b/stealth_bench/official_plots/category_heatmap_light_old.png differ diff --git a/stealth_bench/official_results/Stealth_Bench_V1_browser_driver_model_bu-2-0.json b/stealth_bench/official_results/Stealth_Bench_V1_browser_driver_model_bu-2-0.json new file mode 100644 index 0000000..2133769 --- /dev/null +++ b/stealth_bench/official_results/Stealth_Bench_V1_browser_driver_model_bu-2-0.json @@ -0,0 +1,35 @@ +[ + { + "run_start": "20260326_160529", + "tasks_completed": 80, + "tasks_successful": 69, + "total_steps": 1436, + "total_duration": 29982.2, + "total_cost": 0.0, + "tasks_successful_by_category": { + "Cloudflare": 17, + "reCaptcha": 5, + "PerimeterX": 17, + "Datadome": 11, + "hCaptcha": 3, + "GeeTest": 3, + "Akamai": 6, + "Shape": 1, + "Kasada": 1, + "Custom Antibot": 5 + }, + "tasks_total_by_category": { + "Cloudflare": 22, + "reCaptcha": 6, + "PerimeterX": 18, + "Datadome": 13, + "hCaptcha": 3, + "GeeTest": 4, + "Akamai": 6, + "Shape": 1, + "Kasada": 1, + "Temu Slider": 1, + "Custom Antibot": 5 + } + } +] \ No newline at end of file