diff --git a/pyproject.toml b/pyproject.toml index 7bf5f92..4380313 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "scrapingbee-cli" -version = "1.4.1" +version = "1.5.0" description = "Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal." readme = "README.md" license = "MIT" @@ -40,6 +40,8 @@ dependencies = [ "certifi", "click>=8.0", "click-option-group>=0.5.6", + "prompt_toolkit>=3.0", + "rich>=13.0", "scrapy>=2.11", "scrapy-scrapingbee>=0.0.5", ] diff --git a/src/scrapingbee_cli/__init__.py b/src/scrapingbee_cli/__init__.py index dc7d57e..9ba2602 100644 --- a/src/scrapingbee_cli/__init__.py +++ b/src/scrapingbee_cli/__init__.py @@ -3,7 +3,7 @@ import platform import sys -__version__ = "1.4.1" +__version__ = "1.5.0" def user_agent_headers() -> dict[str, str]: @@ -12,7 +12,7 @@ def user_agent_headers() -> dict[str, str]: Returns a dict of headers: User-Agent: ScrapingBee/CLI User-Agent-Client: scrapingbee-cli - User-Agent-Client-Version: 1.4.1 + User-Agent-Client-Version: 1.5.0 User-Agent-Environment: python User-Agent-Environment-Version: 3.14.2 User-Agent-OS: Darwin arm64 diff --git a/src/scrapingbee_cli/batch.py b/src/scrapingbee_cli/batch.py index 2b7a94b..fff83e0 100644 --- a/src/scrapingbee_cli/batch.py +++ b/src/scrapingbee_cli/batch.py @@ -5,6 +5,7 @@ import asyncio import hashlib import os +import sys import time from collections.abc import Awaitable, Callable from dataclasses import dataclass @@ -17,6 +18,13 @@ from .client import Client, parse_usage from .config import BASE_URL, get_api_key +from .theme import ( + echo_warning, + is_repl_mode, + notify_completion, + print_completion_summary, + styled_echo, +) # Map Content-Type (main part, lowercased) to file extension for batch output. CONTENT_TYPE_EXTENSION: dict[str, str] = { @@ -392,18 +400,34 @@ def _release_usage_lock(lf: object) -> None: def get_batch_usage(api_key_flag: str | None) -> dict: - """Return usage info (max_concurrency, credits) from a live API call. + """Return usage info (max_concurrency, credits). + + Inside the REPL the file cache (12 s TTL) is consulted first so the + several REPL-side callers (background refresher, batch / crawl + pre-flight) share a single live call per window and stay under the + ``/usage`` rate limit. - When SCRAPINGBEE_USAGE_CACHE=1 is set (test environments only), the file - cache is used to avoid 429 errors from repeated calls in the same session. + Direct CLI invocations (``scrapingbee crawl ...`` outside the REPL) + keep their original behaviour: a live call every time, unless the + legacy ``SCRAPINGBEE_USAGE_CACHE=1`` test escape hatch is set. """ key = get_api_key(api_key_flag) - if os.environ.get("SCRAPINGBEE_USAGE_CACHE") == "1": + try: + from .theme import is_repl_mode + + _in_repl = is_repl_mode() + except Exception: + _in_repl = False + cache_opt_in = _in_repl or os.environ.get("SCRAPINGBEE_USAGE_CACHE") == "1" + if cache_opt_in: cached = read_usage_file_cache(key) if cached is not None: return cached result = asyncio.run(_fetch_usage_async(key)) - write_usage_file_cache(key, result) + try: + write_usage_file_cache(key, result) + except Exception: + pass return result return asyncio.run(_fetch_usage_async(key)) @@ -442,11 +466,17 @@ def resolve_batch_concurrency( if user_concurrency > 0: cap = min(from_usage, CONCURRENCY_CAP) if user_concurrency > cap and warn: - click.echo( - f"Warning: concurrency capped at {cap} (plan limit or max {CONCURRENCY_CAP}). " - "Very high concurrency can overload your network.", - err=True, - ) + if is_repl_mode(): + echo_warning( + f"Concurrency capped at {cap} (plan limit or max {CONCURRENCY_CAP}). " + "Very high concurrency can overload your network." + ) + else: + click.echo( + f"Warning: concurrency capped at {cap} (plan limit or max {CONCURRENCY_CAP}). " + "Very high concurrency can overload your network.", + err=True, + ) return min(user_concurrency, cap) return max(1, from_usage) @@ -524,11 +554,25 @@ async def run_batch_async( concurrency = min(max(1, concurrency), len(inputs)) source = "from --concurrency" if from_user else "from usage API" total = len(inputs) - click.echo(f"Batch: {total} items, concurrency {concurrency} ({source})", err=True) + if is_repl_mode(): + styled_echo(f"Batch: {total} items, concurrency {concurrency} ({source})", style="info") + else: + click.echo(f"Batch: {total} items, concurrency {concurrency} ({source})", err=True) sem = asyncio.Semaphore(concurrency) completed = 0 failure_count = 0 start_time = time.monotonic() + # Seed the REPL progress widget at 0/total so the user sees the + # honeycomb the moment the batch starts, not after the first item + # finishes. Without this, a slow first request can leave the user + # staring at silence for ~1s before any visual feedback. + if is_repl_mode() and show_progress and total > 0: + try: + from .theme import update_progress_state + + update_progress_state(0, total, rps=None, eta=None, failure_pct=None) + except Exception: + pass async def run_one(i: int, inp: str) -> tuple[int, BatchResult]: nonlocal completed, failure_count @@ -567,23 +611,57 @@ async def run_one(i: int, inp: str) -> tuple[int, BatchResult]: failure_count += 1 if show_progress: elapsed = time.monotonic() - start_time - parts = [f"[{completed}/{total}]"] + rps_val = None + eta_val = None + fail_pct = None if elapsed > 0: - rps = completed / elapsed - parts.append(f"{rps:.0f} req/s") + rps_val = completed / elapsed remaining = total - completed - if rps > 0 and remaining > 0: - parts.append(f"ETA {_format_eta(remaining / rps)}") + if rps_val > 0 and remaining > 0: + eta_val = _format_eta(remaining / rps_val) if failure_count > 0: - pct = failure_count / completed * 100 - parts.append(f"Failures: {pct:.0f}%") - click.echo(f" {' | '.join(parts)}", err=True) + fail_pct = failure_count / completed * 100 + if is_repl_mode(): + # Push the latest counts/rates into the shared progress + # state. ``update_progress_state`` renders immediately + # AND the REPL ticker will keep re-rendering at ~10 Hz + # so the boundary hex shimmers between completions. + from .theme import update_progress_state + + update_progress_state( + completed, + total, + rps=rps_val, + eta=eta_val, + failure_pct=fail_pct, + ) + else: + parts = [f"[{completed}/{total}]"] + if rps_val is not None: + parts.append(f"{rps_val:.0f} req/s") + if eta_val is not None: + parts.append(f"ETA {eta_val}") + if fail_pct is not None and fail_pct > 0: + parts.append(f"Failures: {fail_pct:.0f}%") + click.echo(f" {' | '.join(parts)}", err=True) if on_result is not None: on_result(result) return i, result tasks = [run_one(i, inp) for i, inp in enumerate(inputs)] - ordered = await asyncio.gather(*tasks, return_exceptions=True) + try: + ordered = await asyncio.gather(*tasks, return_exceptions=True) + finally: + # Stop the REPL's ticker from re-rendering the progress widget + # now that the batch is done (or cancelled). Safe to call even + # when state was never set. + if is_repl_mode(): + try: + from .theme import clear_progress_state + + clear_progress_state() + except Exception: + pass results: list[BatchResult] = [] for i, item in enumerate(ordered): if isinstance(item, BaseException): @@ -733,7 +811,6 @@ def write_batch_output_to_dir( def _save_batch_meta(output_dir: str, total: int, succeeded: int, failed: int) -> None: """Save batch metadata for --resume discovery.""" import json as _json - import sys from datetime import datetime, timezone meta_path = os.path.join(output_dir, _BATCH_META_FILE) @@ -1072,6 +1149,25 @@ def write_batch_output_csv( ApiCallFn = Callable[[Client, str], Awaitable[tuple[bytes, dict, int]]] +def _batch_done( + plain_msg: str, + *, + succeeded: int = 0, + failed: int = 0, + duration_s: float | None = None, + output_path: str | None = None, + err: bool = True, +) -> None: + """Print batch completion — fancy panel in REPL, plain line otherwise.""" + if is_repl_mode(): + print_completion_summary( + succeeded=succeeded, failed=failed, duration_s=duration_s, output_path=output_path + ) + notify_completion("ScrapingBee", plain_msg) + else: + click.echo(plain_msg, err=err) + + async def _run_api_batch_async( key: str, inputs: list[str], @@ -1091,6 +1187,7 @@ async def _run_api_batch_async( extract_field: str | None = None, fields: str | None = None, ) -> None: + _batch_start = time.monotonic() ndjson_pp = post_process if output_format == "ndjson" else None ndjson_fh = None if output_format == "ndjson" and output_file: @@ -1162,6 +1259,7 @@ async def do_one(item: str): out_dir_resolved = "" out_file_resolved = "" + _duration = time.monotonic() - _batch_start if update_csv_path: out_file_resolved, succeeded, failed = update_csv_with_results( update_csv_path, @@ -1169,9 +1267,12 @@ async def do_one(item: str): results, output_file, ) - click.echo( + _batch_done( f"CSV updated: {succeeded} succeeded, {failed} failed. Output: {out_file_resolved}", - err=True, + succeeded=succeeded, + failed=failed, + duration_s=_duration, + output_path=out_file_resolved, ) elif output_format == "ndjson": if ndjson_fh: @@ -1180,9 +1281,12 @@ async def do_one(item: str): failed = sum(1 for r in results if r.error and not r.skipped) out_file_resolved = output_file or "" out_label = out_file_resolved or "" - click.echo( + _batch_done( f"Batch complete: {succeeded} succeeded, {failed} failed. Output: {out_label}", - err=True, + succeeded=succeeded, + failed=failed, + duration_s=_duration, + output_path=out_file_resolved or None, ) elif output_format == "csv": if post_process: @@ -1192,9 +1296,12 @@ async def do_one(item: str): out_file_resolved, succeeded, failed = write_batch_output_csv( results, output_file, fields=fields ) - click.echo( + _batch_done( f"Batch complete: {succeeded} succeeded, {failed} failed. Output: {out_file_resolved}", - err=True, + succeeded=succeeded, + failed=failed, + duration_s=_duration, + output_path=out_file_resolved, ) else: out_dir_resolved, succeeded, failed = write_batch_output_to_dir( @@ -1203,9 +1310,12 @@ async def do_one(item: str): verbose, post_process=post_process, ) - click.echo( + _batch_done( f"Batch complete: {succeeded} succeeded, {failed} failed. Output: {out_dir_resolved}", - err=True, + succeeded=succeeded, + failed=failed, + duration_s=_duration, + output_path=out_dir_resolved, ) if on_complete: from .cli_utils import run_on_complete diff --git a/src/scrapingbee_cli/cli.py b/src/scrapingbee_cli/cli.py index cb45f93..b4d5cc5 100644 --- a/src/scrapingbee_cli/cli.py +++ b/src/scrapingbee_cli/cli.py @@ -8,9 +8,15 @@ from .commands import register_commands from .config import load_dotenv +# Guard against REPL re-entry when cli.main(args) is called from within REPL +_in_repl = False + def _show_active_schedules_hint() -> None: """If there are active schedules, print a one-line hint to stderr.""" + if _in_repl: + return # Don't show on every REPL command + import json import sys from pathlib import Path @@ -63,10 +69,17 @@ def _show_active_schedules_hint() -> None: ) -@click.group() +@click.group(invoke_without_command=True) @click.version_option(version=__version__) +@click.option( + "--keep-bg", + is_flag=True, + default=False, + help="Keep the terminal's current background and theme colours instead " + "of forcing the REPL to black/light-grey.", +) @click.pass_context -def cli(ctx: click.Context) -> None: +def cli(ctx: click.Context, keep_bg: bool) -> None: """ScrapingBee CLI - Web scraping API client. Commands: scrape (single or batch), crawl (Scrapy/quick-crawl), usage, @@ -77,6 +90,15 @@ def cli(ctx: click.Context) -> None: load_dotenv() _show_active_schedules_hint() ctx.ensure_object(dict) + global _in_repl # noqa: PLW0603 + if ctx.invoked_subcommand is None and not _in_repl: + from .interactive import run_repl + + _in_repl = True + try: + run_repl(cli, __version__, keep_bg=keep_bg) + finally: + _in_repl = False register_commands(cli) @@ -169,6 +191,17 @@ def main() -> None: sys.exit(0) _handle_scraping_config() + # Let users write ``--verbose true`` / ``--verbose false`` in + # addition to the bare ``--verbose`` shortcut, so all boolean + # options behave like the scraping-side ones (--render-js, etc.). + try: + from .cli_utils import collect_bool_flag_names, normalize_bool_flag_args + + _bool_flags = collect_bool_flag_names(cli) + sys.argv[1:] = normalize_bool_flag_args(sys.argv[1:], _bool_flags) + except Exception: + pass + try: cli.main(standalone_mode=False) except click.ClickException as e: diff --git a/src/scrapingbee_cli/cli_utils.py b/src/scrapingbee_cli/cli_utils.py index 42e0d60..d4814e8 100644 --- a/src/scrapingbee_cli/cli_utils.py +++ b/src/scrapingbee_cli/cli_utils.py @@ -10,6 +10,142 @@ import click +from .theme import ( + echo_bee_error, + echo_error, + echo_key_value, + echo_separator, + echo_warning, + is_repl_mode, + styled_echo, +) + +_REPL_PREVIEW_MAX_LINES = 30 +_REPL_PREVIEW_MAX_BYTES = 4000 + + +def _format_bytes(n: int) -> str: + if n >= 1_048_576: + return f"{n / 1_048_576:.1f} MB" + if n >= 1024: + return f"{n / 1024:.1f} KB" + return f"{n} B" + + +def _maybe_repl_preview(data: bytes) -> tuple[bytes, str | None, str | None]: + """If we're in REPL mode and `data` is a large text payload, shrink it + down to a preview and save the full payload to a fixed cache path. + + Triggers truncation on EITHER too many lines OR too many bytes — single- + line minified HTML often hits the byte cap without ever wrapping, so a + line-only check would let it through unchanged. + + Returns ``(bytes_to_print, summary_or_none, saved_path_or_none)``. Outside + REPL mode (or for binary data, or short outputs), returns ``(data, None, + None)`` unchanged so piped/redirected use is unaffected. + """ + if not data: + return data, None, None + if not is_repl_mode(): + return data, None, None + + # Skip binary data (screenshots, PDFs, etc.) — keep the original behaviour. + is_text = data[:1] in (b"{", b"[", b"<", b"#") or b"\x00" not in data[:512] + if not is_text: + return data, None, None + + # Always overwrite the ``last-output`` cache for every response, even + # short ones. Otherwise ``:view`` would happily display a stale large + # response from a previous command — the cache file would only get + # refreshed by responses big enough to trigger the truncation branch. + full_path: str | None = None + try: + from pathlib import Path + + cache_dir = Path.home() / ".cache" / "scrapingbee-cli" + cache_dir.mkdir(parents=True, exist_ok=True) + cache_path = cache_dir / "last-output" + cache_path.write_bytes(data) + full_path = str(cache_path) + except Exception: + full_path = None + + line_count = data.count(b"\n") + 1 + if len(data) <= _REPL_PREVIEW_MAX_BYTES and line_count <= _REPL_PREVIEW_MAX_LINES: + # Small enough to print inline — but the cache is still fresh. + return data, None, None + + text = data.decode("utf-8", errors="replace") + lines = text.split("\n") + line_preview = "\n".join(lines[:_REPL_PREVIEW_MAX_LINES]) + + # Decide whether to truncate by lines or by chars. Single-line minified + # HTML/JSON would have line_preview == text but len > byte cap; truncate by + # chars there so the preview really does stay small on screen. + if len(line_preview.encode("utf-8")) > _REPL_PREVIEW_MAX_BYTES: + preview = text[:_REPL_PREVIEW_MAX_BYTES] + more_chars = len(text) - len(preview) + truncation_note = ( + f"showing first {_REPL_PREVIEW_MAX_BYTES:,} chars · +{more_chars:,} more chars" + ) + else: + preview = line_preview + more_lines = max(0, len(lines) - _REPL_PREVIEW_MAX_LINES) + shown = min(_REPL_PREVIEW_MAX_LINES, len(lines)) + truncation_note = f"showing {shown}/{len(lines):,} lines · +{more_lines:,} more lines" + + summary = f"… preview truncated · {_format_bytes(len(data))} · {truncation_note}" + return preview.encode("utf-8"), summary, full_path + + +def collect_bool_flag_names(cli_group: click.Group) -> set[str]: + """Walk a click group + every subcommand and return the set of all + option strings declared as ``is_flag=True``. Used by + ``normalize_bool_flag_args`` to extend bool flags so they ALSO + accept ``true``/``false`` values for consistency with the + scraping-side flags that already take string bools + (``--render-js true`` etc.). + """ + flags: set[str] = set() + try: + for cmd in cli_group.commands.values(): + for p in cmd.params: + if getattr(p, "is_flag", False): + for opt in p.opts: + flags.add(opt) + except Exception: + pass + return flags + + +def normalize_bool_flag_args(args: list[str], flag_names: set[str]) -> list[str]: + """Pre-parse boolean flags so they accept an explicit true/false + value in addition to the bare flag form: + ``--verbose true`` → ``--verbose`` (value dropped, flag kept) + ``--verbose false`` → flag dropped entirely (default = False) + ``--verbose`` → unchanged + ``--no-verbose`` → unchanged (Click's own ``--no-x`` form) + """ + _true = {"true", "1", "yes", "on"} + _false = {"false", "0", "no", "off"} + out: list[str] = [] + i = 0 + while i < len(args): + tok = args[i] + if tok in flag_names and i + 1 < len(args): + next_lv = args[i + 1].strip().lower() + if next_lv in _true: + out.append(tok) + i += 2 + continue + if next_lv in _false: + # Skip the flag entirely; default value applies. + i += 2 + continue + out.append(tok) + i += 1 + return out + class NormalizedChoice(click.Choice): """Choice type that accepts both hyphens and underscores. @@ -184,6 +320,13 @@ def confirm_overwrite(path: str | None, overwrite: bool = False) -> None: from pathlib import Path if Path(path).exists() and not overwrite: + # In REPL mode, prompt_toolkit owns the TTY (full-screen / alt-buffer), + # so click.confirm reads from sys.stdin and blocks forever. Surface + # the conflict as an error and tell the user to pass --overwrite. + if is_repl_mode(): + raise click.UsageError( + f"'{path}' already exists. Re-run with --overwrite to replace it." + ) if not click.confirm(f"'{path}' already exists. Overwrite?"): click.echo("Cancelled.", err=True) raise SystemExit(0) @@ -1220,7 +1363,10 @@ def _validate_range( return if value < min_val or value > max_val: u = f" {unit}" if unit else "" - click.echo(f"{name} must be between {min_val} and {max_val}{u}", err=True) + if is_repl_mode(): + echo_error(f"{name} must be between {min_val} and {max_val}{u}") + else: + click.echo(f"{name} must be between {min_val} and {max_val}{u}", err=True) raise SystemExit(1) @@ -1372,7 +1518,10 @@ def check_api_response(data: bytes, status_code: int, err_prefix: str = "Error") from .client import pretty_json if status_code >= 400: - click.echo(f"{err_prefix}: HTTP {status_code}", err=True) + if is_repl_mode(): + echo_bee_error(status_code, f"{err_prefix}: HTTP {status_code}") + else: + click.echo(f"{err_prefix}: HTTP {status_code}", err=True) try: click.echo(pretty_json(data), err=True) except Exception: @@ -1459,7 +1608,12 @@ async def scrape_with_escalation( already = any(scrape_kwargs.get(k) for k in tier_overrides) if already: continue - click.echo(f"[escalate-proxy] {url}: blocked, retrying with {tier_name} proxy", err=True) + if is_repl_mode(): + echo_warning(f"[escalate-proxy] {url}: blocked, retrying with {tier_name} proxy") + else: + click.echo( + f"[escalate-proxy] {url}: blocked, retrying with {tier_name} proxy", err=True + ) escalated = {**scrape_kwargs, **tier_overrides} data, headers, status_code = await client.scrape(url, **escalated) if verbose: @@ -1557,29 +1711,57 @@ def write_output( Precedence: *smart_extract* > *extract_field* > *fields*. """ if verbose: - click.echo(f"HTTP Status: {status_code}", err=True) - headers_lower = {k.lower(): (k, v) for k, v in headers.items()} - spb_cost_present = False - for key, label in [ - ("spb-cost", "Credit Cost"), - ("spb-resolved-url", "Resolved URL"), - ("spb-initial-status-code", "Initial Status Code"), - ]: - if key in headers_lower: - _, val = headers_lower[key] - if val: - click.echo(f"{label}: {val}", err=True) - if key == "spb-cost": - spb_cost_present = True - if not spb_cost_present: - if credit_cost is not None: - click.echo(f"Credit Cost: {credit_cost}", err=True) - elif command: - from scrapingbee_cli.credits import ESTIMATED_CREDITS - - if command in ESTIMATED_CREDITS: - click.echo(f"Credit Cost (estimated): {ESTIMATED_CREDITS[command]}", err=True) - click.echo("---", err=True) + if is_repl_mode(): + status_style = "success" if status_code < 400 else "error" + styled_echo(f"HTTP Status: {status_code}", style=status_style) + headers_lower = {k.lower(): (k, v) for k, v in headers.items()} + spb_cost_present = False + for key, label in [ + ("spb-cost", "Credit Cost"), + ("spb-resolved-url", "Resolved URL"), + ("spb-initial-status-code", "Initial Status Code"), + ]: + if key in headers_lower: + _, val = headers_lower[key] + if val: + echo_key_value(label, str(val)) + if key == "spb-cost": + spb_cost_present = True + if not spb_cost_present: + if credit_cost is not None: + echo_key_value("Credit Cost", str(credit_cost)) + elif command: + from scrapingbee_cli.credits import ESTIMATED_CREDITS + + if command in ESTIMATED_CREDITS: + echo_key_value("Credit Cost (estimated)", str(ESTIMATED_CREDITS[command])) + echo_separator() + else: + click.echo(f"HTTP Status: {status_code}", err=True) + headers_lower = {k.lower(): (k, v) for k, v in headers.items()} + spb_cost_present = False + for key, label in [ + ("spb-cost", "Credit Cost"), + ("spb-resolved-url", "Resolved URL"), + ("spb-initial-status-code", "Initial Status Code"), + ]: + if key in headers_lower: + _, val = headers_lower[key] + if val: + click.echo(f"{label}: {val}", err=True) + if key == "spb-cost": + spb_cost_present = True + if not spb_cost_present: + if credit_cost is not None: + click.echo(f"Credit Cost: {credit_cost}", err=True) + elif command: + from scrapingbee_cli.credits import ESTIMATED_CREDITS + + if command in ESTIMATED_CREDITS: + click.echo( + f"Credit Cost (estimated): {ESTIMATED_CREDITS[command]}", err=True + ) + click.echo("---", err=True) if smart_extract: from .extract import smart_extract as _smart_extract_fn @@ -1597,10 +1779,26 @@ def write_output( with fh: fh.write(data) else: - sys.stdout.buffer.write(data) + # In REPL mode, truncate large text dumps to a tidy preview and surface + # a path to the full output. Non-REPL invocations (`scrapingbee scrape ...`) + # keep the original behaviour so pipes and redirects work unchanged. + preview_data, repl_summary, repl_full_path = _maybe_repl_preview(data) + sys.stdout.buffer.write(preview_data) # Only add a trailing newline for text-like content; binary data (PNG, PDF, etc.) # must not have extra bytes appended. - if data and not data.endswith(b"\n"): - is_text = data[:1] in (b"{", b"[", b"<", b"#") or b"\x00" not in data[:512] + if preview_data and not preview_data.endswith(b"\n"): + is_text = ( + preview_data[:1] in (b"{", b"[", b"<", b"#") or b"\x00" not in preview_data[:512] + ) if is_text: click.echo() + if repl_summary: + from .theme import BEE_DIM, BEE_YELLOW, err_console + + err_console.print(f" [{BEE_DIM}]{repl_summary}[/]") + if repl_full_path: + err_console.print( + f" [bold {BEE_YELLOW}]:view[/] " + f"[{BEE_DIM}]to scroll the full output · or pass[/] " + f"[bold {BEE_YELLOW}]--output-file FILE[/]" + ) diff --git a/src/scrapingbee_cli/client.py b/src/scrapingbee_cli/client.py index 32b420a..a6a768c 100644 --- a/src/scrapingbee_cli/client.py +++ b/src/scrapingbee_cli/client.py @@ -588,6 +588,7 @@ def parse_usage(body: bytes) -> dict: avail = int(max_credit) - int(used_credit) if avail >= 0: out["credits"] = avail + out["max_api_credit"] = int(max_credit) max_concurrency_val = data.get("max_concurrency") if max_concurrency_val is not None and isinstance(max_concurrency_val, (int, float)): diff --git a/src/scrapingbee_cli/commands/auth.py b/src/scrapingbee_cli/commands/auth.py index fce226a..0cc879c 100644 --- a/src/scrapingbee_cli/commands/auth.py +++ b/src/scrapingbee_cli/commands/auth.py @@ -83,8 +83,25 @@ async def _check() -> tuple[int, bytes]: data, _, status_code = await client.usage(retries=1, backoff=1.0) return status_code, data + def _run_check() -> tuple[int, bytes]: + return asyncio.run(_check()) + try: - status, data = asyncio.run(_check()) + # ``asyncio.run`` refuses to start when a loop is already running + # in the current thread. The REPL's ``auth`` flow runs us on the + # main thread (via ``run_in_terminal``) while prompt_toolkit's + # Application loop is still active — offload the coroutine to a + # short-lived worker thread in that case. From a plain CLI + # invocation no loop is running, so we just use ``asyncio.run`` + # directly. + try: + asyncio.get_running_loop() + from concurrent.futures import ThreadPoolExecutor + + with ThreadPoolExecutor(max_workers=1) as pool: + status, data = pool.submit(_run_check).result() + except RuntimeError: + status, data = _run_check() if status == 200: return True, "" # API returned an error — try to extract the message diff --git a/src/scrapingbee_cli/commands/crawl.py b/src/scrapingbee_cli/commands/crawl.py index e854b22..b124869 100644 --- a/src/scrapingbee_cli/commands/crawl.py +++ b/src/scrapingbee_cli/commands/crawl.py @@ -431,17 +431,24 @@ def crawl_cmd( if not target: click.echo("Provide a spider name, one or more URLs, or --from-sitemap URL.", err=True) raise SystemExit(1) + usage_info: dict | None = None try: usage_info = get_batch_usage(None) concurrency = resolve_batch_concurrency(obj["concurrency"], usage_info, 1) from_concurrency = obj["concurrency"] > 0 plan_concurrency = usage_info.get("max_concurrency") or 0 - except Exception: + except Exception as e: + # The /usage endpoint is rate-limited; bursts of crawl runs can + # trip it. Surface the actual reason so the user can tell apart + # "rate limited, retry in a moment" from real network / auth + # problems. + reason = str(e).strip() or type(e).__name__ click.echo( - "Warning: could not check plan concurrency. Defaulting to 1 concurrent request. " - "Use --concurrency to set explicitly.", + f"Warning: could not check plan concurrency ({reason}). " + "Defaulting to 1 concurrent request. Use --concurrency to set explicitly.", err=True, ) + usage_info = None concurrency = 1 from_concurrency = False plan_concurrency = 0 @@ -538,6 +545,23 @@ def crawl_cmd( if allowed_domains: allowed_list = [d.strip() for d in allowed_domains.split(",") if d.strip()] try: + # ``known_total`` enables a batch-style honeycomb + # progress bar in the REPL widget. Used when the total + # is bounded up front: + # - sitemap mode (--from-sitemap) gives an exact list + # - max_depth=1 stops at the seed URLs themselves + # - --max-pages N caps the crawl, even when + # link-following could otherwise discover more + # For genuinely open-ended crawls (max_pages=0) we fall + # back to a rolling "fetching: " line driven by + # the spider signal handlers. + _kt: int | None = None + if from_sitemap: + _kt = len(urls) + elif max_depth == 1: + _kt = len(urls) + elif max_pages and max_pages > 0: + _kt = max_pages run_urls_spider( urls, key, @@ -555,6 +579,7 @@ def crawl_cmd( include_pattern=include_pattern, exclude_pattern=exclude_pattern, save_pattern=save_pattern, + known_total=_kt, ) except ValueError as e: click.echo(str(e), err=True) diff --git a/src/scrapingbee_cli/commands/scrape.py b/src/scrapingbee_cli/commands/scrape.py index 53cba9a..3b04f1c 100644 --- a/src/scrapingbee_cli/commands/scrape.py +++ b/src/scrapingbee_cli/commands/scrape.py @@ -38,6 +38,7 @@ from ..client import Client, pretty_json from ..config import BASE_URL, get_api_key from ..crawl import _preferred_extension_from_scrape_params +from ..theme import echo_error, is_repl_mode def _apply_chunking(url: str, data: bytes, chunk_size: int, chunk_overlap: int) -> bytes: @@ -726,7 +727,10 @@ async def _single() -> None: else: data, resp_headers, status_code = await client.scrape(scrape_url, **scrape_kwargs) if not scrape_kwargs.get("transparent_status_code") and status_code >= 400: - click.echo(f"Error: HTTP {status_code}", err=True) + if is_repl_mode(): + echo_error(f"Error: HTTP {status_code}") + else: + click.echo(f"Error: HTTP {status_code}", err=True) try: click.echo(pretty_json(data), err=True) except Exception: diff --git a/src/scrapingbee_cli/commands/usage.py b/src/scrapingbee_cli/commands/usage.py index 418fc15..b140b59 100644 --- a/src/scrapingbee_cli/commands/usage.py +++ b/src/scrapingbee_cli/commands/usage.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +import json as _json import click @@ -10,6 +11,7 @@ from ..cli_utils import _output_options, store_common_options from ..client import Client, parse_usage, pretty_json from ..config import BASE_URL, get_api_key +from ..theme import is_repl_mode @click.command() @@ -37,15 +39,69 @@ async def _run() -> None: raise SystemExit(1) # Warm the shared file cache so concurrent batch subprocesses skip the API call. write_usage_file_cache(key, parse_usage(data)) - output_file = obj.get("output_file") - if output_file: - with open(output_file, "w", encoding="utf-8") as f: - f.write(pretty_json(data) + "\n") + + if is_repl_mode(): + _show_repl_usage(data) else: - click.echo(pretty_json(data)) + output_file = obj.get("output_file") + if output_file: + with open(output_file, "w", encoding="utf-8") as f: + f.write(pretty_json(data) + "\n") + else: + click.echo(pretty_json(data)) asyncio.run(_run()) +def _show_repl_usage(data: bytes) -> None: + """Render a fully styled usage dashboard to stderr (REPL mode only).""" + from rich.text import Text + + from ..theme import ( + BEE_YELLOW, + echo_key_value, + echo_separator, + err_console, + format_honeycomb_meter, + ) + + raw = _json.loads(data) + + header = Text() + header.append(" Credit Usage", style=f"bold {BEE_YELLOW}") + err_console.print(header) + err_console.print() + + used = raw.get("used_api_credit", 0) or 0 + total = raw.get("max_api_credit", 0) or 0 + remaining = total - used + + meter = format_honeycomb_meter(used, total) + err_console.print(meter) + err_console.print() + + echo_key_value("Credits used", f"{used:,}") + echo_key_value("Credits remaining", f"{remaining:,}") + echo_key_value("Total credits", f"{total:,}") + err_console.print() + + max_conc = raw.get("max_concurrency", "N/A") + cur_conc = raw.get("current_concurrency", 0) + echo_key_value("Max concurrency", str(max_conc)) + echo_key_value("Current concurrency", str(cur_conc)) + err_console.print() + + renewal = raw.get("renewal_subscription_date", "") + if renewal: + try: + date_part, time_part = renewal.split("T") + time_clean = time_part.split(".")[0][:5] + echo_key_value("Renewal date", f"{date_part} {time_clean} UTC") + except Exception: + echo_key_value("Renewal date", renewal) + + echo_separator() + + def register(cli: click.Group) -> None: cli.add_command(usage_cmd, "usage") diff --git a/src/scrapingbee_cli/crawl.py b/src/scrapingbee_cli/crawl.py index 363900c..cfba286 100644 --- a/src/scrapingbee_cli/crawl.py +++ b/src/scrapingbee_cli/crawl.py @@ -21,7 +21,7 @@ from scrapy_scrapingbee import ScrapingBeeRequest from . import user_agent_headers -from .batch import _batch_subdir_for_extension, extension_for_crawl, extension_from_url_path +from .batch import _batch_subdir_for_extension, extension_for_crawl if TYPE_CHECKING: from scrapy import Request @@ -29,27 +29,203 @@ SCRAPINGBEE_MIDDLEWARE = "scrapy_scrapingbee.ScrapingBeeMiddleware" MIDDLEWARE_PRIORITY = 725 + +class _CrawlerReactorAlreadyUsedError(RuntimeError): + """Raised when Twisted's reactor has already been started + stopped + in this Python process and can't be re-used for another crawl. The + REPL surfaces a friendly message asking the user to restart the + session, rather than letting Scrapy's raw error bubble up. + """ + + +def stop_running_reactor() -> bool: + """Thread-safely stop the running Twisted reactor if it's currently + running a crawl. Returns True if a stop was scheduled, False if no + reactor is currently running (so the caller can fall through to its + other Ctrl+C paths). + + Used by the REPL's Ctrl+C handler — the Twisted reactor in the + worker thread is blocked in a C-level ``epoll``/``kqueue``/``select`` + waiting on sockets, so neither ``PyThreadState_SetAsyncExc`` nor + ``asyncio.Task.cancel`` reaches it. ``reactor.callFromThread`` is + the blessed cross-thread escape hatch: it wakes the selector via + the reactor's self-pipe and schedules the callback on the reactor + thread, where ``reactor.stop()`` can run safely. + """ + try: + from twisted.internet import reactor + except Exception: + return False + if not getattr(reactor, "running", False): + return False + try: + # ``callFromThread`` / ``stop`` are populated dynamically when + # the reactor is installed; the static module stub doesn't + # carry them. ``getattr`` keeps the type checker quiet without + # rerouting the runtime hot path. + cft = getattr(reactor, "callFromThread", None) + stop = getattr(reactor, "stop", None) + if cft is None or stop is None: + return False + cft(stop) + return True + except Exception: + return False + + +def _ensure_reactor_usable() -> None: + """Sanity check before we hand a new crawl to Twisted. + + Twisted's reactor is a process-wide singleton — once ``reactor.run()`` + returns (either naturally or because the user cancelled the crawl) + the reactor's ``_startedBefore`` flag stays True, and calling + ``run()`` again raises ``ReactorNotRestartable``. The REPL invokes + ``run_urls_spider`` / ``run_project_spider`` in a worker thread per + command, so the second crawl in a REPL session always trips this. + + We INSPECT the reactor via ``sys.modules`` rather than importing + ``twisted.internet.reactor`` ourselves — a bare import triggers the + default reactor (SelectReactor on macOS) to install eagerly, which + then conflicts with Scrapy's ``TWISTED_REACTOR`` setting that wants + ``AsyncioSelectorReactor``. The result was every crawl failing + immediately with ``RuntimeError: The installed reactor … does not + match`` before any signal could fire. + + Detect the dead-reactor state early and raise a clean error the + REPL can render as "Restart the REPL to crawl again" instead of a + multi-line Twisted traceback. (A true fix would spawn each crawl + in a subprocess; that's a follow-up.) + """ + import sys as _sys + + reactor = _sys.modules.get("twisted.internet.reactor") + if reactor is None: + return # No reactor has been installed yet, nothing to check. + if getattr(reactor, "_startedBefore", False): + raise _CrawlerReactorAlreadyUsedError( + "Crawls in this REPL session have ended. Twisted's reactor " + "is single-shot per process — please run ``:q`` and relaunch " + "scrapingbee to crawl again." + ) + + +def _target_url_from_request(request) -> str: + """Extract the user-facing target URL from a Scrapy request. + + ``scrapy-scrapingbee`` rewrites outgoing requests so they hit + ``app.scrapingbee.com/api/v1/?api_key=…&url=…``. Stick that URL in + the REPL's live status line and the user sees their API key in + plain text plus a totally unhelpful host — they want their own + target URL. The request's ``meta["scrapingbee"]["url"]`` (set by + the middleware before it rewrites the request) is the cleanest + source; if that's missing we fall back to decoding the ``url`` + query param from ``request.url``, and to ``request.url`` itself if + even that fails (so the line stays populated rather than going + blank). + + Output is always a clean printable string — non-printable bytes + that sometimes show up in target URLs (e.g. screenshot-mode pages + with binary blobs in the path) are stripped so the status widget + never renders mojibake. + """ + raw = "" + try: + meta_url = (request.meta or {}).get("scrapingbee_target_url") + if meta_url: + raw = meta_url + except Exception: + pass + if not raw: + raw = getattr(request, "url", "") or "" + if "app.scrapingbee.com" in raw and "url=" in raw: + try: + from urllib.parse import parse_qs, unquote, urlparse + + qs = parse_qs(urlparse(raw).query) + target = qs.get("url", [None])[0] + if target: + raw = unquote(target, errors="replace") + except Exception: + pass + if isinstance(raw, bytes): + raw = raw.decode("utf-8", errors="replace") + # Keep only ASCII printable code points (32–126). URLs are + # supposed to be 7-bit ASCII with %-encoding for everything + # else; anything outside that range here is decoded garbage + # (sites like crawler-test.com host pages with deliberate + # binary blobs in their paths for scraper-stress-testing). + # ``isprintable()`` alone passes too much through — combining + # marks, zero-width chars, exotic whitespace all render weird + # in the status widget. + return "".join(ch for ch in raw if 32 <= ord(ch) <= 126) + + +def _install_signal_handlers() -> bool: + """Whether Scrapy / Twisted should install Unix signal handlers. + + Returns False when running inside the REPL — there we run crawl in a + worker thread (to avoid asyncio.run conflicting with prompt_toolkit's + main-thread loop), and ``signal.signal()`` is restricted to the main + thread, so any attempt to install handlers raises ``ValueError: + signal only works in main thread of the main interpreter``. The REPL + provides its own Ctrl+C handling that injects ``KeyboardInterrupt`` + into the worker thread, so we don't need Scrapy's handlers there. + + Returns True for direct ``scrapingbee crawl ...`` invocations — those + run on the main thread and benefit from Twisted's graceful shutdown. + """ + try: + from .theme import is_repl_mode + + return not is_repl_mode() + except Exception: + return True + + +def _maybe_set_repl_log_file(settings) -> str | None: + """In REPL mode (or a REPL-spawned subprocess), pipe the full Scrapy + log to a file on disk and silence the noisy ``py.warnings`` logger + so the in-flight crawl UI isn't drowned in deprecation tracebacks. + + The REPL's virtual scrollback caps at ~10K lines and drops the + oldest 10% when full, so long crawls would otherwise lose their + history. ``LOG_FILE`` mirrors everything Scrapy emits (at the + configured ``LOG_LEVEL``) to ``~/.cache/scrapingbee-cli/crawl.log``; + the user can open it any time with ``:view crawl``. + + ``py.warnings`` is the logger Scrapy uses to forward Python + ``warnings.warn`` calls. Multi-line deprecation tracebacks (Scrapy + nagging about old middleware APIs etc.) belong in the file, not on + screen — we raise THAT specific logger to ERROR so those entries + stop reaching the terminal stream while the rest of Scrapy's + routine logging continues at its configured level. + """ + try: + from .theme import is_repl_mode + + in_repl = is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" + if not in_repl: + return None + log_dir = Path.home() / ".cache" / "scrapingbee-cli" + log_dir.mkdir(parents=True, exist_ok=True) + log_path = log_dir / "crawl.log" + settings.set("LOG_FILE", str(log_path)) + settings.set("LOG_FILE_APPEND", False) # fresh log per run + try: + import logging as _logging + + _logging.getLogger("py.warnings").setLevel(_logging.ERROR) + except Exception: + pass + return str(log_path) + except Exception: + return None + + # 0 means unlimited DEFAULT_MAX_DEPTH = 0 DEFAULT_MAX_PAGES = 0 -# URL extensions that will never contain HTML links — skip discovery re-requests for these. -_NON_HTML_URL_EXTENSIONS = frozenset( - { - "jpg", - "jpeg", - "png", - "gif", - "webp", - "svg", - "ico", # images - "pdf", - "zip", # binary downloads - "css", - "js", # web assets - } -) - def _normalize_url(url: str) -> str: """Strip fragment and trailing slash for deduplication.""" @@ -129,9 +305,13 @@ def _requires_discovery_phase(scrape_params: dict[str, Any]) -> bool: if _param_truthy(scrape_params, "return_page_text"): return True # Raw screenshot (no JSON wrapper) → binary PNG, no extractable links. - if _param_truthy(scrape_params, "screenshot") and not _param_truthy( - scrape_params, "json_response" - ): + # All three screenshot params produce PNG output unless wrapped in JSON. + screenshot_requested = ( + _param_truthy(scrape_params, "screenshot") + or _param_truthy(scrape_params, "screenshot_full_page") + or scrape_params.get("screenshot_selector") + ) + if screenshot_requested and not _param_truthy(scrape_params, "json_response"): return True return False @@ -217,9 +397,16 @@ def __init__( include_pattern: str | None = None, exclude_pattern: str | None = None, save_pattern: str | None = None, + known_total: int | None = None, **kwargs: Any, ) -> None: super().__init__(name=name, **kwargs) + # Optional: when the caller knows up front how many pages will be + # fetched (e.g. sitemap mode), we surface a batch-style honeycomb + # progress bar in the REPL. Left None for open-ended crawls. + self._known_total: int | None = ( + int(known_total) if known_total and known_total > 0 else None + ) self.start_urls = start_urls or [] self.scrape_params = scrape_params or {} self.custom_headers = custom_headers @@ -241,7 +428,263 @@ def __init__( self._exclude_re = re.compile(exclude_pattern) if exclude_pattern else None self._save_re = re.compile(save_pattern) if save_pattern else None self._save_count = 0 + # Save requests that have been dispatched but not yet completed. + # Used together with ``_save_count`` to enforce ``max_pages`` + # tightly even when several discovery callbacks fire saves + # before the first save completes (without this we overshoot + # the cap by ``concurrency``). + self._save_pending = 0 + # Pool-based discovery for binary modes (screenshot / extract / + # ai / return_page_text). Discovery callbacks accumulate URLs + # into ``_save_queue`` without firing a save per page; once the + # queue contains >= ``max_pages`` candidates we flip + # ``_discovery_done`` and dispatch all save requests in one go, + # then drop any further discoveries that come back late. This + # avoids paying for an HTML discovery per saved page when a + # handful of pages already expose more URLs than the cap. + # ``_save_queue_next`` is the index of the next un-dispatched + # URL in ``_save_queue`` — used by ``_on_save_error`` to backfill + # from the remainder of the pool when a dispatched save fails, + # so a few errors don't leave the user with < max_pages files + # despite there being more candidates available. + self._save_queue: list[str] = [] + self._save_queue_set: set[str] = set() + self._save_queue_next: int = 0 + self._discovery_done: bool = False self._fetch_count = 0 + # Live-status counters surfaced to the REPL via theme._crawl_status. + # Only populated under REPL mode; the signal handlers below early- + # exit otherwise so the standalone CLI path stays unchanged. + self._queued_count = 0 + # Counted at signal time (response_received), independent of the + # parse callbacks that increment ``_fetch_count`` later in the + # pipeline. Used for the dim-row "X fetched" indicator and the + # honeycomb progress widget so the count advances the instant a + # response lands, not when its body is parsed. + self._response_count = 0 + + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + """Standard Scrapy hook — instantiate the spider AND wire signal + handlers that push live status into ``theme._crawl_status`` so + the REPL's dim row can show the current URL + fetched count + in real time. Outside REPL mode the handlers are no-ops. + """ + spider = super().from_crawler(crawler, *args, **kwargs) + try: + from scrapy import signals as _scrapy_signals + + from .theme import is_repl_mode + + # Stash the crawler so signal handlers can dispatch new + # requests via ``crawler.engine.crawl`` (needed from + # ``spider_idle`` to flush the pool when discovery exhausts + # without saturating). ``Spider`` doesn't declare this slot + # so we use ``setattr`` to keep the type checker happy. + setattr(spider, "_crawler", crawler) + + # The pool-based discovery flow needs to flush queued URLs + # at spider_idle (when discovery exhausts before reaching + # ``max_pages``). Wire this regardless of REPL mode — it's + # a credit-saving optimisation, not a UI feature. + crawler.signals.connect(spider._on_spider_idle, signal=_scrapy_signals.spider_idle) + + # Register signal handlers when running inside the REPL + # (legacy in-process path) OR when the parent REPL spawned + # us as a subprocess and set the status-file env var (the + # new subprocess-per-crawl path). The handlers themselves + # call ``update_crawl_status`` which atomically mirrors + # state to the file if the env var is set. + _want_status = is_repl_mode() or bool(os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE")) + if _want_status: + crawler.signals.connect( + spider._on_spider_opened, signal=_scrapy_signals.spider_opened + ) + crawler.signals.connect( + spider._on_request_scheduled, + signal=_scrapy_signals.request_scheduled, + ) + crawler.signals.connect( + spider._on_request_reached, + signal=_scrapy_signals.request_reached_downloader, + ) + crawler.signals.connect( + spider._on_response_received, + signal=_scrapy_signals.response_received, + ) + crawler.signals.connect( + spider._on_spider_closed, signal=_scrapy_signals.spider_closed + ) + except Exception: + pass + return spider + + # ── Live-status signal handlers (REPL mode only) ────────────────────── + def _on_spider_opened(self, spider) -> None: + try: + from .theme import update_crawl_status, update_progress_state + + update_crawl_status( + current_url=None, + fetched=0, + queued=0, + saved=0, + phase="discovering", + ) + # If we already know the total (sitemap mode), seed the + # progress widget at 0/total so the user sees the bar from + # frame one. + if self._known_total is not None and self._known_total > 0: + update_progress_state(0, self._known_total) + except Exception: + pass + + def _on_request_scheduled(self, request, spider) -> None: + try: + self._queued_count += 1 + from .theme import update_crawl_status + + update_crawl_status(queued=self._queued_count) + except Exception: + pass + + def _on_request_reached(self, request, spider) -> None: + try: + from .theme import update_crawl_status + + # Scrapy sees the outgoing proxy URL + # (``app.scrapingbee.com/api/v1/?api_key=…&url=…``) — that's + # leaky (API key) and not what the user thinks of as "their" + # URL. Pull the target out of the ``url`` query param so the + # status widget reads naturally: ``fetching: https://example.com``. + display_url = _target_url_from_request(request) + update_crawl_status(current_url=display_url) + except Exception: + pass + + def _on_response_received(self, response, request, spider) -> None: + try: + self._response_count += 1 + from .theme import update_crawl_status, update_progress_state + + update_crawl_status( + fetched=self._response_count, + saved=self._save_count, + phase="fetching", + ) + if self._known_total is not None and self._known_total > 0: + update_progress_state( + min(self._response_count, self._known_total), + self._known_total, + ) + except Exception: + pass + + def _on_spider_closed(self, spider, reason) -> None: + try: + from .theme import clear_crawl_status, clear_progress_state + + clear_crawl_status() + clear_progress_state() + except Exception: + pass + + def _on_spider_idle(self, spider) -> None: + """Flush the pool when discovery exhausts before saturation. + + Pool-based binary mode only dispatches saves once the queue + reaches ``max_pages``. If the site is smaller than the cap (or + ``max_pages`` is 0 / unlimited), the queue never reaches the + threshold and would never trigger save dispatch — the spider + would close with the pool full and zero files saved. + + ``spider_idle`` fires when the scheduler is empty and no + requests are in flight. We use it to commit whatever URLs we + gathered: dispatch save requests for every queued URL (capped + at ``max_pages`` if set), then raise ``DontCloseSpider`` so + Scrapy waits for the saves to complete before shutting down. + + Only relevant for binary-mode crawls (the same-mode and + HTML-save-pattern flows save in place, no pool involved). + """ + if self._discovery_done: + return + if not _requires_discovery_phase(self.scrape_params): + return + if not self._save_queue: + return + # Resolve the engine BEFORE latching ``_discovery_done`` — if + # the engine isn't available (very unlikely by the time + # spider_idle fires, but worth being defensive), bail without + # leaving the flag set, so a later idle tick gets another + # chance instead of permanently skipping flush. + engine = getattr(getattr(self, "_crawler", None), "engine", None) + if engine is None: + return + self._discovery_done = True + budget = ( + min(self.max_pages, len(self._save_queue)) if self.max_pages else len(self._save_queue) + ) + for url in self._save_queue[:budget]: + self._save_pending += 1 + self._save_queue_next += 1 + try: + engine.crawl(self._make_save_request(url), spider) + except Exception: + if self._save_pending > 0: + self._save_pending -= 1 + from scrapy.exceptions import DontCloseSpider + + raise DontCloseSpider + + def _push_saved_status(self) -> None: + """Re-push the live ``saved`` count after a successful save, + and tear the spider down once we've hit ``max_pages``. + + ``_on_response_received`` (Scrapy signal) fires BEFORE the + ``parse``/``_parse_save_only`` callback writes the file, so the + widget's ``saved`` count always lags by one until the next + response arrives. With ``--max-pages N`` the spider closes + before that next response, leaving a stale ``N fetched + N-1 saved`` reading on screen until ``_on_spider_closed`` + clears the widget. Calling this right after the save commits + keeps the display honest. + + Once the cap is reached we also raise ``CloseSpider`` so the + engine drops anything still queued (e.g. the ~N follow-up + discoveries that the seed callback already yielded). Without + this the spider would happily keep fetching no-op pages until + the framework safety cap ``CLOSESPIDER_PAGECOUNT`` kicks in — + burning credits the user expects ``--max-pages`` to bound. + """ + try: + from .theme import update_crawl_status + + update_crawl_status(saved=self._save_count) + except Exception: + pass + if self.max_pages != 0 and self._save_count >= self.max_pages: + from scrapy.exceptions import CloseSpider + + raise CloseSpider("max_pages") + + def _on_request_error(self, failure) -> None: + """Swallow request-level errors so one bad URL doesn't kill the + whole crawl. ``scrapy_scrapingbee`` ships an errback that + crashes on binary error responses (``response.text`` raises + ``AttributeError`` when the body isn't decodable as text — + which happens any time the API returns a non-200 in screenshot + mode). Attaching our own errback to every request short- + circuits that and just logs the failure. + """ + try: + req = getattr(failure, "request", None) + url = getattr(req, "url", "?") if req is not None else "?" + exc = type(failure.value).__name__ if hasattr(failure, "value") else "error" + self.logger.warning("Skipped %s (%s)", url, exc) + except Exception: + pass + return None def _allowed_netlocs_set(self) -> set[str]: if self._allowed_netlocs is not None: @@ -263,16 +706,29 @@ def _url_allowed(self, url: str) -> bool: return not allowed or netloc in allowed def start_requests(self) -> Iterator[Request]: + # Two flows: + # 1. "Same-mode": one request per page; the response is both saved + # and parsed for outgoing links. Works only when scrape_params + # yield HTML/JSON-with-body (no screenshot/extract/etc). + # 2. "Discovery-first": fetch each page in HTML mode for link + # extraction, and (if it should be saved) fire a SECOND + # request with the user's full scrape_params to obtain the + # saved artifact (PNG, extract-rules JSON, etc). + # Discovery-first is required whenever the user asks for binary or + # non-link-bearing output, AND whenever --save-pattern is set + # (so the cheap HTML pass can find links without spending the full + # per-page cost on every crawled URL). + use_discovery_flow = self._save_re is not None or _requires_discovery_phase( + self.scrape_params + ) for url in self.start_urls: normalized = _normalize_url(url) if normalized in self.seen_urls: continue - if self.max_pages != 0 and self._fetch_count >= self.max_pages: + if self.max_pages != 0 and self._save_count >= self.max_pages: continue self.seen_urls.add(normalized) - # When --save-pattern is set, use discovery params for initial crawl - # (HTML for link finding). Full params only for save-worthy pages. - if self._save_re: + if use_discovery_flow: params = _params_for_discovery(self.scrape_params) callback = self._parse_crawl_and_save else: @@ -284,6 +740,7 @@ def start_requests(self) -> Iterator[Request]: headers=self.custom_headers, meta={"depth": 0}, callback=callback, + errback=self._on_request_error, ) def _response_headers_dict(self, response: Response) -> dict: @@ -352,20 +809,19 @@ def closed(self, reason: str) -> None: _save_batch_meta(abs_dir, len(self._url_file_map), len(self._url_file_map), 0) - def _iter_follow_requests( - self, - response: Response, - params: dict[str, Any], - callback: Any, - ) -> Any: - """Yield ScrapingBeeRequests for allowed, same-domain - (or allowed-domains) links from response.""" + def _iter_follow_urls(self, response: Response) -> Any: + """Yield ``(url, next_depth)`` for each link in ``response`` that + passes the spider's URL filters (scheme, ASCII, domain + allow-list, include/exclude regex, dedup). Centralised so the + same filter chain is used by both the request-yielding flow + (``_iter_follow_requests``) and the pool-based discovery flow + (``_parse_crawl_and_save`` for binary modes). + """ depth = response.meta.get("depth", 0) if self.max_depth != 0 and depth >= self.max_depth: return - # max_pages = max pages fetched from API (credits spent) - if self.max_pages != 0 and self._fetch_count >= self.max_pages: - return + from urllib.parse import unquote as _unquote + for href in _extract_hrefs_from_response(response): if not href or href.startswith(("#", "mailto:", "javascript:")): continue @@ -373,6 +829,24 @@ def _iter_follow_requests( parsed = urlparse(full_url) if parsed.scheme not in ("http", "https"): continue + # Skip URLs whose decoded path/query carries non-printable + # or non-ASCII bytes. Such URLs (common on the + # crawler-test.com fixture pages) trip a known + # ``scrapy_scrapingbee`` bug: when ScrapingBee's API + # returns 500 for the malformed URL, the library's errback + # tries to format the error using ``response.text`` — + # which raises ``AttributeError`` on a binary + # screenshot-mode response and kills the whole spider. + # Filtering them out keeps the crawl going. + try: + _path_tail = _unquote( + (parsed.path or "") + (parsed.query or ""), + errors="replace", + ) + if not all(32 <= ord(ch) <= 126 for ch in _path_tail): + continue + except Exception: + continue if not self._url_allowed(full_url): continue if self._include_re and not self._include_re.search(full_url): @@ -383,90 +857,233 @@ def _iter_follow_requests( if normalized in self.seen_urls: continue self.seen_urls.add(normalized) + yield full_url, depth + 1 + + def _iter_follow_requests( + self, + response: Response, + params: dict[str, Any], + callback: Any, + ) -> Any: + """Yield ScrapingBeeRequests for allowed links from response. + Used by the same-mode ``parse()`` flow (HTML crawl) and by the + HTML-save-pattern branch of ``_parse_crawl_and_save``. + """ + # max_pages = max saved pages. Stop queueing follow-ups once + # the budget (already-saved + in-flight saves) is committed. + if self.max_pages != 0 and self._save_count + self._save_pending >= self.max_pages: + return + for full_url, next_depth in self._iter_follow_urls(response): yield ScrapingBeeRequest( full_url, params=params, headers=self.custom_headers, - meta={"depth": depth + 1}, + meta={"depth": next_depth}, callback=callback, + errback=self._on_request_error, ) + def _make_save_request(self, url: str) -> ScrapingBeeRequest: + """Build a save request (full ``scrape_params``) for ``url``. + Used in the pool-based discovery flow once we've accumulated + enough candidate URLs. Caller is responsible for incrementing + ``_save_pending`` before yielding. + """ + return ScrapingBeeRequest( + url, + params=dict(self.scrape_params), + headers=self.custom_headers, + callback=self._parse_save_only, + errback=self._on_save_error, + dont_filter=True, + priority=10, + ) + def parse(self, response: Response, **kwargs: object) -> Any: - """Save response, then yield follow requests. If no links found in response, - yield a discovery request (same URL with HTML-only params) to extract links.""" + """Same-mode callback: the response is both saved and parsed for + outgoing links. Only used when scrape_params return HTML or + json_response with a parseable body — binary/extract modes are + routed through ``_parse_crawl_and_save`` from ``start_requests``. + """ + from scrapy.exceptions import CloseSpider + self._fetch_count += 1 self.logger.info("Fetched %s (%d bytes)", response.url, len(response.body)) - # Only save if URL matches --save-pattern (or no pattern set) - if not self._save_re or self._save_re.search(response.url): - try: - self._save_response(response) - except Exception as e: - self.logger.warning("Failed to save %s: %s", response.url, e) + try: + self._save_response(response) + self._save_count += 1 + self._push_saved_status() + except CloseSpider: + # The cap-reached signal from _push_saved_status MUST + # propagate to Scrapy's engine — catching it as a generic + # exception below would silence the shutdown and let the + # already-queued follow requests keep firing. + raise + except Exception as e: + self.logger.warning("Failed to save %s: %s", response.url, e) try: hrefs = _extract_hrefs_from_response(response) except Exception: hrefs = [] if hrefs: yield from self._iter_follow_requests(response, dict(self.scrape_params), self.parse) - else: - # Skip discovery re-request for URLs that are clearly binary/non-HTML resources - # (images, PDFs, CSS, JS, etc.) — they will never contain links. - url_ext = extension_from_url_path(response.url) - if url_ext in _NON_HTML_URL_EXTENSIONS: - return - discovery_params = _params_for_discovery(self.scrape_params) - yield ScrapingBeeRequest( - response.url, - params=discovery_params, - headers=self.custom_headers, - meta=response.meta, - callback=self._parse_discovery_links_only, - dont_filter=True, - ) def _parse_crawl_and_save(self, response: Response, **kwargs: object) -> Any: - """Used when --save-pattern is set. Receives HTML (discovery params), - extracts links, follows them, and fires a save request for matching pages.""" + """Discovery-first callback. Two flows live here: + + * **Binary / extract modes** (``_requires_discovery_phase``): + POOL-BASED. Each discovery response contributes its own URL + and its outbound links to ``_save_queue``. We do NOT fire a + save per page. Once the queue reaches ``max_pages`` we flip + ``_discovery_done``, dispatch one save request per queued + URL up to the cap, and stop discovering. Save credits paid + per pre-cap discovery: 0. Compare the old "save+follow each + page" flow, which paid one full-param fetch per saved page + PLUS one HTML discovery per saved page — roughly 2× credits. + + * **HTML save-pattern mode**: SAVE-IN-PLACE. The response IS + the HTML we want to save (the user's ``scrape_params`` + already yield HTML), so we write it directly and follow + links. No separate save request needed. + """ + from scrapy.exceptions import CloseSpider as _CloseSpider + self._fetch_count += 1 self.logger.info("Fetched %s (%d bytes) [crawl]", response.url, len(response.body)) - # If this page matches --save-pattern, fire a separate request with full params to save - if self._save_re and self._save_re.search(response.url): + binary_mode = _requires_discovery_phase(self.scrape_params) + + if not binary_mode: + # ── HTML save-pattern flow (unchanged) ─────────────────── + save_this = (self._save_re is None) or bool(self._save_re.search(response.url)) + within_cap = ( + self.max_pages == 0 or self._save_count + self._save_pending < self.max_pages + ) + if save_this and within_cap: + try: + self._save_response(response) + self._save_count += 1 + self._push_saved_status() + except _CloseSpider: + raise + except Exception as e: + self.logger.warning("Failed to save %s: %s", response.url, e) + try: + hrefs = _extract_hrefs_from_response(response) + except Exception: + hrefs = [] + if hrefs: + yield from self._iter_follow_requests( + response, + _params_for_discovery(self.scrape_params), + self._parse_crawl_and_save, + ) + return + + # ── Binary / extract mode: pool-based discovery ────────────── + if self._discovery_done: + # A late-arriving discovery response after saturation. The + # save dispatches for the first ``max_pages`` URLs are + # already in flight; this page contributes nothing new. + return + + # Add the current URL to the save queue (if it passes the + # save filter) so the seed and every successfully-discovered + # page becomes a save candidate. ``seen_urls`` and the pool + # both dedup on the normalized form so a trailing-slash + # difference between the seed and an extracted link doesn't + # produce two entries for the same logical page. + if (self._save_re is None) or bool(self._save_re.search(response.url)): + norm = _normalize_url(response.url) + if norm not in self._save_queue_set: + self._save_queue.append(response.url) + self._save_queue_set.add(norm) + + # Extract links from this page and grow both queues. + new_discovery_targets: list[tuple[str, int]] = [] + for full_url, next_depth in self._iter_follow_urls(response): + new_discovery_targets.append((full_url, next_depth)) + if (self._save_re is None) or bool(self._save_re.search(full_url)): + norm = _normalize_url(full_url) + if norm not in self._save_queue_set: + self._save_queue.append(full_url) + self._save_queue_set.add(norm) + + # Saturation: pool has enough candidates → stop discovery, + # dispatch saves for the first ``max_pages`` URLs in queue + # order (seed first, then breadth-first by discovery). The + # remaining URLs stay in the queue as reserves — ``_on_save_error`` + # pulls from them if a dispatched save fails. + if self.max_pages and len(self._save_queue) >= self.max_pages: + self._discovery_done = True + for url in self._save_queue[: self.max_pages]: + self._save_pending += 1 + self._save_queue_next += 1 + yield self._make_save_request(url) + return + + # Still hungry — yield discoveries for the newly-extracted URLs. + discovery_params = _params_for_discovery(self.scrape_params) + for full_url, next_depth in new_discovery_targets: yield ScrapingBeeRequest( - response.url, - params=dict(self.scrape_params), + full_url, + params=discovery_params, headers=self.custom_headers, - meta=response.meta, - callback=self._parse_save_only, - dont_filter=True, - ) - # Extract links from HTML and follow them - try: - hrefs = _extract_hrefs_from_response(response) - except Exception: - hrefs = [] - if hrefs: - yield from self._iter_follow_requests( - response, - _params_for_discovery(self.scrape_params), - self._parse_crawl_and_save, + meta={"depth": next_depth}, + callback=self._parse_crawl_and_save, + errback=self._on_request_error, ) def _parse_save_only(self, response: Response, **kwargs: object) -> Any: """Save the response (fetched with full params). No link following.""" + from scrapy.exceptions import CloseSpider + self.logger.info("Fetched %s (%d bytes) [save]", response.url, len(response.body)) try: self._save_response(response) self._save_count += 1 + self._push_saved_status() + except CloseSpider: + raise except Exception as e: self.logger.warning("Failed to save %s: %s", response.url, e) + finally: + # ``finally`` runs even when CloseSpider is re-raised, so the + # pending counter is still decremented cleanly during shutdown. + if self._save_pending > 0: + self._save_pending -= 1 - def _parse_discovery_links_only(self, response: Response, **kwargs: object) -> Any: - """Handle HTML response from discovery request: extract links and follow (no save).""" - self.logger.info("Fetched %s (%d bytes) [discovery]", response.url, len(response.body)) + def _on_save_error(self, failure) -> None: + """Errback for save requests — decrement the pending counter, + log, and backfill from the pool if the user's cap isn't yet + committed. Without backfill, a handful of network failures + would silently shrink the user's effective ``max_pages``. + """ + if self._save_pending > 0: + self._save_pending -= 1 + # If we have reserves in ``_save_queue`` AND the cap (already- + # saved + still-in-flight) hasn't been committed yet, dispatch + # a replacement save. Only relevant when discovery is done + # (i.e. we've already started flushing the queue). try: - yield from self._iter_follow_requests(response, dict(self.scrape_params), self.parse) - except Exception as e: - self.logger.warning("Discovery failed for %s: %s", response.url, e) + if ( + self._discovery_done + and self.max_pages + and self._save_queue_next < len(self._save_queue) + and self._save_count + self._save_pending < self.max_pages + ): + engine = getattr(getattr(self, "_crawler", None), "engine", None) + if engine is not None: + url = self._save_queue[self._save_queue_next] + self._save_queue_next += 1 + self._save_pending += 1 + try: + engine.crawl(self._make_save_request(url), self) + except Exception: + if self._save_pending > 0: + self._save_pending -= 1 + except Exception: + pass + return self._on_request_error(failure) def _fetch_sitemap_urls(url: str, *, api_key: str | None = None, depth: int = 0) -> list[str]: @@ -617,9 +1234,30 @@ def run_project_spider( download_delay=download_delay, autothrottle_enabled=autothrottle_enabled, ) + from .theme import is_repl_mode as _is_repl_mode + + _repl_log_active = _is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" + if _repl_log_active: + # Verbose file log, quiet stream — see run_urls_spider for why. + settings.set("LOG_LEVEL", "INFO") + log_path = _maybe_set_repl_log_file(settings) + if log_path: + click.echo( + f"REPL mode: full crawl log → {log_path} (use `:view crawl` to scroll through it)", + err=True, + ) + _ensure_reactor_usable() process = CrawlerProcess(settings) + if _repl_log_active: + import logging as _logging + + for _h in _logging.getLogger().handlers: + if isinstance(_h, _logging.FileHandler): + continue + if isinstance(_h, _logging.StreamHandler): + _h.setLevel(_logging.WARNING) process.crawl(spider_name) - process.start() + process.start(install_signal_handlers=_install_signal_handlers()) finally: os.chdir(orig_cwd) @@ -641,6 +1279,7 @@ def run_urls_spider( include_pattern: str | None = None, exclude_pattern: str | None = None, save_pattern: str | None = None, + known_total: int | None = None, ) -> None: """Run the built-in generic spider: start from URLs and follow links. By default only same-domain links are followed; use allowed_domains or @@ -679,10 +1318,52 @@ def run_urls_spider( download_delay=download_delay, autothrottle_enabled=autothrottle_enabled, ) - settings.set("LOG_LEVEL", "WARNING") + # In REPL mode we want the *file* log to be verbose (so ``:view crawl`` + # is actually useful) while keeping the *stream* output quiet (so the + # REPL scrollback isn't drowned in per-request INFO chatter). We do + # that by raising LOG_LEVEL to INFO globally and then bumping ONLY + # the StreamHandler back up to WARNING after CrawlerProcess wires up + # the handlers (see below). Outside REPL there's no file log, so the + # stream handler picks up LOG_LEVEL directly — keep that at WARNING. + from .theme import is_repl_mode as _is_repl_mode + + _repl_log_active = _is_repl_mode() or os.environ.get("SCRAPINGBEE_FROM_REPL") == "1" + settings.set("LOG_LEVEL", "INFO" if _repl_log_active else "WARNING") if max_pages > 0: - settings.set("CLOSESPIDER_PAGECOUNT", max_pages) + # The authoritative cap is the spider's ``_save_count >= + # max_pages`` check (in both ``_iter_follow_requests`` and the + # per-page save dispatch in ``_parse_crawl_and_save``). Scrapy's + # ``CLOSESPIDER_PAGECOUNT`` counts EVERY response — in the + # discovery-flow modes that fire one HTML pass plus one save + # request per page, the response count can easily reach + # ``max_pages × N`` where N depends on how many hrefs a typical + # page exposes. Set the framework cap to a generous multiple + # so it never fires before the spider's own cap stops queuing. + use_discovery_flow = bool(save_pattern) or _requires_discovery_phase(scrape_params or {}) + framework_cap = max_pages * 20 if use_discovery_flow else max_pages + settings.set("CLOSESPIDER_PAGECOUNT", framework_cap) + log_path = _maybe_set_repl_log_file(settings) + if log_path: + click.echo( + f"REPL mode: full crawl log → {log_path} (use `:view crawl` to scroll through it)", + err=True, + ) + _ensure_reactor_usable() process = CrawlerProcess(settings) + # CrawlerProcess just configured the root logger with handlers + # honouring LOG_LEVEL. In REPL mode we asked for INFO so the file + # captures everything, but the StreamHandler also got INFO and + # would spam the REPL scrollback. Demote ONLY the StreamHandler + # (not the FileHandler, which is a StreamHandler subclass) so the + # file stays verbose while stderr stays clean. + if _repl_log_active: + import logging as _logging + + for _h in _logging.getLogger().handlers: + if isinstance(_h, _logging.FileHandler): + continue + if isinstance(_h, _logging.StreamHandler): + _h.setLevel(_logging.WARNING) process.crawl( GenericScrapingBeeSpider, start_urls=urls, @@ -698,5 +1379,6 @@ def run_urls_spider( include_pattern=include_pattern, exclude_pattern=exclude_pattern, save_pattern=save_pattern, + known_total=known_total, ) - process.start() + process.start(install_signal_handlers=_install_signal_handlers()) diff --git a/src/scrapingbee_cli/help_formatter.py b/src/scrapingbee_cli/help_formatter.py new file mode 100644 index 0000000..da2c0b2 --- /dev/null +++ b/src/scrapingbee_cli/help_formatter.py @@ -0,0 +1,156 @@ +"""Custom Rich-powered help formatter for ScrapingBee CLI.""" + +from __future__ import annotations + +import sys +from typing import Any + +import click + +from .theme import BEE_AMBER, BEE_YELLOW, err_console + + +def _should_style() -> bool: + """True when stderr is a real TTY (styled help goes to stderr).""" + return sys.stderr.isatty() + + +class BeeHelpFormatter(click.HelpFormatter): + """Click help formatter that outputs styled text via Rich.""" + + def write(self, string: str) -> None: + """Collect raw text — we'll style it in getvalue().""" + super().write(string) + + +class BeeCommand(click.Command): + """Command subclass that renders help through Rich.""" + + def format_help(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: + """Override to render help with Rich styling.""" + self.format_usage(ctx, formatter) + self.format_help_text(ctx, formatter) + self.format_options(ctx, formatter) + self.format_epilog(ctx, formatter) + + def get_help(self, ctx: click.Context) -> str: + """Return plain help AND print styled version to stderr if TTY.""" + formatter = ctx.make_formatter() + self.format_help(ctx, formatter) + plain = formatter.getvalue() + if _should_style(): + _print_styled_help(plain, self.name or "") + return plain + + +class BeeGroup(click.Group): + """Group subclass that renders help through Rich.""" + + def get_help(self, ctx: click.Context) -> str: + formatter = ctx.make_formatter() + self.format_help(ctx, formatter) + plain = formatter.getvalue() + if _should_style(): + _print_styled_help(plain, self.name or "scrapingbee") + return plain + + def format_help(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: + self.format_usage(ctx, formatter) + self.format_help_text(ctx, formatter) + self.format_options(ctx, formatter) + self.format_commands(ctx, formatter) + self.format_epilog(ctx, formatter) + + def command(self, *args: Any, **kwargs: Any) -> Any: + kwargs.setdefault("cls", BeeCommand) + return super().command(*args, **kwargs) + + def group(self, *args: Any, **kwargs: Any) -> Any: + kwargs.setdefault("cls", BeeGroup) + return super().group(*args, **kwargs) + + +def _print_styled_help(plain_help: str, command_name: str) -> None: + """Parse plain Click help text and render it with Rich styling.""" + from rich.text import Text + + lines = plain_help.split("\n") + + # Header + err_console.print() + header = Text() + header.append(f" {command_name}", style=f"bold {BEE_YELLOW}") + err_console.print(header) + err_console.print() + + in_commands = False + + for line in lines: + stripped = line.strip() + + # Skip the "Usage:" line (we already printed header) + if stripped.startswith("Usage:"): + # Print usage in dim + err_console.print(f" [dim]{stripped}[/dim]") + continue + + # Section headers + if stripped.endswith(":") and not stripped.startswith("-") and not stripped.startswith("["): + in_commands = stripped == "Commands:" + err_console.print( + f" [bold {BEE_YELLOW}]~~ {stripped[:-1]} ~~{'~' * (36 - len(stripped))}[/]" + ) + continue + + # Option group headers (from click-option-group) + if stripped.endswith(":") and len(stripped) < 40 and not stripped.startswith("-"): + err_console.print( + f" [bold {BEE_YELLOW}]~~ {stripped[:-1]} ~~{'~' * (36 - len(stripped))}[/]" + ) + continue + + # Empty lines + if not stripped: + err_console.print() + continue + + # Description text (not indented or lightly indented, not starting with -) + if not line.startswith(" ") or ( + line.startswith(" ") and not line.startswith(" ") and not stripped.startswith("-") + ): + if stripped and not stripped.startswith("-"): + err_console.print(f" [dim]{stripped}[/dim]") + continue + + # Options: --flag Description + if stripped.startswith("-") or stripped.startswith("["): + # Split on double space to separate flag from description + parts = stripped.split(" ", 1) + if len(parts) == 2: + flag, desc = parts[0].strip(), parts[1].strip() + text = Text() + text.append(f" {flag:<30}", style=f"bold {BEE_AMBER}") + text.append(f" {desc}", style="dim") + err_console.print(text) + else: + err_console.print(f" [{BEE_AMBER}]{stripped}[/]") + continue + + # Commands list + if in_commands and stripped: + parts = stripped.split(" ", 1) + if len(parts) == 2: + cmd, desc = parts[0].strip(), parts[1].strip() + text = Text() + text.append(f" {cmd:<20}", style=f"bold {BEE_YELLOW}") + text.append(f" {desc}", style="dim") + err_console.print(text) + else: + err_console.print(f" [{BEE_YELLOW}]{stripped}[/]") + continue + + # Indented description continuation + if stripped: + err_console.print(f" [dim]{stripped}[/dim]") + + err_console.print() diff --git a/src/scrapingbee_cli/interactive.py b/src/scrapingbee_cli/interactive.py new file mode 100644 index 0000000..a0dd8b8 --- /dev/null +++ b/src/scrapingbee_cli/interactive.py @@ -0,0 +1,4888 @@ +"""Interactive REPL — Ink-style hybrid (real scrollback + persistent bottom prompt). + +The pattern is the same one Claude CLI uses (see Ink's `` component): +- Past command output is printed to real terminal stdout → goes into terminal + scrollback. Mouse-wheel scrolling and selection work normally, resize is + handled by the terminal, and quitting leaves a clean record behind. +- The input area + status toolbar live at the very bottom of the terminal as + a small persistent `Application(full_screen=False)`. prompt_toolkit's + `patch_stdout` redraws this strip whenever something prints, so the prompt + is always visible no matter how many lines of output flow above. + +That means: typing a command, hitting enter, watching output stream in +*above* the prompt — exactly the Claude experience — without losing real +terminal scrollback or selection. + +Implementation notes: +- ONE persistent Application for the whole REPL session (not one-per-prompt). +- Enter key binding runs the click command synchronously inside the handler. + Output from the command goes through patched stdout/stderr and lands above + the prompt. +- Interactive commands (tutorial, auth) take over the terminal via + `run_in_terminal` so click.prompt() works. +- On launch we pad with newlines so the prompt anchors at the bottom from + the first frame. +""" + +from __future__ import annotations + +import os +import re +import shlex +import sys +import threading +import time +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any + +from rich.text import Text + +from .theme import BEE_DIM, BEE_RED, BEE_YELLOW, err_console + +if TYPE_CHECKING: + pass + + +# --------------------------------------------------------------------------- +# Refined palette +# --------------------------------------------------------------------------- + +_AMBER = "#E5A800" # frame border / soft accent +_GREEN = "#22C55E" # success +_DIM2 = "#555555" # darker chrome (toolbar labels, hint) +_BG_CHIP = "#1a1400" # chip background (settings) +_URL_CYAN = "#7DD3FC" # URLs in input lexer + +_STYLE_DICT = { + # Top/bottom horizontal rules around the input + "rule": _AMBER, + # Prompt mark inside the input area + "promptmark": f"{BEE_YELLOW} bold", + # Lexer (input syntax highlighting). Specific categories have explicit + # colours; unstyled tokens fall through to the application's default + # style (key `""`), which is set per-session in `_style_dict_for`. + "lexer.cmd": f"{BEE_YELLOW} bold", + "lexer.flag": _AMBER, + "lexer.url": _URL_CYAN, + "lexer.string": _GREEN, + # Bottom toolbar + "toolbar": f"{BEE_DIM}", + "toolbar.label": _DIM2, + "toolbar.value": f"{BEE_YELLOW} bold", + "toolbar.ok": f"{_GREEN} bold", + "toolbar.fail": f"{BEE_RED} bold", + "toolbar.hint": _DIM2, + "toolbar.chip": f"bg:{_BG_CHIP} {BEE_YELLOW}", + "toolbar.gauge": f"{BEE_YELLOW}", + # Completion menu + "completion-menu": f"bg:{_BG_CHIP}", + "completion-menu.completion": f"bg:{_BG_CHIP} {BEE_YELLOW}", + "completion-menu.completion.current": f"bg:{BEE_YELLOW} #000000 bold", + "completion-menu.meta.completion": f"bg:{_BG_CHIP} #886600", + "completion-menu.meta.completion.current": f"bg:{BEE_YELLOW} #000000", + "auto-suggestion": "fg:#777777 italic", +} + + +def _style_dict_for(keep_bg: bool) -> dict[str, str]: + """Return the prompt_toolkit Style dict for the REPL session. + + When `keep_bg` is False (default), set the empty class `""` (the default + style) to a dark-theme foreground. Combined with the OSC 11/10 escapes + that switch the *terminal* fg/bg to dark, this gives a single coherent + "dark theme" applied at both layers — explicit class colours stay as-is, + and any unstyled text falls back to a readable light-grey. + + With `keep_bg=True`, the default class is empty and the terminal's own + fg/bg are untouched — the user's system theme drives all defaults. + """ + style = dict(_STYLE_DICT) + if not keep_bg: + style[""] = "fg:#EAEAEA" + return style + + +# --------------------------------------------------------------------------- +# Binary-write adapter +# --------------------------------------------------------------------------- + + +class _BinaryAdapter: + """Adapter that exposes a ``.write(bytes)`` interface on top of a text + stream. Bolted onto prompt_toolkit's StdoutProxy at runtime so callers + that write bytes (``sys.stdout.buffer.write(b"...")``) work transparently + while we're inside a ``patch_stdout`` context. + """ + + def __init__(self, text_stream) -> None: + self._stream = text_stream + + def write(self, data) -> int: + if data is None or len(data) == 0: + return 0 + if isinstance(data, (bytes, bytearray, memoryview)): + text = bytes(data).decode("utf-8", errors="replace") + else: + text = str(data) + self._stream.write(text) + return len(data) + + def flush(self) -> None: + try: + self._stream.flush() + except Exception: + pass + + @property + def closed(self) -> bool: + return False + + +# --------------------------------------------------------------------------- +# Virtual scrollback (for full_screen=True mode) +# --------------------------------------------------------------------------- + + +try: + from prompt_toolkit.auto_suggest import AutoSuggest as _PTKAutoSuggest +except Exception: # pragma: no cover — prompt_toolkit should always be present + _PTKAutoSuggest = object # type: ignore[misc,assignment] + + +class BeeAutoSuggest(_PTKAutoSuggest): + """Context-aware ghost-text autosuggest for the REPL prompt. + + On each keystroke prompt_toolkit calls ``get_suggestion`` with the + current buffer; we look at the partial token under the cursor and + return a single greyed-out continuation (or ``None`` for silence). + + Sources used, in order: + - **First word** → match against known command names. + - **A flag** (token starts with ``-``) → match flags registered for + the current command. + - **Token after a choice/bool flag** → match valid choice values. + - **Free text otherwise** → match the start of a previous history + line that begins with the same prefix. + + Candidates are ranked by recency in command history (most-recently- + used wins → behaves like frequency for active users). If the + partial token doesn't prefix any known candidate, we return + ``None`` — typos get no suggestion, even if they happen to be + substrings of past commands. + + Accepting a suggestion (Right arrow / End, or Ctrl+F for the first + word in emacs-style bindings) is handled by prompt_toolkit's + built-in ``auto_suggest_apply`` key processors — no extra wiring + needed here. + """ + + def __init__( + self, + command_names, + command_flags, + bool_flags, + choice_flags, + history, + is_disabled=None, + ) -> None: + self._command_names = sorted(command_names) + self._command_flags = command_flags + self._bool_flags = bool_flags + self._choice_flags = choice_flags + self._history = history + # Optional callable; when it returns True we skip suggestions + # entirely. Used during first-run API key entry — we don't want + # history-based suggestions (which might leak a previously-typed + # secret) or command-name suggestions (irrelevant in that mode). + self._is_disabled = is_disabled + # Cache history lines (newest-first). Refreshed lazily when the + # underlying length changes — cheap O(1) check, avoids re-listing + # the history on every keystroke. + self._cached_lines: list[str] = [] + self._cached_len = -1 + + def _refresh_history(self) -> None: + if self._history is None: + return + try: + lines = list(self._history.get_strings()) + except Exception: + return + if len(lines) != self._cached_len: + self._cached_len = len(lines) + self._cached_lines = lines + + def _rank_by_recency(self, candidates: list[str]) -> list[str]: + """Sort candidates by first occurrence in (newest-first) history. + Unseen candidates fall to the end, then ordered alphabetically.""" + self._refresh_history() + recency: dict[str, int] = {} + for i, line in enumerate(self._cached_lines): + for tok in line.split(): + if tok in candidates and tok not in recency: + recency[tok] = i + return sorted(candidates, key=lambda c: (recency.get(c, 10**9), c)) + + def get_suggestion(self, buffer, document): + from prompt_toolkit.auto_suggest import Suggestion + + try: + if self._is_disabled is not None and self._is_disabled(): + return None + text = document.text_before_cursor + if not text: + return None + words = text.split() + if not words: + return None + first = words[0] + + # Gate against typos at the command level. We only allow a + # suggestion if the first token is either a recognised command + # or a valid PREFIX of one — otherwise we'd risk surfacing + # history junk for a clear typo (the user's explicit ask). + first_is_known = first in self._command_flags + first_is_prefix = not first_is_known and any( + c.startswith(first) for c in self._command_names + ) + if not (first_is_known or first_is_prefix): + return None + + # 1) Prefer a full history-line continuation. Catches the most + # natural case: "scrape https://exam" → finish the URL + # and any flags the user last paired with it. + self._refresh_history() + for line in self._cached_lines: + if line.startswith(text) and line != text: + return Suggestion(line[len(text) :]) + + # 2) No matching history line. Suggest from the structured + # options (command names, flags, choice values). + has_trailing_space = text.endswith(" ") + last = words[-1] + on_first = (len(words) == 1) and not has_trailing_space + + if on_first: + cands = [c for c in self._command_names if c.startswith(last) and c != last] + if not cands: + return None + best = self._rank_by_recency(cands)[0] + return Suggestion(best[len(last) :]) + + # Multi-word — need a recognised command to suggest structure. + if not first_is_known: + return None + if has_trailing_space: + return None # no partial token to complete + + if last.startswith("-"): + flags = self._command_flags.get(first, []) + cands = [f for f in flags if f.startswith(last) and f != last] + if not cands: + return None + best = self._rank_by_recency(cands)[0] + return Suggestion(best[len(last) :]) + + if len(words) >= 2: + prev = words[-2] + if prev in self._choice_flags: + cands = [ + v for v in self._choice_flags[prev] if v.startswith(last) and v != last + ] + if not cands: + return None + best = self._rank_by_recency(cands)[0] + return Suggestion(best[len(last) :]) + if prev in self._bool_flags: + for v in ("true", "false"): + if v.startswith(last.lower()) and v != last.lower(): + return Suggestion(v[len(last) :]) + return None + return None + except Exception: + return None + + +def _make_capped_history(filename: str, max_entries: int = 10_000): + """Construct a ``FileHistory`` with the on-disk file pre-trimmed to + keep at most ``max_entries`` most-recent entries. + + prompt_toolkit's stock ``FileHistory`` appends forever — every + command you ever type lives in ``.history`` until you delete the + file manually. For long-running CLI users that file grows unbounded + and slows down the REPL's initial history-load. We keep the last + 10000 entries on disk (a few months of normal use, file stays + under ~2 MB). + + Trim runs once at construction. During the session, ``FileHistory`` + appends as normal — no per-write overhead. The file may briefly + exceed the cap mid-session; the excess is dropped on next startup. + """ + import datetime as _dt + import os as _os + + from prompt_toolkit.history import FileHistory + + if _os.path.exists(filename): + try: + tmp_history = FileHistory(filename) + strings = list(tmp_history.load_history_strings()) # newest-first + if len(strings) > max_entries: + keep_newest_first = strings[:max_entries] + keep_oldest_first = list(reversed(keep_newest_first)) + tmp = filename + ".tmp" + now = _dt.datetime.now() + try: + with open(tmp, "wb") as f: + for s in keep_oldest_first: + f.write(f"\n# {now}\n".encode()) + for line in s.split("\n"): + f.write(f"+{line}\n".encode()) + _os.replace(tmp, filename) + except Exception: + try: + _os.unlink(tmp) + except Exception: + pass + except Exception: + pass + return FileHistory(filename) + + +def _split_fragments_to_width( + line: list[tuple[str, str]], width: int +) -> list[list[tuple[str, str]]]: + """Split a logical line's (style, text) fragments into a list of + visual rows, each at most ``width`` characters wide. + + Empty input → one empty row (so blank lines still occupy one row). + Preserves styles across the split — if a styled fragment crosses a + row boundary, the boundary lands inside the fragment with the same + style on both sides. + """ + if width <= 0: + return [list(line)] + if not line: + return [[]] + out: list[list[tuple[str, str]]] = [] + current: list[tuple[str, str]] = [] + current_len = 0 + for sty, text in line: + if not text: + continue + i = 0 + n = len(text) + while i < n: + room = width - current_len + if room <= 0: + out.append(current) + current = [] + current_len = 0 + room = width + chunk = text[i : i + room] + current.append((sty, chunk)) + current_len += len(chunk) + i += len(chunk) + if current or not out: + out.append(current) + return out + + +class ScrollbackBuffer: + """In-memory line buffer that backs the scrollable output Window. + + When the REPL runs in full_screen mode we own the alt buffer, so command + output can't flow into real terminal scrollback. Instead, every line of + output gets parsed for ANSI escapes and stored as a list of + ``(style, text)`` fragments. The render callback for the output Window + asks the buffer for a slice based on the current scroll offset. + + Thread-safe append: command output is written from worker threads and + the renderer reads from the loop thread; a lock keeps the list + consistent without trying to be clever. + """ + + MAX_LINES = 10_000 # ring-buffer cap so a runaway scrape can't OOM us + + def __init__(self) -> None: + self.lines: list[list[tuple[str, str]]] = [] + # How many lines we're scrolled up from the bottom. 0 = at bottom + # (auto-follow); positive = locked at some scrolled-up position. + self.scroll_offset = 0 + self._lock = threading.Lock() + + def append_fragments(self, fragments: list) -> None: + """Append one rendered line (already styled) as the final entry. + ``fragments`` is the prompt_toolkit ``StyleAndTextTuples`` + shape — either ``(style, text)`` or ``(style, text, handler)``. + Typed loosely so callers using either variant are accepted. + """ + with self._lock: + self.lines.append(list(fragments)) + if len(self.lines) > self.MAX_LINES: + # Drop the oldest 10% — cheaper than dropping one at a time + # if a scrape produces tens of thousands of lines. + drop = self.MAX_LINES // 10 + del self.lines[:drop] + + def replace_last_line(self, fragments: list) -> None: + """Overwrite the most recent line. Used for in-place progress + updates via the standard terminal ``\\r`` idiom — write + ``\\r\\n`` and the previous line gets replaced rather + than another row appended. + """ + with self._lock: + if self.lines: + self.lines[-1] = list(fragments) + else: + self.lines.append(list(fragments)) + + def replace_last_n_lines(self, n: int, lines: list) -> None: + """Replace the most recent ``n`` lines with the given ``lines``. + If fewer than ``n`` lines exist, the remainder is appended. + Used for multi-line in-place progress widgets (e.g. the + 3-row honeycomb progress bar). + """ + with self._lock: + if len(self.lines) >= n and n > 0: + # Replace tail in place — same count, no shift. + self.lines[len(self.lines) - n :] = [list(f) for f in lines] + else: + # Not enough prior lines to replace; append. + for f in lines: + self.lines.append(list(f)) + + def append_ansi_text(self, text: str) -> None: + """Parse ANSI codes in ``text`` and append the resulting line(s). + + Handles partial-line writes: callers may write text without a + trailing newline (e.g. an in-progress progress bar). We split on + ``\\n``; the final post-split chunk goes into a pending buffer + that gets prepended to the next write. + + Carriage-return (``\\r``) handling: anything before the last + ``\\r`` on a line is discarded (standard terminal "go to start + of line" semantics), AND the resulting line replaces the + previous line in scrollback instead of appending. This lets + callers do in-place progress updates by writing + ``\\r\\n`` repeatedly. + """ + from prompt_toolkit.formatted_text import ANSI, to_formatted_text + + # Combine with anything pending from a previous partial write. + with self._lock: + pending = self._pending if hasattr(self, "_pending") else "" + combined = pending + text + chunks = combined.split("\n") + self._pending = chunks[-1] # may be empty if text ended with \n + complete = chunks[:-1] + + for raw in complete: + had_cr = "\r" in raw + if had_cr: + # Everything before the last \r is overwritten — keep + # only what comes after it. + raw = raw.rsplit("\r", 1)[1] + try: + fragments = list(to_formatted_text(ANSI(raw))) + except Exception: + fragments = [("", raw)] + if had_cr: + self.replace_last_line(fragments) + else: + self.append_fragments(fragments) + + def flush_pending(self) -> None: + """Commit any pending partial line as its own row.""" + with self._lock: + pending = getattr(self, "_pending", "") + self._pending = "" + if pending: + from prompt_toolkit.formatted_text import ANSI, to_formatted_text + + try: + fragments = list(to_formatted_text(ANSI(pending))) + except Exception: + fragments = [("", pending)] + self.append_fragments(fragments) + + def get_visible_window(self, height: int) -> list[list[tuple[str, str]]]: + """Backwards-compatible: visible slice in *logical* lines.""" + with self._lock: + total = len(self.lines) + if total == 0: + return [] + max_offset = max(0, total - height) + if self.scroll_offset > max_offset: + self.scroll_offset = max_offset + end = total - self.scroll_offset + start = max(0, end - height) + return [list(line) for line in self.lines[start:end]] + + def get_visible_visual(self, height: int, width: int) -> list[list[tuple[str, str]]]: + """Return visible content in *visual rows* (post-wrap). + + Long single lines that wrap to multiple terminal rows are + pre-split here at ``width`` characters so each entry in the + returned list is exactly one terminal row. ``scroll_offset`` + is in visual rows too, so one ``scroll_up(1)`` step moves the + view by exactly one visible row — even through a 5000-char + JSON blob that wraps to dozens of rows. This is what makes + wheel/trackpad scrolling feel consistent regardless of line + length. + """ + if width <= 1: + return self.get_visible_window(height) + with self._lock: + # Walk from the bottom up, accumulating visual rows until we + # have enough to fill the window at the requested scroll offset. + # Stops early on large buffers — we don't need to wrap content + # the user can't see this frame. + need = max(0, self.scroll_offset) + max(1, height) + collected: list[list[tuple[str, str]]] = [] # newest-first + for line in reversed(self.lines): + for visual_row in reversed(_split_fragments_to_width(line, width)): + collected.append(visual_row) + if len(collected) >= need: + break + collected.reverse() # back to oldest-first + total = len(collected) + max_offset = max(0, total - height) + if self.scroll_offset > max_offset: + self.scroll_offset = max_offset + end = total - self.scroll_offset + start = max(0, end - height) + return collected[start:end] + + def scroll_up(self, n: int = 1) -> None: + with self._lock: + # Soft cap — get_visible_window will further clamp based on + # the actual rendered height, but capping here at total-1 + # avoids letting offset grow unboundedly between renders. + self.scroll_offset = min(max(0, len(self.lines) - 1), self.scroll_offset + n) + + def scroll_down(self, n: int = 1) -> None: + with self._lock: + self.scroll_offset = max(0, self.scroll_offset - n) + + def scroll_to_top(self) -> None: + with self._lock: + self.scroll_offset = max(0, len(self.lines) - 1) + + def scroll_to_bottom(self) -> None: + with self._lock: + self.scroll_offset = 0 + + @property + def at_bottom(self) -> bool: + with self._lock: + return self.scroll_offset == 0 + + def insert_line(self, index: int, fragments: list) -> None: + """Insert a single line at ``index`` (clamped to current length). + + Used to retroactively splice the command-echo line in front of a + finished command's output, so the user sees ``❯ `` above the + output rows the command produced — without the echo being visible + during execution itself (where the shimmer is the live indicator). + """ + with self._lock: + i = max(0, min(index, len(self.lines))) + self.lines.insert(i, list(fragments)) + + def current_length(self) -> int: + with self._lock: + return len(self.lines) + + +class ScrollbackWriter: + """File-like writer that pipes everything into a ScrollbackBuffer. + + Installed as ``sys.stdout`` / ``sys.stderr`` while the REPL runs. + Click commands, rich consoles, plain ``print`` calls — all flow + through here, get parsed for ANSI, and end up as rows in the + scrollback. The renderer then displays them. + + Thread-safe: command output comes from worker threads while the + prompt_toolkit loop renders on the main thread. + """ + + encoding = "utf-8" + + def __init__(self, scrollback: ScrollbackBuffer, on_write: Any = None) -> None: + self._sb = scrollback + self._on_write = on_write # callable to nudge the app to re-render + + def write(self, s: Any) -> int: + if not s: + return 0 + if isinstance(s, (bytes, bytearray, memoryview)): + s = bytes(s).decode("utf-8", errors="replace") + elif not isinstance(s, str): + s = str(s) + self._sb.append_ansi_text(s) + if self._on_write is not None: + try: + self._on_write() + except Exception: + pass + return len(s) + + def flush(self) -> None: + # No-op — we don't buffer beyond ScrollbackBuffer's pending partial. + pass + + def isatty(self) -> bool: + return True # let click / rich treat us as a tty so colors stay on + + @property + def closed(self) -> bool: + return False + + def writable(self) -> bool: + return True + + +# --------------------------------------------------------------------------- +# Shimmer (prompt_toolkit-formatted) +# --------------------------------------------------------------------------- + +# Used for the live "running command" line above the input. A bright white +# "peak" cell sweeps across the line, flanked by warm-yellow cells, with the +# rest in brand yellow — reads as a glow running along the command text. +_SHIMMER_PEAK_PT = "#FFFFFF" +_SHIMMER_FLANK_PT = "#FFE780" + + +def _shimmer_pt(text: str, position: int, base_color: str) -> list[tuple[str, str]]: + """Return prompt_toolkit formatted-text tuples with a shimmer at `position`. + + Character at `position` is peak white, neighbours at ±1 are warm yellow, + everything else uses ``base_color``. Combined with a position that + advances each tick this reads as a wave of light along the text. + """ + out: list[tuple[str, str]] = [] + for i, ch in enumerate(text): + distance = abs(i - position) + if distance == 0: + style = f"bold fg:{_SHIMMER_PEAK_PT}" + elif distance == 1: + style = f"bold fg:{_SHIMMER_FLANK_PT}" + else: + style = f"bold fg:{base_color}" + out.append((style, ch)) + return out + + +# --------------------------------------------------------------------------- +# Click tree introspection +# --------------------------------------------------------------------------- + + +def _walk_click_tree( + cli_group: Any, +) -> tuple[dict[str, str], dict[str, list[str]], set[str], dict[str, list[str]]]: + """Return (command_help, command_flags, bool_flags, choice_flags).""" + import click + + command_help: dict[str, str] = {} + command_flags: dict[str, list[str]] = {} + bool_flags: set[str] = set() + choice_flags: dict[str, list[str]] = {} + + for name, cmd in cli_group.commands.items(): + first_line = "" + for source in (cmd.short_help, cmd.help): + if source: + first_line = source.strip().splitlines()[0] + break + command_help[name] = first_line + + flags: list[str] = [] + for param in cmd.params: + if not isinstance(param, click.Option): + continue + for opt in param.opts: + if opt.startswith("--"): + flags.append(opt) + if param.is_flag: + bool_flags.add(opt) + if isinstance(param.type, click.Choice): + choice_flags[opt] = list(param.type.choices) + command_flags[name] = sorted(set(flags)) + + return command_help, command_flags, bool_flags, choice_flags + + +# --------------------------------------------------------------------------- +# Session state +# --------------------------------------------------------------------------- + + +class SessionState: + """REPL-wide mutable state surfaced in the bottom toolbar.""" + + USAGE_REFRESH_INTERVAL = 30.0 # seconds between background usage API calls + + def __init__(self) -> None: + self.last_command: str | None = None + self.last_status: str | None = None # "ok" | "fail" + self.last_duration: float | None = None + # Live account state — surfaced in the toolbar. None ⇒ unknown / N/A. + self.credits: int | None = None # available = max - used + self.credits_total: int | None = None # max_api_credit + self.used_credits: int | None = None # used_api_credit (latest) + self.used_credits_at_start: int | None = None # snapshotted after first ok refresh + self.max_concurrency: int | None = None + self.current_concurrency: int | None = None + # Whether the API key was present when the REPL started (or after auth). + # Drives "N/A" rendering in the toolbar while False. + self.api_key_set: bool = False + # Short hash of the live API key. Used to detect logout/relogin with + # the same key — when the key is unchanged we keep the session + # counter going instead of resetting it to 0. + self.api_key_hash: str | None = None + self.last_usage_refresh_mono: float | None = None # time.monotonic() of last ok refresh + self.settings: dict[str, str] = {} + # In-flight execution state — drives the live "running" line above + # the input (with shimmer sweep) and the toolbar's running indicator. + self.is_running: bool = False + self.running_command: str | None = None + self.running_command_text: str | None = None # full line as typed + self.run_start: float | None = None + self.tick: int = 0 # frame counter for the shimmer position + # Mouse mode toggle: "scroll" = mouse_support on (wheel scrolls the + # virtual buffer, drag-select needs a per-terminal modifier); + # "select" = mouse_support off (native drag-select works everywhere + # without a modifier, but wheel scroll stops). Alt+S toggles. + self.mouse_mode: str = "scroll" + + def apply_settings_to_args( + self, args: list[str], accepted: set[str] | None = None + ) -> list[str]: + """Append session defaults to ``args`` for any flag that: + - is not already present on the command line, AND + - is accepted by the target command (when ``accepted`` is given). + + Without the ``accepted`` filter, session defaults would leak into + commands that don't take them (e.g. ``--json-response`` into + ``usage``), causing "No such option" errors. + """ + if not self.settings: + return args + present = {a for a in args if a.startswith("--")} + out = list(args) + for key, value in self.settings.items(): + flag = f"--{key}" + if flag in present: + continue + if accepted is not None and flag not in accepted: + continue + out.extend([flag, value]) + return out + + def refresh_credits_from_cache(self) -> None: + """Populate live fields from the on-disk usage cache. + + Cache file shape (written by ``batch.write_usage_file_cache``): + ``{"ts": , "key_hash": , "data": }`` + where ``data`` is the output of ``client.parse_usage``: + ``{"credits": int, "max_api_credit": int, "max_concurrency": int}`` + + Only the ``data`` sub-dict has the values we care about; reading any + other key would just see metadata. Earlier versions iterated + ``data.values()`` and relied on the fact that the inner dict happened + to have matching keys — works by accident, brittle if the cache + format ever grows. + """ + try: + import json + from pathlib import Path + + cache = Path.home() / ".config" / "scrapingbee-cli" / "usage_cache.json" + if not cache.exists(): + return + entry = json.loads(cache.read_text(encoding="utf-8")) + if not isinstance(entry, dict): + return + data = entry.get("data") + if not isinstance(data, dict): + return + if isinstance(data.get("credits"), int): + self.credits = data["credits"] + if isinstance(data.get("max_api_credit"), int): + self.credits_total = data["max_api_credit"] + if isinstance(data.get("max_concurrency"), int): + self.max_concurrency = data["max_concurrency"] + except Exception: + return + + def update_from_usage_response(self, raw: dict, key_hash: str | None = None) -> None: + """Apply a parsed JSON usage-API response to the live state. + + Snapshots ``used_credits_at_start`` on first successful update so the + toolbar's "used this session" remains accurate even if the REPL was + launched before the first refresh succeeded. If ``key_hash`` is + provided and differs from the previous one, the session start + snapshot is reset — so logging out and back in with a *different* + key starts the counter at 0, but re-auth with the *same* key keeps + counting from where it left off. + """ + if key_hash is not None and key_hash != self.api_key_hash: + # Key changed (initial set OR switched to a different key) — + # forget the previous session's baseline so the next snapshot + # below establishes a fresh one. + self.used_credits_at_start = None + self.api_key_hash = key_hash + max_credit = raw.get("max_api_credit") + used_credit = raw.get("used_api_credit") + if isinstance(max_credit, (int, float)): + self.credits_total = int(max_credit) + if isinstance(used_credit, (int, float)): + self.used_credits = int(used_credit) + if self.used_credits_at_start is None: + self.used_credits_at_start = int(used_credit) + if self.credits_total is not None and self.used_credits is not None: + self.credits = max(0, self.credits_total - self.used_credits) + mc = raw.get("max_concurrency") + if isinstance(mc, (int, float)): + self.max_concurrency = int(mc) + cc = raw.get("current_concurrency") + if isinstance(cc, (int, float)): + self.current_concurrency = int(cc) + self.last_usage_refresh_mono = time.monotonic() + + @property + def session_credits_used(self) -> int | None: + if self.used_credits is None or self.used_credits_at_start is None: + return None + return max(0, self.used_credits - self.used_credits_at_start) + + @property + def seconds_until_next_refresh(self) -> int | None: + if self.last_usage_refresh_mono is None: + return None + remaining = self.last_usage_refresh_mono + self.USAGE_REFRESH_INTERVAL - time.monotonic() + return max(0, int(remaining + 0.999)) # ceil so the countdown never shows -1 + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _format_credits(n: int) -> str: + if n >= 1_000_000: + return f"{n / 1_000_000:.1f}M" + if n >= 1_000: + return f"{n / 1_000:.1f}K" + return str(n) + + +def _credit_gauge(used_pct: int) -> str: + """Tiny block-bar showing credit usage (0..100).""" + blocks = "▁▂▃▄▅▆▇█" + n = min(7, max(0, int(used_pct * 8 / 100))) + return blocks[n] + + +def _levenshtein(a: str, b: str) -> int: + if a == b: + return 0 + if not a: + return len(b) + if not b: + return len(a) + prev = list(range(len(b) + 1)) + for i, ca in enumerate(a, 1): + curr = [i] + [0] * len(b) + for j, cb in enumerate(b, 1): + cost = 0 if ca == cb else 1 + curr[j] = min(curr[j - 1] + 1, prev[j] + 1, prev[j - 1] + cost) + prev = curr + return prev[-1] + + +def _suggest(typed: str, candidates: Iterable[str], threshold: int = 2) -> str | None: + best: tuple[int, str] | None = None + for c in candidates: + d = _levenshtein(typed.lower(), c.lower()) + if d <= threshold and (best is None or d < best[0]): + best = (d, c) + return best[1] if best else None + + +# --------------------------------------------------------------------------- +# Lexer (syntax highlighting in the input buffer) +# --------------------------------------------------------------------------- + + +def _make_lexer(): + from prompt_toolkit.lexers import Lexer + + class CmdLexer(Lexer): + def lex_document(self, document): + def get_line(lineno: int): + if lineno >= len(document.lines): + return [] + line = document.lines[lineno] + tokens: list[tuple[str, str]] = [] + first_word_seen = False + for piece in re.split(r"(\s+)", line): + if not piece: + continue + if piece.isspace(): + tokens.append(("", piece)) + continue + if not first_word_seen: + # First word coloured even if it's a slash-command + tokens.append(("class:lexer.cmd", piece)) + first_word_seen = True + elif piece.startswith("--"): + tokens.append(("class:lexer.flag", piece)) + elif piece.startswith(("http://", "https://")): + tokens.append(("class:lexer.url", piece)) + elif len(piece) > 1 and piece[0] in ("'", '"') and piece[-1] == piece[0]: + tokens.append(("class:lexer.string", piece)) + else: + # Inherit the app default style (`""`), which is set + # to light-grey foreground when --keep-bg is off and + # left empty (terminal default) when --keep-bg is on. + tokens.append(("", piece)) + return tokens + + return get_line + + return CmdLexer() + + +# --------------------------------------------------------------------------- +# Bottom toolbar +# --------------------------------------------------------------------------- + + +def _make_toolbar(state: SessionState): + """Return a callable producing toolbar segments. + + The toolbar adapts to terminal width: + - Wide: credits gauge · last cmd · all chips · hint + - Medium: credits gauge · last cmd · chip count · hint + - Narrow: credits · last cmd · chip count + + While a command is in flight (``state.is_running``) the toolbar shows a + plain "running · s" label; the visual animation lives on the + shimmering command line just above the input. + """ + + def render() -> list[tuple[str, str]]: + # Width: prefer prompt_toolkit's live SIGWINCH-tracked size when an + # app is actually running (so the toolbar stays in lockstep with + # what prompt_toolkit's own renderer is using). Outside a run loop, + # ``get_app()`` returns a dummy whose output reports a constant 80 + # — useless — so we fall through to shutil in that case. + width = 0 + try: + from prompt_toolkit.application import get_app as _get_app + + _app = _get_app() + # get_app() returns a dummy outside a real run loop; its output + # reports a constant 80 — useless. Only trust the live app. + if getattr(_app, "is_running", False): + width = _app.output.get_size().columns + except Exception: + pass + if not width: + import shutil + + width = shutil.get_terminal_size((80, 24)).columns + segs: list[tuple[str, str]] = [("class:toolbar", " ")] + + # Unified toolbar pipeline for both idle and in-flight modes: + # build fields → greedy-pack into pages → render the current + # page with a pinned hint on the right. While running we + # prepend a live ``12.3s`` elapsed-time field so the user can + # see how long the command has been going; the bee verb that + # used to live here now alternates with bee facts in the dim + # row above the input. + fields: list[list[tuple[str, str]]] = [] + + if state.is_running and state.run_start is not None: + elapsed = time.monotonic() - state.run_start + fields.append( + [ + ("class:toolbar.label", "Elapsed "), + ("class:toolbar.value", f"{elapsed:.1f}s"), + ] + ) + + # Available Credits + avail: list[tuple[str, str]] = [("class:toolbar.label", "Available Credits ")] + if state.api_key_set and state.credits is not None: + avail.append(("class:toolbar.value", _format_credits(state.credits))) + if state.credits_total: + used_pct = max( + 0, + min(100, 100 - int(state.credits / state.credits_total * 100)), + ) + avail.append(("class:toolbar.hint", f" ({used_pct}% used)")) + else: + avail.append(("class:toolbar.value", "N/A")) + fields.append(avail) + + # Used (Current Session) + used_chunk: list[tuple[str, str]] = [("class:toolbar.label", "Used (Current Session) ")] + scu = state.session_credits_used if state.api_key_set else None + used_chunk.append( + ("class:toolbar.value", _format_credits(scu) if scu is not None else "N/A") + ) + fields.append(used_chunk) + + # Concurrency + conc_chunk: list[tuple[str, str]] = [("class:toolbar.label", "Concurrency ")] + if state.api_key_set and state.max_concurrency is not None: + cur = state.current_concurrency if state.current_concurrency is not None else 0 + conc_chunk.append(("class:toolbar.value", f"{cur}/{state.max_concurrency}")) + else: + conc_chunk.append(("class:toolbar.value", "N/A")) + fields.append(conc_chunk) + + # Next Update countdown (only after first successful refresh) + if state.api_key_set: + nxt = state.seconds_until_next_refresh + if nxt is not None: + fields.append( + [ + ("class:toolbar.label", "Next Update "), + ("class:toolbar.value", f"{nxt}s"), + ] + ) + + # (Removed "last cmd" indicator — the typed command and its + # ✓/✗ footer are already visible in the scrollback echo, so a + # toolbar copy doesn't add information and just consumes width.) + + # Session setting chips — one chunk PER setting so the pagination + # loop below can split them across pages. Long values (e.g. a + # multi-step ``--js-scenario`` JSON blob) are truncated so a + # single chip never overflows the toolbar line. + if state.settings: + _max_chip_value = 28 + for k, v in state.settings.items(): + display_v = v if len(v) <= _max_chip_value else v[: _max_chip_value - 1] + "…" + fields.append([("class:toolbar.chip", f" {k}={display_v} ")]) + + # Hint chunk pinned bottom-right. Always shows the active mouse + # mode label (Scroll / Select) so the user can see what mouse + # behaviour they have at a glance — even while a command is + # running. The Shift+Tab toggle is documented in ``:help`` to + # keep this strip clean. While running we additionally append + # ``Ctrl+C to stop`` so the cancel affordance stays visible. + if not state.api_key_set: + hint_text = "type `auth` to set API key" + hint_chunk: list[tuple[str, str]] = [("class:toolbar.hint", hint_text)] + else: + mode_label = "Scroll mode" if state.mouse_mode == "scroll" else "Select mode" + hint_chunk = [("class:toolbar.value", mode_label)] + if state.is_running: + hint_chunk.append(("class:toolbar.hint", " · Ctrl+C to stop")) + + _leading = " " + _sep = " · " + _page_seconds = 5 # how long each page is displayed before rotating + + def _seg_len(chunk: list[tuple[str, str]]) -> int: + return sum(len(t) for _, t in chunk) + + # The mode hint ("Scroll mode · Tab to switch" / auth nudge) is the + # one piece of toolbar content the user needs to see at all times — + # it advertises the only globally-mutable runtime mode. Pin it on + # every page by reserving its width up-front and pagination only + # packs the *other* fields into the remaining space. + hint_len = _seg_len(hint_chunk) + budget = max(10, width - 2) + # Reserve room for hint + separator on every page. If the hint alone + # is wider than the budget, we'll still try to render it (final + # hard-truncate at the bottom of this function will clip). + field_budget = max(0, budget - hint_len - len(_sep)) + + # Greedy-pack the non-hint fields into pages, each ≤ field_budget. + pages: list[list[list[tuple[str, str]]]] = [] + cur: list[list[tuple[str, str]]] = [] + cur_len = len(_leading) + for chunk in fields: + chunk_len = _seg_len(chunk) + added = chunk_len + (len(_sep) if cur else 0) + if cur and cur_len + added > field_budget: + pages.append(cur) + cur = [chunk] + cur_len = len(_leading) + chunk_len + else: + cur.append(chunk) + cur_len += added + if cur: + pages.append(cur) + # Even if there are no non-hint fields (extreme narrow), produce + # one empty page so the hint still renders. + if not pages: + pages = [[]] + + # Rotate pages by wall-clock time. Single-page case is static. + if len(pages) == 1: + page_idx = 0 + else: + page_idx = int(time.monotonic() / _page_seconds) % len(pages) + page = pages[page_idx] + + # Compose the chosen page. + segs: list[tuple[str, str]] = [("class:toolbar", _leading)] + for i, chunk in enumerate(page): + if i > 0: + segs.append(("class:toolbar", _sep)) + segs.extend(chunk) + + # Page indicator (e.g. "1/3") trailing — only when rotating. + if len(pages) > 1: + indicator = f" ({page_idx + 1}/{len(pages)})" + cur_total = sum(len(t) for _, t in segs) + if cur_total + len(indicator) <= field_budget: + segs.append(("class:toolbar.hint", indicator)) + + # Hint always rendered on the right edge of every page. + cur_total = sum(len(t) for _, t in segs) + pad = max(2, width - cur_total - hint_len - 2) + segs.append(("class:toolbar", " " * pad)) + segs.extend(hint_chunk) + + # Final safety: hard-truncate so we never emit a line wider than + # the terminal (prevents the soft-wrap ghost-toolbar artifact). + total = sum(len(t) for _, t in segs) + if total > width - 1: + cap = max(0, width - 1) + kept: list[tuple[str, str]] = [] + used_len = 0 + for sty, text in segs: + room = cap - used_len + if room <= 0: + break + if len(text) <= room: + kept.append((sty, text)) + used_len += len(text) + else: + kept.append((sty, text[: max(0, room - 1)] + "…")) + break + segs = kept + return segs + + return render + + +# --------------------------------------------------------------------------- +# Application (Frame around input + toolbar) +# --------------------------------------------------------------------------- + + +def _build_application(state: SessionState, completer: Any, history_path: str): + from prompt_toolkit.application import Application + from prompt_toolkit.auto_suggest import AutoSuggestFromHistory + from prompt_toolkit.buffer import Buffer + from prompt_toolkit.filters import has_completions + from prompt_toolkit.history import FileHistory + from prompt_toolkit.key_binding import KeyBindings + from prompt_toolkit.layout import Layout + from prompt_toolkit.layout.containers import HSplit, Window + from prompt_toolkit.layout.controls import BufferControl, FormattedTextControl + from prompt_toolkit.layout.dimension import D + from prompt_toolkit.styles import Style + + try: + history = FileHistory(history_path) + except Exception: + history = None + + buffer = Buffer( + history=history, + completer=completer, + complete_while_typing=False, + auto_suggest=AutoSuggestFromHistory(), + multiline=False, + ) + + # The input is a single Window with a per-line prefix (the chevron). + # `dont_extend_height=True` makes the Window report its preferred height as + # the content's line count — so the layout shrinks to fit, no greedy fill. + def _line_prefix(line_no, _wrap_count): + if line_no == 0: + return [("class:promptmark", "❯ ")] + return [("", " ")] + + input_window = Window( + content=BufferControl(buffer=buffer, lexer=_make_lexer()), + get_line_prefix=_line_prefix, + wrap_lines=True, + height=D(min=1), + dont_extend_height=True, + ) + + toolbar_window = Window( + content=FormattedTextControl(_make_toolbar(state)), + height=D.exact(1), + ) + + # No horizontal rules above/below the input. Earlier versions had `─` + # rules for visual structure, but every resize redraws the layout at the + # new width and leaves the old wider rule fragments behind in scrollback — + # piles of yellow horizontal lines accumulate. Visual hierarchy still + # holds via the yellow chevron prompt mark and the dim toolbar. + layout = Layout(HSplit([input_window, toolbar_window])) + + kb = KeyBindings() + + @kb.add("enter") + def _enter(event): + text = buffer.text + if text.strip(): + event.app.exit(result=text) + + @kb.add("c-c") + def _ctrl_c(event): + event.app.exit(result=None) + + @kb.add("c-d") + def _ctrl_d(event): + if not buffer.text: + event.app.exit(result=None) + + # Tab opens / advances the completion menu. (Custom KeyBindings override + # prompt_toolkit's default Tab handler, so we re-bind it explicitly.) + @kb.add("tab", filter=~has_completions) + def _tab_open(event): + event.current_buffer.start_completion(select_first=False) + + @kb.add("tab", filter=has_completions) + def _tab_next(event): + event.current_buffer.complete_next() + + @kb.add("s-tab", filter=has_completions) + def _shift_tab(event): + event.current_buffer.complete_previous() + + @kb.add("escape", filter=has_completions, eager=True) + def _escape_menu(event): + event.current_buffer.cancel_completion() + + app = Application( + layout=layout, + key_bindings=kb, + style=Style.from_dict(_STYLE_DICT), + full_screen=False, + mouse_support=False, + # Erase the rendered prompt area on exit so rules + input + toolbar + # don't pile up in scrollback as stale-width artifacts after every + # submit (or after a terminal resize). The submitted command is + # echoed manually by the main loop so the user can still see what + # they typed. + erase_when_done=True, + ) + return app, buffer + + +# --------------------------------------------------------------------------- +# Banner / help / output frame +# --------------------------------------------------------------------------- + + +# ScrapingBee wordmark — approximation of the actual brand logo +# (https://www.scrapingbee.com/images/favico.svg): three honeycomb cells +# arranged in an L-shape (top, bottom-left, bottom-right) next to the +# "ScrapingBee" text rendered in the figlet ``smblock`` font. +# All rendered in brand yellow (terminal limits us to single-colour per +# Window; the real SVG has the bottom-left cell highlighted vs the other +# two). ~42 cols × 4 rows. +# "ScrapingBee" rendered in the figlet ``smblock`` font — 4 rows × 32 cols, +# roughly the same width as the "Web scraping from the terminal" tagline. +# Same block-letter style as the old 6-row logo, just compact. +# ANSI Shadow letters for "SCRAPING" and "BEE", kept as separate halves +# so each can carry its own colour (yellow + white, matching the brand +# wordmark) when stitched together at render time. +# +# Note on widths: the rightmost letter ``G`` has a natural 1-column +# narrower silhouette on its top and bottom rows (its shape leaves a +# trailing space on rows 1, 2, 6 but extends to a full ``╗``/``║``/``╝`` +# on rows 3, 4, 5). Without explicit padding, that imbalance shifts +# BEE one column right on the middle rows when we concat them, which +# reads as a misaligned bottom-left/last-bottom-right on the BEE side. +# Each row below is normalised to the same width with a trailing space +# where the font naturally has one. +_SCRAPING_LETTERS = [ + "███████╗ ██████╗██████╗ █████╗ ██████╗ ██╗███╗ ██╗ ██████╗ ", + "██╔════╝██╔════╝██╔══██╗██╔══██╗██╔══██╗██║████╗ ██║██╔════╝ ", + "███████╗██║ ██████╔╝███████║██████╔╝██║██╔██╗ ██║██║ ███╗", + "╚════██║██║ ██╔══██╗██╔══██║██╔═══╝ ██║██║╚██╗██║██║ ██║", + "███████║╚██████╗██║ ██║██║ ██║██║ ██║██║ ╚████║╚██████╔╝", + "╚══════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚═════╝ ", +] +_BEE_LETTERS = [ + "██████╗ ███████╗███████╗", + "██╔══██╗██╔════╝██╔════╝", + "██████╔╝█████╗ █████╗ ", + "██╔══██╗██╔══╝ ██╔══╝ ", + "██████╔╝███████╗███████╗", + "╚═════╝ ╚══════╝╚══════╝", +] +# Combined "SCRAPING BEE" wordmark on a single row of letterforms — 6 +# lines tall, ~90 cols wide. Replaces the prior 4-row smblock SCRAPING +# + 6-row BEE stack (10 logo rows) with this single 6-row version. +_SCRAPINGBEE_LOGO = [" " + s + " " + b for s, b in zip(_SCRAPING_LETTERS, _BEE_LETTERS)] +# Column at which "BEE" begins inside each combined row, used by the +# pinned banner renderer to split the row into a yellow "SCRAPING" half +# and a white "BEE" half. +_BEE_OFFSET = 2 + len(_SCRAPING_LETTERS[0]) + 2 +# Legacy alias kept so any external callers still resolve. +_BEE_LOGO = _BEE_LETTERS + + +def _render_banner(version: str) -> str: + """Render the startup banner to an ANSI-formatted string. + + Rendered into an in-memory StringIO via rich so the whole banner is + assembled before any write to the terminal — avoids interleaving with + other stdout writes (clear-screen, padding newlines) and avoids any + timing-related re-ordering between rich's internal flushing and our + direct sys.stdout.write calls. + """ + from io import StringIO + + from rich.console import Console + + from .theme import SCRAPINGBEE_THEME + + buf = StringIO() + c = Console( + file=buf, + theme=SCRAPINGBEE_THEME, + highlight=False, + force_terminal=True, + width=200, # don't wrap the wide ASCII logo + ) + c.print() + # Each combined row is " ". Split + # at the known offset so the yellow/white wordmark colours mirror + # the brand mark (SCRAPING yellow, BEE white). + for line in _SCRAPINGBEE_LOGO: + left = line[:_BEE_OFFSET] + right = line[_BEE_OFFSET:] + c.print(f"[bold {BEE_YELLOW}]{left}[/][bold white]{right}[/]") + c.print() + # Version + c.print(f" [bold {BEE_YELLOW}]v{version}[/]") + # Tagline + c.print(f" [{BEE_DIM}]Web scraping from the terminal[/]") + c.print() + # Hint + hint = Text() + hint.append(" Type ", style=BEE_DIM) + hint.append(":help", style=f"bold {BEE_YELLOW}") + hint.append(" for commands, ", style=BEE_DIM) + hint.append(":q", style=f"bold {BEE_YELLOW}") + hint.append(" to quit", style=BEE_DIM) + c.print(hint) + c.print() + return buf.getvalue() + + +def _print_help(commands: dict[str, str]) -> None: + """Print the REPL command list with a two-column layout. + + Long descriptions wrap with a hanging indent so continuation lines line + up under the description column instead of flowing back to column 0. + Column widths: + 4 (leading) + 20 (cmd col) + 2 (gap) = 26-col indent for + continuation lines. The description column gets the rest of the + terminal width. + """ + import shutil + import textwrap + + cmd_col = 20 + leading = 4 + gap = 2 + indent_width = leading + cmd_col + gap # 26 + indent_str = " " * indent_width + + def _print_row(cmd: str, desc: str) -> None: + try: + term_w = shutil.get_terminal_size((80, 24)).columns + except Exception: + term_w = 80 + desc_w = max(20, term_w - indent_width) + lines = textwrap.wrap(desc, width=desc_w) or [""] + # Build Text objects directly instead of using Rich's markup + # parser — markup strings like ``[dim]...[/]`` go through Rich's + # console renderer which strips leading whitespace and re-wraps + # at its own console width (re-wrapping our pre-wrapped lines + # mid-word, and dropping the hanging indent). Plain Text objects + # plus ``soft_wrap=True`` keep the spans and indent intact. + first = Text() + first.append(" " * leading) + first.append(cmd.ljust(cmd_col), style=f"bold {BEE_YELLOW}") + first.append(" " * gap) + first.append(lines[0], style=BEE_DIM) + err_console.print(first, soft_wrap=True) + for line in lines[1:]: + cont = Text() + cont.append(indent_str) + cont.append(line, style=BEE_DIM) + err_console.print(cont, soft_wrap=True) + + err_console.print() + groups = { + "Pages": ["scrape", "crawl"], + "Search": ["google", "fast-search"], + "Marketplaces": ["amazon-product", "amazon-search", "walmart-product", "walmart-search"], + "Media": ["youtube-search", "youtube-metadata"], + "AI": ["chatgpt"], + "Learn": ["tutorial"], + "Account": ["auth", "logout"], + "Tools": ["usage", "schedule", "export", "docs", "unsafe"], + } + for i, (group_name, cmds) in enumerate(groups.items()): + if i > 0: + err_console.print() # blank row between categories for breathing room + err_console.print(f" [{BEE_DIM}]{group_name}[/]") + for cmd in cmds: + _print_row(cmd, commands.get(cmd, "")) + err_console.print() + err_console.print(f" [{BEE_DIM}]REPL[/]") + for cmd, desc in [ + (":help, :?", "Show this command list"), + (":clear", "Clear the screen"), + ( + ":view", + "Scroll the last command's output (auto-picks crawl.log after crawl; pass a path to view any file)", + ), + (":set K=V ...", "Set one or more session defaults"), + (":unset K", "Remove a session default ('all' or '*' clears every)"), + (":reset", "Clear every session default"), + (":show, :list", "Show current session defaults"), + ("!", "Run a shell command (requires unsafe mode)"), + (":q, :quit", "Quit the REPL"), + ]: + _print_row(cmd, desc) + err_console.print() + err_console.print(f" [{BEE_DIM}]Shortcuts[/]") + for cmd, desc in [ + ("Tab", "Complete (inline if 1 match, popup if many, ghost word otherwise)"), + ("Shift+Tab", "Cycle popup back / toggle Scroll ↔ Select mode"), + ("Esc", "Close the completion popup"), + ("→", "Accept the next word of the ghost suggestion"), + ("End", "Accept the whole ghost suggestion"), + ("↑ / ↓", "Walk history (single-line) / move cursor (multi-line)"), + ("PgUp / PgDn", "Scroll the scrollback buffer up / down"), + ("Ctrl+Home/End", "Jump to top / bottom of scrollback"), + ("Ctrl+J", "Insert a newline (multi-line compose; also Alt/Option+Enter)"), + ("Ctrl+W", "Delete the word before the cursor (also Alt/Option+⌫)"), + ("Click", "Open a highlighted path in Finder / default app"), + ("Ctrl+C", "Stop running command / cancel queue / clear multi-line / exit when idle"), + ("Ctrl+D", "Exit the REPL (when no command is running)"), + ]: + _print_row(cmd, desc) + err_console.print() + + +def _print_command_header(args: list[str]) -> None: + import shutil + + width = shutil.get_terminal_size((80, 24)).columns + label = " " + " ".join(args) + " " + fill = max(3, width - len(label) - 6) + line = Text() + line.append("─── ", style=BEE_DIM) + line.append(label, style=f"bold {BEE_YELLOW}") + line.append("─" * fill, style=BEE_DIM) + err_console.print(line) + + +def _print_command_footer(status: str, duration: float) -> None: + line = Text() + line.append(" ") + if status == "ok": + line.append("✓", style=f"bold {_GREEN}") + elif status == "fail": + line.append("✗", style=f"bold {BEE_RED}") + elif status == "stopped": + line.append("■", style=f"bold {BEE_YELLOW}") + line.append(f" {duration:.2f}s", style=BEE_DIM) + err_console.print(line) + err_console.print() + + +# --------------------------------------------------------------------------- +# Slash-command dispatcher +# --------------------------------------------------------------------------- + + +def _open_pager(path: str) -> None: + """Cross-platform scrollable pager built on prompt_toolkit. + + Replaces external tools (`less` on Unix, `more` on Windows) with an + in-process viewer so the CLI works identically everywhere with no extra + install. Arrow keys / page up-down / home / end / mouse wheel scroll; + `q` or `Esc` exits back to the REPL. Long lines wrap to the terminal + width so you can see all of a wide JSON or HTML response without + horizontal scrolling. Press `p` to toggle pretty-printed JSON. + """ + import json + from pathlib import Path + + from prompt_toolkit.application import Application + from prompt_toolkit.buffer import Buffer + from prompt_toolkit.document import Document + from prompt_toolkit.filters import Condition + from prompt_toolkit.key_binding import KeyBindings + from prompt_toolkit.layout import Layout + from prompt_toolkit.layout.containers import HSplit, Window + from prompt_toolkit.layout.controls import BufferControl, FormattedTextControl + from prompt_toolkit.layout.dimension import D + from prompt_toolkit.styles import Style + + raw_text = Path(path).read_text(encoding="utf-8", errors="replace") + + # If the cached output is valid JSON or recognisable HTML, prepare + # a pretty-printed version up-front. We default to pretty mode so + # the user sees the human-readable form first; ``r`` toggles raw + # if they need to grep the original bytes. When the content + # matches neither, pretty is unavailable and we stick with raw. + pretty_text: str | None + try: + pretty_text = json.dumps(json.loads(raw_text), indent=2, ensure_ascii=False) + except Exception: + pretty_text = None + if pretty_text is None: + # Cheap heuristic: looks like HTML if a leading non-whitespace + # chunk starts with ``<``. lxml accepts both well-formed XML + # and tag-soup HTML, so this stays fast and lenient. + stripped = raw_text.lstrip() + if stripped.startswith("<"): + try: + # lxml's compiled submodules aren't visible to static + # type checkers; import via ``importlib`` so the + # checker doesn't try to resolve them. + import importlib + + _etree = importlib.import_module("lxml.etree") + _lxml_html = importlib.import_module("lxml.html") + tree = _lxml_html.fromstring(raw_text) + pretty_text = _etree.tostring( + tree, pretty_print=True, encoding="unicode", method="html" + ) + except Exception: + pretty_text = None + + mode = ["pretty" if pretty_text is not None else "raw"] + + buffer = Buffer(read_only=Condition(lambda: True)) + + def _set_text(s: str) -> None: + buffer.set_document(Document(text=s, cursor_position=0), bypass_readonly=True) + + _set_text(pretty_text if (mode[0] == "pretty" and pretty_text is not None) else raw_text) + + def _current_line_count() -> int: + return buffer.document.line_count + + text_window = Window( + content=BufferControl(buffer=buffer), + # Wrap long lines so a multi-KB JSON / HTML response is fully + # visible without horizontal scrolling. The previous default + # (wrap_lines=False) clipped at column-N and the rest was just + # gone unless the user used Left/Right scrolling. + wrap_lines=True, + ) + + def _status_line(): + cursor_line = buffer.document.cursor_position_row + 1 + total = _current_line_count() + pct = int(cursor_line / max(1, total) * 100) + mode_label = "pretty" if mode[0] == "pretty" else "raw" + # `r` toggles raw on/off. Hidden when there's no pretty version + # available (non-JSON content) — there'd be nothing to toggle to. + toggle_hint = ( + ("r: pretty" if mode[0] == "raw" else "r: raw") if pretty_text is not None else "" + ) + right_hint = ( + "↑↓ PgUp/PgDn scroll" + + (f" · {toggle_hint}" if toggle_hint else "") + + " · q to exit" + ) + return [ + ("class:pager.bar", " "), + ("class:pager.value", f"{cursor_line}/{total}"), + ("class:pager.bar", f" ({pct}%) · {mode_label} · "), + ("class:pager.label", path), + ("class:pager.bar", " "), + ("class:pager.hint", right_hint), + ] + + status_window = Window( + content=FormattedTextControl(_status_line), + height=D.exact(1), + ) + + layout = Layout(HSplit([text_window, status_window])) + + kb = KeyBindings() + + @kb.add("q") + @kb.add("c-c") + def _exit(event): + event.app.exit() + + # Esc gets its own binding with ``eager=True`` so it fires immediately + # instead of waiting through prompt_toolkit's internal key-processor + # ``timeoutlen`` (the buffered-input default + any partial-match + # search across implicit bindings). Without eager the user perceives + # a multi-second pause between pressing Esc and the pager exiting. + @kb.add("escape", eager=True) + def _exit_esc(event): + event.app.exit() + + @kb.add("r") + def _toggle_raw(_e): + # No-op if the content isn't JSON — pretty isn't available, so + # we're already showing raw and there's nothing to toggle to. + if pretty_text is None: + return + if mode[0] == "pretty": + mode[0] = "raw" + _set_text(raw_text) + else: + mode[0] = "pretty" + _set_text(pretty_text) + + @kb.add("up") + def _up(_e): + buffer.cursor_up() + + @kb.add("down") + def _down(_e): + buffer.cursor_down() + + @kb.add("pageup") + def _pgup(_e): + buffer.cursor_up(count=20) + + @kb.add("pagedown") + def _pgdn(_e): + buffer.cursor_down(count=20) + + @kb.add("home") + def _home(event): + buffer.cursor_position = 0 + + @kb.add("end") + def _end(event): + buffer.cursor_position = len(buffer.text) + + @kb.add("left") + def _left(_e): + buffer.cursor_left() + + @kb.add("right") + def _right(_e): + buffer.cursor_right() + + style = Style.from_dict( + { + "pager.bar": f"bg:{_BG_CHIP} {BEE_DIM}", + "pager.value": f"bg:{_BG_CHIP} {BEE_YELLOW} bold", + "pager.label": f"bg:{_BG_CHIP} {BEE_DIM}", + "pager.hint": f"bg:{_BG_CHIP} {_DIM2}", + } + ) + + pager_app = Application( + layout=layout, + key_bindings=kb, + style=style, + full_screen=True, + mouse_support=True, + ) + # Shrink BOTH escape-related timeouts. ``ttimeoutlen`` is the parser- + # level wait for "is this Esc-byte the start of an escape sequence", + # default 0.5s. ``timeoutlen`` is the key-processor wait for "is this + # complete key the start of a multi-key binding", default 1.0s. + # Together with eager=True on the Esc-exit binding above, this makes + # Esc fire essentially instantly in the pager. 50ms is enough for + # any well-formed escape sequence from a modern terminal. + pager_app.ttimeoutlen = 0.05 + pager_app.timeoutlen = 0.05 + + # We're (almost certainly) called from inside the REPL's prompt_toolkit + # event loop — a sync key-binding handler invoked `:view`. Calling + # ``pager_app.run()`` here would hit ``asyncio.run()`` from inside a + # running loop and raise. Detect that and farm the pager out to a + # worker thread which has no loop of its own, so ``app.run()`` can + # safely create a fresh one. Blocking the main thread on ``join()`` + # freezes the outer app's rendering while the pager has the terminal, + # which is exactly what we want — the pager uses the alternate screen + # buffer (full_screen=True), then yields it back on exit. + try: + import asyncio as _asyncio_check + + _asyncio_check.get_running_loop() + in_loop = True + except RuntimeError: + in_loop = False + + if not in_loop: + pager_app.run() + return + + err_holder: list[BaseException | None] = [None] + + def _run_in_worker() -> None: + try: + pager_app.run() + except BaseException as e: + err_holder[0] = e + + t = threading.Thread(target=_run_in_worker, daemon=False) + t.start() + t.join() + if err_holder[0] is not None: + raise err_holder[0] + # NOTE: the caller (run_repl, after :view) is responsible for + # re-entering the alt buffer and resetting the outer app's renderer + # cache. prompt_toolkit's Application.run cleanup emits + # ``\x1b[?1049l`` on exit, which kicks the outer REPL out of the + # alt buffer too — only the caller has access to ``app`` to invalidate + # it properly, so the cleanup lives there. + + +def _normalize_setting_key(key: str) -> str: + """Strip leading dashes; settings keys are stored without `--` prefix. + + Hyphen vs underscore is left to the user — we don't normalise either way + because click options exist in both forms across the codebase. The + validation check (against the click flag list) settles which is correct. + """ + return key.strip().lstrip("-") + + +def _parse_set_args(rest: str) -> list[tuple[str, str]] | str: + """Parse the argument string for `:set`. Returns either a list of + (key, value) pairs, or an error string explaining what's wrong. + + Accepted forms (mix and match in one line): + :set country-code=fr + :set --country-code fr + :set country-code=fr premium-proxy=true device=mobile + :set --country-code fr --premium-proxy true + """ + try: + tokens = shlex.split(rest) + except ValueError as e: + return f"parse error: {e}" + + pairs: list[tuple[str, str]] = [] + i = 0 + while i < len(tokens): + tok = tokens[i] + if "=" in tok and not tok.startswith("="): + key, _, value = tok.partition("=") + key = _normalize_setting_key(key) + value = value.strip() + if not key or value == "": + return f"empty key or value in '{tok}'" + pairs.append((key, value)) + i += 1 + elif tok.startswith("--"): + key = _normalize_setting_key(tok) + if i + 1 >= len(tokens): + return f"missing value for --{key}" + pairs.append((key, tokens[i + 1])) + i += 2 + else: + return ( + f"unexpected '{tok}'. Use key=value or --key value " + f"(e.g. :set country-code=fr or :set --country-code fr)" + ) + return pairs + + +def _handle_meta( + line: str, + state: SessionState, + command_help: dict[str, str], + all_known_flags: set[str], + bool_flags: set[str], + choice_flags: dict[str, list[str]], + scrollback: ScrollbackBuffer | None = None, +) -> str | None: + parts = line.strip().split(None, 1) + head = parts[0] + rest = parts[1] if len(parts) > 1 else "" + head_low = head.lower() + + if head_low in {":q", ":quit", "exit", "quit", "q"}: + return "quit" + if head_low in {":help", ":?", "help", "?"}: + _print_help(command_help) + return "ok" + if head_low in {":clear", "clear"}: + if scrollback is not None: + # full_screen mode — clear our virtual buffer + with scrollback._lock: + scrollback.lines.clear() + scrollback.scroll_offset = 0 + else: + # Legacy fallback (shouldn't trigger in current REPL) + sys.stderr.write("\033[2J\033[H") + sys.stderr.flush() + return "ok" + if head_low in (":show", ":list"): + if not state.settings: + err_console.print(f" [{BEE_DIM}]No session defaults set.[/]") + else: + err_console.print() + for k, v in state.settings.items(): + err_console.print(f" [bold {BEE_YELLOW}]{k:<20}[/] [dim]{v}[/]") + err_console.print() + return "ok" + if head_low == ":view": + from pathlib import Path + + cache_dir = Path.home() / ".cache" / "scrapingbee-cli" + crawl_log = cache_dir / "crawl.log" + last_output = cache_dir / "last-output" + target_arg = rest.strip() + # `:view` → whatever the most-recent command produced. + # ``crawl`` writes a Scrapy log to crawl.log; every + # other API command (scrape, google, batch items, + # …) writes its response body to last-output. So + # the routing key is ``state.last_command``. + # `:view crawl` → backwards-compat shortcut for crawl.log; still + # useful when the user just wants to peek at the + # log without having re-run crawl most recently. + # `:view ` → arbitrary file (must exist). + if not target_arg: + if state.last_command == "crawl" and crawl_log.exists(): + target_path = crawl_log + missing_msg = "no crawl log yet — run `crawl ...` first" + else: + target_path = last_output + missing_msg = "no recent output to view" + elif target_arg.lower() == "crawl": + target_path = crawl_log + missing_msg = "no crawl log yet — run `crawl ...` first" + else: + target_path = Path(target_arg).expanduser() + missing_msg = f"file not found: {target_arg}" + if not target_path.exists(): + err_console.print(f" [{BEE_DIM}]{missing_msg}[/]") + return "ok" + try: + _open_pager(str(target_path)) + except FileNotFoundError: + # File got deleted between exists() and read() — race with cleanup + err_console.print(f" [{BEE_DIM}]file no longer available[/]") + except Exception as e: + err_console.print(f" [bold {BEE_RED}]pager error:[/] {e}") + err_console.print( + f" [{BEE_DIM}]full output saved at[/] [bold {BEE_YELLOW}]{target_path}[/]" + ) + return "ok" + + if head_low in {":reset", ":unset-all"}: + n = len(state.settings) + state.settings.clear() + err_console.print(f" [{BEE_DIM}]cleared {n} setting(s)[/]") + return "ok" + if head_low == ":unset": + target = rest.strip() + if not target: + err_console.print(f" [bold {BEE_RED}]usage:[/] :unset KEY | :unset * | :reset") + return "ok" + if target in {"*", "all"}: + n = len(state.settings) + state.settings.clear() + err_console.print(f" [{BEE_DIM}]cleared {n} setting(s)[/]") + return "ok" + # Allow space- or comma-separated multiple keys. + keys = [_normalize_setting_key(k) for k in re.split(r"[,\s]+", target) if k] + for key in keys: + if key in state.settings: + del state.settings[key] + err_console.print(f" [{BEE_DIM}]unset[/] [bold {BEE_YELLOW}]{key}[/]") + else: + err_console.print(f" [{BEE_DIM}]not set:[/] {key}") + return "ok" + if head_low == ":set": + if not rest.strip(): + err_console.print(f" [bold {BEE_RED}]usage:[/] :set KEY=VALUE [KEY=VALUE ...]") + err_console.print(f" [{BEE_DIM}] or:[/] :set --KEY VALUE [--KEY VALUE ...]") + return "ok" + parsed = _parse_set_args(rest) + if isinstance(parsed, str): + err_console.print(f" [bold {BEE_RED}]:set[/] {parsed}") + return "ok" + + valid_keys = {f.lstrip("-") for f in all_known_flags} + applied: list[tuple[str, str]] = [] + rejected: list[str] = [] + for key, value in parsed: + if key not in valid_keys: + err_console.print( + f" [bold {BEE_RED}]unknown option:[/] [bold {BEE_YELLOW}]--{key}[/]" + ) + suggestion = _suggest(key, valid_keys, threshold=2) + if suggestion: + err_console.print( + f" [{BEE_DIM}] did you mean[/] " + f"[bold {BEE_YELLOW}]--{suggestion}[/][{BEE_DIM}]?[/]" + ) + rejected.append(key) + continue + flag = f"--{key}" + # Validate choices + if flag in choice_flags and value not in choice_flags[flag]: + err_console.print( + f" [bold {BEE_RED}]invalid value for[/] " + f"[bold {BEE_YELLOW}]--{key}[/][bold {BEE_RED}]:[/] {value}" + ) + err_console.print(f" [{BEE_DIM}] choices:[/] " + ", ".join(choice_flags[flag])) + rejected.append(key) + continue + # Validate bool values + if flag in bool_flags and value.lower() not in ( + "true", + "false", + "yes", + "no", + "1", + "0", + "on", + "off", + ): + err_console.print(f" [bold {BEE_RED}]--{key} expects a bool, got:[/] {value}") + rejected.append(key) + continue + state.settings[key] = value + applied.append((key, value)) + + for key, value in applied: + err_console.print(f" [{BEE_DIM}]set[/] [bold {BEE_YELLOW}]{key}[/] = [dim]{value}[/]") + return "ok" + return None + + +# --------------------------------------------------------------------------- +# Completer +# --------------------------------------------------------------------------- + + +def _make_completer( + commands: list[str], + command_flags: dict[str, list[str]], + bool_flags: set[str], + choice_flags: dict[str, list[str]], + command_help: dict[str, str], +): + from prompt_toolkit.completion import Completer, Completion + + meta_cmds = [ + ":help", + ":?", + ":clear", + ":view", + ":set", + ":unset", + ":reset", + ":show", + ":list", + ":q", + ":quit", + ] + + # Precompute the union of every flag known to any command. Used as a + # fallback completion pool when the user's typed command isn't + # recognised (typo, in-progress rename, etc.) — without this the + # completer would silently stop suggesting anything as soon as the + # first word is unknown, which is confusing UX. + _all_known_flags: list[str] = sorted({f for flags in command_flags.values() for f in flags}) + + class BeeCompleter(Completer): + def get_completions(self, document, complete_event): + text = document.text_before_cursor.lstrip() + words = text.split() + on_first = (not text) or (len(words) == 1 and not text.endswith(" ")) + + if on_first: + partial = words[0].lower() if words else "" + pool: list[tuple[str, str]] = [(c, command_help.get(c, "")) for c in commands] + pool.extend((m, "REPL meta") for m in meta_cmds) + for cmd, meta in sorted(pool): + if cmd.startswith(partial): + yield Completion(cmd, start_position=-len(partial), display_meta=meta) + return + + cmd_name = words[0] + # If cmd_name is unknown, fall back to the union of all flags + # so the user still gets *some* suggestions instead of silence. + # Display "(unknown command)" so they know completions may + # not actually apply to what they typed. + cmd_known = cmd_name in command_flags + flags_for_cmd = command_flags[cmd_name] if cmd_known else _all_known_flags + ends_with_space = text.endswith(" ") + last_word = words[-1] if words else "" + # When the buffer ends with a space the user has *finished* + # typing the previous arg and is starting a new one. The + # "current partial" is empty; the "previous arg" (used for + # bool/choice value suggestions) shifts to the last typed + # word. Earlier this was off-by-one and would cause Tab to + # replace the wrong span — e.g. ``--verbose `` + Tab would + # corrupt to ``---verbose``. + if ends_with_space: + last = "" + prev = last_word + else: + last = last_word + prev = words[-2] if len(words) >= 2 else "" + + if ends_with_space and prev in bool_flags: + yield Completion("true", display_meta="enable") + yield Completion("false", display_meta="disable") + return + if ends_with_space and prev in choice_flags: + for v in choice_flags[prev]: + yield Completion(v) + return + if (not ends_with_space) and len(words) >= 2 and not last.startswith("-"): + if prev in bool_flags: + for v in ("true", "false"): + if v.startswith(last.lower()): + yield Completion(v, start_position=-len(last)) + return + if prev in choice_flags: + for v in choice_flags[prev]: + if v.startswith(last.lower()): + yield Completion(v, start_position=-len(last)) + return + # Flag completions: either the user is typing a partial flag + # (``--ver``), or they're at a trailing space ready for a + # new flag (``last == ""`` here matches every flag). In both + # cases start_position is ``-len(last)`` — which is 0 in + # the trailing-space case, so flags get inserted at the + # cursor without disturbing previous text. + if last.startswith("-") or (ends_with_space and last == ""): + meta_label = "" if cmd_known else "(unknown command)" + for flag in flags_for_cmd: + if flag.startswith(last): + yield Completion(flag, start_position=-len(last), display_meta=meta_label) + + return BeeCompleter() + + +# --------------------------------------------------------------------------- +# Multi-line: trailing backslash continues the next line +# --------------------------------------------------------------------------- + + +# --------------------------------------------------------------------------- +# Main loop — persistent Application + patch_stdout +# --------------------------------------------------------------------------- + + +_INTERACTIVE_COMMANDS = {"tutorial", "auth"} + + +def run_repl(cli_group: Any, version: str, *, keep_bg: bool = False) -> None: + """Run the REPL with the Ink-style hybrid pattern. + + Banner is printed to real stdout, lands in scrollback. The input + toolbar + live in a persistent Application(full_screen=False) at the bottom of the + terminal. The whole loop runs inside ``patch_stdout()`` so any print or + click.echo from a command flows through real terminal stdout (real + scrollback, real selection) while the bottom strip is redrawn afterwards. + """ + import shutil + from pathlib import Path + + import click + from prompt_toolkit.application import Application + from prompt_toolkit.application.run_in_terminal import run_in_terminal + from prompt_toolkit.buffer import Buffer + from prompt_toolkit.filters import Condition, has_completions + from prompt_toolkit.key_binding import KeyBindings + from prompt_toolkit.layout import Layout + from prompt_toolkit.layout.containers import ConditionalContainer, HSplit, Window + from prompt_toolkit.layout.controls import BufferControl, FormattedTextControl + from prompt_toolkit.layout.dimension import D + from prompt_toolkit.styles import Style + + from .theme import set_repl_mode + + set_repl_mode(True) + + # ── Asyncio loop tracking for fast Ctrl+C ─────────────────────────────── + # Commands like ``scrape`` run ``asyncio.run(...)`` inside a worker + # thread to drive aiohttp. While the loop is in ``select()`` waiting + # on a socket, ``PyThreadState_SetAsyncExc`` doesn't deliver an + # interrupt — it only fires at the next Python bytecode boundary, and + # no bytecode runs until ``select()`` returns (typically when the + # ScrapingBee API responds, which can be 30+ seconds). + # + # We monkey-patch ``asyncio.run`` for the duration of this REPL + # session so we can keep a handle to the worker's loop. The Ctrl+C + # handler then uses ``call_soon_threadsafe`` to cancel in-flight + # tasks — that wakes the selector immediately and raises + # ``CancelledError`` on the await, which propagates out cleanly + # (the worker's except clause turns it into "stopped"). + import asyncio as _asyncio_mod + import threading as _threading_mod + + _active_worker_loop: list[Any] = [None] + _original_asyncio_run = _asyncio_mod.run + _main_thread = _threading_mod.main_thread() + + def _tracking_loop_factory(): + loop = _asyncio_mod.new_event_loop() + # CRITICAL: only track loops that belong to *worker* threads. The + # main thread's loop is prompt_toolkit's own — cancelling tasks + # on it kills the entire REPL. ``app.run()`` calls + # ``asyncio.run`` (which routes through us here), so without this + # guard the very first call at REPL startup registers the main + # loop as the "worker" loop and any subsequent Ctrl+C tears the + # REPL down with a CancelledError. + if _threading_mod.current_thread() is _main_thread: + return loop + _active_worker_loop[0] = loop + return loop + + def _tracking_asyncio_run(main, *, debug=None, loop_factory=None): + # Same guard on the cleanup side — only clear the worker-loop + # ref if THIS call was a worker-thread call. If we're on the main + # thread we never touched the ref in the first place. + # ``loop_factory`` was added to ``asyncio.run`` in Python 3.12; + # we pass it through ``**kwargs`` so the call works on both 3.11 + # (no kwarg) and 3.12+, and so the type checker doesn't reject + # the kwarg against the older stub. + try: + kwargs: dict = {"debug": debug} + factory = loop_factory or _tracking_loop_factory + kwargs["loop_factory"] = factory + return _original_asyncio_run(main, **kwargs) + finally: + if _threading_mod.current_thread() is not _main_thread: + _active_worker_loop[0] = None + + # Monkey-patch asyncio.run via setattr so the type checker doesn't + # complain about the wrapper's slightly broader signature (it also + # accepts ``loop_factory`` for forward-compat with Python 3.12+). + setattr(_asyncio_mod, "run", _tracking_asyncio_run) + + # ── Click tree introspection ──────────────────────────────────────────── + command_help, command_flags, bool_flags, choice_flags = _walk_click_tree(cli_group) + command_names = sorted(command_flags.keys()) + all_known_flags: set[str] = set() + for flags_list in command_flags.values(): + all_known_flags.update(flags_list) + + state = SessionState() + state.refresh_credits_from_cache() + + from .config import get_api_key_if_set + + state.api_key_set = bool(get_api_key_if_set(None)) + + history_path = str(Path.home() / ".config" / "scrapingbee-cli" / ".history") + Path(history_path).parent.mkdir(parents=True, exist_ok=True) + try: + history = _make_capped_history(history_path, max_entries=10_000) + except Exception: + history = None + + completer = _make_completer( + command_names, command_flags, bool_flags, choice_flags, command_help + ) + + # Set the terminal background to pure black AND the default foreground to + # light grey for the REPL session. We need both — otherwise, any text the + # terminal renders with its theme-default foreground (e.g. a number or an + # unstyled token in the lexer) keeps the user's theme's fg colour, which + # may be near-black on a light theme → invisible on our forced-black bg. + # OSC 11 sets bg, OSC 10 sets fg. BEL terminator (`\x07`) is the most + # compatible across Mac Terminal, Warp, iTerm2, kitty, alacritty, + # gnome-terminal, Windows Terminal. Opt out with `scrapingbee --keep-bg`. + _set_black_bg = not keep_bg + if _set_black_bg: + sys.stdout.write("\033]11;#000000\007") + sys.stdout.write("\033]10;#EAEAEA\007") + sys.stdout.flush() + + # ── Request a usable terminal size (best-effort) ──────────────────────── + # The banner is 90 cols wide; with margins + input + toolbar the REPL + # really wants ~100 cols × ~30 rows. XTERM Window Manipulation + # sequence "CSI 8 ; H ; W t" asks the terminal to resize itself to + # the given rows/cols. Honoured by xterm (with allowWindowOps), + # iTerm2, kitty, alacritty, WezTerm, Windows Terminal, GNOME + # Terminal. macOS Terminal.app and SSH/tmux sessions ignore it — + # we silently accept whatever size we end up with. Only fires when + # the current size is below the target so a user who's already on a + # large window isn't disrupted. + try: + _cur_cols, _cur_rows = shutil.get_terminal_size((80, 24)) + _min_cols, _min_rows = 150, 50 + if _cur_cols < _min_cols or _cur_rows < _min_rows: + _new_cols = max(_cur_cols, _min_cols) + _new_rows = max(_cur_rows, _min_rows) + sys.stdout.write(f"\033[8;{_new_rows};{_new_cols}t") + sys.stdout.flush() + except Exception: + pass + + # Create the virtual scrollback buffer and seed it with the banner. + # In full_screen mode we own the alt buffer entirely. The banner is + # rendered as a FIXED Window at the top of the layout (not pushed into + # scrollback), so it stays anchored while command output flows in the + # scrollback area below it. Trade-off: banner consumes its natural + # height of terminal rows every frame, but the user keeps the brand + # surface visible (their explicit ask: "when scraping banner should + # not disappear"). + scrollback = ScrollbackBuffer() + + # ── Multi-line in-place progress renderer ─────────────────────────────── + # Wired so batch operations (``scrape --input-file ...``) can update a + # 3-row honeycomb progress widget in place rather than appending a new + # row per completion. The renderer keeps track of how many lines the + # previous frame consumed so the next frame overwrites the same band. + # Install a no-op progress renderer in the REPL. ``emit_progress_lines`` + # would otherwise fall back to writing the honeycomb directly to + # stderr — which lands in scrollback via patch_stdout and causes + # duplicate rows. The fixed ``crawl_status_window`` widget renders + # the live honeycomb directly from ``_progress_state``, so the + # scrollback path is no longer needed in REPL mode. + from .theme import set_progress_renderer as _set_progress_renderer + + _set_progress_renderer(lambda _lines: None) + + # ── First-run API key state ───────────────────────────────────────────── + # When no API key is configured we open the REPL UI in a "first-run" + # mode: the bottom prompt changes from ``❯`` to ``API key: ``, the + # input field is masked via PasswordProcessor, and ``_submit`` routes + # to ``_handle_first_run_key`` (which validates against /usage and + # writes to ~/.config/scrapingbee-cli/.env). Once a key validates we + # flip the flag and the prompt transitions to normal command mode in + # place — no app restart, no screen flicker. + _first_run_needs_key = [not state.api_key_set] + if _first_run_needs_key[0]: + # Render the welcome lines into the scrollback area so the user + # sees them right below the banner while the input field shows + # ``API key:``. We use a throwaway rich Console to produce ANSI, + # then append to the scrollback buffer (the live ``err_console`` + # path doesn't work yet — patch_stdout isn't installed until + # ``app.run()`` starts). + try: + from io import StringIO + + from rich.console import Console + + _buf = StringIO() + _c = Console( + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=shutil.get_terminal_size((80, 24)).columns, + ) + _c.print( + f" [{BEE_DIM}]Welcome! Enter your API key to get started — " + f"find it at [bold {BEE_YELLOW}]dashboard.scrapingbee.com/dashboard[/]" + f"[{BEE_DIM}].[/]" + ) + _c.print() + scrollback.append_ansi_text(_buf.getvalue()) + except Exception: + pass + + # ── Input buffer ──────────────────────────────────────────────────────── + # Locked while a worker thread is running a command so the user can't + # submit another command on top of the first one (their outputs would + # interleave through patched stdout). + is_input_locked = [False] + # Reference to the currently-running worker thread (or None). Used by the + # Ctrl+C handler to inject KeyboardInterrupt into the worker so the user + # can stop a long scrape without exiting the REPL. + current_worker: list[threading.Thread | None] = [None] + # Currently-running shell subprocess (when the user submits ``!cmd``). + # Ctrl+C uses this to terminate the child process directly — injecting + # KeyboardInterrupt into the worker thread alone doesn't fire while the + # thread is blocked reading the subprocess's stdout in a C-level read(). + current_subprocess: list[Any] = [None] + # Monotonic timestamp of the most recent Ctrl+C while a command was + # running. Lets the next Ctrl+C escalate from SIGTERM → SIGKILL if + # the user is impatient (subprocess didn't exit within 2 s). + _last_ctrl_c_time: list[float] = [0.0] + # Queue of pending commands. Populated when ``_submit`` receives a + # buffer with newlines (typically from a multi-line paste) — only + # the first line runs immediately, the rest wait their turn. + # ``_ticker`` drains the queue once the input lock clears. + _pending_commands: list[str] = [] + # ``_multiline_visible[0]`` toggles the input buffer between single- + # line and multi-line mode. Default False (single-line). Multi-line + # paste flips it True so the pasted commands stick in the buffer + # (otherwise prompt_toolkit's single-line buffer would strip the + # newlines on insert). The user can then edit each line and press + # Enter to submit the whole batch — ``_submit`` already splits + # multi-line text into the queue. Reset on submit / Ctrl+C. + _multiline_visible: list[bool] = [False] + + input_buffer = Buffer( + history=history, + completer=completer, + complete_while_typing=False, + auto_suggest=BeeAutoSuggest( + command_names=command_names, + command_flags=command_flags, + bool_flags=bool_flags, + choice_flags=choice_flags, + history=history, + is_disabled=lambda: _first_run_needs_key[0], + ), + multiline=Condition(lambda: _multiline_visible[0]), + read_only=Condition(lambda: is_input_locked[0]), + ) + + def _line_prefix(line_no, wrap_count): + # ``❯`` marks the START of a logical command line — both the + # first line and any subsequent line introduced by an explicit + # newline (multi-line paste or Alt+Enter). Visual wraps of a + # single long command get the continuation indent instead, so + # one long command stays visually one command. + if wrap_count > 0: + return [("", " ")] + if line_no == 0 and _first_run_needs_key[0]: + return [("class:promptmark", "API key: ")] + return [("class:promptmark", "❯ ")] + + # While a command is in flight we collapse the input window's height to + # 0 — instead of hiding it via ConditionalContainer. Hiding via Conditional + # makes the focused window invisible, but prompt_toolkit still places the + # terminal cursor *somewhere*, and Mac Terminal renders that cursor as a + # visible `[` block on the first visible row. With the input still in the + # layout but 0-rows tall, the cursor is "on" the input but in an invisible + # row → no stray indicator anywhere. + def _input_height(): + if state.is_running: + return D.exact(0) + return D(min=1, max=8) + + # ``AppendAutoSuggestion`` is the input processor that renders ghost-text + # auto-suggestions after the cursor. Without it, ``buffer.suggestion`` + # is set correctly but never drawn — BufferControl alone only handles + # the typed text + lexer styling. ``HighlightMatchingBracketProcessor`` + # isn't applied so we don't add it. + # + # ``PasswordProcessor`` masks the input when ``_first_run_needs_key`` is + # True so an API key isn't visible on-screen. Wrapped in a + # ``ConditionalProcessor`` so masking flips off automatically once the + # key validates and we transition to normal command mode. + from prompt_toolkit.layout.processors import ( + AppendAutoSuggestion, + ConditionalProcessor, + PasswordProcessor, + ) + + input_window = Window( + content=BufferControl( + buffer=input_buffer, + lexer=_make_lexer(), + input_processors=[ + ConditionalProcessor( + PasswordProcessor(), + Condition(lambda: _first_run_needs_key[0]), + ), + AppendAutoSuggestion(), + ], + ), + get_line_prefix=_line_prefix, + wrap_lines=True, + height=_input_height, + dont_extend_height=True, + always_hide_cursor=Condition(lambda: state.is_running), + ) + + toolbar_window = Window( + content=FormattedTextControl(_make_toolbar(state)), + height=D.exact(1), + wrap_lines=False, # pin explicitly so toolbar can never grow to 2 rows + ) + + # Live "running command" line that appears above the input only while a + # command is in flight. Renders the typed line with a sweeping white-glim + # shimmer so the user has clear visual feedback that something is happening. + def _running_text() -> list[tuple[str, str]]: + if not state.is_running or not state.running_command_text: + return [] + text = f"❯ {state.running_command_text}" + pos = state.tick % max(1, len(text)) + return _shimmer_pt(text, pos, BEE_YELLOW) + + running_window = ConditionalContainer( + content=Window( + content=FormattedTextControl(_running_text), + height=D.exact(1), + ), + filter=Condition(lambda: state.is_running), + ) + + # ── Bee-blurb row (only while a command is running) ───────────────────── + # A single dim italic line just above the input that alternates every + # ~5 seconds between a bee fact ("Did you know? Bees have 5 eyes.") + # and a bee verb ("pollinating…"). Adds personality during long + # scrapes / crawls without competing with the shimmering command + # line right above it. Hidden when idle so the prompt is the only + # thing below the scrollback. + def _bee_fact_text() -> list[tuple[str, str]]: + if not state.is_running: + return [] + from .theme import current_bee_blurb + + blurb = current_bee_blurb(state.tick) + return [(f"italic {BEE_DIM}", f" {blurb}")] + + # Shared FormattedTextControl that forwards wheel-scroll events to + # the scrollback buffer. Used for every fixed-area Window (banner, + # crawl status, bee facts) so the user can scroll regardless of + # where their mouse pointer is — without this, mouse events that + # land on the fixed widgets get dropped because those windows + # don't have their own scroll handler. + from prompt_toolkit.layout.controls import FormattedTextControl + from prompt_toolkit.mouse_events import ( + MouseEventType, + ) + + # ── Path detection for Ctrl/Alt+Click open ─────────────────────────────── + # Matches just the *start* of a path candidate — absolute (``/``), + # home-relative (``~/``), or directory-relative (``./``, ``../``). + # The lookbehind excludes word chars and ``:`` so URLs like + # ``http://...`` don't match their ``//path`` suffix as a path + # start. From each matched start, ``_find_path_at`` greedily + # extends to end of line and trims back at whitespace / slash + # boundaries until it finds the longest substring that exists on + # disk. This is what lets real-world paths with spaces work — + # ``/Applications/Some App.app``, ``~/Library/Application Support/...``, + # ``/var/folders/.../Screenshot 2026-05-18 at 11.44.12 PM.png``. + _path_start_re = re.compile(r"(? '\"\t" + + def _resolve_path_str(raw: str) -> str: + if raw.startswith("~/"): + return os.path.expanduser(raw) + if raw.startswith(("./", "../")): + return os.path.abspath(raw) + return raw + + def _resolve_clicked_path(raw: str) -> str | None: + """Backwards-compat single-string resolver: return the resolved + absolute path if it exists, else ``None``. + """ + resolved = _resolve_path_str(raw) + return resolved if os.path.exists(resolved) else None + + def _open_path(path: str) -> None: + """Open ``path`` with the OS default handler (Finder, Explorer, + ``xdg-open``). Non-blocking; failures are silently swallowed so + a broken handler doesn't crash the REPL. + """ + import platform + import subprocess + + system = platform.system() + try: + if system == "Darwin": + subprocess.Popen(["open", path]) + elif system == "Windows": + getattr(os, "startfile")(path) # noqa: B009 + else: + subprocess.Popen(["xdg-open", path]) + except Exception: + pass + + # The scrollback renderer caches the visible visual rows here so the + # click handler can find what text was at the click position without + # re-running expensive layout calculations. + _last_scrollback_view: dict[str, list] = {"rows": []} + + class _ScrollForwardingFTC(FormattedTextControl): + """Wheel forwarder + optional modifier+click → path opener. + + ``click_handler`` is invoked on MOUSE_DOWN events that carry a + modifier (Ctrl, Alt, or Shift). Plain clicks are ignored so the + terminal's native drag-select stays functional. + """ + + _click_handler = None + + def set_click_handler(self, handler) -> None: + self._click_handler = handler + + def mouse_handler(self, mouse_event): + et = mouse_event.event_type + if et == MouseEventType.SCROLL_UP: + scrollback.scroll_up(1) + try: + app.invalidate() + except Exception: + pass + return None + if et == MouseEventType.SCROLL_DOWN: + scrollback.scroll_down(1) + try: + app.invalidate() + except Exception: + pass + return None + if et == MouseEventType.MOUSE_DOWN and self._click_handler is not None: + # Plain click opens highlighted paths. The scrollback is + # read-only so a click has no other purpose there. The + # click handler returns NotImplemented when the click + # didn't land on a path, which falls through to default + # mouse handling — drag-to-select still works in Select + # mode (toggle with Shift+Tab) because that mode turns + # mouse capture off entirely. + try: + return self._click_handler(mouse_event) + except Exception: + pass + return NotImplemented + + # ── Path-existence cache for render-time linkification ─────────────────── + # Path detection runs on every invalidate (10 Hz ticker + every + # keystroke), so a naive ``os.path.exists`` per match would issue + # thousands of stat() syscalls per second. Cache the result for + # 30 s — long enough to be cheap, short enough that a file written + # during a crawl shows up as clickable within half a minute. + _path_exists_cache: dict[str, tuple[float, bool]] = {} + _path_exists_ttl = 30.0 + + def _path_exists_cached(path: str) -> bool: + now = time.monotonic() + hit = _path_exists_cache.get(path) + if hit is not None and (now - hit[0]) < _path_exists_ttl: + return hit[1] + try: + exists = os.path.exists(path) + except Exception: + exists = False + _path_exists_cache[path] = (now, exists) + if len(_path_exists_cache) > 512: + cutoff = sorted(_path_exists_cache.items(), key=lambda kv: kv[1][0])[:128] + for k, _ in cutoff: + _path_exists_cache.pop(k, None) + return exists + + def _find_path_at(text: str, start: int) -> tuple[int, str | None]: + """Greedy-then-shrink: starting at ``start``, take everything up + to end-of-line / clear delimiter, then trim back at whitespace + and slash boundaries until the substring exists on disk. + Returns ``(end_index, raw_match)`` or ``(start, None)`` if no + prefix resolves to an existing path. + """ + end = start + while end < len(text) and text[end] not in "\n\r\"'<>|`": + end += 1 + while end > start: + candidate = text[start:end].rstrip(_path_trim_chars) + if len(candidate) < 2: + return (start, None) + resolved = _resolve_path_str(candidate) + if _path_exists_cached(resolved): + return (start + len(candidate), candidate) + # Shrink at the rightmost of whitespace or colon — both are + # common boundaries between a real path and trailing text: + # "/tmp/foo bar baz" → trim at last space + # "/tmp/foo.py:42:10" → trim at the colon (line/col suffix) + # Then fall back to the last slash if neither produced a hit. + last_space = max(candidate.rfind(" "), candidate.rfind("\t")) + last_colon = candidate.rfind(":") + last_punct = max(last_space, last_colon) + if last_punct > 0: + end = start + last_punct + continue + last_slash = candidate.rfind("/") + if last_slash > 0: + end = start + last_slash + continue + return (start, None) + return (start, None) + + def _existing_paths_in(text: str): + """Yield ``(start, end, raw)`` for every existing path substring + in ``text``. Non-overlapping; resumes scanning past each match. + """ + i = 0 + while i < len(text): + m = _path_start_re.search(text, i) + if not m: + break + start = m.start() + end, raw = _find_path_at(text, start) + if raw is not None: + yield (start, end, raw) + i = end + else: + # No existing path here — advance past the ``/`` so we + # don't infinite-loop on the same candidate start. + i = m.end() + + def _scrollback_click_handler(mouse_event): + """Resolve a modifier-click on the scrollback to a path open. + Looks at the visual row at click.y and the existing path-like + substring spanning click.x — opens it if found. + """ + rows = _last_scrollback_view.get("rows") or [] + pos = mouse_event.position + y, x = pos.y, pos.x + if y < 0 or y >= len(rows): + return NotImplemented + text = "".join(t for _, t in rows[y]) + for start, end, raw in _existing_paths_in(text): + if start <= x < end: + _open_path(_resolve_path_str(raw)) + return None + return NotImplemented + + def _styled_with_links( + fragments: list[tuple[str, str]], + ) -> list[tuple[str, str]]: + """Re-emit each fragment with brand-yellow + underline applied + to any path-like substring that exists on disk. The detection + runs on the concatenated text of the row so paths split across + style boundaries (e.g. when ANSI styling colours just the + filename) still get caught. + """ + if not fragments: + return fragments + text = "".join(t for _, t in fragments) + if "/" not in text and "~" not in text: + return fragments + # Build an offset map: position → (fragment_index, char_offset_in_fragment). + # Used to split fragments at path boundaries. + spans = list(_existing_paths_in(text)) + if not spans: + return fragments + # Walk fragments + spans together, splitting where needed. + out: list[tuple[str, str]] = [] + cursor = 0 # absolute offset in concatenated text + span_iter = iter(spans) + cur_span = next(span_iter, None) + for style, frag_text in fragments: + if not frag_text: + out.append((style, frag_text)) + continue + frag_end = cursor + len(frag_text) + i = 0 + while i < len(frag_text): + # Skip past consumed spans. + while cur_span is not None and cur_span[1] <= cursor + i: + cur_span = next(span_iter, None) + if cur_span is None or cur_span[0] >= frag_end: + out.append((style, frag_text[i:])) + i = len(frag_text) + break + span_start, span_end, _raw = cur_span + local_start = max(0, span_start - cursor) + local_end = min(len(frag_text), span_end - cursor) + if local_start > i: + out.append((style, frag_text[i:local_start])) + link_style = f"{style} underline fg:{BEE_YELLOW}".strip() + out.append((link_style, frag_text[local_start:local_end])) + i = local_end + cursor = frag_end + return out + + bee_fact_window = ConditionalContainer( + content=Window( + content=_ScrollForwardingFTC(_bee_fact_text), + height=D.exact(1), + ), + filter=Condition(lambda: state.is_running), + ) + + # ── Crawl status line (fixed Window, not scrollback) ──────────────────── + # Originally we rendered this via ``emit_progress_lines`` which + # APPENDS / REPLACES tail rows of the scrollback. That works for + # batch (writes between ticks are file writes), but crawl pumps + # Scrapy logs into stderr → scrollback constantly. Every Scrapy + # log line invalidated the "last N lines are mine" assumption, + # causing the widget to multiply into ghost copies interleaved + # with logs. A fixed layout Window sits at a known position and + # gets re-rendered each frame — no scrollback noise. + def _has_crawl_status_safe() -> bool: + try: + from .theme import has_crawl_status + + return has_crawl_status() + except Exception: + return False + + def _has_active_job_status() -> bool: + """True when the fixed task widget should be visible — either a + crawl is in flight (``_crawl_status``) or a batch is reporting + progress (``_progress_state``). Used as the ConditionalContainer + filter for ``crawl_status_window``.""" + if _has_crawl_status_safe(): + return True + try: + from .theme import has_progress_state + + return has_progress_state() + except Exception: + return False + + def _crawl_status_text() -> list[tuple[str, str]]: + """Build the fragments for the active-job status widget pinned + right below the (compact) banner. + + Layout: + - Honeycomb progress bar + counter, when ``_progress_state`` + is set (crawl-with-known-total or any batch). + - ``: (X fetched[, Y saved])`` line ONLY when a + crawl is in flight (``_crawl_status`` is set). Batch has no + per-item URL to show, so its widget is honeycomb-only. + """ + from . import theme as _theme # live module reference + from .theme import BEE_WHITE, format_honeycomb_grid, get_crawl_status + + cs = get_crawl_status() + ps = getattr(_theme, "_progress_state", None) + if cs is None and ps is None: + return [] + + frags: list[tuple[str, str]] = [] + + # Honeycomb row when progress total is known. + if ps is not None: + try: + rows = format_honeycomb_grid( + completed=ps["completed"], + total=ps["total"], + rps=ps.get("rps"), + eta=ps.get("eta"), + failure_pct=ps.get("failure_pct"), + animate=True, + ) + for i, row_text in enumerate(rows): + if i > 0 or (cs is not None): + frags.append(("", "\n")) + if i == 0 and cs is None: + # First (and usually only) honeycomb row for + # batch-only mode — no preceding \n. + pass + frags.extend(_text_to_fragments(row_text)) + if cs is not None: + # Separator between honeycomb and URL row. + frags.append(("", "\n")) + except Exception: + pass + + # URL / fetched-count line — crawl only. + if cs is not None: + phase = cs.get("phase") or "fetching" + url = cs.get("current_url") + fetched = cs.get("fetched") or 0 + saved = cs.get("saved") or 0 + if url and len(url) > 80: + url = url[:48] + "…" + url[-25:] + frags.append(("", " ")) + frags.append((f"bold {BEE_YELLOW}", f"{phase}: ")) + if url: + frags.append((BEE_WHITE, url)) + else: + frags.append((f"{BEE_DIM}", "…")) + suffix = f" ({fetched} fetched" + if saved: + suffix += f", {saved} saved" + suffix += ")" + frags.append((f"{BEE_DIM}", suffix)) + return frags + + def _crawl_status_height() -> D: + """Compute widget height based on what's shown. + Cases: + • crawl only (no progress) → 1 row (URL line) + • crawl + progress (known total) → 2 rows + • batch only (progress, no crawl URL) → 1 row (honeycomb only) + """ + cs_set = _has_crawl_status_safe() + try: + from .theme import has_progress_state + + ps_set = has_progress_state() + except Exception: + ps_set = False + if cs_set and ps_set: + return D.exact(2) + return D.exact(1) + + crawl_status_window = ConditionalContainer( + content=Window( + content=_ScrollForwardingFTC(_crawl_status_text), + height=_crawl_status_height, + ), + filter=Condition(_has_active_job_status), + ) + + # ── Scrollback Window — virtual buffer rendered as the top section ───── + # This Window fills the vertical space above the running line / input / + # toolbar. It renders whatever ScrollbackBuffer says is visible based + # on the current scroll offset. The user scrolls it with PgUp/PgDn etc. + def _scrollback_render() -> list[tuple[str, str]]: + height = 20 + width = 80 + try: + from prompt_toolkit.application import get_app as _get_app + + _app = _get_app() + if getattr(_app, "is_running", False): + size = _app.output.get_size() + # Reserve rows for the banner + everything below the + # scrollback in the layout: banner_height + spacer_top(1) + # + separator(1) + running_or_input(1) + spacer_bottom(1) + # + toolbar(1) = banner + 5. Banner is now dynamic + # (full ASCII when idle, single line during crawl / + # batch), so we ask ``_banner_height`` for the live + # value rather than using the static visual height. + banner_h = 1 if _active_job_in_progress() else _banner_visual_height + reserved = banner_h + 5 + if state.is_running: + # bee_fact_window row above the (collapsed) input. + reserved += 1 + if _has_active_job_status(): + # The active-job status widget is pinned right under + # the banner — 2 rows when both crawl URL and + # honeycomb are shown, otherwise 1 row (URL-only + # crawl, or honeycomb-only batch). + cs_set = _has_crawl_status_safe() + try: + from .theme import has_progress_state + + ps_set = has_progress_state() + except Exception: + ps_set = False + reserved += 2 if (cs_set and ps_set) else 1 + height = max(1, size.rows - reserved) + width = max(1, size.columns) + except Exception: + pass + # Use visual-row pagination so scrolling moves exactly one terminal + # row per step, even through long single-line content that would + # otherwise wrap into many visual rows. We split at width-1 so a + # full-width row never accidentally pushes the cursor onto the + # next terminal row (which some terminals do at col == width). + visual_rows = scrollback.get_visible_visual(height, max(1, width - 1)) + # Re-style each row so path-like substrings that exist on disk + # are rendered in brand-yellow with an underline — a visible + # affordance for the Ctrl/Alt+Click open-in-Finder feature. + visual_rows = [_styled_with_links(row) for row in visual_rows] + # Cache so the modifier+click handler on the scrollback Window + # can look up what text was at the click position without + # recomputing wrap/scroll math. + _last_scrollback_view["rows"] = visual_rows + out: list[tuple[str, str]] = [] + for i, row in enumerate(visual_rows): + if i > 0: + out.append(("", "\n")) + out.extend(row) + return out + + # The scrollback window uses the same scroll-forwarding control as + # the rest of the fixed-area widgets so a wheel event anywhere on + # screen feeds the scrollback buffer. The click_handler hook + # additionally opens path-like substrings under Ctrl/Alt/Shift+Click. + _scrollback_ftc = _ScrollForwardingFTC(_scrollback_render) + _scrollback_ftc.set_click_handler(_scrollback_click_handler) + scrollback_window = Window( + content=_scrollback_ftc, + # We pre-wrap content ourselves (see _split_fragments_to_width) so + # each line passed to prompt_toolkit is already ≤ terminal width. + # Disable prompt_toolkit's own line-wrapping so it doesn't try to + # second-guess us — we want exact control of which visual rows + # appear for accurate scroll-by-row behaviour. + wrap_lines=False, + always_hide_cursor=True, + ) + + # ── Pinned banner Window (smaller logo, original stacked structure) ─── + # Restores the original banner layout — ASCII logo, then version, + # tagline, blank, hint — but uses only the SCRAPING logo (6 rows) + # instead of stacking SCRAPING + BEE (which was 12 rows). Half the + # vertical footprint, same look. + _banner_visual_height = len(_SCRAPINGBEE_LOGO) + 5 # logo + 5 text rows + + def _banner_render() -> list[tuple[str, str]]: + # While a long-running command (crawl / batch scrape) is in + # flight, collapse the ASCII wordmark to a single-line + # ``ScrapingBee v1.5.0`` so the freed rows above scrollback can + # show the live task widget — URL, fetched count, honeycomb + # progress bar. The big banner returns once the run ends. + if _active_job_in_progress(): + line = Text() + line.append(" ScrapingBee ", style=f"bold {BEE_YELLOW}") + line.append(f"v{version}", style="bold white") + return _text_to_fragments(line) + out: list[tuple[str, str]] = [] + # SCRAPING half in brand yellow, BEE half in white — matches the + # wordmark in the official brand assets. + for i, logo_line in enumerate(_SCRAPINGBEE_LOGO): + if i > 0: + out.append(("", "\n")) + left = logo_line[:_BEE_OFFSET] + right = logo_line[_BEE_OFFSET:] + out.append((f"bold {BEE_YELLOW}", left)) + out.append(("bold white", right)) + # Spacer row + out.append(("", "\n")) + # v1.5.0 + out.append(("", "\n")) + out.append((f"bold {BEE_YELLOW}", f" v{version}")) + # Tagline + out.append(("", "\n")) + out.append((f"{BEE_DIM}", " Web scraping from the terminal")) + out.append(("", "\n")) + # Hint + out.append((f"{BEE_DIM}", " Type ")) + out.append((f"bold {BEE_YELLOW}", ":help")) + out.append((f"{BEE_DIM}", " for commands, ")) + out.append((f"bold {BEE_YELLOW}", ":q")) + out.append((f"{BEE_DIM}", " to quit")) + return out + + def _active_job_in_progress() -> bool: + """True while a crawl or batch is running — used to collapse + the banner so the live task widget gets prominent placement.""" + if _has_crawl_status_safe(): + return True + try: + from .theme import has_progress_state + + return has_progress_state() + except Exception: + return False + + def _text_to_fragments(t: Text) -> list: + """Render a rich Text object to the (style, text) fragment list + prompt_toolkit's ``FormattedTextControl`` expects.""" + try: + from io import StringIO + + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( + to_formatted_text as _tft, + ) + from rich.console import Console + + buf = StringIO() + _c = Console( + file=buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, + ) + _c.print(t, end="") + return list(_tft(_ANSI(buf.getvalue()))) + except Exception: + return [("", t.plain)] + + def _banner_height() -> D: + # Compact one-liner while a crawl / batch is active; full ASCII + # banner otherwise. + if _active_job_in_progress(): + return D.exact(1) + return D.exact(_banner_visual_height) + + banner_window = Window( + content=_ScrollForwardingFTC(_banner_render), + height=_banner_height, + wrap_lines=False, + always_hide_cursor=True, + ) + + # Breathing room around the prompt area (Claude-CLI-style). + # - blank row above the separator → visual gap from output + # - dim horizontal rule → clear boundary between "history" and "input" + # - blank row below the toolbar → keeps the toolbar from sitting right + # on the bottom edge of the terminal + def _hr_render() -> list[tuple[str, str]]: + try: + from prompt_toolkit.application import get_app as _get_app + + _app = _get_app() + if getattr(_app, "is_running", False): + cols = _app.output.get_size().columns + else: + cols = 80 + except Exception: + cols = 80 + return [("class:toolbar.hint", "─" * max(1, cols))] + + spacer_top = Window(height=D.exact(1), char=" ") + separator = Window( + content=FormattedTextControl(_hr_render), + height=D.exact(1), + always_hide_cursor=True, + ) + spacer_bottom = Window(height=D.exact(1), char=" ") + + # FloatContainer wraps the main layout so we can hover a completion + # popup near the cursor. Without the Float + CompletionsMenu prompt- + # toolkit's `start_completion()` enters completion *state* but nothing + # visible changes — the user thought Tab did nothing and pressed + # again, hitting `complete_next` which cycled invisibly. With the + # menu in place, the first Tab opens the popup; Up/Down navigate + # entries; Enter / Tab inserts; Esc dismisses. + from prompt_toolkit.layout.containers import Float, FloatContainer + from prompt_toolkit.layout.menus import CompletionsMenu + + main_split = HSplit( + [ + banner_window, + crawl_status_window, + scrollback_window, + spacer_top, + separator, + running_window, + bee_fact_window, + input_window, + spacer_bottom, + toolbar_window, + ] + ) + layout = Layout( + FloatContainer( + content=main_split, + floats=[ + Float( + xcursor=True, + ycursor=True, + content=CompletionsMenu(max_height=10, scroll_offset=1), + ), + ], + ) + ) + + # ── Command echo ──────────────────────────────────────────────────────── + def _echo_to_scrollback(line: str) -> None: + """Echo the submitted command into scrollback (dim grey). + + Both chevron and line use the explicit ``#888888`` colour rather + than mixing in Rich's ``dim`` attribute on top — on our dark + background the compound was rendering nearly black, making the + echo invisible. A single mid-grey shade is subdued enough to feel + like "history" without disappearing. + + Only the *live* input prompt at the bottom uses the bright yellow + chevron, so the eye can find "where I'm typing now" without it + competing with past commands above. + """ + echo = Text() + echo.append("❯ ", style=BEE_DIM) + echo.append(line, style=BEE_DIM) + err_console.print(echo) + + # ── First-run API key validation ──────────────────────────────────────── + # Called from ``_submit`` on the main thread when ``_first_run_needs_key`` + # is True. The user just submitted the masked key — we validate it + # against the live /usage endpoint, persist on success, and flip the + # flag so subsequent submits route to ``_execute`` (normal commands). + def _handle_first_run_key(key_raw: str, raw_with_ws: str) -> None: + from .commands.auth import _validate_api_key + from .config import ENV_API_KEY, save_api_key_to_dotenv + + key = key_raw.strip() + # Pasted keys from password managers often pick up surrounding + # whitespace. Silently strip but warn so the user knows we did. + if key and key != raw_with_ws.rstrip("\n"): + err_console.print( + f" [{BEE_DIM}]Note: stripped surrounding whitespace from your key.[/]" + ) + if not key: + err_console.print( + f" [bold {BEE_RED}]Empty key.[/] [{BEE_DIM}]Please paste your API key.[/]" + ) + return + err_console.print(f" [{BEE_DIM}]Validating…[/]") + valid, err_msg = _validate_api_key(key) + if valid: + try: + save_api_key_to_dotenv(key) + except Exception as e: + err_console.print(f" [bold {BEE_RED}]Could not save:[/] [{BEE_DIM}]{e}[/]") + os.environ[ENV_API_KEY] = key + state.api_key_set = True + _first_run_needs_key[0] = False + err_console.print(f" [bold {BEE_YELLOW}]✓[/] API key saved.") + # Toolbar credits/concurrency are stale (None); trigger a fresh + # /usage fetch so the bottom strip populates without waiting + # for the 30s tick. + _signal_refresh_from_thread() + try: + app.invalidate() + except Exception: + pass + else: + err_console.print( + f" [bold {BEE_RED}]Invalid:[/] [{BEE_DIM}]{err_msg or 'unknown error'}. Try again.[/]" + ) + + # ── Shell command execution (`!cmd` in the REPL) ──────────────────────── + # Runs in a worker thread so the REPL stays responsive. stdout+stderr + # are merged and streamed line-by-line through the patched + # ``sys.stdout`` (which writes into scrollback). Ctrl+C terminates the + # child process via ``current_subprocess[0].terminate()`` AND injects + # KeyboardInterrupt into the worker thread (so a hung read returns + # promptly). + def _execute_shell(shell_cmd: str, original_line: str, echo_idx: int) -> None: + import subprocess + + output_start_index = echo_idx + start = time.monotonic() + status_ref = ["ok"] + state.is_running = True + state.running_command = "shell" + state.running_command_text = original_line + state.run_start = start + + def _run() -> None: + try: + # Use the system shell so users can pipe / redirect / glob + # naturally. Merge stderr into stdout for unified streaming; + # any separation is the user's problem (they'd redirect + # 2>&1 themselves if they cared). + proc = subprocess.Popen( # noqa: S602 — gated by exec_gate + shell_cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + current_subprocess[0] = proc + try: + assert proc.stdout is not None + for chunk in iter(proc.stdout.readline, ""): + sys.stdout.write(chunk) + finally: + code = proc.wait() + current_subprocess[0] = None + if code != 0: + status_ref[0] = "fail" + err_console.print(f" [{BEE_DIM}]exit code {code}[/]") + except KeyboardInterrupt: + # Ctrl+C: stop the child if it's still running, then mark + # the command as cancelled in the footer. + proc = current_subprocess[0] + if proc is not None: + try: + proc.terminate() + except Exception: + pass + err_console.print(f" [{BEE_DIM}]stopped[/]") + status_ref[0] = "stopped" + except Exception as e: + err_console.print(f" [bold {BEE_RED}]error:[/] {e}") + status_ref[0] = "fail" + + def _finish() -> None: + duration = time.monotonic() - start + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + # Splice the dim echo line above the streamed output. + try: + from io import StringIO + + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( + to_formatted_text as _tft, + ) + from rich.console import Console + + _buf = StringIO() + _c = Console( + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(original_line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(output_start_index, _echo_fragments) + except Exception: + pass + _print_command_footer(status_ref[0], duration) + state.last_command = "shell" + state.last_status = status_ref[0] + state.last_duration = duration + is_input_locked[0] = False + # Clear the typed ``!cmd`` only when it ran cleanly. A + # non-zero exit or Ctrl+C-stopped run leaves the line in the + # buffer so the user can tweak it and retry without retyping. + if status_ref[0] == "ok": + try: + input_buffer.reset() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + + is_input_locked[0] = True + try: + app.invalidate() + except Exception: + pass + + def _worker() -> None: + try: + _run() + finally: + current_worker[0] = None + try: + _finish() + except Exception: + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + is_input_locked[0] = False + try: + app.invalidate() + except Exception: + pass + + worker_thread = threading.Thread(target=_worker, daemon=True) + current_worker[0] = worker_thread + worker_thread.start() + + # ── Crawl execution via subprocess ────────────────────────────────────── + # Twisted's reactor is a single-shot process-wide singleton. Once + # ``reactor.run()`` has been entered and returned, the same Python + # process can never call it again. Running each crawl in a fresh + # ``python -m scrapingbee_cli.cli crawl ...`` subprocess gives us + # a brand-new reactor per crawl, so the user can issue many + # crawls in one REPL session. + # + # IPC for live status: the parent sets ``SCRAPINGBEE_CRAWL_STATUS_FILE`` + # to a per-pid path; the child's spider signal handlers atomically + # mirror ``theme._crawl_status`` to that file via + # ``_maybe_mirror_to_status_file``. A polling thread on the parent + # reads the file every 100 ms and forwards updates into the + # parent's own ``_crawl_status`` so the layout-window crawl status + # display keeps showing live URL / fetched count. + def _execute_crawl_subprocess(crawl_args: list[str], original_line: str, echo_idx: int) -> None: + import os as _os + import subprocess + + output_start_index = echo_idx + start = time.monotonic() + status_ref = ["ok"] + state.is_running = True + state.running_command = "crawl" + state.running_command_text = original_line + state.run_start = start + + status_file = ( + Path.home() / ".cache" / "scrapingbee-cli" / f"crawl-status-{_os.getpid()}.json" + ) + try: + status_file.parent.mkdir(parents=True, exist_ok=True) + except Exception: + pass + + # Clear any leftover state file from a prior run. + try: + status_file.unlink() + except Exception: + pass + + # Pre-populate the parent's _crawl_status so the layout window + # shows "starting…" the instant the user submits, rather than + # waiting for the child to fire its first signal. + try: + from .theme import update_crawl_status + + update_crawl_status( + current_url=None, + fetched=0, + queued=0, + saved=0, + phase="starting", + ) + except Exception: + pass + + _stop_poll = threading.Event() + + def _poll_status_file() -> None: + """Watch the child's status JSON file and forward updates + into the parent's in-memory state. 100 ms cadence so URL + + counter changes feel live in the fixed status widget. + + The payload also carries the child's progress state + (``progress_total``, ``progress_completed``) when a known + total is in play — sitemap mode, ``--max-depth 1``, or + ``--max-pages N``. That's how the parent's fixed widget + learns to show the honeycomb above the URL line. + """ + import json as _json + + from .theme import update_crawl_status, update_progress_state + + last_mtime = 0.0 + while not _stop_poll.wait(0.1): + try: + stat = status_file.stat() + except FileNotFoundError: + continue + except Exception: + continue + if stat.st_mtime == last_mtime: + continue + last_mtime = stat.st_mtime + try: + with open(status_file, encoding="utf-8") as fh: + data = _json.load(fh) + update_crawl_status( + current_url=data.get("current_url"), + fetched=data.get("fetched"), + queued=data.get("queued"), + saved=data.get("saved"), + phase=data.get("phase"), + ) + pt = data.get("progress_total") + pc = data.get("progress_completed") + if isinstance(pt, int) and pt > 0 and isinstance(pc, int): + # ``update_progress_state`` no-ops on the + # scrollback render path when ``_crawl_status`` + # is set, so we just set state — the fixed + # widget will pick it up next frame. + update_progress_state( + pc, + pt, + rps=data.get("progress_rps"), + eta=data.get("progress_eta"), + failure_pct=data.get("progress_failure_pct"), + ) + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + + poll_thread = threading.Thread(target=_poll_status_file, daemon=True) + poll_thread.start() + + def _run() -> None: + try: + env = _os.environ.copy() + env["SCRAPINGBEE_CRAWL_STATUS_FILE"] = str(status_file) + # Mark the child as "spawned by REPL" so it can adjust + # output (no colors / no spinner) if we ever want that. + env["SCRAPINGBEE_FROM_REPL"] = "1" + cmd = [sys.executable, "-m", "scrapingbee_cli.cli"] + crawl_args + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + ) + current_subprocess[0] = proc + try: + assert proc.stdout is not None + for chunk in iter(proc.stdout.readline, ""): + sys.stdout.write(chunk) + finally: + code = proc.wait() + current_subprocess[0] = None + if code != 0: + # ``terminate()`` from Ctrl+C exits with -SIGTERM + # (-15 on POSIX). A second Ctrl+C escalates to + # ``proc.kill()`` which exits with -SIGKILL (-9). + # Treat any of these as a deliberate stop rather + # than a failure so the footer reads ■ stopped. + if code in (-15, -9, -2): + status_ref[0] = "stopped" + else: + status_ref[0] = "fail" + err_console.print(f" [{BEE_DIM}]exit code {code}[/]") + except KeyboardInterrupt: + proc = current_subprocess[0] + if proc is not None: + try: + proc.terminate() + except Exception: + pass + err_console.print(f" [{BEE_DIM}]stopped[/]") + status_ref[0] = "stopped" + except Exception as e: + err_console.print(f" [bold {BEE_RED}]error:[/] {e}") + status_ref[0] = "fail" + + def _finish() -> None: + duration = time.monotonic() - start + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + # Stop polling BEFORE clearing in-memory state. Join the poll + # thread so it can't race past the event check, read the file + # one last time, and resurrect ``_crawl_status`` after we + # cleared it — the bug that left the crawl status window + # visible after Ctrl+C. + _stop_poll.set() + try: + poll_thread.join(timeout=0.5) + except Exception: + pass + try: + from .theme import clear_crawl_status, clear_progress_state + + clear_crawl_status() + clear_progress_state() + except Exception: + pass + try: + status_file.unlink() + except Exception: + pass + # Splice the dim echo line above the streamed output. + try: + from io import StringIO + + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( + to_formatted_text as _tft, + ) + from rich.console import Console + + _buf = StringIO() + _c = Console( + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(original_line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(output_start_index, _echo_fragments) + except Exception: + pass + _print_command_footer(status_ref[0], duration) + state.last_command = "crawl" + state.last_status = status_ref[0] + state.last_duration = duration + is_input_locked[0] = False + + # Buffer mutations have to run on the prompt_toolkit main + # loop thread — this ``_finish`` is on the worker thread, + # and calling ``input_buffer.reset()`` from here directly + # doesn't actually propagate to the displayed input + # (which is why the typed crawl command was still + # appearing in the prompt after ``✓ 28.10s``). Marshal + # the clear + invalidate through ``call_soon_threadsafe``. + def _apply_finish_state() -> None: + if status_ref[0] == "ok": + try: + input_buffer.reset() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + + try: + loop = getattr(app, "loop", None) + if loop is not None: + loop.call_soon_threadsafe(_apply_finish_state) + else: + _apply_finish_state() + except Exception: + _apply_finish_state() + + is_input_locked[0] = True + try: + app.invalidate() + except Exception: + pass + + def _worker() -> None: + try: + _run() + finally: + current_worker[0] = None + try: + _finish() + except Exception: + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + is_input_locked[0] = False + try: + app.invalidate() + except Exception: + pass + + worker_thread = threading.Thread(target=_worker, daemon=True) + current_worker[0] = worker_thread + worker_thread.start() + + # ── Command execution (synchronous, output flows via patched stdout) ──── + def _execute(line: str) -> bool: + """Run a single REPL submission: meta-command or click command. + + Returns ``True`` if the submission was consumed (whether it + succeeded, failed at runtime, or was an unknown command) — in + every such case the user has gotten feedback and the input buffer + should be cleared. Returns ``False`` only when the submission + couldn't even be parsed (shlex error); the caller leaves the + buffer untouched so the user can correct and retry without + re-typing. + """ + line = line.strip() + if not line: + return True + + # Meta-commands (`:set`, `:help`, `:show`, ...) and unknown / parse + # errors echo the command immediately — there's no shimmer pass for + # those. Click commands defer the echo until after completion, so + # the live shimmering line above the input is the only on-screen + # representation while the command runs. + # `:q` is handled at the key-binding layer so we don't get here for it. + # + # Snapshot scrollback length before running the meta-handler so we + # can splice the ``❯ line`` echo at this position afterwards. Without + # this, the echo lands AFTER any error/info the meta-handler + # printed (e.g. ``file not found: foo`` then ``❯ :view foo``), which + # reads upside-down. Insert-at-position keeps the conversational + # order: command, then its output. + meta_echo_idx = scrollback.current_length() + meta = _handle_meta( + line, + state, + command_help, + all_known_flags, + bool_flags, + choice_flags, + scrollback=scrollback, + ) + if meta == "ok": + # If we just ran :view, the nested pager Application emitted + # ``\x1b[?1049l`` on its exit, kicking us out of the alt screen + # buffer. Re-enter it and reset the outer renderer so the next + # paint goes into the fresh alt buffer instead of leaking into + # main-screen scrollback. + if line.strip().lower().startswith(":view"): + try: + # ``sys.__stdout__`` is ``Optional[TextIO]`` in the + # stdlib stubs; in practice it's set for any TTY + # invocation we care about. Guard against the rare + # None case rather than juggling type-ignores. + out = sys.__stdout__ + if out is not None: + out.write("\x1b[?1049h") + out.flush() + except Exception: + pass + try: + app.renderer.reset() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + # Splice the dim echo line ABOVE whatever the meta-handler + # printed during its run. Fall back to appending if the + # rich-render or insert path fails. + try: + from io import StringIO + + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( + to_formatted_text as _tft, + ) + from rich.console import Console + + _buf = StringIO() + _c = Console( + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(meta_echo_idx, _echo_fragments) + except Exception: + _echo_to_scrollback(line) + # Successful meta command — clear the input so the prompt is + # ready for the next entry. Failed parses / typos take the + # ``meta is None`` path (unknown command) which leaves the + # buffer in place for the user to edit. + try: + input_buffer.reset() + except Exception: + pass + return True + if meta == "quit": # belt-and-braces; key binding usually catches it + return True + + # `!shell command` — run a shell command in a worker thread, + # streaming output into scrollback. Gated by the same unsafe-mode + # check used by --post-process / --on-complete / schedule. + if line.startswith("!"): + shell_cmd = line[1:].strip() + shell_echo_idx = scrollback.current_length() + if not shell_cmd: + err_console.print(f" [{BEE_DIM}]usage: ![/][bold {BEE_YELLOW}][/]") + else: + from .exec_gate import ( + is_command_whitelisted, + is_exec_enabled, + is_whitelist_enabled, + ) + + if not is_exec_enabled(): + err_console.print( + f" [bold {BEE_RED}]Shell execution disabled.[/] " + f"[{BEE_DIM}]Enable it with `auth --unsafe` " + f"(requires SCRAPINGBEE_ALLOW_EXEC=1).[/]" + ) + elif is_whitelist_enabled() and not is_command_whitelisted(shell_cmd): + err_console.print( + f" [bold {BEE_RED}]Blocked:[/] " + f"[{BEE_DIM}]command not in whitelist or contains " + f"shell-injection patterns.[/]" + ) + else: + _execute_shell(shell_cmd, line, shell_echo_idx) + return True + # Echo the typed line above whatever error we just printed. + try: + from io import StringIO + + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( + to_formatted_text as _tft, + ) + from rich.console import Console + + _buf = StringIO() + _c = Console( + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(shell_echo_idx, _echo_fragments) + except Exception: + _echo_to_scrollback(line) + return True + + # Tolerate users typing `scrapingbee ...` out of muscle memory. + if line.lower().startswith("scrapingbee "): + line = line[len("scrapingbee ") :].strip() + + original_line = line # what to echo after completion + + try: + args = shlex.split(line) + except ValueError as e: + # Parse error — DO NOT consume the buffer. The user almost + # certainly has an unclosed quote; let them fix it in-place. + err_console.print(f" [bold {BEE_RED}]parse error:[/] {e}") + return False + if not args: + return True + + cmd_name = args[0] + if cmd_name not in command_flags: + _echo_to_scrollback(original_line) + suggestion = _suggest(cmd_name, command_names) + if suggestion: + err_console.print( + f" [bold {BEE_RED}]unknown:[/] {cmd_name} " + f"[{BEE_DIM}]did you mean[/] " + f"[bold {BEE_YELLOW}]{suggestion}[/][{BEE_DIM}]?[/]" + ) + else: + err_console.print(f" [bold {BEE_RED}]unknown:[/] {cmd_name}") + return True + + # Bare ``auth`` in the REPL (no flags) is best served by flipping + # the bottom prompt into first-run mode instead of routing through + # ``run_in_terminal`` — the suspend/resume cycle to read a key in + # the bare terminal feels jarring, and the masked in-place prompt + # is the same flow the user just learned at startup. ``auth + # --api-key KEY`` is non-interactive and still goes through click. + if cmd_name == "auth" and len(args) == 1: + _echo_to_scrollback(original_line) + _first_run_needs_key[0] = True + try: + input_buffer.reset() + except Exception: + pass + err_console.print(f" [{BEE_DIM}]Enter your API key below.[/]") + try: + app.invalidate() + except Exception: + pass + return True + + # ``auth --unsafe`` opens a multi-step disclaimer + masked-key + # prompt that fights our alt-buffer / termios state when invoked + # through ``run_in_terminal``. The flow appears to exit the REPL + # and leaves the terminal non-reactive while it blocks on + # synchronous stdin reads. Redirect the user to run it from a + # plain shell, where its interactive prompts work correctly. + if cmd_name == "auth" and "--unsafe" in args: + _echo_to_scrollback(original_line) + err_console.print( + f" [bold {BEE_YELLOW}]auth --unsafe[/] must be run from a " + f"plain shell, not inside the REPL." + ) + err_console.print( + f" [{BEE_DIM}]exit the REPL ([bold {BEE_YELLOW}]:q[/][{BEE_DIM}]) " + f"then run:[/] [bold]scrapingbee auth --unsafe[/]" + ) + return True + + # Only inject session defaults that the target command actually + # accepts; otherwise ``:set --json-response true`` would also + # apply to ``usage``, which rejects it as an unknown option. + args = state.apply_settings_to_args(args, accepted=set(command_flags.get(cmd_name, []))) + # Let users type ``--verbose true|false`` (etc.) in the REPL + # too — same normalisation as the CLI ``main()`` entry. + try: + from .cli_utils import ( + collect_bool_flag_names, + normalize_bool_flag_args, + ) + + args = normalize_bool_flag_args(args, collect_bool_flag_names(cli_group)) + except Exception: + pass + + # Mark the scrollback position where this command's output will + # start. We DO NOT echo here — while the command runs, only the + # shimmering running line is the live indicator. After the + # command finishes, _finish inserts the dim echo at this index + # so the rendered order becomes: + # ❯ scrape https://… (echo, inserted post-completion) + # (was streamed in during execution) + # ✓ 0.45s (footer, appended in _finish) + # i.e. echo + output + footer atomically appear together at the + # moment of completion, without doubling up the live shimmer. + output_start_index = scrollback.current_length() + + # ``crawl`` is special — Twisted's reactor is a process-wide + # singleton, so we run each crawl in a fresh subprocess to make + # multiple crawls per REPL session work. The function below + # owns the full lifecycle (worker thread, status-file polling, + # _finish), so we return immediately here. + if cmd_name == "crawl": + _execute_crawl_subprocess(args, original_line, output_start_index) + return True + + start = time.monotonic() + status_ref = ["ok"] + state.is_running = True + state.running_command = cmd_name + state.running_command_text = original_line # used by shimmer above input + state.run_start = start + + def _run() -> None: + try: + cli_group.main(args, standalone_mode=False) + except click.UsageError as e: + msg = str(e) + err_console.print(f" [bold {BEE_RED}]usage:[/] {msg}") + if "no such option" in msg.lower(): + m = re.search(r"--?[A-Za-z0-9-]+", msg) + if m: + bad = m.group(0) + suggestion = _suggest(bad, command_flags.get(cmd_name, [])) + if suggestion: + err_console.print( + f" [{BEE_DIM}]did you mean[/] " + f"[bold {BEE_YELLOW}]{suggestion}[/][{BEE_DIM}]?[/]" + ) + status_ref[0] = "fail" + except click.ClickException as e: + e.show() + status_ref[0] = "fail" + except (KeyboardInterrupt, _asyncio_mod.CancelledError): + # Ctrl+C while running — the keybinding either cancelled + # our asyncio tasks (CancelledError propagates out of the + # await chain) or injected KeyboardInterrupt via + # PyThreadState_SetAsyncExc. Either way surface it as a + # deliberate stop in the footer rather than a generic + # failure. (CancelledError is a BaseException since + # Python 3.8 and won't be caught by ``except Exception``.) + err_console.print(f" [{BEE_DIM}]stopped[/]") + status_ref[0] = "stopped" + except SystemExit as e: + code = e.code if e.code is not None else 0 + if code not in (0, None): + status_ref[0] = "fail" + except Exception as e: + err_console.print(f" [bold {BEE_RED}]error:[/] {e}") + status_ref[0] = "fail" + + def _finish() -> None: + duration = time.monotonic() - start + # Stop the shimmer first so the echo + footer commit cleanly to + # scrollback without competing with the live above-input line. + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + # Splice the dim echo line in *front of* the output rows that + # streamed into scrollback during execution. We marked the + # position at the start of _execute (output_start_index); any + # rows past that index belong to this command. Inserting at + # that index puts the echo right above its output. + try: + from io import StringIO + + from prompt_toolkit.formatted_text import ( + ANSI as _ANSI, + ) + from prompt_toolkit.formatted_text import ( + to_formatted_text as _tft, + ) + from rich.console import Console + + _buf = StringIO() + _c = Console( + file=_buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, + ) + _echo_t = Text() + _echo_t.append("❯ ", style=BEE_DIM) + _echo_t.append(original_line, style=BEE_DIM) + _c.print(_echo_t, end="") + _echo_fragments = list(_tft(_ANSI(_buf.getvalue()))) + scrollback.insert_line(output_start_index, _echo_fragments) + except Exception: + # Defensive fallback: if anything goes wrong with the rich + # render, drop the echo rather than crash the REPL. + pass + _print_command_footer(status_ref[0], duration) + state.last_command = cmd_name + state.last_status = status_ref[0] + state.last_duration = duration + state.refresh_credits_from_cache() + is_input_locked[0] = False + + # State mutations triggered by auth/logout need to be visible to + # the asyncio loop's _usage_refresher and the toolbar render — + # both run on the main loop thread while we're in the worker + # thread. Bouncing the writes through call_soon_threadsafe + # guarantees a happens-before edge with the loop's next tick. + # + # We deliberately keep ``used_credits_at_start`` across logout — + # if the user re-authenticates with the *same* key, the next + # refresh detects an unchanged ``api_key_hash`` and continues the + # session counter. A *different* key triggers a reset there. + def _apply_post_cmd_state() -> None: + if cmd_name == "auth": + if get_api_key_if_set(None): + state.api_key_set = True + elif cmd_name == "logout": + state.api_key_set = False + state.credits = None + state.credits_total = None + state.used_credits = None + state.max_concurrency = None + state.current_concurrency = None + state.last_usage_refresh_mono = None + # Flip back into first-run mode in place — the prompt + # transitions to ``API key: `` and the input is masked + # so the user can paste a new key without re-running + # ``auth`` (which would suspend the REPL via + # ``run_in_terminal`` and feel jarring). + _first_run_needs_key[0] = True + err_console.print( + f" [{BEE_DIM}]Enter a new API key to continue, or " + f"[bold {BEE_YELLOW}]:q[/][{BEE_DIM}] to exit.[/]" + ) + # Clear the input buffer only on success — failed or + # cancelled commands leave the line in place so the user + # can edit and re-run without re-typing. Buffer mutations + # have to run on the main thread (this callback is + # already marshalled there via call_soon_threadsafe). + if status_ref[0] == "ok": + try: + input_buffer.reset() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + + try: + loop = getattr(app, "loop", None) + if loop is not None: + loop.call_soon_threadsafe(_apply_post_cmd_state) + else: + _apply_post_cmd_state() + except Exception: + _apply_post_cmd_state() + + # `usage` and `auth` are the two commands whose completion implies + # the live toolbar values are stale — trigger an immediate refresh + # rather than waiting for the next 30s tick. + if cmd_name in ("usage", "auth"): + _signal_refresh_from_thread() + try: + app.invalidate() + except Exception: + pass + + if cmd_name in _INTERACTIVE_COMMANDS: + # tutorial / auth use click.prompt() and need raw terminal access. + # Suspend the persistent prompt-toolkit app, run the command in + # the bare terminal, then resume. Synchronous — we wait for it. + is_input_locked[0] = True + try: + run_in_terminal(_run, in_executor=False) + finally: + _finish() + return True + + # Network commands run in a worker thread so they don't fight + # prompt_toolkit's asyncio loop. (scrape, google, etc. each call + # `asyncio.run(...)` internally — and asyncio.run refuses to start + # when a loop is already running, which is the case while + # prompt_toolkit's Application is alive.) Locking the input + # prevents the user from submitting a second command on top. + is_input_locked[0] = True + try: + app.invalidate() + except Exception: + pass + + def _worker() -> None: + try: + _run() + finally: + # Always clear the worker reference first — the Ctrl+C handler + # uses it to decide between "cancel command" and "exit REPL". + # Stale references would make a quick second Ctrl+C target + # a thread that's already finished. + current_worker[0] = None + # Cleanup MUST always run, even if _finish itself raises — a + # broken finish would leave is_running=True and is_input_locked=True + # forever, making the REPL unusable until restart. + try: + _finish() + except Exception: + state.is_running = False + state.running_command = None + state.running_command_text = None + state.run_start = None + is_input_locked[0] = False + try: + app.invalidate() + except Exception: + pass + + worker_thread = threading.Thread(target=_worker, daemon=True) + current_worker[0] = worker_thread + worker_thread.start() + return True + + # ── Key bindings ──────────────────────────────────────────────────────── + _quit_tokens = {":q", ":quit", "exit", "quit", "q"} + + kb = KeyBindings() + + @kb.add("enter", filter=has_completions) + def _accept(event): + event.current_buffer.complete_state = None + + @kb.add("enter", filter=~has_completions) + def _submit(event): + text = input_buffer.text + stripped = text.strip() + # Whether the user typed/pasted is collapsing back to a single + # line — once Enter fires we drop out of multi-line mode so + # the next prompt is single-line again. + _multiline_visible[0] = False + if not stripped: + # ``reset()`` clears the buffer AND the history-navigation + # cursor (``working_index``). A plain set_document keeps the + # cursor, so an Up press after an empty Enter would resume + # whatever the user was previously browsing in history rather + # than starting fresh from the most recent command. + input_buffer.reset() + return + # First-run API key entry path — text in the buffer is the raw key + # the user just pasted. Validate against /usage and, on success, + # persist + transition to normal command mode in place. + if _first_run_needs_key[0]: + input_buffer.reset() + _handle_first_run_key(stripped, text) + return + if stripped.lower() in _quit_tokens: + input_buffer.reset() + event.app.exit() + return + # Multi-line submission (typically a paste of several commands): + # run the first line immediately, queue the rest. ``_ticker`` + # picks them up one at a time as soon as the input lock clears. + lines = [s for s in (ln.strip() for ln in stripped.splitlines()) if s] + if len(lines) > 1: + stripped = lines[0] + _pending_commands.extend(lines[1:]) + # Persist the submitted line into the FileHistory before we kick off + # execution. ``append_string`` is the right call (not + # ``store_string``): the latter only writes to disk, leaving the + # in-memory ``_loaded_strings`` stale, so newly-submitted commands + # don't show up on the next Up press until the REPL restarts and + # reloads from disk. ``append_string`` does both. + if history is not None: + try: + history.append_string(stripped) + except Exception: + pass + # Don't clear the buffer here — we want the typed command to + # stay visible if it fails or is cancelled (Ctrl+C), so the user + # can edit and retry without re-typing. ``_finish`` clears it + # only when the command succeeded. Shlex parse errors return + # False from ``_execute`` and the text stays in place naturally. + _execute(stripped) + + @kb.add("c-c") + def _ctrl_c(event): + # Clear any queued multi-line commands so an aborted paste + # doesn't keep firing after the user explicitly cancels. + cleared_queue = False + if _pending_commands: + n_dropped = len(_pending_commands) + _pending_commands.clear() + cleared_queue = True + err_console.print( + f" [{BEE_DIM}]cancelled {n_dropped} queued command" + f"{'s' if n_dropped != 1 else ''}[/]" + ) + # If the input buffer is currently in multi-line mode (active + # paste preview), Ctrl+C clears it and drops back to single- + # line — treated as "consumed" so we don't fall through to + # ``event.app.exit()`` below. + cleared_multiline = False + if _multiline_visible[0]: + _multiline_visible[0] = False + cleared_multiline = True + try: + input_buffer.reset() + except Exception: + pass + # If a worker thread is running, Ctrl+C stops that command rather + # than exiting the REPL. We try two mechanisms in parallel: + # + # 1. Cancel all tasks on the worker's asyncio loop via + # ``call_soon_threadsafe``. This wakes the selector + # immediately and raises ``CancelledError`` on the in-flight + # await (e.g. an aiohttp request blocked on socket recv). + # This is the only thing that produces a *fast* stop for + # network commands — without it, a long ScrapingBee request + # would hold the worker until it returns naturally. + # + # 2. Inject ``KeyboardInterrupt`` into the worker thread via + # ``PyThreadState_SetAsyncExc``. Fires at the next Python + # bytecode boundary; covers commands that aren't currently + # blocked in asyncio (sync post-processing, slow loops, ...). + worker = current_worker[0] + if state.is_running and worker is not None and worker.is_alive(): + loop = _active_worker_loop[0] + if loop is not None: + + def _cancel_all_tasks() -> None: + try: + for task in _asyncio_mod.all_tasks(loop): + if not task.done(): + task.cancel() + except Exception: + pass + + try: + loop.call_soon_threadsafe(_cancel_all_tasks) + except Exception: + pass + + # If a Scrapy crawl is running, the worker is parked inside + # Twisted's reactor (epoll/kqueue/select in C code), so + # neither asyncio cancellation nor PyThreadState_SetAsyncExc + # reaches it. ``reactor.callFromThread`` wakes the selector + # via the reactor's self-pipe and runs ``reactor.stop()`` on + # the reactor thread — the only thread-safe way to stop a + # running Twisted reactor from outside. + try: + from .crawl import stop_running_reactor + + stop_running_reactor() + except Exception: + pass + + # If a ``!shell`` command or crawl subprocess is running, + # signal the child — the worker thread is blocked in a + # C-level read() on the child's stdout pipe, so a + # Python-level KeyboardInterrupt won't fire until the read + # returns. First Ctrl+C sends SIGTERM (lets Scrapy write + # the manifest, preserves partial output). A SECOND Ctrl+C + # within 2 s while the child is still running escalates to + # SIGKILL — useful when a long screenshot fetch keeps + # Twisted's reactor parked in select() and SIGTERM + # processing lags behind. Standard Unix Ctrl+C convention. + proc = current_subprocess[0] + if proc is not None: + now = time.monotonic() + last = _last_ctrl_c_time[0] + _last_ctrl_c_time[0] = now + still_running = proc.poll() is None + try: + if still_running and (now - last) < 2.0: + proc.kill() + else: + proc.terminate() + except Exception: + pass + + import ctypes + + tid = worker.ident + if tid is None: + event.app.exit() + return + try: + res = ctypes.pythonapi.PyThreadState_SetAsyncExc( + ctypes.c_ulong(tid), ctypes.py_object(KeyboardInterrupt) + ) + # If we managed to flip exception state in more than one + # thread, the docs say to undo it — otherwise we leave a + # dangling pending exception on an unrelated thread. + if res > 1: + ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_ulong(tid), None) + except Exception: + # ctypes path failed (PyPy? embedded?) — fall back to + # exiting; daemon worker dies with the process. + event.app.exit() + return + # No worker running. If we just dropped queued commands OR + # closed a multi-line paste preview, that was the user's intent + # for this Ctrl+C — don't also exit the REPL on top of it. + if cleared_queue or cleared_multiline: + return + event.app.exit() + + @kb.add("c-d") + def _ctrl_d(event): + # Ctrl+D on empty input is "logout from shell" → exit. While a + # command is running, ignore it to avoid yanking the REPL out from + # under the user mid-scrape; they have :q or a second Ctrl+C. + if state.is_running: + return + if not input_buffer.text: + event.app.exit() + + # Right arrow / End accept the ghost-text suggestion. We're using + # ``Application`` directly (not ``PromptSession``), so the default + # ``load_auto_suggest_bindings`` are NOT in the merged binding set — + # without these, the ghost text appears but no key consumes it. + # (Ctrl-F is intentionally NOT bound — it would be redundant with Right + # arrow and a small minority of users expect it to mean "find".) + @Condition + def _suggestion_at_eol() -> bool: + try: + buf = input_buffer + return ( + buf.suggestion is not None + and len(buf.suggestion.text) > 0 + and buf.document.is_cursor_at_the_end + ) + except Exception: + return False + + def _do_accept_suggestion(event): + """Accept the entire ghost-text suggestion (bound to End).""" + buf = event.current_buffer + sug = buf.suggestion + if sug: + buf.insert_text(sug.text) + + def _do_accept_suggestion_word(event): + """Accept the next word of the ghost-text suggestion (bound to + Right arrow). Splits at the first space — so on a suggestion + ``scrape https://www.scrapingbee.com --premium-proxy true``, + successive Right presses accept ``scrape `` → ``https://… `` → + ``--premium-proxy `` → ``true``. End remains the shortcut for + accepting the whole thing in one keystroke. + """ + buf = event.current_buffer + sug = buf.suggestion + if not sug or not sug.text: + return + text = sug.text + space_idx = text.find(" ") + if space_idx == -1: + buf.insert_text(text) + else: + buf.insert_text(text[: space_idx + 1]) + + kb.add("right", filter=_suggestion_at_eol, eager=True)(_do_accept_suggestion_word) + kb.add("end", filter=_suggestion_at_eol, eager=True)(_do_accept_suggestion) + + _not_first_run = Condition(lambda: not _first_run_needs_key[0]) + + @kb.add("tab", filter=~has_completions & _not_first_run) + def _tab_open(event): + # Bash-style Tab behaviour with a ghost-text fallback: + # • exactly one completion match → accept inline (no popup), + # with a trailing space when the match is a command name. + # • multiple matches → open the popup WITHOUT modifying the + # buffer. We deliberately don't auto-insert the common + # prefix because doing so wipes any active ghost-text + # suggestion (the prefix change invalidates the ghost's + # attachment point). Users can pick from the popup with + # arrow keys, accept the ghost word with Right, or just + # keep typing. + # • zero matches BUT a ghost-text suggestion is visible → + # accept the next word of the suggestion (same as Right + # arrow does in our isolated suggestion handler). + # • zero matches AND no suggestion → open an empty popup + # (visual no-op). + # Wrapped in try/except so a flaky completer can't kill the + # binding handler. + buf = event.current_buffer + try: + from prompt_toolkit.completion import CompleteEvent + + cmps = list(buf.completer.get_completions(buf.document, CompleteEvent())) + except Exception: + buf.start_completion(select_first=False) + return + + # Helper: accept the next word (up to & including next space) of + # the ghost-text suggestion, mirroring what Right arrow does. + def _accept_ghost_word() -> bool: + sug = buf.suggestion + if not sug or not sug.text: + return False + text = sug.text + space_idx = text.find(" ") + buf.insert_text(text if space_idx == -1 else text[: space_idx + 1]) + return True + + if len(cmps) == 1: + c = cmps[0] + # Is the single match REDUNDANT with what's already typed? + # E.g. typing 'scrape' then Tab — the completer yields + # Completion(text='scrape', start_position=-6), which would + # replace 'scrape' with 'scrape' (net zero text change, just + # adds a trailing space). When that happens AND a ghost + # suggestion is showing, prefer advancing into the ghost — + # that's what the user actually wants progress on. + typed_before = buf.document.text_before_cursor + replaced = typed_before[c.start_position :] if c.start_position < 0 else "" + if c.text == replaced and _accept_ghost_word(): + return + try: + if c.start_position < 0: + buf.delete_before_cursor(count=-c.start_position) + # Trailing space for command names; flags (start with + # ``-``) get none so ``--key=value`` is still typable. + suffix = "" if c.text.startswith("-") else " " + buf.insert_text(c.text + suffix) + except Exception: + buf.start_completion(select_first=False) + return + if len(cmps) == 0: + if _accept_ghost_word(): + return + buf.start_completion(select_first=False) + + @kb.add("tab", filter=has_completions) + def _tab_next(event): + event.current_buffer.complete_next() + + # Shift+Tab — when the completion popup is open, navigate backwards; + # when it's not, toggle Scroll ↔ Select mouse mode. + @kb.add("s-tab", filter=has_completions) + def _shift_tab_in_completions(event): + event.current_buffer.complete_previous() + + @kb.add("s-tab", filter=~has_completions) + def _shift_tab_toggle_mode(event): + _toggle_mouse_mode(event) + + @kb.add("escape", filter=has_completions, eager=True) + def _esc(event): + event.current_buffer.cancel_completion() + + # ── Word-wise backward delete ───────────────────────────────────────── + # Bound to the conventional combos so muscle memory works regardless + # of OS / terminal: + # • Option+Backspace on macOS Terminal/iTerm sends ``escape`` + # followed by ``c-h`` (most common) or ``backspace`` (a few + # terminals) — we bind both. + # • Ctrl+W is the POSIX standard for ``unix-word-rubout``. + # ``find_start_of_previous_word`` returns a negative offset to the + # start of the previous word, or ``None`` when the cursor is at the + # buffer start. + def _word_delete_backward(event): + buf = event.current_buffer + pos = buf.document.find_start_of_previous_word(count=1, WORD=False) + if pos: + buf.delete_before_cursor(count=-pos) + + kb.add("escape", "backspace")(_word_delete_backward) + kb.add("escape", "c-h")(_word_delete_backward) + kb.add("c-w")(_word_delete_backward) + + # ── Disable reverse/forward incremental search ──────────────────────── + # prompt_toolkit's emacs defaults bind Ctrl+R and Ctrl+S to incremental + # history search, which writes into a hidden search buffer. Our layout + # has no SearchToolbar, so the search query renders nowhere — the user + # types into a black hole. Up/Down already walk the FileHistory, so we + # explicitly swallow the keys to avoid the broken default behaviour. + @kb.add("c-r") + @kb.add("c-s") + def _disable_incremental_search(event): + pass + + # ── Manual newline insertion ─────────────────────────────────────────── + # When the user wants to compose a multi-command batch by hand + # (rather than via paste), bind Alt+Enter and Ctrl+J to "insert + # newline + flip to multi-line mode". Most terminals don't + # distinguish Shift+Enter from plain Enter, so these are the + # portable shortcuts. Plain Enter remains "submit". + def _insert_newline(event): + _multiline_visible[0] = True + event.current_buffer.insert_text("\n") + + kb.add("escape", "enter")(_insert_newline) + kb.add("c-j")(_insert_newline) + + # ── Bracketed-paste cleanup ─────────────────────────────────────────── + # Pasted text comes in two flavours: + # 1. A single command soft-wrapped by the source (IDE, doc render, + # etc.) — the wrap inserts whitespace (sometimes CR) and may + # also insert a newline. We want to flatten those back into a + # single line so the command parses correctly. + # 2. A genuine multi-command paste (e.g. ``scrape …\ncrawl …``) + # where the user intends each line to run separately via the + # ``_pending_commands`` queue. + # Heuristic: if EVERY non-empty line begins with a recognized + # command name or REPL meta-prefix (``:``, ``!``), treat as + # multi-command (keep ``\n``). Otherwise treat as soft-wrap and + # join with spaces. ``\r`` is always normalised (CR is never a + # useful separator in our buffer). + from prompt_toolkit.keys import Keys as _Keys + + _command_name_set = set(command_names) + + def _looks_like_command_line(line: str) -> bool: + s = line.strip() + if not s: + return False + if s.startswith((":", "!")): + return True + first = s.split(None, 1)[0] + return first in _command_name_set + + @kb.add(_Keys.BracketedPaste) + def _bracketed_paste(event): + # Bracketed paste handler. Two modes: + # • Pasted text contains a newline → switch buffer to + # multi-line mode and insert each line. The user can then + # edit any line and press Enter to submit the whole batch + # — ``_submit`` already splits multi-line text and queues + # subsequent lines via ``_pending_commands``. Esc / Ctrl+C + # clear the buffer and return to single-line mode. + # • No newlines → single-line paste. Collapse runs of + # spaces/tabs to single spaces (handles soft-wrap in the + # source rendering), CR to space, and insert normally. + import re as _re + + # Normalise line endings: CRLF (Windows), CR (classic Mac), and + # LF all become a single ``\n``. Treating a lone CR as a space + # would silently collapse multi-line paste into one line on + # paste from sources that use CR only. + text = event.data.replace("\r\n", "\n").replace("\r", "\n") + if "\n" in text: + non_empty = [ln.strip() for ln in text.split("\n") if ln.strip()] + if non_empty: + _multiline_visible[0] = True + # Replace any current buffer contents with the pasted + # lines (no preserving partial input — multi-line paste + # is the dominant intent). The user can edit any line + # and Enter submits the batch. + event.current_buffer.text = "\n".join(non_empty) + event.current_buffer.cursor_position = len(event.current_buffer.text) + return + text = _re.sub(r"[ \t]+", " ", text) + event.current_buffer.insert_text(text) + + # ── History navigation ───────────────────────────────────────────────── + # Plain Up/Down navigate the FileHistory at ~/.config/scrapingbee-cli/ + # .history. When the completion menu is open these keys instead + # navigate the menu (prompt_toolkit's default behaviour); the + # ``~has_completions`` filter ensures we don't compete. + # Suppressed during the first-run API key prompt — otherwise an Up + # press would inject the previous command into the (masked) API key + # field, with no visible cue that the buffer is no longer empty. + # In multi-line mode (after a multi-line paste) arrow keys must + # navigate within the buffer instead of walking history, otherwise + # the user can't edit lines 2+ after pasting them. + _single_line_buffer = Condition(lambda: not _multiline_visible[0]) + + @kb.add("up", filter=~has_completions & _not_first_run & _single_line_buffer) + def _history_back(event): + buf = event.current_buffer + # prompt_toolkit loads history asynchronously via a background + # task scheduled at first render. After our ``buffer.reset()`` on + # submit, that task is cancelled and ``_working_lines`` is just + # ``[""]`` — the next Up press lands before the task re-runs, so + # ``history_backward`` has nothing to walk and is a no-op. Load + # the history strings synchronously here as a fallback so the + # first Up after a submit actually shows the newest entry. + try: + if len(buf._working_lines) <= 1: + # ``get_strings()`` returns newest-first. prompt_toolkit's + # built-in ``_load_history`` calls ``appendleft`` for each + # yielded item in that order — newest gets pushed left + # FIRST, ending up closest to the current-edit slot at the + # right. Walking Up then visits newest before older. We + # mirror that exact order here so the first Up after a + # submit lands on the freshly-submitted command, not the + # oldest entry on disk. + strings = list(buf.history.get_strings()) + if strings: + for s in strings: + buf._working_lines.appendleft(s) + buf.working_index = len(buf._working_lines) - 1 + elif not buf.text and buf.working_index != len(buf._working_lines) - 1: + # User has browsed back and erased to empty: jump the + # cursor to the newest entry so this Up restarts there + # instead of continuing from the previous browse point. + buf.working_index = len(buf._working_lines) - 1 + except Exception: + pass + buf.history_backward() + + @kb.add( + "down", + filter=~has_completions & _not_first_run & _single_line_buffer, + ) + def _history_forward(event): + event.current_buffer.history_forward() + + # ── Scrollback navigation ────────────────────────────────────────────── + # Keyboard-only scrolling of the virtual buffer. We don't enable mouse + # capture (so native drag-select stays usable), so these keys are the + # primary way to scroll history. Familiar to vim/less/htop users. + # + # ``eager=True`` is critical here: prompt_toolkit's Buffer has its own + # default bindings for PgUp/PgDn (history navigation in some modes) and + # the completion menu also consumes PgUp/PgDn when open. Eager bindings + # fire BEFORE buffer-level handlers, so our scrollback scroll wins + # whenever no completion popup is showing. + @kb.add("pageup", eager=True, filter=~has_completions) + def _sb_pageup(_e): + scrollback.scroll_up(10) + try: + app.invalidate() + except Exception: + pass + + @kb.add("pagedown", eager=True, filter=~has_completions) + def _sb_pagedown(_e): + scrollback.scroll_down(10) + try: + app.invalidate() + except Exception: + pass + + @kb.add("c-up", eager=True) + def _sb_lineup(_e): + scrollback.scroll_up(1) + try: + app.invalidate() + except Exception: + pass + + @kb.add("c-down", eager=True) + def _sb_linedown(_e): + scrollback.scroll_down(1) + try: + app.invalidate() + except Exception: + pass + + @kb.add("c-home", eager=True) + def _sb_top(_e): + scrollback.scroll_to_top() + try: + app.invalidate() + except Exception: + pass + + @kb.add("c-end", eager=True) + def _sb_bottom(_e): + scrollback.scroll_to_bottom() + try: + app.invalidate() + except Exception: + pass + + # ── Mouse mode toggle (Alt+S = Esc S in terminal protocol) ───────────── + # Flips between "scroll mode" (mouse_support on — wheel scrolls our + # virtual buffer, drag-select needs per-terminal modifier like + # Option/Shift) and "select mode" (mouse_support off — drag-select + # works without any modifier on every terminal, wheel scrolling falls + # back to PgUp/PgDn/Ctrl-arrows). Toolbar shows the active mode. + @kb.add("escape", "s", eager=True) + def _toggle_mouse_mode(_event): + if state.mouse_mode == "scroll": + state.mouse_mode = "select" + try: + app.output.disable_mouse_support() + app.output.flush() + except Exception: + pass + else: + state.mouse_mode = "scroll" + try: + app.output.enable_mouse_support() + app.output.flush() + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + + # ── Application (full_screen=True: own the alt buffer cleanly) ───────── + # Owning the alternate screen buffer eliminates the wrap-fragment / + # orphan-toolbar artifacts we got with full_screen=False (where the + # terminal could reflow content under us on resize). + # + # Mouse support is enabled so trackpad / wheel scroll events reach our + # scrollback handler. prompt_toolkit uses mode 1000 — button events + # only, NO motion tracking — so the terminal still owns drag-selection + # (Mac Terminal / iTerm / kitty all keep native select with mode 1000; + # on a few terminals users may need to hold Option/Shift while + # dragging to bypass mouse capture). + app = Application( + layout=layout, + key_bindings=kb, + style=Style.from_dict(_style_dict_for(keep_bg)), + full_screen=True, + mouse_support=True, + ) + # 50ms escape-sequence timeout (default 500ms). Snappy Esc for + # cancel-completion etc. — modern terminals deliver escape sequences + # as one read, so 50ms is plenty. Set on the instance because + # ``ttimeoutlen`` isn't a constructor parameter. + app.ttimeoutlen = 0.05 + + # ── Periodic invalidate while a command is in flight ─────────────────── + # The shimmer on the running command line + the elapsed-time counter + # need a tick ~10× per second to feel live. Without this, the live area + # would only redraw on stdout writes (sparse for long-running scrapes). + # When idle, 1Hz is enough — the "Next Update Xs" countdown only changes + # once per second, and the paged toolbar carousel rotates on 5-second + # boundaries. + async def _ticker(): + import asyncio + + from .theme import has_progress_state + + idle_counter = 0 + # Track terminal width and trigger a fresh invalidate on resize. + # No manual resize-detection needed any more — in full_screen + # mode prompt_toolkit owns the entire screen, so SIGWINCH is + # handled cleanly by the framework: the next render uses the + # new size and the alt buffer has no scrollback-vs-logical-row + # mismatch to worry about. + + while True: + await asyncio.sleep(0.1) + # Drain queued commands from a multi-line paste — only when + # the input lock is clear (previous command done) AND we're + # not in the API-key prompt. Pop one per tick so each + # command's footer renders before the next starts. + if _pending_commands and not is_input_locked[0] and not _first_run_needs_key[0]: + next_cmd = _pending_commands.pop(0) + try: + if history is not None: + try: + history.append_string(next_cmd) + except Exception: + pass + _execute(next_cmd) + except Exception: + pass + # Trigger a frame redraw while progress is reporting so + # the honeycomb's boundary-hex shimmer animates. The fixed + # ``crawl_status_window`` reads progress state directly via + # ``_crawl_status_text`` on each invalidate — no separate + # scrollback rendering needed. + if has_progress_state(): + try: + app.invalidate() + except Exception: + pass + if state.is_running: + state.tick += 1 + try: + app.invalidate() + except Exception: + pass + idle_counter = 0 + else: + idle_counter += 1 + if idle_counter >= 10: # 1Hz idle redraw + idle_counter = 0 + try: + app.invalidate() + except Exception: + pass + + # ── Background usage refresher ────────────────────────────────────────── + # Polls the usage API on a 30s interval so the toolbar's "available", + # "used (session)" and "conc" values stay roughly current. The user can + # force an immediate refresh by signalling _refresh_event (used after the + # `usage` and `auth` commands complete — see _execute). The first call + # is fire-and-forget right after the task starts, so the toolbar + # populates within a beat of REPL startup rather than after a 30s wait. + import asyncio as _asyncio # local alias avoids shadowing module-level usage + + _refresh_event = _asyncio.Event() + + async def _do_usage_refresh() -> None: + import hashlib as _hashlib + import json as _json + + from .batch import read_usage_file_cache, write_usage_file_cache + from .client import Client, parse_usage + from .config import BASE_URL, get_api_key + + try: + key = get_api_key(None) + except ValueError: + return # No key set yet — quietly skip; toolbar stays N/A. + # Short non-reversible hash of the key — used to detect logout/relogin + # with the *same* key vs a different one, so the session counter + # continues for the former and resets for the latter. + key_hash = _hashlib.sha256(key.encode("utf-8")).hexdigest()[:16] + + # ── Cache-first fast path (REPL only) ────────────────────────── + # Sibling REPL sessions and batch/crawl pre-flight write to the + # same file cache. If the cache was refreshed within the TTL we + # can populate the + # toolbar without a live call — saving us a slot in the /usage + # rate limit. ``update_from_usage_response`` reads the same keys + # ``parse_usage`` writes, so we build a synthetic raw dict from + # the cache entry. ``current_concurrency`` isn't preserved in + # the cache, so the toolbar's `0/N` slot will lag by one tick; + # that's an acceptable trade for the rate-limit headroom. + cached = read_usage_file_cache(key) + if cached is not None: + try: + max_credit = cached.get("max_api_credit") + credits = cached.get("credits") + used_credit = ( + int(max_credit) - int(credits) + if isinstance(max_credit, (int, float)) and isinstance(credits, (int, float)) + else None + ) + synthetic = { + "max_concurrency": cached.get("max_concurrency"), + "max_api_credit": max_credit, + "used_api_credit": used_credit, + } + state.update_from_usage_response(synthetic, key_hash=key_hash) + try: + app.invalidate() + except Exception: + pass + return + except Exception: + # Cache was malformed in some unexpected way — fall + # through to the live call. + pass + + try: + async with Client(key, BASE_URL) as client: + data, _hdrs, status_code = await client.usage(retries=1, backoff=1.0) + if status_code != 200: + return + try: + raw = _json.loads(data) + except Exception: + return + state.update_from_usage_response(raw, key_hash=key_hash) + try: + write_usage_file_cache(key, parse_usage(data)) + except Exception: + pass + try: + app.invalidate() + except Exception: + pass + except Exception: + # Network errors must not kill the refresher — just skip this + # tick and try again on the next interval. + return + + async def _usage_refresher() -> None: + while True: + if state.api_key_set: + await _do_usage_refresh() + try: + await _asyncio.wait_for( + _refresh_event.wait(), + timeout=SessionState.USAGE_REFRESH_INTERVAL, + ) + _refresh_event.clear() + except _asyncio.TimeoutError: + pass + + def _signal_refresh_from_thread() -> None: + """Request an immediate usage refresh from a non-loop thread. + + ``asyncio.Event.set`` is not thread-safe, so we hop back onto the + application's event loop. Used after the worker thread finishes + ``usage`` (data just arrived) or ``auth`` (api_key may have just + become set) so the toolbar updates without waiting for the next + scheduled 30s tick. + """ + try: + loop = app.loop + if loop is not None: + loop.call_soon_threadsafe(_refresh_event.set) + except Exception: + pass + + # Track background tasks so we can cancel them cleanly on shutdown + # instead of letting them run until the process exits (they would keep + # firing app.invalidate() against a dead app and leak the asyncio loop + # if the REPL is ever embedded in a larger program). + _bg_tasks: list[Any] = [] + + def _pre_run() -> None: + _bg_tasks.append(app.create_background_task(_ticker())) + _bg_tasks.append(app.create_background_task(_usage_refresher())) + + # ── Run inside patch_stdout so command output flows above the prompt ──── + def _restore_bg(): + if _set_black_bg: + try: + sys.stdout.write("\033]111\007") # reset bg to user default + sys.stdout.write("\033]110\007") # reset fg to user default + sys.stdout.flush() + except Exception: + pass + + # Pipe every stdout / stderr write into the virtual scrollback buffer. + # The renderer (FormattedTextControl on the output Window) reads from + # the buffer each frame. We don't touch the real terminal at all + # while the app runs — that's the alt buffer's job, and it'll be + # dismissed cleanly on exit. + def _on_buffer_write() -> None: + # Auto-follow: a write while user is at the bottom keeps them at + # the bottom (scroll_offset stays 0). A user who's scrolled up + # stays put — they explicitly asked to read history. + try: + app.invalidate() + except Exception: + pass + + sb_writer = ScrollbackWriter(scrollback, on_write=_on_buffer_write) + original_stdout, original_stderr = sys.stdout, sys.stderr + sys.stdout = sb_writer + sys.stderr = sb_writer + # Some callers (cli_utils.write_output) call ``sys.stdout.buffer.write(bytes)``. + # Expose a binary-decoding adapter so those routes still land in our + # scrollback as text. Truly binary output is decoded with errors=replace. + if not hasattr(sys.stdout, "buffer"): + setattr(sys.stdout, "buffer", _BinaryAdapter(sys.stdout)) + if not hasattr(sys.stderr, "buffer"): + setattr(sys.stderr, "buffer", _BinaryAdapter(sys.stderr)) + # err_console (rich.Console used by theme.py) caches a file= reference + # at module import time — point it at our buffer too. + _orig_err_console_file = err_console.file + setattr(err_console, "file", sb_writer) + try: + app.run(pre_run=_pre_run) + finally: + # Cancel background tasks (ticker + usage refresher) so they stop + # invalidating the now-dead app and release the loop they live on. + for task in _bg_tasks: + try: + task.cancel() + except Exception: + pass + sys.stdout = original_stdout + sys.stderr = original_stderr + try: + err_console.file = _orig_err_console_file + except Exception: + pass + _restore_bg() + set_repl_mode(False) diff --git a/src/scrapingbee_cli/theme.py b/src/scrapingbee_cli/theme.py new file mode 100644 index 0000000..d8c2336 --- /dev/null +++ b/src/scrapingbee_cli/theme.py @@ -0,0 +1,1119 @@ +"""ScrapingBee CLI theme: colours and styled output helpers used by the +REPL renderer.""" + +from __future__ import annotations + +import os +import sys + +from rich.console import Console +from rich.text import Text +from rich.theme import Theme + +# -- ScrapingBee brand colours ----------------------------------------------- + +BEE_YELLOW = "#FFCD23" +BEE_DARK = "#0F0F0E" +BEE_WHITE = "#FFFFFF" +BEE_AMBER = "#E5A800" +BEE_GREEN = "#22C55E" +BEE_RED = "#EF4444" +BEE_DIM = "#888888" + +SCRAPINGBEE_THEME = Theme( + { + "bee": f"bold {BEE_YELLOW}", + "bee.dim": BEE_AMBER, + "info": f"bold {BEE_YELLOW}", + "success": f"bold {BEE_GREEN}", + "error": f"bold {BEE_RED}", + "warn": f"bold {BEE_AMBER}", + "dim": BEE_DIM, + "header": f"bold {BEE_WHITE}", + "key": f"bold {BEE_YELLOW}", + "value": BEE_WHITE, + } +) + + +def _want_color() -> bool | None: + if os.environ.get("NO_COLOR"): + return False + if os.environ.get("FORCE_COLOR"): + return True + return None + + +_color = _want_color() + +err_console = Console(stderr=True, theme=SCRAPINGBEE_THEME, highlight=False, force_terminal=_color) +console = Console(theme=SCRAPINGBEE_THEME, highlight=False, force_terminal=_color) + +# -- REPL mode flag ----------------------------------------------------------- +# When True, fancy visuals (panels, honeycomb, personality errors, styled help) +# are enabled. Direct CLI commands (scrapingbee scrape ...) keep plain output. + +_repl_mode = False + + +def set_repl_mode(enabled: bool = True) -> None: + """Enable or disable REPL-mode visuals.""" + global _repl_mode # noqa: PLW0603 + _repl_mode = enabled + + +def is_repl_mode() -> bool: + """Return True when running inside the interactive REPL.""" + return _repl_mode + + +# -- Multi-line progress renderer hook --------------------------------------- +# The REPL installs a renderer here at startup that knows how to replace +# the last N lines of its virtual scrollback in place. Batch operations +# call ``emit_progress_lines`` to update the honeycomb progress bar — +# in REPL mode it overwrites the previous frame; outside the REPL it +# falls back to printing the lines normally. + +_progress_renderer = None + + +def set_progress_renderer(fn) -> None: + """Install a function ``fn(lines)`` where ``lines`` is a list of + ANSI-rendered strings. Called by the REPL to wire up in-place updates. + """ + global _progress_renderer # noqa: PLW0603 + _progress_renderer = fn + + +def emit_progress_lines(lines: list[str]) -> None: + """Emit a multi-line progress update. In REPL mode this overwrites + the previous frame; otherwise it falls back to writing to stderr. + ``lines`` is a list of already-rendered ANSI strings (one per row, + no trailing newlines). + """ + if _progress_renderer is not None: + try: + _progress_renderer(lines) + return + except Exception: + pass + # Fallback: plain stderr append. + for line in lines: + sys.stderr.write(line + "\n") + sys.stderr.flush() + + +# -- Bee facts (rotating trivia shown while a command is in flight) --------- +# Surfaced on the dim row above the input in the REPL. Kept short so they +# fit on a single line even on narrow terminals. + +BEE_FACTS: list[str] = [ + "Did you know? Bees can fly up to 15 mph.", + "Did you know? A bee visits 50–100 flowers per trip.", + "Did you know? Bees have 5 eyes — two compound, three simple.", + "Did you know? Honey never spoils — jars from ancient Egypt are still edible.", + "Did you know? Bees communicate by dancing — the famous waggle dance.", + "Did you know? A single hive can house up to 60,000 bees.", + "Did you know? Bees flap their wings about 200 times per second.", + "Did you know? Bees can recognize individual human faces.", + "Did you know? One bee makes about 1/12 of a teaspoon of honey in its life.", + "Did you know? Bees navigate using the sun's position in the sky.", + "Did you know? Bees pollinate about one third of the food we eat.", + "Did you know? A queen bee can lay up to 2,000 eggs per day.", + "Did you know? Worker bees are all female.", + "Did you know? Bees see ultraviolet patterns we can't.", + "Did you know? Honeycomb hexagons tile flat space using the least wax — a property mathematicians proved only in 1999.", + "Did you know? Worker bees in a hive are about 75% genetically related to each other — human siblings are only 50%.", + "Did you know? A bee's brain is the size of a sesame seed.", + "Did you know? Bees have been around for more than 100 million years — older than most flowering plants.", + "Did you know? The buzzing sound is the rapid beat of a bee's wings.", + "Did you know? Bees can sense the Earth's magnetic field.", + "Did you know? In ancient Babylon, newlyweds drank honey-wine for a month — the likely origin of the word 'honeymoon'.", + "Did you know? A queen bee can live up to 5 years; a worker, only 6 weeks in summer.", + "Did you know? Drones (male bees) have no stinger.", + "Did you know? Bees fan their wings to cool the hive on hot days.", + "Did you know? Bees can tell time using internal circadian rhythms.", + "Did you know? A foraging bee can carry nectar weighing nearly half her body weight.", + "Did you know? Bumblebees can fly in the rain.", + "Did you know? Honeybees evolved from ancient predatory wasps.", + "Did you know? A swarm of bees can contain over 50,000 individuals.", + "Did you know? Bees regulate hive temperature within a degree of 35°C / 95°F.", + "Did you know? The queen's pheromones hold a colony together.", + "Did you know? Bees can recognize the smell of TNT — they're used in landmine detection.", + "Did you know? Bees make beeswax from special glands on their abdomen.", + "Did you know? Royal jelly is what turns a regular larva into a queen.", + "Did you know? Bees do a 'cleansing flight' after winter to relieve themselves.", + "Did you know? Honey is naturally antibacterial.", + "Did you know? Bees can travel up to 6 miles from their hive in a single trip.", + "Did you know? A bee colony collectively visits about 2 million flowers to make one pound of honey.", + "Did you know? Bees have hair on their eyes to collect more pollen.", + "Did you know? Worker bees switch jobs as they age — nurse, builder, guard, then forager.", + "Did you know? The bee was a heraldic emblem of Napoleon's imperial regime.", + "Did you know? Honey has been found preserved in pharaohs' tombs.", + "Did you know? Bees can be trained to detect cancer in human breath.", + "Did you know? The phrase 'busy as a bee' first appeared in Chaucer's Canterbury Tales.", + "Did you know? Stingless bees exist — about 500 species worldwide.", + "Did you know? The mason bee is a far more efficient pollinator than honeybees.", + "Did you know? Bees produce six different products: honey, beeswax, pollen, propolis, royal jelly, and venom.", + "Did you know? 'Propolis' is Greek for 'before the city' — bees seal the hive entrance with it to keep out invaders.", + "Did you know? Bees prefer flowers with caffeine — it boosts their memory.", + "Did you know? Bees actually build round cells first — surface tension in the warm wax reshapes them into hexagons.", + "Did you know? Worker bees flap their wings to evaporate water from nectar, making honey.", + "Did you know? Bumblebees are excellent at 'buzz pollination' — vibrating flowers to release pollen.", + "Did you know? Honey's color depends on which flowers the bees visited.", + "Did you know? A bee's stomach holds 70 mg of nectar — nearly its own weight.", + "Did you know? Africanized 'killer' bees came from a 1957 lab accident in Brazil.", + "Did you know? Honeybees are not native to the Americas — they were brought from Europe.", + "Did you know? A bee's alarm pheromone smells like banana — isoamyl acetate, the very same compound.", + "Did you know? The smallest bee in the world is just 2 mm long (Perdita minima).", + "Did you know? The largest bee is Wallace's giant bee, about the length of a thumb.", + "Did you know? Foraging bees find efficient routes between flowers using simple flight-rule heuristics.", + "Did you know? Honey takes 7 days to ripen from nectar inside the hive.", + "Did you know? Bees were used in ancient warfare — Greeks catapulted hives over castle walls.", + "Did you know? Bees use 'undertakers' — workers whose job is to remove dead bees from the hive.", + "Did you know? Bees can count up to four.", + "Did you know? A single bee can produce only about half a gram of wax in her lifetime.", + "Did you know? Bumblebees can carry a load close to their own body weight in pollen and nectar.", + "Did you know? In Mycenaean Greece, priestesses of the goddess Demeter were called 'Melissai' — the bees.", + "Did you know? Mead — honey wine — may be humanity's oldest fermented drink.", + "Did you know? A worker bee can sting only once; the stinger is barbed.", + "Did you know? Honey contains hydrogen peroxide, produced by an enzyme bees add to nectar.", + "Did you know? Bees can be left-handed or right-handed when entering flowers.", + "Did you know? Beekeeping appears in Egyptian wall art dating back 4,500 years.", + "Did you know? The 'Queen of the Hive' is actually selected by worker bees in larval stage.", + "Did you know? Without bees, most almonds, blueberries, and apples wouldn't exist as we know them.", + "Did you know? A bee's wings beat fast enough to generate static electricity, which attracts pollen.", + "Did you know? Bees have two stomachs — one for eating, one for storing nectar.", + "Did you know? Killer bees are not particularly venomous — they're just very aggressive.", + "Did you know? Honey crystallization is normal — gentle warming returns it to liquid.", + "Did you know? Bees prefer blue, purple, and yellow flowers — red appears black to them.", + "Did you know? Nearly 90% of wild plants depend on animal pollinators, mostly bees.", + "Did you know? Bees take orientation flights before becoming foragers, memorizing landmarks.", + "Did you know? Some bee species are solitary — they don't form colonies at all.", + "Did you know? A bee scientist is called a melittologist.", + "Did you know? Bees were the totem of the Egyptian pharaohs.", + "Did you know? The Mayans practiced beekeeping with stingless Melipona bees.", + "Did you know? Bees use propolis to mummify intruders they can't carry out of the hive.", + "Did you know? In rural England, 'telling the bees' of a death in the family was tradition — leave them out and they'd reportedly abandon the hive.", + "Did you know? A queen bee mates with up to 20 drones in a single flight.", + "Did you know? Honey from different regions tastes completely different — manuka, acacia, clover, lavender.", + "Did you know? Bees can teach each other to use tools.", + "Did you know? Some bees sleep — even with their tongues sticking out.", + "Did you know? Honeycomb cells tilt slightly upward — about 13 degrees — so liquid honey doesn't drip out before it ripens.", + "Did you know? Drones die immediately after mating with the queen.", + "Did you know? Bee venom is being researched as a cancer treatment.", + "Did you know? In Slovenia, beekeeping is so culturally important it's on UNESCO's heritage list.", + "Did you know? Bees can be tracked individually using tiny radio tags.", + "Did you know? The waggle dance can encode distance, direction, and quality of a food source.", + "Did you know? Bees can perceive flower humidity to estimate nectar quality.", + "Did you know? Hive bees fan their wings in coordinated rows to ventilate the colony.", + "Did you know? Pollen is the bee's only source of protein.", + "Did you know? Bees are the only insects that produce food eaten by humans.", + "Did you know? Some orchids look and smell like female bees to trick males into pollinating them.", + "Did you know? Bees recognize their hive entrance by its exact location, not by smell alone.", + "Did you know? Aristotle wrote one of the earliest scientific treatises on beekeeping.", + "Did you know? The hum of a healthy hive is around 250 Hz.", + "Did you know? Bees prefer warm nectar — they're cold-blooded but warm their flight muscles to 35°C.", + "Did you know? Honey contains pinocembrin, an antioxidant studied for its links to brain health.", + "Did you know? In winter, honeybees cluster tightly and shiver their wing muscles to keep the hive warm.", + "Did you know? A worker bee's lifespan in winter is up to 6 months — much longer than summer bees.", + "Did you know? The queen bee produces over 30 different pheromones to manage the colony.", + "Did you know? A pound of honey requires bees to fly the equivalent of three orbits around Earth.", +] + + +def current_bee_fact(tick: int, period_ticks: int = 50) -> str: + """Pick a bee fact from the list, rotating once every ``period_ticks`` + ticks of the REPL's 10 Hz ticker. Default 50 → a new fact every 5s. + """ + if not BEE_FACTS: + return "" + return BEE_FACTS[(tick // max(1, period_ticks)) % len(BEE_FACTS)] + + +# -- Bee-themed action verbs (rotate in place of the static "running") ------ +# Used as the toolbar status label while a command is in flight. Plain +# -ing verbs so they slot grammatically into `` · 12.3s``. + +BEE_VERBS: list[str] = [ + "pollinating", + "buzzing", + "foraging", + "gathering nectar", + "scouting flowers", + "waggle-dancing", + "tending the hive", + "building combs", + "harvesting honey", + "on the wing", + "working the field", + "humming along", + "fanning the hive", + "guarding the entrance", + "swarming", + "courting flowers", + "loading pollen baskets", + "patrolling petals", + "communing with clover", + "sipping nectar", + "weaving wax", + "circling the queen", + "ferrying nectar", + "cleaning cells", + "warming brood", + "deciphering scent trails", + "navigating by sun", + "feeding the queen", + "polishing the comb", + "humming homeward", + "tasting petals", + "marking flowers", + "scouting territories", + "buzzing through HTML", + "extracting honey", + "pollinating pages", + "harvesting data", + "chasing redirects", + "weaving CSS", + "decoding selectors", + "rendering blossoms", + "sniffing user agents", + "scrubbing trackers", +] + + +def current_bee_verb(tick: int, period_ticks: int = 25) -> str: + """Pick a bee verb from the list, rotating once every ``period_ticks`` + ticks. Default 25 → a new verb every 2.5s on the 10 Hz ticker — fast + enough to feel alive on quick scrapes, slow enough not to flicker. + """ + if not BEE_VERBS: + return "running" + return BEE_VERBS[(tick // max(1, period_ticks)) % len(BEE_VERBS)] + + +def current_bee_blurb(tick: int, period_ticks: int = 50) -> str: + """Pick the dim-row content while a command is in flight, alternating + between a "…" bee verb and a "Did you know? ..." fact every + ``period_ticks`` ticks (default 50 → a 5-second switch on the 10 Hz + ticker). The FIRST slot is always a verb so quick commands + (``usage``, ``docs``, fast scrapes) show a natural action label + rather than a flash of trivia. Subsequent slots alternate + verb → fact → verb → fact for the user to read while they wait. + + The fact index and verb index are independent, so the rotation + doesn't cycle the same fact/verb pair together — the lists have + different lengths and advance on their own slot counters. + """ + slot = tick // max(1, period_ticks) + if slot % 2 == 0: + if not BEE_VERBS: + return "" + verb_idx = (slot // 2) % len(BEE_VERBS) + return BEE_VERBS[verb_idx] + "…" + if not BEE_FACTS: + return "" + fact_idx = (slot // 2) % len(BEE_FACTS) + return BEE_FACTS[fact_idx] + + +# -- Crawl live-status state (current URL, fetched count, phase) ------------ +# The Scrapy spider's signal handlers push updates here from the worker +# thread; the REPL's ticker reads them on the main thread to repaint the +# dim row above the input. ``_crawl_status`` is intentionally a plain +# dict mutation since (a) Python dict assignments are atomic and (b) the +# update pattern is single-key writes from one writer at a time, so no +# explicit lock is needed. + +_crawl_status: dict | None = None + + +def update_crawl_status( + *, + current_url: str | None = None, + fetched: int | None = None, + queued: int | None = None, + saved: int | None = None, + phase: str | None = None, +) -> None: + """Update one or more fields of the crawl status. Any field left as + ``None`` keeps its previous value (so a per-signal handler can update + just the field it knows about). + + Subprocess crawl mode: the REPL parent runs each crawl in a child + Python process so it gets a fresh Twisted reactor. The child has no + way to push into the parent's in-memory ``_crawl_status``, so when + the env var ``SCRAPINGBEE_CRAWL_STATUS_FILE`` is set we *also* + mirror the current dict to that JSON file. The parent's ticker + polls the file and forwards updates back into its own + ``_crawl_status`` so the layout window keeps showing live progress. + """ + global _crawl_status # noqa: PLW0603 + if _crawl_status is None: + _crawl_status = { + "current_url": None, + "fetched": 0, + "queued": 0, + "saved": 0, + "phase": "starting", + } + if current_url is not None: + _crawl_status["current_url"] = current_url + if fetched is not None: + _crawl_status["fetched"] = fetched + if queued is not None: + _crawl_status["queued"] = queued + if saved is not None: + _crawl_status["saved"] = saved + if phase is not None: + _crawl_status["phase"] = phase + _maybe_mirror_to_status_file() + + +def _maybe_mirror_to_status_file() -> None: + """Atomic write of ``_crawl_status`` + progress state to + ``$SCRAPINGBEE_CRAWL_STATUS_FILE`` so a polling parent process sees + updates without read/write races. Atomic-rename pattern (write to + ``.tmp``, ``os.replace``) keeps the parent from ever reading a + half-flushed JSON file. + + Progress data (``_progress_state``) rides on the same payload — + that's how the parent learns about a known total (sitemap mode, + ``--max-pages N``) and can show the honeycomb bar above the URL + line in its fixed widget. + """ + sf = os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE") + if not sf: + return + if _crawl_status is None and _progress_state is None: + return + try: + import json as _json + + payload: dict = {} + if _crawl_status is not None: + payload.update(_crawl_status) + if _progress_state is not None: + payload["progress_completed"] = _progress_state.get("completed") + payload["progress_total"] = _progress_state.get("total") + payload["progress_rps"] = _progress_state.get("rps") + payload["progress_eta"] = _progress_state.get("eta") + payload["progress_failure_pct"] = _progress_state.get("failure_pct") + tmp = sf + ".tmp" + with open(tmp, "w", encoding="utf-8") as fh: + _json.dump(payload, fh) + os.replace(tmp, sf) + except Exception: + pass + + +def get_crawl_status() -> dict | None: + return _crawl_status + + +def has_crawl_status() -> bool: + return _crawl_status is not None + + +def clear_crawl_status() -> None: + global _crawl_status # noqa: PLW0603 + _crawl_status = None + sf = os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE") + if sf: + try: + os.unlink(sf) + except Exception: + pass + + +def tick_crawl_render() -> None: + """Re-render the dedicated crawl status widget in scrollback. Same + in-place mechanism as the batch honeycomb (``emit_progress_lines`` + replaces the last N lines), but rendering the crawl-specific + content: a status line with ``: (X fetched[/Y])`` + plus, when a total is known (sitemap mode), the honeycomb + progress bar above it. + + Safe to call when no crawl is in flight — early-exits if + ``_crawl_status`` is None. + """ + if _crawl_status is None: + return + import io + + from rich.console import Console + + lines_text: list[Text] = [] + progress = _progress_state + if progress is not None: + # Sitemap-mode batch-style bar, identical to the batch widget. + rows = format_honeycomb_grid( + completed=progress["completed"], + total=progress["total"], + rps=progress.get("rps"), + eta=progress.get("eta"), + failure_pct=progress.get("failure_pct"), + animate=True, + ) + lines_text.extend(rows) + + # Always include the live URL / fetched-count line below the bar. + status_text = Text() + status_text.append(" ") + phase = _crawl_status.get("phase") or "fetching" + url = _crawl_status.get("current_url") + fetched = _crawl_status.get("fetched") or 0 + saved = _crawl_status.get("saved") or 0 + if url and len(url) > 80: + url = url[:48] + "…" + url[-25:] + status_text.append(f"{phase}: ", style=f"bold {BEE_YELLOW}") + if url: + status_text.append(url, style=BEE_WHITE) + else: + status_text.append("…", style="dim") + status_text.append(f" ({fetched} fetched", style="dim") + if saved: + status_text.append(f", {saved} saved", style="dim") + status_text.append(")", style="dim") + lines_text.append(status_text) + + rendered: list[str] = [] + for row in lines_text: + buf = io.StringIO() + _c = Console( + file=buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, + ) + _c.print(row, end="") + rendered.append(buf.getvalue()) + emit_progress_lines(rendered) + + +def crawl_status_line() -> str | None: + """Build a single-line status string. Kept around for any caller + that wants a one-line crawl summary; the live in-scrollback widget + uses ``tick_crawl_render`` instead. + """ + if _crawl_status is None: + return None + phase = _crawl_status.get("phase") or "fetching" + url = _crawl_status.get("current_url") + fetched = _crawl_status.get("fetched") or 0 + saved = _crawl_status.get("saved") or 0 + # Trim very long URLs so the line fits on narrow terminals — keep the + # prefix (scheme + host + start of path) and the tail (last 25 chars) + # so users can still recognise the page. + if url and len(url) > 80: + url = url[:48] + "…" + url[-25:] + if url: + suffix = f" ({fetched} fetched" + if saved: + suffix += f", {saved} saved" + suffix += ")" + return f"{phase}: {url}{suffix}" + return f"{phase}… ({fetched} fetched)" + + +# -- Shared progress state for the REPL ticker animation --------------------- +# batch.py calls ``update_progress_state`` on each completion to record +# latest counts/rates. The REPL ticker calls ``tick_progress_render`` at +# ~10 Hz so the in-progress (boundary) hex shimmers between frames even +# when no new completion has fired. ``clear_progress_state`` is called +# when the batch finishes so the ticker stops re-rendering. + +_progress_state: dict | None = None + + +def update_progress_state( + completed: int, + total: int, + *, + rps: float | None = None, + eta: str | None = None, + failure_pct: float | None = None, +) -> None: + global _progress_state # noqa: PLW0603 + _progress_state = { + "completed": completed, + "total": total, + "rps": rps, + "eta": eta, + "failure_pct": failure_pct, + } + # In the crawl subprocess we hand state to the parent via the + # status file (``_maybe_mirror_to_status_file`` reads + # ``_progress_state`` alongside ``_crawl_status``). Rendering here + # would emit honeycomb rows via ``emit_progress_lines`` → the + # stderr fallback (no ``_progress_renderer`` is installed in the + # child), and the parent would then ingest those rows into + # scrollback as duplicates because each Scrapy log line displaces + # the ``replace_last_n_lines`` anchor. + if os.environ.get("SCRAPINGBEE_CRAWL_STATUS_FILE"): + _maybe_mirror_to_status_file() + return + # In the REPL parent during a crawl (``_crawl_status`` non-None), + # the fixed crawl_status widget reads ``_progress_state`` directly + # and renders the honeycomb in place. Rendering through + # ``tick_progress_render`` here would ALSO write to scrollback + # (the batch path), giving the same duplicate-rows problem the + # child fix already solved. + if _crawl_status is not None: + return + tick_progress_render() + + +def clear_progress_state() -> None: + global _progress_state # noqa: PLW0603 + _progress_state = None + + +def has_progress_state() -> bool: + return _progress_state is not None + + +def tick_progress_render() -> None: + """Re-render the progress widget with the latest state. Safe to call + when no batch is in progress (becomes a no-op). The shimmer phase + is derived from ``time.monotonic()`` inside ``format_honeycomb_grid``. + """ + if _progress_state is None: + return + rows = format_honeycomb_grid( + completed=_progress_state["completed"], + total=_progress_state["total"], + rps=_progress_state["rps"], + eta=_progress_state["eta"], + failure_pct=_progress_state["failure_pct"], + animate=True, + ) + import io + + from rich.console import Console + + rendered: list[str] = [] + for row in rows: + buf = io.StringIO() + _c = Console( + file=buf, + force_terminal=True, + color_system="truecolor", + highlight=False, + width=200, + ) + _c.print(row, end="") + rendered.append(buf.getvalue()) + emit_progress_lines(rendered) + + +# -- Single-line bee frames -------------------------------------------------- + +# Each frame is a tuple of (segment, style) pairs rendered inline. +# The bee body is yellow, wings are white, and they alternate to create a flap. +_BEE_INLINE_FRAMES: list[list[tuple[str, str]]] = [ + [ + ("\\", "bold white"), + ("(", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + ("ω", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + (")", "dim"), + ("/", "bold white"), + ], + [ + ("᎑", "bold white"), + ("(", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + ("ω", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + (")", "dim"), + ("᎑", "bold white"), + ], + [ + ("/", "bold white"), + ("(", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + ("ω", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + (")", "dim"), + ("\\", "bold white"), + ], + [ + ("᎑", "bold white"), + ("(", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + ("ω", "dim"), + ("◉", f"bold {BEE_YELLOW}"), + (")", "dim"), + ("᎑", "bold white"), + ], +] + + +def _render_inline_bee(frame_idx: int) -> Text: + """Return a single-line bee Text for the given frame.""" + parts = _BEE_INLINE_FRAMES[frame_idx % len(_BEE_INLINE_FRAMES)] + text = Text() + for content, style in parts: + text.append(content, style=style) + return text + + +# -- Styled output helpers --------------------------------------------------- + + +def print_banner() -> None: + """Print the ScrapingBee CLI banner to stderr.""" + banner = Text() + bee = _render_inline_bee(0) + banner.append(" ") + banner.append_text(bee) + banner.append(" ScrapingBee", style=f"bold {BEE_YELLOW}") + banner.append(" CLI", style="bold white") + err_console.print(banner) + + +def styled_echo(message: str, *, style: str = "info", err: bool = True) -> None: + c = err_console if err else console + c.print(f"[{style}]{message}[/{style}]") + + +def echo_success(message: str) -> None: + err_console.print(f"[success]{message}[/success]") + + +def echo_error(message: str) -> None: + err_console.print(f"[error]{message}[/error]") + + +def echo_warning(message: str) -> None: + err_console.print(f"[warn]{message}[/warn]") + + +def echo_key_value(key: str, value: str) -> None: + text = Text() + text.append(f" {key}: ", style=f"bold {BEE_YELLOW}") + text.append(value, style="white") + err_console.print(text) + + +def echo_separator() -> None: + err_console.print(f"[dim]{'─' * 40}[/dim]") + + +def format_progress_line( + completed: int, + total: int, + *, + rps: float | None = None, + eta: str | None = None, + failure_pct: float | None = None, +) -> Text: + width = 20 + filled = int(width * completed / total) if total > 0 else 0 + bar = "█" * filled + "░" * (width - filled) + + text = Text() + text.append(" ") + text.append(bar, style=f"bold {BEE_YELLOW}") + text.append(f" {completed}/{total}", style="bold white") + if rps is not None: + text.append(f" {rps:.0f} req/s", style="dim") + if eta is not None: + text.append(f" ETA {eta}", style="dim") + if failure_pct is not None and failure_pct > 0: + text.append(f" Failures: {failure_pct:.0f}%", style=f"bold {BEE_RED}") + return text + + +# -- Honeycomb credit meter -------------------------------------------------- + + +def format_honeycomb_meter(used: int, total: int) -> Text: + """Render a honeycomb-style credit meter. + + Filled hex (⬢) = remaining credits (ScrapingBee brand yellow). + Outline hex (⬡) = used / consumed (dim grey). + Intuitive "fuel gauge" semantics — yellow shows what you have left. + """ + width = 20 + if total <= 0: + pct = 0.0 + else: + pct = (total - used) / total + remaining = total - used + filled = int(width * pct) # remaining portion (yellow, filled hex) + empty = width - filled # used portion (dim, outline hex) + + text = Text() + text.append(" ") + text.append("⬢" * filled, style=f"bold {BEE_YELLOW}") + text.append("⬡" * empty, style=f"dim {BEE_YELLOW}") + text.append(f" {remaining:,} / {total:,} credits remaining", style="bold white") + + # Color the percentage based on health + pct_val = pct * 100 + if pct_val > 50: + pct_style = f"bold {BEE_GREEN}" + elif pct_val > 20: + pct_style = f"bold {BEE_AMBER}" + else: + pct_style = f"bold {BEE_RED}" + text.append(f" ({pct_val:.0f}%)", style=pct_style) + return text + + +# -- Completion summary panel ------------------------------------------------ + + +def print_completion_summary( + *, + succeeded: int, + failed: int, + duration_s: float | None = None, + credits_used: int | None = None, + output_path: str | None = None, + is_crawl: bool = False, +) -> None: + """Print a styled completion summary panel to stderr.""" + from rich.panel import Panel + from rich.table import Table + + total = succeeded + failed + table = Table(show_header=False, box=None, padding=(0, 1)) + table.add_column(style=f"bold {BEE_YELLOW}", min_width=12) + table.add_column(style="bold white") + + # Status line + if failed == 0: + status = Text() + status.append(" \\(◉ω◉)/ ", style=f"bold {BEE_YELLOW}") + status.append("Mission accomplished!", style=f"bold {BEE_GREEN}") + else: + status = Text() + status.append(" /(◉ω◉)\\ ", style=f"bold {BEE_YELLOW}") + status.append(f"{succeeded} succeeded, {failed} failed", style=f"bold {BEE_AMBER}") + + table.add_row( + "Items", + f"{succeeded}/{total} succeeded" + (f" ({failed} failed)" if failed else ""), + ) + if credits_used is not None: + table.add_row("Credits", f"{credits_used:,} used") + if duration_s is not None: + if duration_s < 60: + dur_str = f"{duration_s:.1f}s" + else: + m, s = divmod(int(duration_s), 60) + dur_str = f"{m}m {s}s" + table.add_row("Duration", dur_str) + if total > 0 and duration_s > 0: + table.add_row("Avg speed", f"{total / duration_s:.1f} req/s") + if output_path: + table.add_row("Output", output_path) + if failed > 0: + tip = ( + "Tip: Retry failures with --resume" + if not is_crawl + else "Tip: Re-run with --resume to retry" + ) + table.add_row("", Text(tip, style="dim")) + + title = "Crawl Complete" if is_crawl else "Batch Complete" + panel = Panel( + table, + title=f"[bold {BEE_YELLOW}]{title}[/]", + subtitle=str(status), + border_style=BEE_YELLOW, + padding=(1, 2), + ) + err_console.print(panel) + + +# -- Honeycomb trail progress ------------------------------------------------ + + +def format_honeycomb_grid( + completed: int, + total: int, + *, + rps: float | None = None, + eta: str | None = None, + failure_pct: float | None = None, + animate: bool = False, +) -> list[Text]: + """3-row honeycomb progress bar for batch operations. + + Filled hex (⬢) = completed (ScrapingBee brand yellow, bold). + Outline hex (⬡) = remaining (brand yellow, dim — still brand-colored, + just lower-emphasis so the difference reads visually). Cells fill in + row order, left to right. + + Row layout (offset to look like a honeycomb): + Row 0: ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ + Row 1: ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ + Row 2: ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ ⬢ + + Returns a list of three Text objects, one per row. The third row also + carries the ``X/Y N req/s ETA …`` stats trailing the cells. + """ + # Single row of hexes — the terminal's line-height made a 3-row stack + # feel visually disconnected, and the user preferred a tighter + # single-line look. The multi-line plumbing (``replace_last_n_lines``, + # the ticker shimmer, the progress-state hook) is kept intact because + # it costs nothing and the single line is just ``n=1``. + width = 20 + if total <= 0: + filled = 0 + else: + filled = int(width * completed / total) + filled = min(filled, width) + + filled_style = f"bold {BEE_YELLOW}" + outline_style = f"dim {BEE_YELLOW}" + + # Boundary cell shimmer: the next-to-be-filled cell pulses between a + # mid-bright and a soft yellow so the user can see the batch is alive + # even when no completion has fired in the last few ms. Only active + # when ``animate=True`` (the REPL ticker passes that) and only when + # there is a still-empty cell at the front of the bar. + shimmer_styles: list[str] = [] + if animate and filled < width: + import math + import time as _time + + # 1.2 Hz pulse — slow enough to read, fast enough to feel alive. + phase = 0.5 + 0.5 * math.sin(_time.monotonic() * 2 * math.pi * 1.2) + if phase > 0.55: + shimmer_styles.append(f"bold {BEE_YELLOW}") + else: + shimmer_styles.append(f"{BEE_YELLOW}") + + def _render_row(row_text: Text) -> None: + if filled > 0: + row_text.append("⬢" * filled, style=filled_style) + if filled < width: + if shimmer_styles: + # First empty cell uses the shimmer style; the rest are + # the regular dim-yellow outline. + row_text.append("⬡", style=shimmer_styles[0]) + if (width - filled) > 1: + row_text.append("⬡" * (width - filled - 1), style=outline_style) + else: + row_text.append("⬡" * (width - filled), style=outline_style) + + row_text = Text() + row_text.append(" ") + _render_row(row_text) + # Stats trail directly off the single row. + row_text.append(f" {completed}/{total}", style="bold white") + if rps is not None: + row_text.append(f" {rps:.1f} req/s", style="dim") + if eta is not None: + row_text.append(f" ETA {eta}", style="dim") + if failure_pct is not None and failure_pct > 0: + row_text.append(f" Failures: {failure_pct:.0f}%", style=f"bold {BEE_RED}") + return [row_text] + + +def format_honeycomb_trail( + completed: int, + total: int, + *, + rps: float | None = None, + eta: str | None = None, + failure_pct: float | None = None, +) -> Text: + """Backward-compatible single-line variant. New code should use + :func:`format_honeycomb_grid` for the richer 3-row layout. + """ + width = 25 + if total <= 0: + pos = 0 + else: + pos = int(width * completed / total) + pos = min(pos, width) + + text = Text() + text.append(" ") + text.append("⬢" * pos, style=f"bold {BEE_YELLOW}") + text.append("⬡" * (width - pos), style=f"dim {BEE_YELLOW}") + text.append(f" {completed}/{total}", style="bold white") + if rps is not None: + text.append(f" {rps:.1f} req/s", style="dim") + if eta is not None: + text.append(f" ETA {eta}", style="dim") + if failure_pct is not None and failure_pct > 0: + text.append(f" Failures: {failure_pct:.0f}%", style=f"bold {BEE_RED}") + return text + + +# -- Notification helper (cross-platform) ------------------------------------ + + +def notify_completion(title: str, body: str) -> None: + """Send a desktop notification + terminal bell. Cross-platform.""" + import shutil + import subprocess + + # Terminal bell + sys.stderr.write("\a") + sys.stderr.flush() + + try: + if sys.platform == "darwin": + subprocess.run( + [ + "osascript", + "-e", + f'display notification "{body}" with title "{title}"', + ], + capture_output=True, + timeout=5, + ) + elif sys.platform == "win32": + # PowerShell toast notification + ps_cmd = ( + f"[Windows.UI.Notifications.ToastNotificationManager, Windows.UI.Notifications, " + f"ContentType = WindowsRuntime] > $null; " + f"$template = [Windows.UI.Notifications.ToastNotificationManager]::" + f"GetTemplateContent([Windows.UI.Notifications.ToastTemplateType]::ToastText02); " + f"$textNodes = $template.GetElementsByTagName('text'); " + f"$textNodes.Item(0).AppendChild($template.CreateTextNode('{title}')) > $null; " + f"$textNodes.Item(1).AppendChild($template.CreateTextNode('{body}')) > $null; " + f"$toast = [Windows.UI.Notifications.ToastNotification]::new($template); " + f"[Windows.UI.Notifications.ToastNotificationManager]::" + f"CreateToastNotifier('ScrapingBee CLI').Show($toast)" + ) + subprocess.run( + ["powershell", "-Command", ps_cmd], + capture_output=True, + timeout=10, + ) + elif shutil.which("notify-send"): + subprocess.run( + ["notify-send", title, body, "-i", "dialog-information"], + capture_output=True, + timeout=5, + ) + except Exception: + pass # Notification is best-effort + + +# -- Styled version output --------------------------------------------------- + + +def print_styled_version(version: str) -> None: + """Print a branded version line to stderr.""" + import platform + + bee = _render_inline_bee(0) + text = Text() + text.append(" ") + text.append_text(bee) + text.append(" ScrapingBee CLI ", style=f"bold {BEE_YELLOW}") + text.append(f"v{version}", style="bold white") + err_console.print(text) + err_console.print(f" [dim]Python {platform.python_version()} | {sys.platform}[/dim]") + # Try to show credit balance + try: + from .config import get_api_key + + api_key = get_api_key(None) + if api_key: + import asyncio + + from .client import Client + from .config import BASE_URL + + async def _check(): + async with Client(api_key, BASE_URL, timeout=10) as c: + body, _, code = await c.usage() + if code == 200: + from .client import parse_usage + + return parse_usage(body) + return None + + usage = asyncio.run(_check()) + if usage: + remaining = usage.get("credits", 0) + err_console.print( + f" [dim]API credits remaining:[/dim] [bold {BEE_GREEN}]{remaining:,}[/bold {BEE_GREEN}]" + ) + except Exception: + pass + + +# -- Welcome banner with grouped commands ------------------------------------ + + +def print_welcome_banner(version: str, commands: dict[str, list[tuple[str, str]]]) -> None: + """Print a branded welcome screen with grouped commands. + + commands: dict mapping group name to list of (cmd_name, description) tuples. + """ + # Header + bee = _render_inline_bee(0) + header = Text() + header.append(" ") + header.append_text(bee) + header.append(" ScrapingBee CLI ", style=f"bold {BEE_YELLOW}") + header.append(f"v{version}", style="bold white") + err_console.print(header) + err_console.print(" [dim]Web scraping from the terminal, powered by bees.[/dim]") + err_console.print() + + # Command groups + for group_name, cmds in commands.items(): + err_console.print(f" [bold {BEE_YELLOW}]~~ {group_name} ~~[/]") + for cmd_name, description in cmds: + err_console.print(f" [bold {BEE_YELLOW}]{cmd_name:<20}[/] [dim]{description}[/dim]") + err_console.print() + + err_console.print( + " [dim]Run[/dim] [bold white]scrapingbee --help[/] [dim]for details.[/dim]" + ) + err_console.print() + + +# -- Personality error messages ---------------------------------------------- + +_ERROR_MESSAGES: dict[int, tuple[str, str]] = { + 401: ("Bzzt! Invalid API key", "Run: scrapingbee auth"), + 403: ( + "The page stung back! (403 Forbidden)", + "Try --premium-proxy or --stealth-proxy", + ), + 404: ("The page flew away! (404 Not Found)", "Double-check your URL"), + 429: ( + "Whoa, too fast! The hive needs a breather (429)", + "Use --concurrency to slow down, or wait a moment", + ), + 500: ( + "Something went wrong on their end (500)", + "Use --retries to try again automatically", + ), + 502: ("The upstream hive is down (502)", "Try again in a moment"), + 503: ( + "Service temporarily unavailable (503)", + "The target is overloaded — retry shortly", + ), +} + + +def echo_bee_error(status_code: int, fallback_msg: str = "") -> None: + """Print a bee-personality error with actionable tip.""" + if status_code in _ERROR_MESSAGES: + msg, tip = _ERROR_MESSAGES[status_code] + bee = _render_inline_bee(2) # wings-down frame for errors + line = Text() + line.append(" ") + line.append_text(bee) + line.append(f" {msg}", style=f"bold {BEE_RED}") + err_console.print(line) + err_console.print(f" [dim]Tip: {tip}[/dim]") + else: + echo_error(fallback_msg or f"Error: HTTP {status_code}") diff --git a/tests/unit/test_crawl.py b/tests/unit/test_crawl.py index 0f93d1b..e5cc582 100644 --- a/tests/unit/test_crawl.py +++ b/tests/unit/test_crawl.py @@ -3,7 +3,6 @@ from __future__ import annotations from scrapingbee_cli.crawl import ( - _NON_HTML_URL_EXTENSIONS, _body_from_json_response, _extract_hrefs_from_body, _extract_hrefs_from_response, @@ -187,83 +186,6 @@ def test_html_links_via_css(self): assert "https://other.com/b" in hrefs -class TestSpiderDiscovery: - """Tests for the double-fetch discovery mechanism in GenericScrapingBeeSpider.""" - - def _make_response(self, url: str, body: bytes, depth: int = 0): - """Create a Scrapy HtmlResponse with request meta attached.""" - from scrapy.http import HtmlResponse, Request - - response = HtmlResponse(url, body=body, encoding="utf-8") - response.request = Request(url, meta={"depth": depth}) - return response - - def test_parse_yields_discovery_request_when_no_links(self): - """parse() must yield exactly one discovery request when the body has no links.""" - from scrapy_scrapingbee import ScrapingBeeRequest - - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={"return_page_text": True}, - output_dir=None, - ) - response = self._make_response("https://example.com/page", b"Plain text, no links") - requests = list(spider.parse(response)) - - assert len(requests) == 1 - assert isinstance(requests[0], ScrapingBeeRequest) - assert requests[0].callback == spider._parse_discovery_links_only - assert requests[0].dont_filter is True - - def test_parse_does_not_yield_discovery_when_links_found(self): - """parse() must not yield a discovery request when the body already has links.""" - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={}, - output_dir=None, - ) - spider.seen_urls.add("https://example.com") - - response = self._make_response( - "https://example.com", - b'link1link2', - ) - requests = list(spider.parse(response)) - - # No request should target the discovery callback - for req in requests: - assert req.callback != spider._parse_discovery_links_only - - def test_parse_discovery_links_only_follows_links_but_does_not_save(self, tmp_path): - """_parse_discovery_links_only must yield follow requests but never write files.""" - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={"return_page_text": True}, - output_dir=str(tmp_path), - ) - spider.seen_urls.add("https://example.com") - - response = self._make_response( - "https://example.com", - b'p1p2', - ) - requests = list(spider._parse_discovery_links_only(response)) - - # Should yield follow requests (not empty) - assert len(requests) > 0 - # Each follow request must use the main parse callback (not discovery again) - for req in requests: - assert req.callback == spider.parse - # Nothing written — discovery does not save - assert list(tmp_path.iterdir()) == [] - - class TestSpiderSaveResponse: """Tests for _save_response manifest field extraction.""" @@ -427,91 +349,6 @@ def test_return_page_markdown_does_not_require_discovery(self): assert _requires_discovery_phase({"return_page_markdown": "true"}) is False -class TestNonHtmlUrlExtensions: - """Tests for the _NON_HTML_URL_EXTENSIONS set and its use in parse().""" - - def test_image_extensions_are_binary(self): - for ext in ("jpg", "jpeg", "png", "gif", "webp", "svg", "ico"): - assert ext in _NON_HTML_URL_EXTENSIONS, f"{ext!r} should be in _NON_HTML_URL_EXTENSIONS" - - def test_download_extensions_are_binary(self): - for ext in ("pdf", "zip"): - assert ext in _NON_HTML_URL_EXTENSIONS - - def test_web_asset_extensions_are_binary(self): - for ext in ("css", "js"): - assert ext in _NON_HTML_URL_EXTENSIONS - - def test_html_like_extensions_not_in_set(self): - # These can contain links and must NOT be skipped - for ext in ("html", "htm", "asp", "aspx", "php", "xml", "md", "txt", "json"): - assert ext not in _NON_HTML_URL_EXTENSIONS, ( - f"{ext!r} must not be in _NON_HTML_URL_EXTENSIONS" - ) - - def _make_response(self, url: str, body: bytes, depth: int = 0): - from scrapy.http import HtmlResponse, Request - - response = HtmlResponse(url, body=body, encoding="utf-8") - response.request = Request(url, meta={"depth": depth}) - return response - - def test_parse_skips_discovery_for_image_url(self): - """parse() must NOT yield a discovery request when the URL is a known binary type.""" - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={"extract_rules": '{"price": ".price"}'}, - output_dir=None, - ) - # Simulate fetching a JPEG URL that returns no links (binary body) - response = self._make_response( - "https://example.com/hero.jpg", - b"\xff\xd8\xff\xe0", # JPEG magic bytes - ) - requests = list(spider.parse(response)) - # Must yield nothing — no discovery re-request for binary URLs - assert requests == [], f"Expected no requests for binary URL, got {requests}" - - def test_parse_still_fires_discovery_for_html_url_with_no_links(self): - """parse() must still yield a discovery request for HTML-like URLs with no links.""" - from scrapy_scrapingbee import ScrapingBeeRequest - - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={"extract_rules": '{"price": ".price"}'}, - output_dir=None, - ) - # JSON response body (from extract_rules) has no links - response = self._make_response( - "https://example.com/product", # no binary extension → should fire discovery - b'{"price": "$9.99"}', - ) - requests = list(spider.parse(response)) - assert len(requests) == 1 - assert isinstance(requests[0], ScrapingBeeRequest) - assert requests[0].callback == spider._parse_discovery_links_only - - def test_parse_skips_discovery_for_css_url(self): - """CSS files never contain HTML links — discovery must be skipped.""" - from scrapingbee_cli.crawl import GenericScrapingBeeSpider - - spider = GenericScrapingBeeSpider( - start_urls=["https://example.com"], - scrape_params={}, - output_dir=None, - ) - response = self._make_response( - "https://example.com/styles/main.css", - b"body { color: red; }", - ) - requests = list(spider.parse(response)) - assert requests == [] - - class TestExtractHrefsExceptionHandling: """Tests that _extract_hrefs_from_response handles non-HTML gracefully."""