diff --git a/alembic/versions/018_link_edge_position.py b/alembic/versions/018_link_edge_position.py new file mode 100644 index 0000000..14c21e2 --- /dev/null +++ b/alembic/versions/018_link_edge_position.py @@ -0,0 +1,24 @@ +"""Add position column to link_edges for nav/header/content/footer/sidebar classification. + +Revision ID: 018_link_edge_position +Revises: 017_content_drafts +""" +from __future__ import annotations + +from alembic import op + +revision = "018_link_edge_position" +down_revision = "017_content_drafts" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute(""" + ALTER TABLE link_edges + ADD COLUMN IF NOT EXISTS position TEXT NOT NULL DEFAULT 'content' + """) + + +def downgrade() -> None: + op.execute("ALTER TABLE link_edges DROP COLUMN IF EXISTS position") diff --git a/alembic/versions/019_crawl_run_mobile_link.py b/alembic/versions/019_crawl_run_mobile_link.py new file mode 100644 index 0000000..ef31bef --- /dev/null +++ b/alembic/versions/019_crawl_run_mobile_link.py @@ -0,0 +1,24 @@ +"""Add mobile_run_id to crawl_runs for pairing desktop+mobile dual crawls. + +Revision ID: 019_crawl_run_mobile_link +Revises: 018_link_edge_position +""" +from __future__ import annotations + +from alembic import op + +revision = "019_crawl_run_mobile_link" +down_revision = "018_link_edge_position" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute(""" + ALTER TABLE crawl_runs + ADD COLUMN IF NOT EXISTS mobile_run_id INT REFERENCES crawl_runs(id) + """) + + +def downgrade() -> None: + op.execute("ALTER TABLE crawl_runs DROP COLUMN IF EXISTS mobile_run_id") diff --git a/alembic/versions/020_crawl_run_pause_state.py b/alembic/versions/020_crawl_run_pause_state.py new file mode 100644 index 0000000..49ede24 --- /dev/null +++ b/alembic/versions/020_crawl_run_pause_state.py @@ -0,0 +1,26 @@ +"""Add pause_state JSONB and paused_at to crawl_runs. + +Revision ID: 020_crawl_run_pause_state +Revises: 019_crawl_run_mobile_link +Create Date: 2026-06-18 +""" +from alembic import op + +revision = "020_crawl_run_pause_state" +down_revision = "019_crawl_run_mobile_link" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute( + "ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS pause_state JSONB" + ) + op.execute( + "ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS paused_at TEXT" + ) + + +def downgrade() -> None: + op.execute("ALTER TABLE crawl_runs DROP COLUMN IF EXISTS pause_state") + op.execute("ALTER TABLE crawl_runs DROP COLUMN IF EXISTS paused_at") diff --git a/crawl_results.csv b/crawl_results.csv new file mode 100644 index 0000000..1d05b30 --- /dev/null +++ b/crawl_results.csv @@ -0,0 +1,2 @@ +url,status +https://a.com,200 diff --git a/input.txt.example b/input.txt.example index 14f9f88..d8f5acc 100644 --- a/input.txt.example +++ b/input.txt.example @@ -26,8 +26,9 @@ crawl_exclude_urls = crawl_discovery_mode = spider crawl_url_list = crawl_user_agent_preset = default -crawl_user_agent_custom = -crawl_auth_username = +crawl_user_agent_custom = +compare_mobile_desktop = false +crawl_auth_username = crawl_auth_password = crawl_extra_headers = crawl_cookies = diff --git a/pipeline-config.example.txt b/pipeline-config.example.txt index ab60161..cb1e205 100644 --- a/pipeline-config.example.txt +++ b/pipeline-config.example.txt @@ -27,7 +27,8 @@ crawl_exclude_urls = crawl_discovery_mode = spider crawl_url_list = crawl_user_agent_preset = default -crawl_user_agent_custom = +crawl_user_agent_custom = +compare_mobile_desktop = false crawl_auth_username = crawl_auth_password = crawl_extra_headers = diff --git a/src/website_profiling/commands/config_resolve.py b/src/website_profiling/commands/config_resolve.py index 5468b51..7d91c8b 100644 --- a/src/website_profiling/commands/config_resolve.py +++ b/src/website_profiling/commands/config_resolve.py @@ -364,4 +364,11 @@ def build_parser() -> argparse.ArgumentParser: dest="stdin_json", help="For 'chat' command: read JSON payload from stdin and emit NDJSON events.", ) + parser.add_argument( + "--resume-run-id", + type=int, + default=None, + dest="resume_run_id", + help="Resume a paused crawl from the saved frontier of the given crawl_run_id.", + ) return parser diff --git a/src/website_profiling/commands/pipeline_cmd.py b/src/website_profiling/commands/pipeline_cmd.py index d50078f..84a040e 100644 --- a/src/website_profiling/commands/pipeline_cmd.py +++ b/src/website_profiling/commands/pipeline_cmd.py @@ -157,7 +157,12 @@ def run(cfg: dict, args: argparse.Namespace) -> None: phase_results: list[PhaseResult] = [] - if run_crawl: + resume_run_id = getattr(args, "resume_run_id", None) + if resume_run_id is not None: + phase_results.append( + run_pipeline_phase("crawl", lambda: _run_crawl(cfg, use_database, resume_run_id=resume_run_id)) + ) + elif run_crawl: phase_results.append(run_pipeline_phase("crawl", lambda: _run_crawl(cfg, use_database))) if run_content_analysis and use_database: @@ -209,7 +214,7 @@ def _finalize_pipeline_run(phase_results: list[PhaseResult]) -> None: sys.exit(1) -def _run_crawl(cfg: dict, use_database: bool) -> None: +def _run_crawl(cfg: dict, use_database: bool, resume_run_id: int | None = None) -> None: from ..crawl.crawler import run_crawler console_print("[Crawl] Starting...", flush=True) @@ -304,6 +309,7 @@ def _run_crawl(cfg: dict, use_database: bool) -> None: crawl_robots_txt_override=(cfg.get("crawl_robots_txt_override") or "").strip(), custom_extractors=custom_extractors or None, enable_axe=enable_axe, + resume_run_id=resume_run_id, ) console_print("[Crawl] Done.", flush=True) emit_phase_done("crawl") diff --git a/src/website_profiling/crawl/config.py b/src/website_profiling/crawl/config.py index 2acbffe..a84b036 100644 --- a/src/website_profiling/crawl/config.py +++ b/src/website_profiling/crawl/config.py @@ -69,6 +69,7 @@ class CrawlConfig: crawl_robots_txt_override: str = "" custom_extractors: Optional[list[dict]] = None enable_axe: bool = False + compare_mobile_desktop: bool = False @classmethod def from_kwargs(cls, **kwargs: object) -> CrawlConfig: diff --git a/src/website_profiling/crawl/crawler.py b/src/website_profiling/crawl/crawler.py index 1c68926..3d1c923 100644 --- a/src/website_profiling/crawl/crawler.py +++ b/src/website_profiling/crawl/crawler.py @@ -4,10 +4,26 @@ from __future__ import annotations import json +import os +import signal +import threading import time from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait from typing import Optional +# Module-level pause event — set by SIGUSR1 (Unix) or a PID-keyed file (Windows). +_PAUSE_EVENT = threading.Event() + + +def _handle_pause_signal(signum: int, frame: object) -> None: # pragma: no cover + _PAUSE_EVENT.set() + + +try: + signal.signal(signal.SIGUSR1, _handle_pause_signal) +except (AttributeError, OSError): # pragma: no cover + pass # SIGUSR1 not available on Windows + import pandas as pd import requests from tqdm.auto import tqdm @@ -112,6 +128,7 @@ def __init__( enable_axe: bool = False, *, config: Optional[CrawlConfig] = None, + pause_state: Optional[dict] = None, ): if config is None: config = CrawlConfig.from_kwargs( @@ -206,6 +223,7 @@ def __init__( self.lock = self.frontier.lock self.results: list[dict] = [] + self.paused: bool = False # `requests.Session` is not thread-safe, so worker threads each build # their own session from this factory (see StaticFetcher). The template # `self.session` below is only touched on the main thread (sitemap @@ -233,6 +251,8 @@ def __init__( self._hybrid_fetcher = ( self.fetcher if isinstance(self.fetcher, HybridFetcher) else None ) + if pause_state: + self.frontier.restore_from_state(pause_state) self.frontier.seed_initial_urls( discovery_mode=config.discovery_mode, crawl_url_list=config.crawl_url_list, @@ -429,6 +449,7 @@ def crawl( stream_crawl_run_id: Optional[int] = None, stream_batch_size: int = 500, ) -> pd.DataFrame: + _PAUSE_EVENT.clear() start_time = time.time() from ..progress import CrawlProgressTracker, emit_phase_start @@ -509,6 +530,22 @@ def crawl( remaining.append(f) futures = remaining + # Check for pause request (SIGUSR1) or Windows file-based signal. + if not _PAUSE_EVENT.is_set(): + _pause_file = os.path.join( + os.environ.get("TMPDIR", "/tmp"), + f"wp_pause_{os.getpid()}.flag", + ) + if os.path.exists(_pause_file): + try: + os.unlink(_pause_file) + except OSError: + pass + _PAUSE_EVENT.set() + if _PAUSE_EVENT.is_set(): + self.paused = True + break + if self.queue.empty() and not futures: break finally: @@ -582,8 +619,22 @@ def run_crawler( crawl_robots_txt_override: str = "", custom_extractors: Optional[list] = None, enable_axe: bool = False, + compare_mobile_desktop: bool = False, + resume_run_id: Optional[int] = None, ) -> pd.DataFrame: """Run crawler and optionally save to CSV/JSON or PostgreSQL. Returns DataFrame.""" + _resume_pause_state: Optional[dict] = None + if resume_run_id is not None: + from ..db import db_session + from ..db.crawl_store import load_pause_state + with db_session() as _conn: + _resume_pause_state = load_pause_state(_conn, resume_run_id) + if _resume_pause_state: + console_print( + f" Resuming from paused run {resume_run_id} " + f"({len(_resume_pause_state.get('pending', []))} URLs pending)...", + flush=True, + ) max_p = max_pages if max_pages is not None else 0 mode_label = (render_mode or "static").strip().lower() disc_label = normalize_discovery_mode(discovery_mode) @@ -634,6 +685,7 @@ def run_crawler( crawl_robots_txt_override=crawl_robots_txt_override, custom_extractors=custom_extractors, enable_axe=enable_axe, + pause_state=_resume_pause_state, ) stream_run_id: Optional[int] = None if output_db: @@ -663,6 +715,35 @@ def run_crawler( show_progress=show_progress, stream_crawl_run_id=stream_run_id, ) + + # ---- Pause handling: save frontier and exit with code 2 ---- + if getattr(crawler, "paused", False): + import sys + from ..db import db_session + from ..db.crawl_store import save_pause_state + + _pause_run_id = stream_run_id + if _pause_run_id is not None: + _frontier_state = crawler.frontier.serialize_state() + _frontier_state["pages_crawled"] = len(crawler.results) + with db_session() as _conn: + save_pause_state(_conn, _pause_run_id, _frontier_state) + console_print( + f"[PAUSE] crawl_run_id={_pause_run_id}", + flush=True, + ) + else: + console_print("[PAUSE] crawl_run_id=none", flush=True) + sys.exit(2) + + # ---- Resume cleanup: clear saved frontier from the resumed run ---- + if resume_run_id is not None and _resume_pause_state is not None and not getattr(crawler, "paused", False): + from ..db import db_session + from ..db.crawl_store import clear_pause_state + + with db_session() as _conn: + clear_pause_state(_conn, resume_run_id) + if output_db and crawler.link_edges_accum: from ..db import db_session from ..db.crawl_store import write_link_edges @@ -712,6 +793,70 @@ def run_crawler( console_print(" Crawl DB write complete.", flush=True) elif output_db and stream_run_id is not None: console_print(" Crawl streamed to DB during fetch.", flush=True) + + # Second pass: run mobile crawl and pair the two runs via mobile_run_id FK + if compare_mobile_desktop and output_db and run_id is not None: + from ..db import db_session + from ..db.crawl_store import get_latest_crawl_run_id, set_mobile_run_id + + console_print(" Starting mobile second-pass crawl for comparison...", flush=True) + with db_session() as _conn: + _baseline_id = get_latest_crawl_run_id(_conn) or 0 + run_crawler( + start_url=start_url, + max_pages=max_pages, + concurrency=concurrency, + timeout=timeout, + ignore_robots=ignore_robots, + allow_external=allow_external, + max_depth=max_depth, + polite_delay=polite_delay, + store_outlinks=store_outlinks, + output_csv=None, + output_db=True, + show_progress=show_progress, + exclude_urls=exclude_urls, + preserve_crawl_history=True, + store_content_excerpt=store_content_excerpt, + content_excerpt_max_chars=content_excerpt_max_chars, + store_page_html=False, + run_content_analysis=False, + crawl_stream_to_db=crawl_stream_to_db, + property_id=property_id, + render_mode=render_mode, + js_concurrency=js_concurrency, + js_timeout=js_timeout, + js_wait_until=js_wait_until, + js_extra_wait_ms=js_extra_wait_ms, + js_block_resources=js_block_resources, + capture_console=capture_console, + js_console_levels=js_console_levels, + capture_failed_requests=capture_failed_requests, + console_max_per_page=console_max_per_page, + custom_extraction_regex=custom_extraction_regex, + crawl_ignore_params=crawl_ignore_params, + discovery_mode=discovery_mode, + crawl_url_list=crawl_url_list, + crawl_user_agent_preset="mobile", + crawl_user_agent_custom="", + crawl_auth_username=crawl_auth_username, + crawl_auth_password=crawl_auth_password, + crawl_extra_headers=crawl_extra_headers, + crawl_cookies=crawl_cookies, + crawl_robots_txt_override=crawl_robots_txt_override, + custom_extractors=custom_extractors, + enable_axe=False, + compare_mobile_desktop=False, + ) + with db_session() as _conn: + mobile_id = get_latest_crawl_run_id(_conn) + if mobile_id is not None and mobile_id != _baseline_id: + set_mobile_run_id(_conn, run_id, mobile_id) + console_print( + f" Mobile crawl complete (run_id={mobile_id}). Linked to desktop run {run_id}.", + flush=True, + ) + elif output_csv and not df.empty: if output_csv.lower().endswith(".json"): df.to_json(output_csv, orient="records", indent=2, date_format="iso", default_handler=str) diff --git a/src/website_profiling/crawl/frontier.py b/src/website_profiling/crawl/frontier.py index cb1e65a..f3882d9 100644 --- a/src/website_profiling/crawl/frontier.py +++ b/src/website_profiling/crawl/frontier.py @@ -154,3 +154,19 @@ def mark_visited(self, url: str) -> bool: def should_skip_dequeued(self, url: str) -> bool: return url_matches_exclude(url, self.exclude_urls) + + def serialize_state(self) -> dict: + """Return a JSON-serialisable snapshot of the frontier for pause/resume.""" + with self.lock: + pending = list(self.queue.queue) + visited = list(self.visited) + depths = dict(self.depths) + return {"pending": pending, "visited": visited, "depths": depths} + + def restore_from_state(self, state: dict) -> None: + """Pre-populate the frontier from a previously serialised state.""" + with self.lock: + for url in state.get("pending", []): + self.queue.put(url) + self.visited.update(state.get("visited", [])) + self.depths.update(state.get("depths", {})) diff --git a/src/website_profiling/db/crawl_store.py b/src/website_profiling/db/crawl_store.py index fc50d32..b86ef57 100644 --- a/src/website_profiling/db/crawl_store.py +++ b/src/website_profiling/db/crawl_store.py @@ -105,6 +105,141 @@ def get_crawl_run_info(conn: Connection, run_id: int) -> Optional[dict[str, Any] return None +def set_mobile_run_id(conn: Connection, desktop_run_id: int, mobile_run_id: int) -> None: + """Link a mobile crawl run to its paired desktop run.""" + conn.execute( + "UPDATE crawl_runs SET mobile_run_id = %s WHERE id = %s", + (mobile_run_id, desktop_run_id), + ) + conn.commit() + + +def get_mobile_run_id(conn: Connection, run_id: int) -> Optional[int]: + """Return the mobile_run_id paired with this desktop run, or None.""" + try: + cur = conn.execute( + "SELECT mobile_run_id FROM crawl_runs WHERE id = %s", (run_id,) + ) + row = cur.fetchone() + if row is None: + return None + val = row["mobile_run_id"] + return int(val) if val is not None else None + except Exception: + return None + + +def read_mobile_desktop_delta(conn: Connection, desktop_run_id: int) -> list[dict[str, Any]]: + """Compare desktop vs paired mobile crawl, returning per-URL delta rows. + + Each row has: url, desktop, mobile (each with title/h1/word_count/status), + and boolean flags title_differs, h1_differs, status_differs, plus word_count_delta. + Only URLs present in both runs with at least one meaningful difference are included. + """ + mobile_run_id = get_mobile_run_id(conn, desktop_run_id) + if mobile_run_id is None: + return [] + desktop_df = read_crawl(conn, desktop_run_id) + mobile_df = read_crawl(conn, mobile_run_id) + if desktop_df.empty or mobile_df.empty: + return [] + + def _norm(s: Any) -> str: + return str(s or "").rstrip("/").lower() + + def _int(v: Any) -> int: + try: + return int(v or 0) + except (TypeError, ValueError): + return 0 + + desktop_map = {_norm(r.get("url")): r for r in desktop_df.to_dict("records")} + mobile_map = {_norm(r.get("url")): r for r in mobile_df.to_dict("records")} + + deltas: list[dict[str, Any]] = [] + for url_key, dr in desktop_map.items(): + mr = mobile_map.get(url_key) + if mr is None: + continue + d_title = str(dr.get("title") or "") + m_title = str(mr.get("title") or "") + d_h1 = str(dr.get("h1") or "") + m_h1 = str(mr.get("h1") or "") + d_wc = _int(dr.get("word_count")) + m_wc = _int(mr.get("word_count")) + d_st = _int(dr.get("status")) + m_st = _int(mr.get("status")) + + title_diff = d_title != m_title + h1_diff = d_h1 != m_h1 + wc_diff = abs(d_wc - m_wc) + status_diff = d_st != m_st + + if not (title_diff or h1_diff or wc_diff > 50 or status_diff): + continue + deltas.append({ + "url": str(dr.get("url") or url_key), + "desktop": {"title": d_title, "h1": d_h1, "word_count": d_wc, "status": d_st}, + "mobile": {"title": m_title, "h1": m_h1, "word_count": m_wc, "status": m_st}, + "title_differs": title_diff, + "h1_differs": h1_diff, + "word_count_delta": wc_diff, + "status_differs": status_diff, + }) + + # Sort: status diffs first (mobile indexing risk), then title, then word count delta + deltas.sort( + key=lambda d: -( + (4 if d["status_differs"] else 0) + + (2 if d["title_differs"] else 0) + + (1 if d["h1_differs"] else 0) + + (1 if d["word_count_delta"] > 100 else 0) + ) + ) + return deltas + + +def save_pause_state(conn: Connection, run_id: int, state: dict) -> None: + """Persist frontier state for a paused crawl run.""" + from datetime import datetime, timezone + + now = datetime.now(timezone.utc).isoformat() + conn.execute( + "UPDATE crawl_runs SET pause_state = %s, paused_at = %s WHERE id = %s", + (json.dumps(state), now, run_id), + ) + conn.commit() + + +def load_pause_state(conn: Connection, run_id: int) -> Optional[dict]: + """Load saved frontier state for a paused crawl run.""" + try: + cur = conn.execute( + "SELECT pause_state FROM crawl_runs WHERE id = %s", (run_id,) + ) + row = cur.fetchone() + if row is None or row["pause_state"] is None: + return None + val = row["pause_state"] + if isinstance(val, str): + return json.loads(val) + return dict(val) + except Exception: + return None + + +def clear_pause_state(conn: Connection, run_id: int) -> None: + """Clear saved frontier state after a successful resume.""" + try: + conn.execute( + "UPDATE crawl_runs SET pause_state = NULL, paused_at = NULL WHERE id = %s", + (run_id,), + ) + conn.commit() + except Exception: + pass + + def _df_row_to_crawl_json(row: pd.Series) -> dict[str, Any]: out: dict[str, Any] = {} for col in row.index: @@ -398,14 +533,15 @@ def write_link_edges( bool(e.get("is_sponsored")), bool(e.get("is_ugc")), str(e.get("link_type") or "internal"), + str(e.get("position") or "content"), )) if rows: _executemany( conn, """INSERT INTO link_edges ( crawl_run_id, from_url, to_url, anchor_text, rel, - is_nofollow, is_sponsored, is_ugc, link_type - ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + is_nofollow, is_sponsored, is_ugc, link_type, position + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING""", rows, ) @@ -424,7 +560,9 @@ def read_link_edges( return [] try: cur = conn.execute( - """SELECT from_url, to_url, anchor_text, rel, is_nofollow, is_sponsored, is_ugc, link_type + """SELECT from_url, to_url, anchor_text, rel, + is_nofollow, is_sponsored, is_ugc, link_type, + COALESCE(position, 'content') AS position FROM link_edges WHERE crawl_run_id = %s LIMIT %s""", (run_id, max(1, int(limit))), ) diff --git a/src/website_profiling/llm/issues_action_plan.py b/src/website_profiling/llm/issues_action_plan.py new file mode 100644 index 0000000..5c824eb --- /dev/null +++ b/src/website_profiling/llm/issues_action_plan.py @@ -0,0 +1,158 @@ +"""LLM action plan for deduplicated audit issue lists.""" +from __future__ import annotations + +import hashlib +import json +from typing import Any + +from ..llm_config import llm_is_enabled +from .base import get_llm_client, parse_json_response +from .enrich import _read_cache, _write_cache +from .fix_suggestions import _fix_suggestion_enabled +from .prompts import ISSUES_ACTION_PLAN_SYSTEM, PROMPT_VERSION + +MAX_ISSUES = 80 + + +def _cache_key(model: str, domain: str, issues: list[dict[str, Any]]) -> str: + body = json.dumps({"domain": domain, "issues": issues}, sort_keys=True, default=str) + digest = hashlib.sha256(f"issues_action_plan:{PROMPT_VERSION}:{model}:{body}".encode()).hexdigest() + return digest + + +def _compact_issues(raw: list[Any]) -> list[dict[str, Any]]: + out: list[dict[str, Any]] = [] + for row in raw or []: + if not isinstance(row, dict): + continue + message = str(row.get("message") or "").strip() + if not message: + continue + item: dict[str, Any] = { + "category": str(row.get("category") or ""), + "message": message, + "priority": str(row.get("priority") or "Medium"), + "url_count": int(row.get("url_count") or row.get("urlCount") or 0), + "sample_urls": [ + str(u).strip() + for u in (row.get("sample_urls") or row.get("sampleUrls") or []) + if str(u).strip() + ][:5], + } + rec = row.get("recommendation") + if rec: + item["recommendation"] = str(rec) + for src, dst in (("impact_score", "impact_score"), ("gsc_clicks", "gsc_clicks")): + val = row.get(src) if src in row else row.get("impactScore" if src == "impact_score" else "gscClicks") + if val is not None: + try: + item[dst] = float(val) + except (TypeError, ValueError): + pass + out.append(item) + return out[:MAX_ISSUES] + + +def _format_plan_markdown(data: dict[str, Any]) -> str: + lines: list[str] = [] + summary = str(data.get("summary") or "").strip() + if summary: + lines.extend([summary, ""]) + + quick_wins = data.get("quick_wins") or [] + if isinstance(quick_wins, list) and quick_wins: + lines.append("### Quick wins") + for item in quick_wins[:8]: + text = str(item).strip() + if text: + lines.append(f"- {text}") + lines.append("") + + phases = data.get("phases") or [] + if isinstance(phases, list) and phases: + lines.append("### Phased plan") + for phase in phases[:6]: + if not isinstance(phase, dict): + continue + name = str(phase.get("name") or "Phase").strip() + effort = str(phase.get("effort") or "").strip() + header = f"**{name}**" + if effort: + header += f" (effort: {effort})" + lines.append(header) + actions = phase.get("actions") or [] + if isinstance(actions, list): + for action in actions[:8]: + text = str(action).strip() + if text: + lines.append(f"- {text}") + lines.append("") + + notes = str(data.get("notes") or "").strip() + if notes: + lines.extend(["### Notes", notes]) + + return "\n".join(lines).strip() + + +def generate_issues_action_plan( + payload: dict[str, Any], + *, + cfg: dict[str, str] | None = None, + refresh: bool = False, +) -> dict[str, Any]: + from ..llm_config import load_llm_config_from_db + + cfg = cfg or load_llm_config_from_db() + if not llm_is_enabled(cfg): + return {"ok": False, "error": "AI insights are disabled."} + if not _fix_suggestion_enabled(cfg): + return {"ok": False, "error": "Issue fix suggestions are disabled in AI task settings."} + + domain = str(payload.get("domain") or "").strip() + issues = _compact_issues(payload.get("issues") or []) + if not domain: + return {"ok": False, "error": "domain required."} + if not issues: + return {"ok": False, "error": "issues required."} + + model = (cfg.get("llm_model") or cfg.get("llm_provider") or "unknown").strip() + cache_key = _cache_key(model, domain, issues) + + if not refresh: + cached = _read_cache(cache_key) + if cached: + plan_md = _format_plan_markdown(cached) + return { + "ok": True, + "cached": True, + "plan": plan_md, + "summary": cached.get("summary"), + "phases": cached.get("phases"), + "quick_wins": cached.get("quick_wins"), + "notes": cached.get("notes"), + "provenance": "AI insights", + } + + user_payload = {"domain": domain, "issue_count": len(issues), "issues": issues} + try: + client = get_llm_client(cfg) + user = json.dumps(user_payload, indent=2, default=str)[:12000] + raw = client.complete_json(ISSUES_ACTION_PLAN_SYSTEM, user) + parsed = raw if isinstance(raw, dict) and raw else parse_json_response(str(raw)) + if not isinstance(parsed, dict): + parsed = {"summary": str(raw or "").strip() or "No plan returned."} + _write_cache(cache_key, parsed) + plan_md = _format_plan_markdown(parsed) + return { + "ok": True, + "cached": False, + "plan": plan_md, + "summary": parsed.get("summary"), + "phases": parsed.get("phases"), + "quick_wins": parsed.get("quick_wins"), + "notes": parsed.get("notes"), + "provenance": "AI insights", + } + except Exception as e: + return {"ok": False, "error": str(e)} diff --git a/src/website_profiling/llm/prompts.py b/src/website_profiling/llm/prompts.py index 6408d45..66480b8 100644 --- a/src/website_profiling/llm/prompts.py +++ b/src/website_profiling/llm/prompts.py @@ -88,6 +88,16 @@ Use ONLY the scores and issues provided. Be direct and prioritize by traffic impact. Return JSON: {"summary": "3-5 sentences in plain language", "priorities": ["bullet 1", "bullet 2", "bullet 3"]}""" +ISSUES_ACTION_PLAN_SYSTEM = """You are a senior SEO/technical audit consultant. +Given a deduplicated list of site audit issues, return a prioritized remediation plan. +Use ONLY the issues provided. Group by root cause where possible. +Return JSON: { + "summary": "2-3 sentence overview", + "phases": [{"name": "...", "effort": "low|medium|high", "actions": ["..."]}], + "quick_wins": ["..."], + "notes": "optional caveats" +}""" + CHAT_NARRATIVE_SYSTEM = """You write the user-facing narrative for a site-audit chat turn. Use ONLY the user question and tool results provided. Do not invent metrics, URLs, or scores. The chat UI already renders charts, tables, and score cards from tool data — do not repeat those numbers. diff --git a/src/website_profiling/parsing/links.py b/src/website_profiling/parsing/links.py index 6f78fb3..4e82621 100644 --- a/src/website_profiling/parsing/links.py +++ b/src/website_profiling/parsing/links.py @@ -61,6 +61,53 @@ def _parse_rel_flags(rel_raw: str) -> tuple[bool, bool, bool]: return ("nofollow" in parts, "sponsored" in parts, "ugc" in parts) +_NAV_CLASSES = frozenset({"nav", "menu", "navbar", "navigation"}) +_HEADER_CLASSES = frozenset({"header", "site-header", "page-header"}) +_FOOTER_CLASSES = frozenset({"footer", "site-footer", "page-footer"}) +_SIDEBAR_CLASSES = frozenset({"sidebar", "side", "aside", "widget-area"}) + + +def _classify_position(tag) -> str: + """Classify where a link sits on the page by walking its ancestor chain.""" + for parent in tag.parents: + name = getattr(parent, "name", None) + if name is None: + continue + # Semantic HTML5 landmarks (highest priority — unambiguous) + if name == "nav": + return "nav" + if name == "header": + return "header" + if name == "footer": + return "footer" + if name == "aside": + return "sidebar" + # ARIA roles + role = str(parent.get("role") or "").lower() + if role in ("navigation", "menubar"): + return "nav" + if role == "banner": + return "header" + if role == "contentinfo": + return "footer" + if role == "complementary": + return "sidebar" + # Class / ID heuristics for common naming conventions + cls_set = {c.lower() for c in (parent.get("class") or [])} + elem_id = str(parent.get("id") or "").lower() + if cls_set & _NAV_CLASSES or elem_id in _NAV_CLASSES: + return "nav" + if cls_set & _HEADER_CLASSES or elem_id in _HEADER_CLASSES: + return "header" + if cls_set & _FOOTER_CLASSES or elem_id in _FOOTER_CLASSES: + return "footer" + if cls_set & _SIDEBAR_CLASSES or elem_id in _SIDEBAR_CLASSES: + return "sidebar" + if name in ("main", "article", "section"): + return "content" + return "content" + + def _anchor_text_from_tag(a) -> str: parts: list[str] = [] for child in a.children: @@ -103,6 +150,7 @@ def parse_link_edges(base_url: str, html_text: str) -> tuple[str, list[dict]]: "is_sponsored": sponsored, "is_ugc": ugc, "link_type": link_type, + "position": _classify_position(a), }) return title_tag, edges diff --git a/src/website_profiling/reporting/crawl_segments.py b/src/website_profiling/reporting/crawl_segments.py index 4dfa960..c0ae25c 100644 --- a/src/website_profiling/reporting/crawl_segments.py +++ b/src/website_profiling/reporting/crawl_segments.py @@ -1,20 +1,94 @@ -"""Per path-prefix crawl segment health scores.""" +"""Per path-prefix / regex crawl segment health scores.""" from __future__ import annotations +import re from typing import Any from urllib.parse import urlparse +import pandas as pd + from ..scoring import round_half_up +# Unambiguous regex metacharacters that distinguish a regex from a plain path prefix. +# Avoids false-positive on e.g. /api/v1.0 (single dot without quantifier is kept literal). +_REGEX_INDICATOR = re.compile(r"\.\*|\.\+|\\[dwWDSBbAZ]|\[|\(|\{|\$|\|") + + +def _is_regex(pattern: str) -> bool: + """Return True when *pattern* contains unambiguous regex metacharacters.""" + return bool(_REGEX_INDICATOR.search(pattern)) + + +def _matches_path(path: str, pattern: str, is_rx: bool, compiled: Any) -> bool: + """Return True if *path* matches *pattern* (regex search or prefix check).""" + if is_rx: + return bool(compiled.search(path)) + # Literal prefix: exact match or path starts with prefix + "/" + return path == pattern or path.startswith(pattern.rstrip("/") + "/") + + +def _segment_health(seg_df: pd.DataFrame) -> int: + """Lightweight health score computed from the segment's URL subset. + + Uses only columns that are always present in a crawl DataFrame. Deductions: + - up to 30 pts for non-2xx status codes + - up to 20 pts for missing page titles + - up to 10 pts for missing meta descriptions + - up to 10 pts for missing viewport tags + Returns a value in [0, 100]. + """ + n = len(seg_df) + if n == 0: + return 0 + score = 100 + + if "status" in seg_df.columns: + def _is_success(s: Any) -> bool: + return bool(s) and str(s).startswith("2") + ok = seg_df["status"].apply(_is_success).sum() + error_rate = 1.0 - ok / n + if error_rate > 0: + score -= round_half_up(30 * error_rate) + + if "title" in seg_df.columns: + missing = seg_df["title"].apply(lambda t: not t or str(t).strip() == "").sum() + missing_rate = missing / n + if missing_rate > 0.1: + score -= round_half_up(20 * missing_rate) + + if "description" in seg_df.columns: + missing = seg_df["description"].apply(lambda d: not d or str(d).strip() == "").sum() + missing_rate = missing / n + if missing_rate > 0.1: + score -= round_half_up(10 * missing_rate) + + if "viewport_present" in seg_df.columns: + no_vp = (~seg_df["viewport_present"].astype(bool)).sum() + no_vp_rate = no_vp / n + if no_vp_rate > 0.1: + score -= round_half_up(10 * no_vp_rate) + + return max(0, score) + def build_crawl_segments( - df, + df: Any, categories: list[dict[str, Any]], path_prefixes: list[str], ) -> dict[str, Any] | None: + """Build per-segment health data. + + Each entry in *path_prefixes* may be a plain path prefix ("/blog") or a + regex pattern ("/blog/.*", r"/api/v\\d+"). Regex patterns are detected + automatically by the presence of unambiguous metacharacters. + + Health scores are computed from the segment's own URL subset rather than + inheriting the site-wide average. + """ if not path_prefixes or df is None or getattr(df, "empty", True): return None + # Site-wide overall health (kept for backward compatibility) overall_scores = [ float(c.get("score")) for c in categories @@ -22,23 +96,40 @@ def build_crawl_segments( ] overall = round_half_up(sum(overall_scores) / len(overall_scores)) if overall_scores else None + # Pre-compile patterns once + compiled_patterns: list[tuple[str, bool, Any]] = [] + for raw in path_prefixes: + p = raw if raw.startswith("/") else f"/{raw}" + is_rx = _is_regex(p) + try: + compiled: Any = re.compile(p) if is_rx else p + except re.error: + is_rx = False + compiled = p + compiled_patterns.append((p, is_rx, compiled)) + segments: list[dict[str, Any]] = [] - for prefix in path_prefixes: - p = prefix if prefix.startswith("/") else f"/{prefix}" - urls = [] + for prefix, is_rx, compiled in compiled_patterns: + matching_rows = [] for _, row in df.iterrows(): url = str(row.get("url") or "") try: path = urlparse(url).path or "/" except Exception: path = url - if path == p or path.startswith(p.rstrip("/") + "/"): - urls.append(url) + if _matches_path(path, prefix, is_rx, compiled): + matching_rows.append(row.to_dict() if hasattr(row, "to_dict") else dict(row)) + + seg_df = pd.DataFrame(matching_rows) if matching_rows else pd.DataFrame() + health: int | None = _segment_health(seg_df) if not seg_df.empty else 0 + segments.append( { - "prefix": p, - "url_count": len(urls), - "health_score": overall, + "prefix": prefix, + "url_count": len(matching_rows), + "health_score": health, + "pattern_type": "regex" if is_rx else "prefix", } ) + return {"overall_health": overall, "segments": segments} diff --git a/src/website_profiling/reporting/link_edges_report.py b/src/website_profiling/reporting/link_edges_report.py index 6bc98a9..49f54a0 100644 --- a/src/website_profiling/reporting/link_edges_report.py +++ b/src/website_profiling/reporting/link_edges_report.py @@ -18,8 +18,8 @@ def summarize_link_rel(edges: list[dict[str, Any]]) -> dict[str, Any]: def build_inlink_anchor_matrix(edges: list[dict[str, Any]], *, limit: int = 500) -> list[dict[str, Any]]: - """Aggregate inlink anchor text counts per target URL.""" - buckets: dict[tuple[str, str], int] = Counter() + """Aggregate inlink anchor text counts per target URL, including dominant position.""" + buckets: dict[tuple[str, str], Counter] = {} for e in edges: if str(e.get("link_type") or "") != "internal": continue @@ -28,10 +28,14 @@ def build_inlink_anchor_matrix(edges: list[dict[str, Any]], *, limit: int = 500) source = str(e.get("from_url") or "").rstrip("/") if not target or not source: continue - buckets[(target, anchor)] += 1 - rows = [ - {"target_url": t, "anchor_text": a, "inlink_count": c} - for (t, a), c in buckets.items() - ] + key = (target, anchor) + if key not in buckets: + buckets[key] = Counter() + buckets[key][str(e.get("position") or "content")] += 1 + rows = [] + for (t, a), pos_counter in buckets.items(): + total = sum(pos_counter.values()) + top_pos = pos_counter.most_common(1)[0][0] if pos_counter else "content" + rows.append({"target_url": t, "anchor_text": a, "inlink_count": total, "top_position": top_pos}) rows.sort(key=lambda r: (-r["inlink_count"], r["target_url"])) return rows[: max(1, limit)] diff --git a/src/website_profiling/tools/export_crawl_workbook.py b/src/website_profiling/tools/export_crawl_workbook.py index 87bbdb2..6fd46d1 100644 --- a/src/website_profiling/tools/export_crawl_workbook.py +++ b/src/website_profiling/tools/export_crawl_workbook.py @@ -67,7 +67,7 @@ def build_crawl_workbook_zip(report_payload: dict[str, Any]) -> bytes: if isinstance(link_edges, list) and link_edges: edge_cols = [ "from_url", "to_url", "anchor_text", "rel", - "is_nofollow", "is_sponsored", "is_ugc", "link_type", + "is_nofollow", "is_sponsored", "is_ugc", "link_type", "position", ] zf.writestr("links.csv", _csv_bytes(link_edges, edge_cols)) diff --git a/tests/reporting/test_crawl_segments.py b/tests/reporting/test_crawl_segments.py index 882272b..d70bb54 100644 --- a/tests/reporting/test_crawl_segments.py +++ b/tests/reporting/test_crawl_segments.py @@ -3,9 +3,148 @@ import pandas as pd -from website_profiling.reporting.crawl_segments import build_crawl_segments +from website_profiling.reporting.crawl_segments import ( + _is_regex, + _matches_path, + _segment_health, + build_crawl_segments, +) +# --------------------------------------------------------------------------- +# _is_regex +# --------------------------------------------------------------------------- + +def test_is_regex_plain_prefix() -> None: + assert _is_regex("/blog") is False + assert _is_regex("/api/v1") is False + assert _is_regex("/products-new") is False + + +def test_is_regex_dotstar_pattern() -> None: + assert _is_regex("/blog/.*") is True + assert _is_regex("/api/.*") is True + + +def test_is_regex_dotplus_pattern() -> None: + assert _is_regex("/api/.+") is True + + +def test_is_regex_shorthand_class() -> None: + assert _is_regex(r"/api/v\d+") is True + assert _is_regex(r"/path/\w+") is True + + +def test_is_regex_character_class() -> None: + assert _is_regex("/api/[v][0-9]") is True + + +def test_is_regex_group() -> None: + assert _is_regex("/(blog|news)/") is True + + +def test_is_regex_dollar_anchor() -> None: + assert _is_regex("/blog$") is True + + +def test_is_regex_single_dot_not_flagged() -> None: + """A plain dot in a path like /api/v1.0 should NOT be treated as regex.""" + assert _is_regex("/api/v1.0") is False + + +# --------------------------------------------------------------------------- +# _matches_path +# --------------------------------------------------------------------------- + +def test_matches_path_prefix_exact() -> None: + assert _matches_path("/blog", "/blog", False, "/blog") is True + + +def test_matches_path_prefix_child() -> None: + assert _matches_path("/blog/post-1", "/blog", False, "/blog") is True + + +def test_matches_path_prefix_no_match() -> None: + assert _matches_path("/about", "/blog", False, "/blog") is False + assert _matches_path("/blogger", "/blog", False, "/blog") is False + + +def test_matches_path_regex() -> None: + import re + pattern = "/api/.*" + compiled = re.compile(pattern) + assert _matches_path("/api/v1/users", pattern, True, compiled) is True + assert _matches_path("/about", pattern, True, compiled) is False + + +# --------------------------------------------------------------------------- +# _segment_health +# --------------------------------------------------------------------------- + +def test_segment_health_all_ok() -> None: + df = pd.DataFrame([ + {"url": "https://ex.com/a", "status": 200, "title": "A", "description": "desc"}, + {"url": "https://ex.com/b", "status": 200, "title": "B", "description": "desc"}, + ]) + assert _segment_health(df) == 100 + + +def test_segment_health_empty_df() -> None: + assert _segment_health(pd.DataFrame()) == 0 + + +def test_segment_health_error_status_deduction() -> None: + """50% 4xx → deducts 15 pts (50% of 30).""" + df = pd.DataFrame([ + {"status": 200}, {"status": 200}, + {"status": 404}, {"status": 404}, + ]) + score = _segment_health(df) + assert score == 85 # 100 - round(30 * 0.5) = 85 + + +def test_segment_health_missing_title_deduction() -> None: + """All titles missing → full 20-pt deduction.""" + df = pd.DataFrame([{"status": 200, "title": ""} for _ in range(5)]) + score = _segment_health(df) + assert score == 80 # 100 - 20 + + +def test_segment_health_missing_description_deduction() -> None: + """All descriptions missing → full 10-pt deduction.""" + df = pd.DataFrame([{"status": 200, "title": "T", "description": ""} for _ in range(5)]) + score = _segment_health(df) + assert score == 90 # 100 - 10 + + +def test_segment_health_missing_viewport_deduction() -> None: + """All viewport missing → full 10-pt deduction.""" + df = pd.DataFrame([{"status": 200, "title": "T", "viewport_present": False} for _ in range(5)]) + score = _segment_health(df) + assert score == 90 # 100 - 10 + + +def test_segment_health_clamped_to_zero() -> None: + """Multiple deductions stack: 100 - 30(status) - 20(title) - 10(desc) - 10(viewport) = 30.""" + df = pd.DataFrame([ + {"status": 500, "title": "", "description": "", "viewport_present": False} + for _ in range(10) + ]) + assert _segment_health(df) == 30 + + +def test_segment_health_small_missing_rate_no_deduction() -> None: + """Under 10% missing rate triggers no deduction.""" + rows = [{"status": 200, "title": "T", "description": "D"} for _ in range(10)] + rows[0]["title"] = "" # 10% — threshold is > 10%, so no deduction + df = pd.DataFrame(rows) + assert _segment_health(df) == 100 + + +# --------------------------------------------------------------------------- +# build_crawl_segments +# --------------------------------------------------------------------------- + def test_build_crawl_segments_groups_by_prefix() -> None: df = pd.DataFrame([ {"url": "https://example.com/blog/a"}, @@ -19,6 +158,7 @@ def test_build_crawl_segments_groups_by_prefix() -> None: seg = out["segments"][0] assert seg["prefix"] == "/blog" assert seg["url_count"] == 2 + assert seg["pattern_type"] == "prefix" def test_build_crawl_segments_empty_prefixes() -> None: @@ -34,3 +174,93 @@ def test_build_crawl_segments_handles_bad_url() -> None: out = build_crawl_segments(df, [{"id": "x", "score": 80}], ["/not-a-valid-url"]) assert out is not None assert out["segments"][0]["url_count"] == 1 + + +def test_build_crawl_segments_regex_pattern() -> None: + """Regex pattern /blog/.* matches /blog/post-1 and /blog/post-2.""" + df = pd.DataFrame([ + {"url": "https://example.com/blog/post-1"}, + {"url": "https://example.com/blog/post-2"}, + {"url": "https://example.com/about"}, + ]) + out = build_crawl_segments(df, [], ["/blog/.*"]) + assert out is not None + seg = out["segments"][0] + assert seg["url_count"] == 2 + assert seg["pattern_type"] == "regex" + + +def test_build_crawl_segments_mixed_prefix_and_regex() -> None: + """Mixed literal prefix and regex in the same list.""" + df = pd.DataFrame([ + {"url": "https://example.com/blog/a", "status": 200, "title": "T"}, + {"url": "https://example.com/api/v1/users", "status": 200, "title": "T"}, + {"url": "https://example.com/api/v2/items", "status": 200, "title": "T"}, + {"url": "https://example.com/about", "status": 200, "title": "T"}, + ]) + out = build_crawl_segments(df, [], ["/blog", r"/api/v\d+"]) + assert out is not None + assert len(out["segments"]) == 2 + blog_seg = next(s for s in out["segments"] if s["prefix"] == "/blog") + api_seg = next(s for s in out["segments"] if "api" in s["prefix"]) + assert blog_seg["url_count"] == 1 + assert blog_seg["pattern_type"] == "prefix" + assert api_seg["url_count"] == 2 + assert api_seg["pattern_type"] == "regex" + + +def test_build_crawl_segments_per_segment_health_differs() -> None: + """Segments with different URL subsets get different health scores.""" + df = pd.DataFrame([ + # /good: all 200, all have titles + {"url": "https://ex.com/good/a", "status": 200, "title": "A"}, + {"url": "https://ex.com/good/b", "status": 200, "title": "B"}, + # /bad: all 500, no titles + {"url": "https://ex.com/bad/a", "status": 500, "title": ""}, + {"url": "https://ex.com/bad/b", "status": 500, "title": ""}, + ]) + out = build_crawl_segments(df, [], ["/good", "/bad"]) + assert out is not None + segs = {s["prefix"]: s for s in out["segments"]} + assert segs["/good"]["health_score"] == 100 + # /bad: 100% 500 status (−30) + 100% missing title (−20) = 50 + assert segs["/bad"]["health_score"] == 50 + + +def test_build_crawl_segments_invalid_regex_falls_back_to_prefix() -> None: + """An invalid regex is silently treated as a literal prefix.""" + df = pd.DataFrame([{"url": "https://example.com/[invalid"}]) + # "[invalid" looks like a regex (contains "[") but won't compile → fallback to prefix + out = build_crawl_segments(df, [], ["[invalid"]) + assert out is not None + seg = out["segments"][0] + assert seg["pattern_type"] == "prefix" + + +def test_build_crawl_segments_no_categories_overall_health_is_none() -> None: + df = pd.DataFrame([{"url": "https://example.com/blog/a"}]) + out = build_crawl_segments(df, [], ["/blog"]) + assert out is not None + assert out["overall_health"] is None + + +def test_build_crawl_segments_prefix_without_leading_slash() -> None: + """Prefixes without a leading slash get one added automatically.""" + df = pd.DataFrame([ + {"url": "https://example.com/blog/post"}, + {"url": "https://example.com/about"}, + ]) + out = build_crawl_segments(df, [], ["blog"]) + assert out is not None + seg = out["segments"][0] + assert seg["prefix"] == "/blog" + assert seg["url_count"] == 1 + + +def test_build_crawl_segments_zero_match() -> None: + """Segment with no matching URLs gets health_score of 0.""" + df = pd.DataFrame([{"url": "https://example.com/about"}]) + out = build_crawl_segments(df, [], ["/blog"]) + assert out is not None + assert out["segments"][0]["url_count"] == 0 + assert out["segments"][0]["health_score"] == 0 diff --git a/tests/test_config_schema_keys.py b/tests/test_config_schema_keys.py index bfcd483..f0417dd 100644 --- a/tests/test_config_schema_keys.py +++ b/tests/test_config_schema_keys.py @@ -30,6 +30,7 @@ "crawl_url_list", "crawl_user_agent_preset", "crawl_user_agent_custom", + "compare_mobile_desktop", "crawl_auth_username", "crawl_auth_password", "crawl_extra_headers", diff --git a/tests/test_crawl_pause_resume.py b/tests/test_crawl_pause_resume.py new file mode 100644 index 0000000..55f2631 --- /dev/null +++ b/tests/test_crawl_pause_resume.py @@ -0,0 +1,534 @@ +"""Tests for crawl pause/resume: frontier serialisation, pause state DB helpers, +and the pause/resume flow in run_crawler.""" +from __future__ import annotations + +import json +import os +import threading +from queue import Queue +from typing import Any +from unittest.mock import MagicMock, patch, call + +import pandas as pd +import pytest + + +# --------------------------------------------------------------------------- +# CrawlFrontier.serialize_state / restore_from_state +# --------------------------------------------------------------------------- + +def _make_frontier() -> Any: + from website_profiling.crawl.frontier import CrawlFrontier + + with patch("website_profiling.crawl.frontier.load_robots", return_value=None): + f = CrawlFrontier("https://example.com", ignore_robots=True) + return f + + +def test_serialize_state_empty(): + f = _make_frontier() + state = f.serialize_state() + assert state["pending"] == [] + assert state["visited"] == [] + assert state["depths"] == {} + + +def test_serialize_state_captures_pending_and_visited(): + f = _make_frontier() + f.queue.put("https://example.com/a") + f.queue.put("https://example.com/b") + f.depths["https://example.com/a"] = 0 + f.depths["https://example.com/b"] = 1 + f.visited.add("https://example.com/visited") + + state = f.serialize_state() + assert set(state["pending"]) == {"https://example.com/a", "https://example.com/b"} + assert "https://example.com/visited" in state["visited"] + assert state["depths"]["https://example.com/a"] == 0 + assert state["depths"]["https://example.com/b"] == 1 + + +def test_restore_from_state_populates_frontier(): + f = _make_frontier() + state = { + "pending": ["https://example.com/x", "https://example.com/y"], + "visited": ["https://example.com/z"], + "depths": {"https://example.com/x": 0, "https://example.com/y": 1}, + } + f.restore_from_state(state) + + assert not f.queue.empty() + items = list(f.queue.queue) + assert set(items) == {"https://example.com/x", "https://example.com/y"} + assert "https://example.com/z" in f.visited + assert f.depths["https://example.com/x"] == 0 + + +def test_restore_from_state_empty_state(): + f = _make_frontier() + f.restore_from_state({}) + assert f.queue.empty() + assert len(f.visited) == 0 + assert len(f.depths) == 0 + + +def test_serialize_restore_roundtrip(): + f = _make_frontier() + f.queue.put("https://example.com/page") + f.depths["https://example.com/page"] = 2 + f.visited.add("https://example.com/done") + + state = f.serialize_state() + serialised = json.dumps(state) # must be JSON-serialisable + + f2 = _make_frontier() + f2.restore_from_state(json.loads(serialised)) + assert list(f2.queue.queue) == ["https://example.com/page"] + assert "https://example.com/done" in f2.visited + + +# --------------------------------------------------------------------------- +# crawl_store: save_pause_state / load_pause_state / clear_pause_state +# --------------------------------------------------------------------------- + +def _mock_conn(): + conn = MagicMock() + conn.execute.return_value = MagicMock() + return conn + + +def test_save_pause_state_executes_update(): + from website_profiling.db.crawl_store import save_pause_state + + conn = _mock_conn() + state = {"pending": ["https://example.com/a"], "visited": [], "depths": {}} + save_pause_state(conn, 42, state) + + args = conn.execute.call_args + sql = args[0][0] + assert "UPDATE crawl_runs SET pause_state" in sql + assert "paused_at" in sql + conn.commit.assert_called_once() + + +def test_load_pause_state_returns_dict(): + from website_profiling.db.crawl_store import load_pause_state + + state = {"pending": ["https://example.com/a"], "visited": [], "depths": {}} + row = MagicMock() + row.__getitem__ = lambda self, k: json.dumps(state) if k == "pause_state" else None + conn = _mock_conn() + conn.execute.return_value.fetchone.return_value = row + + result = load_pause_state(conn, 42) + assert result == state + + +def test_load_pause_state_returns_none_when_null(): + from website_profiling.db.crawl_store import load_pause_state + + row = MagicMock() + row.__getitem__ = lambda self, k: None + conn = _mock_conn() + conn.execute.return_value.fetchone.return_value = row + + assert load_pause_state(conn, 42) is None + + +def test_load_pause_state_returns_none_when_no_row(): + from website_profiling.db.crawl_store import load_pause_state + + conn = _mock_conn() + conn.execute.return_value.fetchone.return_value = None + + assert load_pause_state(conn, 42) is None + + +def test_load_pause_state_returns_none_on_exception(): + from website_profiling.db.crawl_store import load_pause_state + + conn = _mock_conn() + conn.execute.side_effect = Exception("db error") + + assert load_pause_state(conn, 42) is None + + +def test_load_pause_state_accepts_dict_value(): + """Column value already a dict (psycopg JSONB auto-parse).""" + from website_profiling.db.crawl_store import load_pause_state + + state = {"pending": [], "visited": [], "depths": {}} + row = MagicMock() + row.__getitem__ = lambda self, k: state if k == "pause_state" else None + conn = _mock_conn() + conn.execute.return_value.fetchone.return_value = row + + result = load_pause_state(conn, 7) + assert result == state + + +def test_clear_pause_state_executes_update(): + from website_profiling.db.crawl_store import clear_pause_state + + conn = _mock_conn() + clear_pause_state(conn, 42) + + args = conn.execute.call_args + sql = args[0][0] + assert "pause_state = NULL" in sql + conn.commit.assert_called_once() + + +def test_clear_pause_state_swallows_exception(): + from website_profiling.db.crawl_store import clear_pause_state + + conn = _mock_conn() + conn.execute.side_effect = Exception("db down") + clear_pause_state(conn, 42) # must not raise + + +# --------------------------------------------------------------------------- +# Crawler.__init__ restore_from_state branch (line 255 coverage) +# --------------------------------------------------------------------------- + +def test_crawler_init_restores_pause_state(monkeypatch): + """Passing pause_state to Crawler.__init__ calls frontier.restore_from_state.""" + import website_profiling.crawl.crawler as mod + + restored = {} + + class _FakeFrontier: + queue = Queue() + visited: set = set() + depths: dict = {} + lock = threading.Lock() + rp = None + + def __init__(self, *a, **kw): + pass + + def restore_from_state(self, state): + restored["state"] = state + + def seed_initial_urls(self, **kw): + pass + + pause_state = {"pending": ["https://example.com/p"], "visited": [], "depths": {}} + + with ( + patch.object(mod, "CrawlFrontier", _FakeFrontier), + patch.object(mod, "build_fetcher", return_value=MagicMock()), + ): + c = mod.Crawler("https://example.com", pause_state=pause_state) + + assert restored.get("state") == pause_state + + +# --------------------------------------------------------------------------- +# _PAUSE_EVENT and pause file check in crawl loop +# --------------------------------------------------------------------------- + +def test_pause_event_is_set_by_pause_file(tmp_path, monkeypatch): + """Crawler.crawl() detects a pause file written to TMPDIR and marks paused=True.""" + import website_profiling.crawl.crawler as mod + from website_profiling.crawl.schema import empty_crawl_row + + monkeypatch.setenv("TMPDIR", str(tmp_path)) + mod._PAUSE_EVENT.clear() + + pid = os.getpid() + flag = tmp_path / f"wp_pause_{pid}.flag" + flag.write_text("") # write BEFORE crawl starts + + # Minimal real Crawler setup — mocked frontier with one URL queued. + class _FakeFrontier: + queue: Queue = Queue() + visited: set = set() + depths: dict = {} + lock = threading.Lock() + rp = None + + def __init__(self, *a, **kw): + self.queue.put("https://example.com/") + self.depths["https://example.com/"] = 0 + + def should_skip_dequeued(self, url): + return False + + def mark_visited(self, url): + if url in self.visited: + return False + self.visited.add(url) + return True + + def seed_initial_urls(self, **kw): + pass + + def serialize_state(self): + return {"pending": [], "visited": [], "depths": {}} + + fake_result = empty_crawl_row(status=200) + fake_result["url"] = "https://example.com/" + + mock_fetcher = MagicMock() + mock_fetcher.fetch.return_value = MagicMock( + url="https://example.com/", + final_url="https://example.com/", + status_code=200, + text="", + content_type="text/html", + fetch_method="static", + console_messages=[], + failed_requests=[], + ) + mock_fetcher.close = MagicMock() + + with ( + patch.object(mod, "CrawlFrontier", _FakeFrontier), + patch.object(mod, "build_fetcher", return_value=mock_fetcher), + patch.object(mod.Crawler, "worker", return_value=fake_result), + ): + crawler = mod.Crawler("https://example.com", max_pages=10) + df = crawler.crawl(show_progress=False) + + assert crawler.paused is True + assert not flag.exists() # file was deleted + mod._PAUSE_EVENT.clear() + + +def test_pause_loop_os_unlink_error_is_swallowed(tmp_path, monkeypatch): + """OSError from os.unlink during pause-file cleanup is silently swallowed.""" + import website_profiling.crawl.crawler as mod + from website_profiling.crawl.schema import empty_crawl_row + + monkeypatch.setenv("TMPDIR", str(tmp_path)) + mod._PAUSE_EVENT.clear() + + pid = os.getpid() + flag = tmp_path / f"wp_pause_{pid}.flag" + flag.write_text("") + + class _FakeFrontier: + queue: Queue = Queue() + visited: set = set() + depths: dict = {} + lock = threading.Lock() + rp = None + + def __init__(self, *a, **kw): + self.queue.put("https://example.com/") + self.depths["https://example.com/"] = 0 + + def should_skip_dequeued(self, url): + return False + + def mark_visited(self, url): + if url in self.visited: + return False + self.visited.add(url) + return True + + def seed_initial_urls(self, **kw): + pass + + def serialize_state(self): + return {"pending": [], "visited": [], "depths": {}} + + fake_result = empty_crawl_row(status=200) + fake_result["url"] = "https://example.com/" + + mock_fetcher = MagicMock() + mock_fetcher.close = MagicMock() + + with ( + patch.object(mod, "CrawlFrontier", _FakeFrontier), + patch.object(mod, "build_fetcher", return_value=mock_fetcher), + patch.object(mod.Crawler, "worker", return_value=fake_result), + patch("os.unlink", side_effect=OSError("permission denied")), + ): + crawler = mod.Crawler("https://example.com", max_pages=10) + df = crawler.crawl(show_progress=False) + + assert crawler.paused is True + mod._PAUSE_EVENT.clear() + + +# --------------------------------------------------------------------------- +# run_crawler: pause saves state and calls sys.exit(2) +# --------------------------------------------------------------------------- + +def _patch_crawler_paused(monkeypatch, pause_state_to_save=None): + """Return a fake Crawler class whose crawl() immediately marks itself paused.""" + import website_profiling.crawl.crawler as mod + + class _FakeCrawler: + paused = True + results = [{"url": "https://example.com/a"}] + link_edges_accum = [] + frontier = MagicMock() + _html_buffer = [] + store_page_html = False + + def __init__(self, *a, **kw): + self.frontier.serialize_state.return_value = pause_state_to_save or { + "pending": ["https://example.com/b"], + "visited": ["https://example.com/a"], + "depths": {"https://example.com/b": 1}, + } + + def crawl(self, **kw): + return pd.DataFrame(self.results) + + return _FakeCrawler + + +def _db_session_cm(conn): + """Return a callable context-manager mock that yields *conn*.""" + from contextlib import contextmanager + + @contextmanager + def _cm(): + yield conn + + return _cm + + +def test_run_crawler_pause_saves_state_and_exits(monkeypatch): + import website_profiling.crawl.crawler as mod + import website_profiling.db as db_pkg + + FakeCrawler = _patch_crawler_paused(monkeypatch) + + saved = {} + + def _fake_save(conn, run_id, state): + saved["run_id"] = run_id + saved["state"] = state + + mock_conn = MagicMock() + + with ( + patch.object(mod, "Crawler", FakeCrawler), + patch.object(db_pkg, "db_session", _db_session_cm(mock_conn)), + patch.object(db_pkg, "create_crawl_run", return_value=7), + patch.object(db_pkg, "backup_db_if_exists", return_value=None), + patch.object(db_pkg, "read_historical_data", return_value={}), + patch.object(db_pkg, "restore_historical_data", MagicMock()), + patch("website_profiling.db.storage.ensure_crawl_tables_cleared", MagicMock()), + patch("website_profiling.db.crawl_store.save_pause_state", _fake_save), + pytest.raises(SystemExit) as exc_info, + ): + mod.run_crawler( + start_url="https://example.com", + output_db=True, + crawl_stream_to_db=True, + ) + + assert exc_info.value.code == 2 + assert saved.get("run_id") == 7 + assert "pending" in saved.get("state", {}) + + +def test_run_crawler_pause_no_stream_run_id_still_exits(monkeypatch): + """When streaming wasn't used (stream_run_id=None) pause still calls sys.exit(2).""" + import website_profiling.crawl.crawler as mod + + FakeCrawler = _patch_crawler_paused(monkeypatch) + + with ( + patch.object(mod, "Crawler", FakeCrawler), + pytest.raises(SystemExit) as exc_info, + ): + mod.run_crawler( + start_url="https://example.com", + output_db=False, + ) + + assert exc_info.value.code == 2 + + +# --------------------------------------------------------------------------- +# run_crawler: resume loads state and clears it on success +# --------------------------------------------------------------------------- + +def test_run_crawler_resume_loads_and_clears_state(monkeypatch): + import website_profiling.crawl.crawler as mod + import website_profiling.db as db_pkg + + pause_state = { + "pending": ["https://example.com/b"], + "visited": ["https://example.com/a"], + "depths": {"https://example.com/b": 1}, + "pages_crawled": 1, + } + cleared = {} + + def _fake_load(conn, run_id): + return pause_state + + def _fake_clear(conn, run_id): + cleared["run_id"] = run_id + + class _FakeCrawlerNotPaused: + paused = False + results = [] + link_edges_accum = [] + frontier = MagicMock() + _html_buffer = [] + store_page_html = False + + def __init__(self, *a, **kw): + self._pause_state = kw.get("pause_state") + + def crawl(self, **kw): + return pd.DataFrame() + + mock_conn = MagicMock() + + with ( + patch.object(mod, "Crawler", _FakeCrawlerNotPaused), + patch.object(db_pkg, "db_session", _db_session_cm(mock_conn)), + patch("website_profiling.db.crawl_store.load_pause_state", _fake_load), + patch("website_profiling.db.crawl_store.clear_pause_state", _fake_clear), + ): + mod.run_crawler( + start_url="https://example.com", + output_db=False, + resume_run_id=42, + ) + + assert cleared.get("run_id") == 42 + + +def test_run_crawler_resume_with_no_saved_state(monkeypatch): + """If no pause state exists for resume_run_id the crawler starts fresh.""" + import website_profiling.crawl.crawler as mod + import website_profiling.db as db_pkg + + class _FakeCrawlerFresh: + paused = False + results = [] + link_edges_accum = [] + frontier = MagicMock() + _html_buffer = [] + store_page_html = False + + def __init__(self, *a, **kw): + assert kw.get("pause_state") is None + + def crawl(self, **kw): + return pd.DataFrame() + + mock_conn = MagicMock() + + with ( + patch.object(mod, "Crawler", _FakeCrawlerFresh), + patch.object(db_pkg, "db_session", _db_session_cm(mock_conn)), + patch("website_profiling.db.crawl_store.load_pause_state", return_value=None), + patch("website_profiling.db.crawl_store.clear_pause_state"), + ): + mod.run_crawler( + start_url="https://example.com", + output_db=False, + resume_run_id=99, + ) diff --git a/tests/test_crawler_deep.py b/tests/test_crawler_deep.py index 6ad95a4..e51f341 100644 --- a/tests/test_crawler_deep.py +++ b/tests/test_crawler_deep.py @@ -662,3 +662,83 @@ def enqueue_html(self, record: dict) -> None: assert len(writer.records) == 1 assert writer.records[0]["url"] == "https://site.com" + +def test_run_crawler_compare_mobile_desktop_second_pass(monkeypatch): + """compare_mobile_desktop=True triggers a second crawl and links the run IDs.""" + import website_profiling.crawl.crawler as mod + + crawl_calls: list[dict] = [] + + class FakeCrawler: + def __init__(self, **_kwargs): + self.link_edges_accum = [] + self.store_page_html = False + + def crawl(self, **_kwargs): + return pd.DataFrame([{"url": "https://a.com", "status": 200, "title": "ok"}]) + + run_id_seq = iter([7, 8]) + + class _Ctx: + def __init__(self): + self._conn = object() + + def __enter__(self): + return self._conn + + def __exit__(self, _t, _v, _tb): + return False + + linked: list[tuple] = [] + fake_set_mobile = lambda conn, d, m: linked.append((d, m)) + + def fake_get_latest(conn): + return next(run_id_seq) + + fake_db = types.SimpleNamespace( + backup_db_if_exists=lambda: None, + create_crawl_run=lambda *_a, **_k: 7, + db_session=lambda: _Ctx(), + read_historical_data=lambda: {}, + restore_historical_data=lambda *_a, **_k: None, + write_crawl=lambda conn, df, crawl_run_id=None: None, + ) + fake_storage = types.SimpleNamespace(ensure_crawl_tables_cleared=lambda *_a, **_k: None) + monkeypatch.setattr(mod, "Crawler", FakeCrawler) + monkeypatch.setitem(__import__("sys").modules, "website_profiling.db", fake_db) + monkeypatch.setitem(__import__("sys").modules, "website_profiling.db.storage", fake_storage) + + import website_profiling.db.crawl_store as cs_mod + + monkeypatch.setattr(cs_mod, "get_latest_crawl_run_id", fake_get_latest) + monkeypatch.setattr(cs_mod, "set_mobile_run_id", fake_set_mobile) + + # Patch run_crawler itself for the recursive call to avoid double setup + second_calls: list[dict] = [] + + original_run = mod.run_crawler + + def patched_run(start_url="", **kwargs): + if kwargs.get("compare_mobile_desktop") is False and kwargs.get("crawl_user_agent_preset") == "mobile": + second_calls.append({"start_url": start_url, **kwargs}) + return pd.DataFrame([{"url": "https://a.com", "status": 200}]) + return original_run(start_url, **kwargs) + + monkeypatch.setattr(mod, "run_crawler", patched_run) + + mod.run_crawler( + "https://a.com", + output_db=True, + crawl_stream_to_db=False, + max_pages=5, + preserve_crawl_history=True, + show_progress=False, + compare_mobile_desktop=True, + ) + + assert len(second_calls) == 1 + assert second_calls[0]["crawl_user_agent_preset"] == "mobile" + assert second_calls[0]["compare_mobile_desktop"] is False + # set_mobile_run_id was called + assert linked and linked[0][0] == 7 + diff --git a/tests/test_link_edges.py b/tests/test_link_edges.py index eca30c8..1d17a8d 100644 --- a/tests/test_link_edges.py +++ b/tests/test_link_edges.py @@ -2,6 +2,7 @@ from __future__ import annotations from website_profiling.common import parse_link_edges, parse_links +from website_profiling.reporting.link_edges_report import build_inlink_anchor_matrix def test_parse_link_edges_anchor_and_rel(): @@ -24,6 +25,124 @@ def test_parse_link_edges_anchor_and_rel(): assert external["link_type"] == "external" +def test_parse_link_edges_position_defaults_to_content(): + html = '