From 80ddd6525ed14c90edd95300fdd006f2217cdc1a Mon Sep 17 00:00:00 2001 From: PrashantUnity Date: Thu, 18 Jun 2026 10:14:12 +0530 Subject: [PATCH 1/3] PPT and bugs hunting --- alembic/versions/018_link_edge_position.py | 24 + alembic/versions/019_crawl_run_mobile_link.py | 24 + alembic/versions/020_crawl_run_pause_state.py | 26 + crawl_results.csv | 2 + input.txt.example | 5 +- pipeline-config.example.txt | 3 +- .../commands/config_resolve.py | 7 + .../commands/pipeline_cmd.py | 10 +- src/website_profiling/crawl/config.py | 1 + src/website_profiling/crawl/crawler.py | 145 +++++ src/website_profiling/crawl/frontier.py | 16 + src/website_profiling/db/crawl_store.py | 144 ++++- src/website_profiling/parsing/links.py | 48 ++ .../reporting/crawl_segments.py | 111 +++- .../reporting/link_edges_report.py | 18 +- .../tools/export_crawl_workbook.py | 2 +- tests/reporting/test_crawl_segments.py | 232 +++++++- tests/test_config_schema_keys.py | 1 + tests/test_crawl_pause_resume.py | 534 ++++++++++++++++++ tests/test_crawler_deep.py | 80 +++ tests/test_link_edges.py | 121 ++++ tests/test_mobile_delta.py | 235 ++++++++ tests/test_pipeline_cmd_run_unit.py | 21 + web/app/api/jobs/[id]/pause/route.ts | 30 + web/app/api/jobs/[id]/resume/route.ts | 29 + web/app/api/report/mobile-delta/route.ts | 21 + web/app/globals.css | 68 ++- web/src/components/LandingShell.tsx | 76 ++- web/src/components/issues/IssueTrendChart.tsx | 132 +++++ .../components/issues/MobileDesktopDelta.tsx | 162 ++++++ .../components/landing/LandingCodeBlock.tsx | 6 +- .../landing/LandingDeckProgress.tsx | 2 +- .../landing/LandingFeatureSpotlight.tsx | 10 +- .../components/landing/LandingFinalCta.tsx | 18 +- web/src/components/landing/LandingFooter.tsx | 34 +- web/src/components/landing/LandingHero.tsx | 16 +- .../components/landing/LandingHeroTopBar.tsx | 12 +- .../components/landing/LandingLimitations.tsx | 24 +- .../components/landing/LandingPathStrip.tsx | 18 +- .../components/landing/LandingProductMock.tsx | 12 +- .../components/landing/LandingQuickStart.tsx | 12 +- .../landing/LandingSectionHeader.tsx | 14 +- .../components/landing/LandingStatsStrip.tsx | 18 +- .../components/landing/LandingUseCases.tsx | 6 +- web/src/components/landing/landingLayout.ts | 10 +- .../components/links/AdvancedLinkFilters.tsx | 153 +++++ web/src/components/links/ColumnPicker.tsx | 81 +++ .../components/links/LinkAttributesPanel.tsx | 22 + .../components/links/SavedCrawlFiltersBar.tsx | 13 +- .../links/explorer/LinksExplorerTableTab.tsx | 90 ++- .../pipeline/PipelineProgressHeader.tsx | 35 +- .../components/pipeline/PipelineRunPanel.tsx | 14 + web/src/hooks/useStageScale.ts | 54 ++ web/src/lib/advancedLinkFilter.test.ts | 151 +++++ web/src/lib/advancedLinkFilter.ts | 246 ++++++++ web/src/lib/columnConfig.ts | 42 ++ web/src/lib/formatPipelineLog.ts | 2 +- web/src/lib/pipelineConfigSchema.ts | 14 +- web/src/lib/savedLinksView.test.ts | 41 ++ web/src/lib/savedLinksView.ts | 37 ++ web/src/server/mobileDeltaDb.ts | 73 +++ web/src/server/pipelineJobs.ts | 137 ++++- web/src/server/pipelineJobsDb.ts | 22 +- web/src/types/api.ts | 3 +- web/src/types/report.ts | 22 + web/src/views/Issues.tsx | 8 + web/src/views/Landing.tsx | 6 +- web/src/views/Links.tsx | 69 ++- web/src/views/SiteStructure.tsx | 7 +- 69 files changed, 3660 insertions(+), 222 deletions(-) create mode 100644 alembic/versions/018_link_edge_position.py create mode 100644 alembic/versions/019_crawl_run_mobile_link.py create mode 100644 alembic/versions/020_crawl_run_pause_state.py create mode 100644 crawl_results.csv create mode 100644 tests/test_crawl_pause_resume.py create mode 100644 tests/test_mobile_delta.py create mode 100644 web/app/api/jobs/[id]/pause/route.ts create mode 100644 web/app/api/jobs/[id]/resume/route.ts create mode 100644 web/app/api/report/mobile-delta/route.ts create mode 100644 web/src/components/issues/IssueTrendChart.tsx create mode 100644 web/src/components/issues/MobileDesktopDelta.tsx create mode 100644 web/src/components/links/AdvancedLinkFilters.tsx create mode 100644 web/src/components/links/ColumnPicker.tsx create mode 100644 web/src/hooks/useStageScale.ts create mode 100644 web/src/lib/advancedLinkFilter.test.ts create mode 100644 web/src/lib/advancedLinkFilter.ts create mode 100644 web/src/lib/columnConfig.ts create mode 100644 web/src/lib/savedLinksView.test.ts create mode 100644 web/src/lib/savedLinksView.ts create mode 100644 web/src/server/mobileDeltaDb.ts diff --git a/alembic/versions/018_link_edge_position.py b/alembic/versions/018_link_edge_position.py new file mode 100644 index 0000000..14c21e2 --- /dev/null +++ b/alembic/versions/018_link_edge_position.py @@ -0,0 +1,24 @@ +"""Add position column to link_edges for nav/header/content/footer/sidebar classification. + +Revision ID: 018_link_edge_position +Revises: 017_content_drafts +""" +from __future__ import annotations + +from alembic import op + +revision = "018_link_edge_position" +down_revision = "017_content_drafts" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute(""" + ALTER TABLE link_edges + ADD COLUMN IF NOT EXISTS position TEXT NOT NULL DEFAULT 'content' + """) + + +def downgrade() -> None: + op.execute("ALTER TABLE link_edges DROP COLUMN IF EXISTS position") diff --git a/alembic/versions/019_crawl_run_mobile_link.py b/alembic/versions/019_crawl_run_mobile_link.py new file mode 100644 index 0000000..ef31bef --- /dev/null +++ b/alembic/versions/019_crawl_run_mobile_link.py @@ -0,0 +1,24 @@ +"""Add mobile_run_id to crawl_runs for pairing desktop+mobile dual crawls. + +Revision ID: 019_crawl_run_mobile_link +Revises: 018_link_edge_position +""" +from __future__ import annotations + +from alembic import op + +revision = "019_crawl_run_mobile_link" +down_revision = "018_link_edge_position" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute(""" + ALTER TABLE crawl_runs + ADD COLUMN IF NOT EXISTS mobile_run_id INT REFERENCES crawl_runs(id) + """) + + +def downgrade() -> None: + op.execute("ALTER TABLE crawl_runs DROP COLUMN IF EXISTS mobile_run_id") diff --git a/alembic/versions/020_crawl_run_pause_state.py b/alembic/versions/020_crawl_run_pause_state.py new file mode 100644 index 0000000..49ede24 --- /dev/null +++ b/alembic/versions/020_crawl_run_pause_state.py @@ -0,0 +1,26 @@ +"""Add pause_state JSONB and paused_at to crawl_runs. + +Revision ID: 020_crawl_run_pause_state +Revises: 019_crawl_run_mobile_link +Create Date: 2026-06-18 +""" +from alembic import op + +revision = "020_crawl_run_pause_state" +down_revision = "019_crawl_run_mobile_link" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute( + "ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS pause_state JSONB" + ) + op.execute( + "ALTER TABLE crawl_runs ADD COLUMN IF NOT EXISTS paused_at TEXT" + ) + + +def downgrade() -> None: + op.execute("ALTER TABLE crawl_runs DROP COLUMN IF EXISTS pause_state") + op.execute("ALTER TABLE crawl_runs DROP COLUMN IF EXISTS paused_at") diff --git a/crawl_results.csv b/crawl_results.csv new file mode 100644 index 0000000..1d05b30 --- /dev/null +++ b/crawl_results.csv @@ -0,0 +1,2 @@ +url,status +https://a.com,200 diff --git a/input.txt.example b/input.txt.example index 14f9f88..d8f5acc 100644 --- a/input.txt.example +++ b/input.txt.example @@ -26,8 +26,9 @@ crawl_exclude_urls = crawl_discovery_mode = spider crawl_url_list = crawl_user_agent_preset = default -crawl_user_agent_custom = -crawl_auth_username = +crawl_user_agent_custom = +compare_mobile_desktop = false +crawl_auth_username = crawl_auth_password = crawl_extra_headers = crawl_cookies = diff --git a/pipeline-config.example.txt b/pipeline-config.example.txt index ab60161..cb1e205 100644 --- a/pipeline-config.example.txt +++ b/pipeline-config.example.txt @@ -27,7 +27,8 @@ crawl_exclude_urls = crawl_discovery_mode = spider crawl_url_list = crawl_user_agent_preset = default -crawl_user_agent_custom = +crawl_user_agent_custom = +compare_mobile_desktop = false crawl_auth_username = crawl_auth_password = crawl_extra_headers = diff --git a/src/website_profiling/commands/config_resolve.py b/src/website_profiling/commands/config_resolve.py index 5468b51..7d91c8b 100644 --- a/src/website_profiling/commands/config_resolve.py +++ b/src/website_profiling/commands/config_resolve.py @@ -364,4 +364,11 @@ def build_parser() -> argparse.ArgumentParser: dest="stdin_json", help="For 'chat' command: read JSON payload from stdin and emit NDJSON events.", ) + parser.add_argument( + "--resume-run-id", + type=int, + default=None, + dest="resume_run_id", + help="Resume a paused crawl from the saved frontier of the given crawl_run_id.", + ) return parser diff --git a/src/website_profiling/commands/pipeline_cmd.py b/src/website_profiling/commands/pipeline_cmd.py index d50078f..84a040e 100644 --- a/src/website_profiling/commands/pipeline_cmd.py +++ b/src/website_profiling/commands/pipeline_cmd.py @@ -157,7 +157,12 @@ def run(cfg: dict, args: argparse.Namespace) -> None: phase_results: list[PhaseResult] = [] - if run_crawl: + resume_run_id = getattr(args, "resume_run_id", None) + if resume_run_id is not None: + phase_results.append( + run_pipeline_phase("crawl", lambda: _run_crawl(cfg, use_database, resume_run_id=resume_run_id)) + ) + elif run_crawl: phase_results.append(run_pipeline_phase("crawl", lambda: _run_crawl(cfg, use_database))) if run_content_analysis and use_database: @@ -209,7 +214,7 @@ def _finalize_pipeline_run(phase_results: list[PhaseResult]) -> None: sys.exit(1) -def _run_crawl(cfg: dict, use_database: bool) -> None: +def _run_crawl(cfg: dict, use_database: bool, resume_run_id: int | None = None) -> None: from ..crawl.crawler import run_crawler console_print("[Crawl] Starting...", flush=True) @@ -304,6 +309,7 @@ def _run_crawl(cfg: dict, use_database: bool) -> None: crawl_robots_txt_override=(cfg.get("crawl_robots_txt_override") or "").strip(), custom_extractors=custom_extractors or None, enable_axe=enable_axe, + resume_run_id=resume_run_id, ) console_print("[Crawl] Done.", flush=True) emit_phase_done("crawl") diff --git a/src/website_profiling/crawl/config.py b/src/website_profiling/crawl/config.py index 2acbffe..a84b036 100644 --- a/src/website_profiling/crawl/config.py +++ b/src/website_profiling/crawl/config.py @@ -69,6 +69,7 @@ class CrawlConfig: crawl_robots_txt_override: str = "" custom_extractors: Optional[list[dict]] = None enable_axe: bool = False + compare_mobile_desktop: bool = False @classmethod def from_kwargs(cls, **kwargs: object) -> CrawlConfig: diff --git a/src/website_profiling/crawl/crawler.py b/src/website_profiling/crawl/crawler.py index 1c68926..3d1c923 100644 --- a/src/website_profiling/crawl/crawler.py +++ b/src/website_profiling/crawl/crawler.py @@ -4,10 +4,26 @@ from __future__ import annotations import json +import os +import signal +import threading import time from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait from typing import Optional +# Module-level pause event — set by SIGUSR1 (Unix) or a PID-keyed file (Windows). +_PAUSE_EVENT = threading.Event() + + +def _handle_pause_signal(signum: int, frame: object) -> None: # pragma: no cover + _PAUSE_EVENT.set() + + +try: + signal.signal(signal.SIGUSR1, _handle_pause_signal) +except (AttributeError, OSError): # pragma: no cover + pass # SIGUSR1 not available on Windows + import pandas as pd import requests from tqdm.auto import tqdm @@ -112,6 +128,7 @@ def __init__( enable_axe: bool = False, *, config: Optional[CrawlConfig] = None, + pause_state: Optional[dict] = None, ): if config is None: config = CrawlConfig.from_kwargs( @@ -206,6 +223,7 @@ def __init__( self.lock = self.frontier.lock self.results: list[dict] = [] + self.paused: bool = False # `requests.Session` is not thread-safe, so worker threads each build # their own session from this factory (see StaticFetcher). The template # `self.session` below is only touched on the main thread (sitemap @@ -233,6 +251,8 @@ def __init__( self._hybrid_fetcher = ( self.fetcher if isinstance(self.fetcher, HybridFetcher) else None ) + if pause_state: + self.frontier.restore_from_state(pause_state) self.frontier.seed_initial_urls( discovery_mode=config.discovery_mode, crawl_url_list=config.crawl_url_list, @@ -429,6 +449,7 @@ def crawl( stream_crawl_run_id: Optional[int] = None, stream_batch_size: int = 500, ) -> pd.DataFrame: + _PAUSE_EVENT.clear() start_time = time.time() from ..progress import CrawlProgressTracker, emit_phase_start @@ -509,6 +530,22 @@ def crawl( remaining.append(f) futures = remaining + # Check for pause request (SIGUSR1) or Windows file-based signal. + if not _PAUSE_EVENT.is_set(): + _pause_file = os.path.join( + os.environ.get("TMPDIR", "/tmp"), + f"wp_pause_{os.getpid()}.flag", + ) + if os.path.exists(_pause_file): + try: + os.unlink(_pause_file) + except OSError: + pass + _PAUSE_EVENT.set() + if _PAUSE_EVENT.is_set(): + self.paused = True + break + if self.queue.empty() and not futures: break finally: @@ -582,8 +619,22 @@ def run_crawler( crawl_robots_txt_override: str = "", custom_extractors: Optional[list] = None, enable_axe: bool = False, + compare_mobile_desktop: bool = False, + resume_run_id: Optional[int] = None, ) -> pd.DataFrame: """Run crawler and optionally save to CSV/JSON or PostgreSQL. Returns DataFrame.""" + _resume_pause_state: Optional[dict] = None + if resume_run_id is not None: + from ..db import db_session + from ..db.crawl_store import load_pause_state + with db_session() as _conn: + _resume_pause_state = load_pause_state(_conn, resume_run_id) + if _resume_pause_state: + console_print( + f" Resuming from paused run {resume_run_id} " + f"({len(_resume_pause_state.get('pending', []))} URLs pending)...", + flush=True, + ) max_p = max_pages if max_pages is not None else 0 mode_label = (render_mode or "static").strip().lower() disc_label = normalize_discovery_mode(discovery_mode) @@ -634,6 +685,7 @@ def run_crawler( crawl_robots_txt_override=crawl_robots_txt_override, custom_extractors=custom_extractors, enable_axe=enable_axe, + pause_state=_resume_pause_state, ) stream_run_id: Optional[int] = None if output_db: @@ -663,6 +715,35 @@ def run_crawler( show_progress=show_progress, stream_crawl_run_id=stream_run_id, ) + + # ---- Pause handling: save frontier and exit with code 2 ---- + if getattr(crawler, "paused", False): + import sys + from ..db import db_session + from ..db.crawl_store import save_pause_state + + _pause_run_id = stream_run_id + if _pause_run_id is not None: + _frontier_state = crawler.frontier.serialize_state() + _frontier_state["pages_crawled"] = len(crawler.results) + with db_session() as _conn: + save_pause_state(_conn, _pause_run_id, _frontier_state) + console_print( + f"[PAUSE] crawl_run_id={_pause_run_id}", + flush=True, + ) + else: + console_print("[PAUSE] crawl_run_id=none", flush=True) + sys.exit(2) + + # ---- Resume cleanup: clear saved frontier from the resumed run ---- + if resume_run_id is not None and _resume_pause_state is not None and not getattr(crawler, "paused", False): + from ..db import db_session + from ..db.crawl_store import clear_pause_state + + with db_session() as _conn: + clear_pause_state(_conn, resume_run_id) + if output_db and crawler.link_edges_accum: from ..db import db_session from ..db.crawl_store import write_link_edges @@ -712,6 +793,70 @@ def run_crawler( console_print(" Crawl DB write complete.", flush=True) elif output_db and stream_run_id is not None: console_print(" Crawl streamed to DB during fetch.", flush=True) + + # Second pass: run mobile crawl and pair the two runs via mobile_run_id FK + if compare_mobile_desktop and output_db and run_id is not None: + from ..db import db_session + from ..db.crawl_store import get_latest_crawl_run_id, set_mobile_run_id + + console_print(" Starting mobile second-pass crawl for comparison...", flush=True) + with db_session() as _conn: + _baseline_id = get_latest_crawl_run_id(_conn) or 0 + run_crawler( + start_url=start_url, + max_pages=max_pages, + concurrency=concurrency, + timeout=timeout, + ignore_robots=ignore_robots, + allow_external=allow_external, + max_depth=max_depth, + polite_delay=polite_delay, + store_outlinks=store_outlinks, + output_csv=None, + output_db=True, + show_progress=show_progress, + exclude_urls=exclude_urls, + preserve_crawl_history=True, + store_content_excerpt=store_content_excerpt, + content_excerpt_max_chars=content_excerpt_max_chars, + store_page_html=False, + run_content_analysis=False, + crawl_stream_to_db=crawl_stream_to_db, + property_id=property_id, + render_mode=render_mode, + js_concurrency=js_concurrency, + js_timeout=js_timeout, + js_wait_until=js_wait_until, + js_extra_wait_ms=js_extra_wait_ms, + js_block_resources=js_block_resources, + capture_console=capture_console, + js_console_levels=js_console_levels, + capture_failed_requests=capture_failed_requests, + console_max_per_page=console_max_per_page, + custom_extraction_regex=custom_extraction_regex, + crawl_ignore_params=crawl_ignore_params, + discovery_mode=discovery_mode, + crawl_url_list=crawl_url_list, + crawl_user_agent_preset="mobile", + crawl_user_agent_custom="", + crawl_auth_username=crawl_auth_username, + crawl_auth_password=crawl_auth_password, + crawl_extra_headers=crawl_extra_headers, + crawl_cookies=crawl_cookies, + crawl_robots_txt_override=crawl_robots_txt_override, + custom_extractors=custom_extractors, + enable_axe=False, + compare_mobile_desktop=False, + ) + with db_session() as _conn: + mobile_id = get_latest_crawl_run_id(_conn) + if mobile_id is not None and mobile_id != _baseline_id: + set_mobile_run_id(_conn, run_id, mobile_id) + console_print( + f" Mobile crawl complete (run_id={mobile_id}). Linked to desktop run {run_id}.", + flush=True, + ) + elif output_csv and not df.empty: if output_csv.lower().endswith(".json"): df.to_json(output_csv, orient="records", indent=2, date_format="iso", default_handler=str) diff --git a/src/website_profiling/crawl/frontier.py b/src/website_profiling/crawl/frontier.py index cb1e65a..f3882d9 100644 --- a/src/website_profiling/crawl/frontier.py +++ b/src/website_profiling/crawl/frontier.py @@ -154,3 +154,19 @@ def mark_visited(self, url: str) -> bool: def should_skip_dequeued(self, url: str) -> bool: return url_matches_exclude(url, self.exclude_urls) + + def serialize_state(self) -> dict: + """Return a JSON-serialisable snapshot of the frontier for pause/resume.""" + with self.lock: + pending = list(self.queue.queue) + visited = list(self.visited) + depths = dict(self.depths) + return {"pending": pending, "visited": visited, "depths": depths} + + def restore_from_state(self, state: dict) -> None: + """Pre-populate the frontier from a previously serialised state.""" + with self.lock: + for url in state.get("pending", []): + self.queue.put(url) + self.visited.update(state.get("visited", [])) + self.depths.update(state.get("depths", {})) diff --git a/src/website_profiling/db/crawl_store.py b/src/website_profiling/db/crawl_store.py index fc50d32..b86ef57 100644 --- a/src/website_profiling/db/crawl_store.py +++ b/src/website_profiling/db/crawl_store.py @@ -105,6 +105,141 @@ def get_crawl_run_info(conn: Connection, run_id: int) -> Optional[dict[str, Any] return None +def set_mobile_run_id(conn: Connection, desktop_run_id: int, mobile_run_id: int) -> None: + """Link a mobile crawl run to its paired desktop run.""" + conn.execute( + "UPDATE crawl_runs SET mobile_run_id = %s WHERE id = %s", + (mobile_run_id, desktop_run_id), + ) + conn.commit() + + +def get_mobile_run_id(conn: Connection, run_id: int) -> Optional[int]: + """Return the mobile_run_id paired with this desktop run, or None.""" + try: + cur = conn.execute( + "SELECT mobile_run_id FROM crawl_runs WHERE id = %s", (run_id,) + ) + row = cur.fetchone() + if row is None: + return None + val = row["mobile_run_id"] + return int(val) if val is not None else None + except Exception: + return None + + +def read_mobile_desktop_delta(conn: Connection, desktop_run_id: int) -> list[dict[str, Any]]: + """Compare desktop vs paired mobile crawl, returning per-URL delta rows. + + Each row has: url, desktop, mobile (each with title/h1/word_count/status), + and boolean flags title_differs, h1_differs, status_differs, plus word_count_delta. + Only URLs present in both runs with at least one meaningful difference are included. + """ + mobile_run_id = get_mobile_run_id(conn, desktop_run_id) + if mobile_run_id is None: + return [] + desktop_df = read_crawl(conn, desktop_run_id) + mobile_df = read_crawl(conn, mobile_run_id) + if desktop_df.empty or mobile_df.empty: + return [] + + def _norm(s: Any) -> str: + return str(s or "").rstrip("/").lower() + + def _int(v: Any) -> int: + try: + return int(v or 0) + except (TypeError, ValueError): + return 0 + + desktop_map = {_norm(r.get("url")): r for r in desktop_df.to_dict("records")} + mobile_map = {_norm(r.get("url")): r for r in mobile_df.to_dict("records")} + + deltas: list[dict[str, Any]] = [] + for url_key, dr in desktop_map.items(): + mr = mobile_map.get(url_key) + if mr is None: + continue + d_title = str(dr.get("title") or "") + m_title = str(mr.get("title") or "") + d_h1 = str(dr.get("h1") or "") + m_h1 = str(mr.get("h1") or "") + d_wc = _int(dr.get("word_count")) + m_wc = _int(mr.get("word_count")) + d_st = _int(dr.get("status")) + m_st = _int(mr.get("status")) + + title_diff = d_title != m_title + h1_diff = d_h1 != m_h1 + wc_diff = abs(d_wc - m_wc) + status_diff = d_st != m_st + + if not (title_diff or h1_diff or wc_diff > 50 or status_diff): + continue + deltas.append({ + "url": str(dr.get("url") or url_key), + "desktop": {"title": d_title, "h1": d_h1, "word_count": d_wc, "status": d_st}, + "mobile": {"title": m_title, "h1": m_h1, "word_count": m_wc, "status": m_st}, + "title_differs": title_diff, + "h1_differs": h1_diff, + "word_count_delta": wc_diff, + "status_differs": status_diff, + }) + + # Sort: status diffs first (mobile indexing risk), then title, then word count delta + deltas.sort( + key=lambda d: -( + (4 if d["status_differs"] else 0) + + (2 if d["title_differs"] else 0) + + (1 if d["h1_differs"] else 0) + + (1 if d["word_count_delta"] > 100 else 0) + ) + ) + return deltas + + +def save_pause_state(conn: Connection, run_id: int, state: dict) -> None: + """Persist frontier state for a paused crawl run.""" + from datetime import datetime, timezone + + now = datetime.now(timezone.utc).isoformat() + conn.execute( + "UPDATE crawl_runs SET pause_state = %s, paused_at = %s WHERE id = %s", + (json.dumps(state), now, run_id), + ) + conn.commit() + + +def load_pause_state(conn: Connection, run_id: int) -> Optional[dict]: + """Load saved frontier state for a paused crawl run.""" + try: + cur = conn.execute( + "SELECT pause_state FROM crawl_runs WHERE id = %s", (run_id,) + ) + row = cur.fetchone() + if row is None or row["pause_state"] is None: + return None + val = row["pause_state"] + if isinstance(val, str): + return json.loads(val) + return dict(val) + except Exception: + return None + + +def clear_pause_state(conn: Connection, run_id: int) -> None: + """Clear saved frontier state after a successful resume.""" + try: + conn.execute( + "UPDATE crawl_runs SET pause_state = NULL, paused_at = NULL WHERE id = %s", + (run_id,), + ) + conn.commit() + except Exception: + pass + + def _df_row_to_crawl_json(row: pd.Series) -> dict[str, Any]: out: dict[str, Any] = {} for col in row.index: @@ -398,14 +533,15 @@ def write_link_edges( bool(e.get("is_sponsored")), bool(e.get("is_ugc")), str(e.get("link_type") or "internal"), + str(e.get("position") or "content"), )) if rows: _executemany( conn, """INSERT INTO link_edges ( crawl_run_id, from_url, to_url, anchor_text, rel, - is_nofollow, is_sponsored, is_ugc, link_type - ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + is_nofollow, is_sponsored, is_ugc, link_type, position + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING""", rows, ) @@ -424,7 +560,9 @@ def read_link_edges( return [] try: cur = conn.execute( - """SELECT from_url, to_url, anchor_text, rel, is_nofollow, is_sponsored, is_ugc, link_type + """SELECT from_url, to_url, anchor_text, rel, + is_nofollow, is_sponsored, is_ugc, link_type, + COALESCE(position, 'content') AS position FROM link_edges WHERE crawl_run_id = %s LIMIT %s""", (run_id, max(1, int(limit))), ) diff --git a/src/website_profiling/parsing/links.py b/src/website_profiling/parsing/links.py index 6f78fb3..4e82621 100644 --- a/src/website_profiling/parsing/links.py +++ b/src/website_profiling/parsing/links.py @@ -61,6 +61,53 @@ def _parse_rel_flags(rel_raw: str) -> tuple[bool, bool, bool]: return ("nofollow" in parts, "sponsored" in parts, "ugc" in parts) +_NAV_CLASSES = frozenset({"nav", "menu", "navbar", "navigation"}) +_HEADER_CLASSES = frozenset({"header", "site-header", "page-header"}) +_FOOTER_CLASSES = frozenset({"footer", "site-footer", "page-footer"}) +_SIDEBAR_CLASSES = frozenset({"sidebar", "side", "aside", "widget-area"}) + + +def _classify_position(tag) -> str: + """Classify where a link sits on the page by walking its ancestor chain.""" + for parent in tag.parents: + name = getattr(parent, "name", None) + if name is None: + continue + # Semantic HTML5 landmarks (highest priority — unambiguous) + if name == "nav": + return "nav" + if name == "header": + return "header" + if name == "footer": + return "footer" + if name == "aside": + return "sidebar" + # ARIA roles + role = str(parent.get("role") or "").lower() + if role in ("navigation", "menubar"): + return "nav" + if role == "banner": + return "header" + if role == "contentinfo": + return "footer" + if role == "complementary": + return "sidebar" + # Class / ID heuristics for common naming conventions + cls_set = {c.lower() for c in (parent.get("class") or [])} + elem_id = str(parent.get("id") or "").lower() + if cls_set & _NAV_CLASSES or elem_id in _NAV_CLASSES: + return "nav" + if cls_set & _HEADER_CLASSES or elem_id in _HEADER_CLASSES: + return "header" + if cls_set & _FOOTER_CLASSES or elem_id in _FOOTER_CLASSES: + return "footer" + if cls_set & _SIDEBAR_CLASSES or elem_id in _SIDEBAR_CLASSES: + return "sidebar" + if name in ("main", "article", "section"): + return "content" + return "content" + + def _anchor_text_from_tag(a) -> str: parts: list[str] = [] for child in a.children: @@ -103,6 +150,7 @@ def parse_link_edges(base_url: str, html_text: str) -> tuple[str, list[dict]]: "is_sponsored": sponsored, "is_ugc": ugc, "link_type": link_type, + "position": _classify_position(a), }) return title_tag, edges diff --git a/src/website_profiling/reporting/crawl_segments.py b/src/website_profiling/reporting/crawl_segments.py index 4dfa960..c0ae25c 100644 --- a/src/website_profiling/reporting/crawl_segments.py +++ b/src/website_profiling/reporting/crawl_segments.py @@ -1,20 +1,94 @@ -"""Per path-prefix crawl segment health scores.""" +"""Per path-prefix / regex crawl segment health scores.""" from __future__ import annotations +import re from typing import Any from urllib.parse import urlparse +import pandas as pd + from ..scoring import round_half_up +# Unambiguous regex metacharacters that distinguish a regex from a plain path prefix. +# Avoids false-positive on e.g. /api/v1.0 (single dot without quantifier is kept literal). +_REGEX_INDICATOR = re.compile(r"\.\*|\.\+|\\[dwWDSBbAZ]|\[|\(|\{|\$|\|") + + +def _is_regex(pattern: str) -> bool: + """Return True when *pattern* contains unambiguous regex metacharacters.""" + return bool(_REGEX_INDICATOR.search(pattern)) + + +def _matches_path(path: str, pattern: str, is_rx: bool, compiled: Any) -> bool: + """Return True if *path* matches *pattern* (regex search or prefix check).""" + if is_rx: + return bool(compiled.search(path)) + # Literal prefix: exact match or path starts with prefix + "/" + return path == pattern or path.startswith(pattern.rstrip("/") + "/") + + +def _segment_health(seg_df: pd.DataFrame) -> int: + """Lightweight health score computed from the segment's URL subset. + + Uses only columns that are always present in a crawl DataFrame. Deductions: + - up to 30 pts for non-2xx status codes + - up to 20 pts for missing page titles + - up to 10 pts for missing meta descriptions + - up to 10 pts for missing viewport tags + Returns a value in [0, 100]. + """ + n = len(seg_df) + if n == 0: + return 0 + score = 100 + + if "status" in seg_df.columns: + def _is_success(s: Any) -> bool: + return bool(s) and str(s).startswith("2") + ok = seg_df["status"].apply(_is_success).sum() + error_rate = 1.0 - ok / n + if error_rate > 0: + score -= round_half_up(30 * error_rate) + + if "title" in seg_df.columns: + missing = seg_df["title"].apply(lambda t: not t or str(t).strip() == "").sum() + missing_rate = missing / n + if missing_rate > 0.1: + score -= round_half_up(20 * missing_rate) + + if "description" in seg_df.columns: + missing = seg_df["description"].apply(lambda d: not d or str(d).strip() == "").sum() + missing_rate = missing / n + if missing_rate > 0.1: + score -= round_half_up(10 * missing_rate) + + if "viewport_present" in seg_df.columns: + no_vp = (~seg_df["viewport_present"].astype(bool)).sum() + no_vp_rate = no_vp / n + if no_vp_rate > 0.1: + score -= round_half_up(10 * no_vp_rate) + + return max(0, score) + def build_crawl_segments( - df, + df: Any, categories: list[dict[str, Any]], path_prefixes: list[str], ) -> dict[str, Any] | None: + """Build per-segment health data. + + Each entry in *path_prefixes* may be a plain path prefix ("/blog") or a + regex pattern ("/blog/.*", r"/api/v\\d+"). Regex patterns are detected + automatically by the presence of unambiguous metacharacters. + + Health scores are computed from the segment's own URL subset rather than + inheriting the site-wide average. + """ if not path_prefixes or df is None or getattr(df, "empty", True): return None + # Site-wide overall health (kept for backward compatibility) overall_scores = [ float(c.get("score")) for c in categories @@ -22,23 +96,40 @@ def build_crawl_segments( ] overall = round_half_up(sum(overall_scores) / len(overall_scores)) if overall_scores else None + # Pre-compile patterns once + compiled_patterns: list[tuple[str, bool, Any]] = [] + for raw in path_prefixes: + p = raw if raw.startswith("/") else f"/{raw}" + is_rx = _is_regex(p) + try: + compiled: Any = re.compile(p) if is_rx else p + except re.error: + is_rx = False + compiled = p + compiled_patterns.append((p, is_rx, compiled)) + segments: list[dict[str, Any]] = [] - for prefix in path_prefixes: - p = prefix if prefix.startswith("/") else f"/{prefix}" - urls = [] + for prefix, is_rx, compiled in compiled_patterns: + matching_rows = [] for _, row in df.iterrows(): url = str(row.get("url") or "") try: path = urlparse(url).path or "/" except Exception: path = url - if path == p or path.startswith(p.rstrip("/") + "/"): - urls.append(url) + if _matches_path(path, prefix, is_rx, compiled): + matching_rows.append(row.to_dict() if hasattr(row, "to_dict") else dict(row)) + + seg_df = pd.DataFrame(matching_rows) if matching_rows else pd.DataFrame() + health: int | None = _segment_health(seg_df) if not seg_df.empty else 0 + segments.append( { - "prefix": p, - "url_count": len(urls), - "health_score": overall, + "prefix": prefix, + "url_count": len(matching_rows), + "health_score": health, + "pattern_type": "regex" if is_rx else "prefix", } ) + return {"overall_health": overall, "segments": segments} diff --git a/src/website_profiling/reporting/link_edges_report.py b/src/website_profiling/reporting/link_edges_report.py index 6bc98a9..49f54a0 100644 --- a/src/website_profiling/reporting/link_edges_report.py +++ b/src/website_profiling/reporting/link_edges_report.py @@ -18,8 +18,8 @@ def summarize_link_rel(edges: list[dict[str, Any]]) -> dict[str, Any]: def build_inlink_anchor_matrix(edges: list[dict[str, Any]], *, limit: int = 500) -> list[dict[str, Any]]: - """Aggregate inlink anchor text counts per target URL.""" - buckets: dict[tuple[str, str], int] = Counter() + """Aggregate inlink anchor text counts per target URL, including dominant position.""" + buckets: dict[tuple[str, str], Counter] = {} for e in edges: if str(e.get("link_type") or "") != "internal": continue @@ -28,10 +28,14 @@ def build_inlink_anchor_matrix(edges: list[dict[str, Any]], *, limit: int = 500) source = str(e.get("from_url") or "").rstrip("/") if not target or not source: continue - buckets[(target, anchor)] += 1 - rows = [ - {"target_url": t, "anchor_text": a, "inlink_count": c} - for (t, a), c in buckets.items() - ] + key = (target, anchor) + if key not in buckets: + buckets[key] = Counter() + buckets[key][str(e.get("position") or "content")] += 1 + rows = [] + for (t, a), pos_counter in buckets.items(): + total = sum(pos_counter.values()) + top_pos = pos_counter.most_common(1)[0][0] if pos_counter else "content" + rows.append({"target_url": t, "anchor_text": a, "inlink_count": total, "top_position": top_pos}) rows.sort(key=lambda r: (-r["inlink_count"], r["target_url"])) return rows[: max(1, limit)] diff --git a/src/website_profiling/tools/export_crawl_workbook.py b/src/website_profiling/tools/export_crawl_workbook.py index 87bbdb2..6fd46d1 100644 --- a/src/website_profiling/tools/export_crawl_workbook.py +++ b/src/website_profiling/tools/export_crawl_workbook.py @@ -67,7 +67,7 @@ def build_crawl_workbook_zip(report_payload: dict[str, Any]) -> bytes: if isinstance(link_edges, list) and link_edges: edge_cols = [ "from_url", "to_url", "anchor_text", "rel", - "is_nofollow", "is_sponsored", "is_ugc", "link_type", + "is_nofollow", "is_sponsored", "is_ugc", "link_type", "position", ] zf.writestr("links.csv", _csv_bytes(link_edges, edge_cols)) diff --git a/tests/reporting/test_crawl_segments.py b/tests/reporting/test_crawl_segments.py index 882272b..d70bb54 100644 --- a/tests/reporting/test_crawl_segments.py +++ b/tests/reporting/test_crawl_segments.py @@ -3,9 +3,148 @@ import pandas as pd -from website_profiling.reporting.crawl_segments import build_crawl_segments +from website_profiling.reporting.crawl_segments import ( + _is_regex, + _matches_path, + _segment_health, + build_crawl_segments, +) +# --------------------------------------------------------------------------- +# _is_regex +# --------------------------------------------------------------------------- + +def test_is_regex_plain_prefix() -> None: + assert _is_regex("/blog") is False + assert _is_regex("/api/v1") is False + assert _is_regex("/products-new") is False + + +def test_is_regex_dotstar_pattern() -> None: + assert _is_regex("/blog/.*") is True + assert _is_regex("/api/.*") is True + + +def test_is_regex_dotplus_pattern() -> None: + assert _is_regex("/api/.+") is True + + +def test_is_regex_shorthand_class() -> None: + assert _is_regex(r"/api/v\d+") is True + assert _is_regex(r"/path/\w+") is True + + +def test_is_regex_character_class() -> None: + assert _is_regex("/api/[v][0-9]") is True + + +def test_is_regex_group() -> None: + assert _is_regex("/(blog|news)/") is True + + +def test_is_regex_dollar_anchor() -> None: + assert _is_regex("/blog$") is True + + +def test_is_regex_single_dot_not_flagged() -> None: + """A plain dot in a path like /api/v1.0 should NOT be treated as regex.""" + assert _is_regex("/api/v1.0") is False + + +# --------------------------------------------------------------------------- +# _matches_path +# --------------------------------------------------------------------------- + +def test_matches_path_prefix_exact() -> None: + assert _matches_path("/blog", "/blog", False, "/blog") is True + + +def test_matches_path_prefix_child() -> None: + assert _matches_path("/blog/post-1", "/blog", False, "/blog") is True + + +def test_matches_path_prefix_no_match() -> None: + assert _matches_path("/about", "/blog", False, "/blog") is False + assert _matches_path("/blogger", "/blog", False, "/blog") is False + + +def test_matches_path_regex() -> None: + import re + pattern = "/api/.*" + compiled = re.compile(pattern) + assert _matches_path("/api/v1/users", pattern, True, compiled) is True + assert _matches_path("/about", pattern, True, compiled) is False + + +# --------------------------------------------------------------------------- +# _segment_health +# --------------------------------------------------------------------------- + +def test_segment_health_all_ok() -> None: + df = pd.DataFrame([ + {"url": "https://ex.com/a", "status": 200, "title": "A", "description": "desc"}, + {"url": "https://ex.com/b", "status": 200, "title": "B", "description": "desc"}, + ]) + assert _segment_health(df) == 100 + + +def test_segment_health_empty_df() -> None: + assert _segment_health(pd.DataFrame()) == 0 + + +def test_segment_health_error_status_deduction() -> None: + """50% 4xx → deducts 15 pts (50% of 30).""" + df = pd.DataFrame([ + {"status": 200}, {"status": 200}, + {"status": 404}, {"status": 404}, + ]) + score = _segment_health(df) + assert score == 85 # 100 - round(30 * 0.5) = 85 + + +def test_segment_health_missing_title_deduction() -> None: + """All titles missing → full 20-pt deduction.""" + df = pd.DataFrame([{"status": 200, "title": ""} for _ in range(5)]) + score = _segment_health(df) + assert score == 80 # 100 - 20 + + +def test_segment_health_missing_description_deduction() -> None: + """All descriptions missing → full 10-pt deduction.""" + df = pd.DataFrame([{"status": 200, "title": "T", "description": ""} for _ in range(5)]) + score = _segment_health(df) + assert score == 90 # 100 - 10 + + +def test_segment_health_missing_viewport_deduction() -> None: + """All viewport missing → full 10-pt deduction.""" + df = pd.DataFrame([{"status": 200, "title": "T", "viewport_present": False} for _ in range(5)]) + score = _segment_health(df) + assert score == 90 # 100 - 10 + + +def test_segment_health_clamped_to_zero() -> None: + """Multiple deductions stack: 100 - 30(status) - 20(title) - 10(desc) - 10(viewport) = 30.""" + df = pd.DataFrame([ + {"status": 500, "title": "", "description": "", "viewport_present": False} + for _ in range(10) + ]) + assert _segment_health(df) == 30 + + +def test_segment_health_small_missing_rate_no_deduction() -> None: + """Under 10% missing rate triggers no deduction.""" + rows = [{"status": 200, "title": "T", "description": "D"} for _ in range(10)] + rows[0]["title"] = "" # 10% — threshold is > 10%, so no deduction + df = pd.DataFrame(rows) + assert _segment_health(df) == 100 + + +# --------------------------------------------------------------------------- +# build_crawl_segments +# --------------------------------------------------------------------------- + def test_build_crawl_segments_groups_by_prefix() -> None: df = pd.DataFrame([ {"url": "https://example.com/blog/a"}, @@ -19,6 +158,7 @@ def test_build_crawl_segments_groups_by_prefix() -> None: seg = out["segments"][0] assert seg["prefix"] == "/blog" assert seg["url_count"] == 2 + assert seg["pattern_type"] == "prefix" def test_build_crawl_segments_empty_prefixes() -> None: @@ -34,3 +174,93 @@ def test_build_crawl_segments_handles_bad_url() -> None: out = build_crawl_segments(df, [{"id": "x", "score": 80}], ["/not-a-valid-url"]) assert out is not None assert out["segments"][0]["url_count"] == 1 + + +def test_build_crawl_segments_regex_pattern() -> None: + """Regex pattern /blog/.* matches /blog/post-1 and /blog/post-2.""" + df = pd.DataFrame([ + {"url": "https://example.com/blog/post-1"}, + {"url": "https://example.com/blog/post-2"}, + {"url": "https://example.com/about"}, + ]) + out = build_crawl_segments(df, [], ["/blog/.*"]) + assert out is not None + seg = out["segments"][0] + assert seg["url_count"] == 2 + assert seg["pattern_type"] == "regex" + + +def test_build_crawl_segments_mixed_prefix_and_regex() -> None: + """Mixed literal prefix and regex in the same list.""" + df = pd.DataFrame([ + {"url": "https://example.com/blog/a", "status": 200, "title": "T"}, + {"url": "https://example.com/api/v1/users", "status": 200, "title": "T"}, + {"url": "https://example.com/api/v2/items", "status": 200, "title": "T"}, + {"url": "https://example.com/about", "status": 200, "title": "T"}, + ]) + out = build_crawl_segments(df, [], ["/blog", r"/api/v\d+"]) + assert out is not None + assert len(out["segments"]) == 2 + blog_seg = next(s for s in out["segments"] if s["prefix"] == "/blog") + api_seg = next(s for s in out["segments"] if "api" in s["prefix"]) + assert blog_seg["url_count"] == 1 + assert blog_seg["pattern_type"] == "prefix" + assert api_seg["url_count"] == 2 + assert api_seg["pattern_type"] == "regex" + + +def test_build_crawl_segments_per_segment_health_differs() -> None: + """Segments with different URL subsets get different health scores.""" + df = pd.DataFrame([ + # /good: all 200, all have titles + {"url": "https://ex.com/good/a", "status": 200, "title": "A"}, + {"url": "https://ex.com/good/b", "status": 200, "title": "B"}, + # /bad: all 500, no titles + {"url": "https://ex.com/bad/a", "status": 500, "title": ""}, + {"url": "https://ex.com/bad/b", "status": 500, "title": ""}, + ]) + out = build_crawl_segments(df, [], ["/good", "/bad"]) + assert out is not None + segs = {s["prefix"]: s for s in out["segments"]} + assert segs["/good"]["health_score"] == 100 + # /bad: 100% 500 status (−30) + 100% missing title (−20) = 50 + assert segs["/bad"]["health_score"] == 50 + + +def test_build_crawl_segments_invalid_regex_falls_back_to_prefix() -> None: + """An invalid regex is silently treated as a literal prefix.""" + df = pd.DataFrame([{"url": "https://example.com/[invalid"}]) + # "[invalid" looks like a regex (contains "[") but won't compile → fallback to prefix + out = build_crawl_segments(df, [], ["[invalid"]) + assert out is not None + seg = out["segments"][0] + assert seg["pattern_type"] == "prefix" + + +def test_build_crawl_segments_no_categories_overall_health_is_none() -> None: + df = pd.DataFrame([{"url": "https://example.com/blog/a"}]) + out = build_crawl_segments(df, [], ["/blog"]) + assert out is not None + assert out["overall_health"] is None + + +def test_build_crawl_segments_prefix_without_leading_slash() -> None: + """Prefixes without a leading slash get one added automatically.""" + df = pd.DataFrame([ + {"url": "https://example.com/blog/post"}, + {"url": "https://example.com/about"}, + ]) + out = build_crawl_segments(df, [], ["blog"]) + assert out is not None + seg = out["segments"][0] + assert seg["prefix"] == "/blog" + assert seg["url_count"] == 1 + + +def test_build_crawl_segments_zero_match() -> None: + """Segment with no matching URLs gets health_score of 0.""" + df = pd.DataFrame([{"url": "https://example.com/about"}]) + out = build_crawl_segments(df, [], ["/blog"]) + assert out is not None + assert out["segments"][0]["url_count"] == 0 + assert out["segments"][0]["health_score"] == 0 diff --git a/tests/test_config_schema_keys.py b/tests/test_config_schema_keys.py index bfcd483..f0417dd 100644 --- a/tests/test_config_schema_keys.py +++ b/tests/test_config_schema_keys.py @@ -30,6 +30,7 @@ "crawl_url_list", "crawl_user_agent_preset", "crawl_user_agent_custom", + "compare_mobile_desktop", "crawl_auth_username", "crawl_auth_password", "crawl_extra_headers", diff --git a/tests/test_crawl_pause_resume.py b/tests/test_crawl_pause_resume.py new file mode 100644 index 0000000..55f2631 --- /dev/null +++ b/tests/test_crawl_pause_resume.py @@ -0,0 +1,534 @@ +"""Tests for crawl pause/resume: frontier serialisation, pause state DB helpers, +and the pause/resume flow in run_crawler.""" +from __future__ import annotations + +import json +import os +import threading +from queue import Queue +from typing import Any +from unittest.mock import MagicMock, patch, call + +import pandas as pd +import pytest + + +# --------------------------------------------------------------------------- +# CrawlFrontier.serialize_state / restore_from_state +# --------------------------------------------------------------------------- + +def _make_frontier() -> Any: + from website_profiling.crawl.frontier import CrawlFrontier + + with patch("website_profiling.crawl.frontier.load_robots", return_value=None): + f = CrawlFrontier("https://example.com", ignore_robots=True) + return f + + +def test_serialize_state_empty(): + f = _make_frontier() + state = f.serialize_state() + assert state["pending"] == [] + assert state["visited"] == [] + assert state["depths"] == {} + + +def test_serialize_state_captures_pending_and_visited(): + f = _make_frontier() + f.queue.put("https://example.com/a") + f.queue.put("https://example.com/b") + f.depths["https://example.com/a"] = 0 + f.depths["https://example.com/b"] = 1 + f.visited.add("https://example.com/visited") + + state = f.serialize_state() + assert set(state["pending"]) == {"https://example.com/a", "https://example.com/b"} + assert "https://example.com/visited" in state["visited"] + assert state["depths"]["https://example.com/a"] == 0 + assert state["depths"]["https://example.com/b"] == 1 + + +def test_restore_from_state_populates_frontier(): + f = _make_frontier() + state = { + "pending": ["https://example.com/x", "https://example.com/y"], + "visited": ["https://example.com/z"], + "depths": {"https://example.com/x": 0, "https://example.com/y": 1}, + } + f.restore_from_state(state) + + assert not f.queue.empty() + items = list(f.queue.queue) + assert set(items) == {"https://example.com/x", "https://example.com/y"} + assert "https://example.com/z" in f.visited + assert f.depths["https://example.com/x"] == 0 + + +def test_restore_from_state_empty_state(): + f = _make_frontier() + f.restore_from_state({}) + assert f.queue.empty() + assert len(f.visited) == 0 + assert len(f.depths) == 0 + + +def test_serialize_restore_roundtrip(): + f = _make_frontier() + f.queue.put("https://example.com/page") + f.depths["https://example.com/page"] = 2 + f.visited.add("https://example.com/done") + + state = f.serialize_state() + serialised = json.dumps(state) # must be JSON-serialisable + + f2 = _make_frontier() + f2.restore_from_state(json.loads(serialised)) + assert list(f2.queue.queue) == ["https://example.com/page"] + assert "https://example.com/done" in f2.visited + + +# --------------------------------------------------------------------------- +# crawl_store: save_pause_state / load_pause_state / clear_pause_state +# --------------------------------------------------------------------------- + +def _mock_conn(): + conn = MagicMock() + conn.execute.return_value = MagicMock() + return conn + + +def test_save_pause_state_executes_update(): + from website_profiling.db.crawl_store import save_pause_state + + conn = _mock_conn() + state = {"pending": ["https://example.com/a"], "visited": [], "depths": {}} + save_pause_state(conn, 42, state) + + args = conn.execute.call_args + sql = args[0][0] + assert "UPDATE crawl_runs SET pause_state" in sql + assert "paused_at" in sql + conn.commit.assert_called_once() + + +def test_load_pause_state_returns_dict(): + from website_profiling.db.crawl_store import load_pause_state + + state = {"pending": ["https://example.com/a"], "visited": [], "depths": {}} + row = MagicMock() + row.__getitem__ = lambda self, k: json.dumps(state) if k == "pause_state" else None + conn = _mock_conn() + conn.execute.return_value.fetchone.return_value = row + + result = load_pause_state(conn, 42) + assert result == state + + +def test_load_pause_state_returns_none_when_null(): + from website_profiling.db.crawl_store import load_pause_state + + row = MagicMock() + row.__getitem__ = lambda self, k: None + conn = _mock_conn() + conn.execute.return_value.fetchone.return_value = row + + assert load_pause_state(conn, 42) is None + + +def test_load_pause_state_returns_none_when_no_row(): + from website_profiling.db.crawl_store import load_pause_state + + conn = _mock_conn() + conn.execute.return_value.fetchone.return_value = None + + assert load_pause_state(conn, 42) is None + + +def test_load_pause_state_returns_none_on_exception(): + from website_profiling.db.crawl_store import load_pause_state + + conn = _mock_conn() + conn.execute.side_effect = Exception("db error") + + assert load_pause_state(conn, 42) is None + + +def test_load_pause_state_accepts_dict_value(): + """Column value already a dict (psycopg JSONB auto-parse).""" + from website_profiling.db.crawl_store import load_pause_state + + state = {"pending": [], "visited": [], "depths": {}} + row = MagicMock() + row.__getitem__ = lambda self, k: state if k == "pause_state" else None + conn = _mock_conn() + conn.execute.return_value.fetchone.return_value = row + + result = load_pause_state(conn, 7) + assert result == state + + +def test_clear_pause_state_executes_update(): + from website_profiling.db.crawl_store import clear_pause_state + + conn = _mock_conn() + clear_pause_state(conn, 42) + + args = conn.execute.call_args + sql = args[0][0] + assert "pause_state = NULL" in sql + conn.commit.assert_called_once() + + +def test_clear_pause_state_swallows_exception(): + from website_profiling.db.crawl_store import clear_pause_state + + conn = _mock_conn() + conn.execute.side_effect = Exception("db down") + clear_pause_state(conn, 42) # must not raise + + +# --------------------------------------------------------------------------- +# Crawler.__init__ restore_from_state branch (line 255 coverage) +# --------------------------------------------------------------------------- + +def test_crawler_init_restores_pause_state(monkeypatch): + """Passing pause_state to Crawler.__init__ calls frontier.restore_from_state.""" + import website_profiling.crawl.crawler as mod + + restored = {} + + class _FakeFrontier: + queue = Queue() + visited: set = set() + depths: dict = {} + lock = threading.Lock() + rp = None + + def __init__(self, *a, **kw): + pass + + def restore_from_state(self, state): + restored["state"] = state + + def seed_initial_urls(self, **kw): + pass + + pause_state = {"pending": ["https://example.com/p"], "visited": [], "depths": {}} + + with ( + patch.object(mod, "CrawlFrontier", _FakeFrontier), + patch.object(mod, "build_fetcher", return_value=MagicMock()), + ): + c = mod.Crawler("https://example.com", pause_state=pause_state) + + assert restored.get("state") == pause_state + + +# --------------------------------------------------------------------------- +# _PAUSE_EVENT and pause file check in crawl loop +# --------------------------------------------------------------------------- + +def test_pause_event_is_set_by_pause_file(tmp_path, monkeypatch): + """Crawler.crawl() detects a pause file written to TMPDIR and marks paused=True.""" + import website_profiling.crawl.crawler as mod + from website_profiling.crawl.schema import empty_crawl_row + + monkeypatch.setenv("TMPDIR", str(tmp_path)) + mod._PAUSE_EVENT.clear() + + pid = os.getpid() + flag = tmp_path / f"wp_pause_{pid}.flag" + flag.write_text("") # write BEFORE crawl starts + + # Minimal real Crawler setup — mocked frontier with one URL queued. + class _FakeFrontier: + queue: Queue = Queue() + visited: set = set() + depths: dict = {} + lock = threading.Lock() + rp = None + + def __init__(self, *a, **kw): + self.queue.put("https://example.com/") + self.depths["https://example.com/"] = 0 + + def should_skip_dequeued(self, url): + return False + + def mark_visited(self, url): + if url in self.visited: + return False + self.visited.add(url) + return True + + def seed_initial_urls(self, **kw): + pass + + def serialize_state(self): + return {"pending": [], "visited": [], "depths": {}} + + fake_result = empty_crawl_row(status=200) + fake_result["url"] = "https://example.com/" + + mock_fetcher = MagicMock() + mock_fetcher.fetch.return_value = MagicMock( + url="https://example.com/", + final_url="https://example.com/", + status_code=200, + text="", + content_type="text/html", + fetch_method="static", + console_messages=[], + failed_requests=[], + ) + mock_fetcher.close = MagicMock() + + with ( + patch.object(mod, "CrawlFrontier", _FakeFrontier), + patch.object(mod, "build_fetcher", return_value=mock_fetcher), + patch.object(mod.Crawler, "worker", return_value=fake_result), + ): + crawler = mod.Crawler("https://example.com", max_pages=10) + df = crawler.crawl(show_progress=False) + + assert crawler.paused is True + assert not flag.exists() # file was deleted + mod._PAUSE_EVENT.clear() + + +def test_pause_loop_os_unlink_error_is_swallowed(tmp_path, monkeypatch): + """OSError from os.unlink during pause-file cleanup is silently swallowed.""" + import website_profiling.crawl.crawler as mod + from website_profiling.crawl.schema import empty_crawl_row + + monkeypatch.setenv("TMPDIR", str(tmp_path)) + mod._PAUSE_EVENT.clear() + + pid = os.getpid() + flag = tmp_path / f"wp_pause_{pid}.flag" + flag.write_text("") + + class _FakeFrontier: + queue: Queue = Queue() + visited: set = set() + depths: dict = {} + lock = threading.Lock() + rp = None + + def __init__(self, *a, **kw): + self.queue.put("https://example.com/") + self.depths["https://example.com/"] = 0 + + def should_skip_dequeued(self, url): + return False + + def mark_visited(self, url): + if url in self.visited: + return False + self.visited.add(url) + return True + + def seed_initial_urls(self, **kw): + pass + + def serialize_state(self): + return {"pending": [], "visited": [], "depths": {}} + + fake_result = empty_crawl_row(status=200) + fake_result["url"] = "https://example.com/" + + mock_fetcher = MagicMock() + mock_fetcher.close = MagicMock() + + with ( + patch.object(mod, "CrawlFrontier", _FakeFrontier), + patch.object(mod, "build_fetcher", return_value=mock_fetcher), + patch.object(mod.Crawler, "worker", return_value=fake_result), + patch("os.unlink", side_effect=OSError("permission denied")), + ): + crawler = mod.Crawler("https://example.com", max_pages=10) + df = crawler.crawl(show_progress=False) + + assert crawler.paused is True + mod._PAUSE_EVENT.clear() + + +# --------------------------------------------------------------------------- +# run_crawler: pause saves state and calls sys.exit(2) +# --------------------------------------------------------------------------- + +def _patch_crawler_paused(monkeypatch, pause_state_to_save=None): + """Return a fake Crawler class whose crawl() immediately marks itself paused.""" + import website_profiling.crawl.crawler as mod + + class _FakeCrawler: + paused = True + results = [{"url": "https://example.com/a"}] + link_edges_accum = [] + frontier = MagicMock() + _html_buffer = [] + store_page_html = False + + def __init__(self, *a, **kw): + self.frontier.serialize_state.return_value = pause_state_to_save or { + "pending": ["https://example.com/b"], + "visited": ["https://example.com/a"], + "depths": {"https://example.com/b": 1}, + } + + def crawl(self, **kw): + return pd.DataFrame(self.results) + + return _FakeCrawler + + +def _db_session_cm(conn): + """Return a callable context-manager mock that yields *conn*.""" + from contextlib import contextmanager + + @contextmanager + def _cm(): + yield conn + + return _cm + + +def test_run_crawler_pause_saves_state_and_exits(monkeypatch): + import website_profiling.crawl.crawler as mod + import website_profiling.db as db_pkg + + FakeCrawler = _patch_crawler_paused(monkeypatch) + + saved = {} + + def _fake_save(conn, run_id, state): + saved["run_id"] = run_id + saved["state"] = state + + mock_conn = MagicMock() + + with ( + patch.object(mod, "Crawler", FakeCrawler), + patch.object(db_pkg, "db_session", _db_session_cm(mock_conn)), + patch.object(db_pkg, "create_crawl_run", return_value=7), + patch.object(db_pkg, "backup_db_if_exists", return_value=None), + patch.object(db_pkg, "read_historical_data", return_value={}), + patch.object(db_pkg, "restore_historical_data", MagicMock()), + patch("website_profiling.db.storage.ensure_crawl_tables_cleared", MagicMock()), + patch("website_profiling.db.crawl_store.save_pause_state", _fake_save), + pytest.raises(SystemExit) as exc_info, + ): + mod.run_crawler( + start_url="https://example.com", + output_db=True, + crawl_stream_to_db=True, + ) + + assert exc_info.value.code == 2 + assert saved.get("run_id") == 7 + assert "pending" in saved.get("state", {}) + + +def test_run_crawler_pause_no_stream_run_id_still_exits(monkeypatch): + """When streaming wasn't used (stream_run_id=None) pause still calls sys.exit(2).""" + import website_profiling.crawl.crawler as mod + + FakeCrawler = _patch_crawler_paused(monkeypatch) + + with ( + patch.object(mod, "Crawler", FakeCrawler), + pytest.raises(SystemExit) as exc_info, + ): + mod.run_crawler( + start_url="https://example.com", + output_db=False, + ) + + assert exc_info.value.code == 2 + + +# --------------------------------------------------------------------------- +# run_crawler: resume loads state and clears it on success +# --------------------------------------------------------------------------- + +def test_run_crawler_resume_loads_and_clears_state(monkeypatch): + import website_profiling.crawl.crawler as mod + import website_profiling.db as db_pkg + + pause_state = { + "pending": ["https://example.com/b"], + "visited": ["https://example.com/a"], + "depths": {"https://example.com/b": 1}, + "pages_crawled": 1, + } + cleared = {} + + def _fake_load(conn, run_id): + return pause_state + + def _fake_clear(conn, run_id): + cleared["run_id"] = run_id + + class _FakeCrawlerNotPaused: + paused = False + results = [] + link_edges_accum = [] + frontier = MagicMock() + _html_buffer = [] + store_page_html = False + + def __init__(self, *a, **kw): + self._pause_state = kw.get("pause_state") + + def crawl(self, **kw): + return pd.DataFrame() + + mock_conn = MagicMock() + + with ( + patch.object(mod, "Crawler", _FakeCrawlerNotPaused), + patch.object(db_pkg, "db_session", _db_session_cm(mock_conn)), + patch("website_profiling.db.crawl_store.load_pause_state", _fake_load), + patch("website_profiling.db.crawl_store.clear_pause_state", _fake_clear), + ): + mod.run_crawler( + start_url="https://example.com", + output_db=False, + resume_run_id=42, + ) + + assert cleared.get("run_id") == 42 + + +def test_run_crawler_resume_with_no_saved_state(monkeypatch): + """If no pause state exists for resume_run_id the crawler starts fresh.""" + import website_profiling.crawl.crawler as mod + import website_profiling.db as db_pkg + + class _FakeCrawlerFresh: + paused = False + results = [] + link_edges_accum = [] + frontier = MagicMock() + _html_buffer = [] + store_page_html = False + + def __init__(self, *a, **kw): + assert kw.get("pause_state") is None + + def crawl(self, **kw): + return pd.DataFrame() + + mock_conn = MagicMock() + + with ( + patch.object(mod, "Crawler", _FakeCrawlerFresh), + patch.object(db_pkg, "db_session", _db_session_cm(mock_conn)), + patch("website_profiling.db.crawl_store.load_pause_state", return_value=None), + patch("website_profiling.db.crawl_store.clear_pause_state"), + ): + mod.run_crawler( + start_url="https://example.com", + output_db=False, + resume_run_id=99, + ) diff --git a/tests/test_crawler_deep.py b/tests/test_crawler_deep.py index 6ad95a4..e51f341 100644 --- a/tests/test_crawler_deep.py +++ b/tests/test_crawler_deep.py @@ -662,3 +662,83 @@ def enqueue_html(self, record: dict) -> None: assert len(writer.records) == 1 assert writer.records[0]["url"] == "https://site.com" + +def test_run_crawler_compare_mobile_desktop_second_pass(monkeypatch): + """compare_mobile_desktop=True triggers a second crawl and links the run IDs.""" + import website_profiling.crawl.crawler as mod + + crawl_calls: list[dict] = [] + + class FakeCrawler: + def __init__(self, **_kwargs): + self.link_edges_accum = [] + self.store_page_html = False + + def crawl(self, **_kwargs): + return pd.DataFrame([{"url": "https://a.com", "status": 200, "title": "ok"}]) + + run_id_seq = iter([7, 8]) + + class _Ctx: + def __init__(self): + self._conn = object() + + def __enter__(self): + return self._conn + + def __exit__(self, _t, _v, _tb): + return False + + linked: list[tuple] = [] + fake_set_mobile = lambda conn, d, m: linked.append((d, m)) + + def fake_get_latest(conn): + return next(run_id_seq) + + fake_db = types.SimpleNamespace( + backup_db_if_exists=lambda: None, + create_crawl_run=lambda *_a, **_k: 7, + db_session=lambda: _Ctx(), + read_historical_data=lambda: {}, + restore_historical_data=lambda *_a, **_k: None, + write_crawl=lambda conn, df, crawl_run_id=None: None, + ) + fake_storage = types.SimpleNamespace(ensure_crawl_tables_cleared=lambda *_a, **_k: None) + monkeypatch.setattr(mod, "Crawler", FakeCrawler) + monkeypatch.setitem(__import__("sys").modules, "website_profiling.db", fake_db) + monkeypatch.setitem(__import__("sys").modules, "website_profiling.db.storage", fake_storage) + + import website_profiling.db.crawl_store as cs_mod + + monkeypatch.setattr(cs_mod, "get_latest_crawl_run_id", fake_get_latest) + monkeypatch.setattr(cs_mod, "set_mobile_run_id", fake_set_mobile) + + # Patch run_crawler itself for the recursive call to avoid double setup + second_calls: list[dict] = [] + + original_run = mod.run_crawler + + def patched_run(start_url="", **kwargs): + if kwargs.get("compare_mobile_desktop") is False and kwargs.get("crawl_user_agent_preset") == "mobile": + second_calls.append({"start_url": start_url, **kwargs}) + return pd.DataFrame([{"url": "https://a.com", "status": 200}]) + return original_run(start_url, **kwargs) + + monkeypatch.setattr(mod, "run_crawler", patched_run) + + mod.run_crawler( + "https://a.com", + output_db=True, + crawl_stream_to_db=False, + max_pages=5, + preserve_crawl_history=True, + show_progress=False, + compare_mobile_desktop=True, + ) + + assert len(second_calls) == 1 + assert second_calls[0]["crawl_user_agent_preset"] == "mobile" + assert second_calls[0]["compare_mobile_desktop"] is False + # set_mobile_run_id was called + assert linked and linked[0][0] == 7 + diff --git a/tests/test_link_edges.py b/tests/test_link_edges.py index eca30c8..1d17a8d 100644 --- a/tests/test_link_edges.py +++ b/tests/test_link_edges.py @@ -2,6 +2,7 @@ from __future__ import annotations from website_profiling.common import parse_link_edges, parse_links +from website_profiling.reporting.link_edges_report import build_inlink_anchor_matrix def test_parse_link_edges_anchor_and_rel(): @@ -24,6 +25,124 @@ def test_parse_link_edges_anchor_and_rel(): assert external["link_type"] == "external" +def test_parse_link_edges_position_defaults_to_content(): + html = '
Page
' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "content" + + +def test_parse_link_edges_position_nav_element(): + html = '' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "nav" + + +def test_parse_link_edges_position_footer_element(): + html = '' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "footer" + + +def test_parse_link_edges_position_header_element(): + html = '
Home
' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "header" + + +def test_parse_link_edges_position_aside_element(): + html = '' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "sidebar" + + +def test_parse_link_edges_position_aria_role_navigation(): + html = '
Nav
' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "nav" + + +def test_parse_link_edges_position_aria_role_contentinfo(): + html = '
Footer
' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "footer" + + +def test_parse_link_edges_position_class_heuristic_sidebar(): + html = '' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "sidebar" + + +def test_parse_link_edges_position_id_heuristic_footer(): + html = '' + _, edges = parse_link_edges("https://example.com/", html) + assert edges[0]["position"] == "footer" + + +def test_parse_link_edges_position_semantic_beats_class(): + """