diff --git a/input.txt.example b/input.txt.example
index 7494b55..14f9f88 100644
--- a/input.txt.example
+++ b/input.txt.example
@@ -96,6 +96,8 @@ enable_duplicate_detection = true
enable_language_detection = true
analysis_fuzzy_threshold = 92
analysis_simhash_hamming = 0
+analysis_simhash_max_urls = 800
+analysis_fuzzy_max_urls = 600
analysis_dup_max_pages = 2000
# --- Audit steps ---
diff --git a/pipeline-config.example.txt b/pipeline-config.example.txt
index d1957b9..ab60161 100644
--- a/pipeline-config.example.txt
+++ b/pipeline-config.example.txt
@@ -97,6 +97,8 @@ enable_duplicate_detection = true
enable_language_detection = true
analysis_fuzzy_threshold = 92
analysis_simhash_hamming = 0
+analysis_simhash_max_urls = 800
+analysis_fuzzy_max_urls = 600
analysis_dup_max_pages = 2000
# --- Pipeline ---
diff --git a/src/website_profiling/analysis/local.py b/src/website_profiling/analysis/local.py
index c55adf1..807a9b6 100644
--- a/src/website_profiling/analysis/local.py
+++ b/src/website_profiling/analysis/local.py
@@ -97,9 +97,11 @@ def _import_langdetect():
def compute_duplicate_groups(
df: pd.DataFrame,
cfg: dict[str, str] | None,
-) -> tuple[list[dict[str, Any]], dict[str, str]]:
+) -> tuple[list[dict[str, Any]], dict[str, str], list[str]]:
if df.empty or not _cfg_bool(cfg, "enable_duplicate_detection", False):
- return [], {}
+ return [], {}, []
+
+ warnings: list[str] = []
success = df[df["status"].astype(str).str.match(r"2\d{2}", na=False)] if "status" in df.columns else df
if "content_type" in success.columns:
@@ -126,6 +128,8 @@ def compute_duplicate_groups(
fuzz = _import_rapidfuzz()
fuzzy_threshold = _cfg_int(cfg, "analysis_fuzzy_threshold", 92) or 92
hamming_max = _cfg_int(cfg, "analysis_simhash_hamming", 0) or 0
+ simhash_max_urls = _cfg_int(cfg, "analysis_simhash_max_urls", 800) or 800
+ fuzzy_max_urls = _cfg_int(cfg, "analysis_fuzzy_max_urls", 600) or 600
parent: dict[str, str] = {}
@@ -150,20 +154,30 @@ def union(a: str, b: str) -> None:
for m in members[1:]:
union(base, m)
- if hamming_max > 0 and len(urls) <= 800:
+ if hamming_max > 0 and len(urls) <= simhash_max_urls:
sh_list = [(u, url_to_sh[u]) for u in urls]
for i, (u1, h1) in enumerate(sh_list):
for u2, h2 in sh_list[i + 1 :]:
if _hamming(h1, h2) <= hamming_max:
union(u1, u2)
+ elif hamming_max > 0 and len(urls) > simhash_max_urls:
+ warnings.append(
+ f"Duplicate detection: SimHash similarity skipped for {len(urls)} URLs "
+ f"(cap {simhash_max_urls}); results may be incomplete."
+ )
- if len(urls) <= 600:
+ if len(urls) <= fuzzy_max_urls:
for i, u1 in enumerate(urls):
fp1 = url_to_fp.get(u1, "")
for u2 in urls[i + 1 :]:
fp2 = url_to_fp.get(u2, "")
if fp1 and fp2 and fuzz.token_set_ratio(fp1, fp2) >= fuzzy_threshold:
union(u1, u2)
+ elif len(urls) > fuzzy_max_urls:
+ warnings.append(
+ f"Duplicate detection: fuzzy title matching skipped for {len(urls)} URLs "
+ f"(cap {fuzzy_max_urls}); results may be incomplete."
+ )
clusters: dict[str, list[str]] = defaultdict(list)
for u in urls:
@@ -196,7 +210,7 @@ def union(a: str, b: str) -> None:
if gid >= max_groups:
break
- return groups_out[:max_groups], url_to_gid
+ return groups_out[:max_groups], url_to_gid, warnings
def compute_language_signals(df: pd.DataFrame, cfg: dict[str, str] | None) -> tuple[dict[str, str], dict[str, Any]]:
@@ -243,9 +257,10 @@ def run_local_enrichment(df: pd.DataFrame, cfg: dict[str, str] | None) -> dict[s
return bundle
try:
- dups, url_gid = compute_duplicate_groups(df, cfg)
+ dups, url_gid, dup_warnings = compute_duplicate_groups(df, cfg)
bundle["content_duplicates"] = dups
bundle["url_duplicate_group_id"] = url_gid
+ bundle["ml_errors"].extend(dup_warnings)
except ImportError as e:
bundle["ml_errors"].append(str(e))
diff --git a/src/website_profiling/crawl/crawler.py b/src/website_profiling/crawl/crawler.py
index a00b4c9..c3b9076 100644
--- a/src/website_profiling/crawl/crawler.py
+++ b/src/website_profiling/crawl/crawler.py
@@ -5,7 +5,7 @@
import json
import time
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
from typing import Optional
import pandas as pd
@@ -449,6 +449,10 @@ def crawl(
continue
futures.append(ex.submit(self.worker, url))
+ if futures and self.queue.empty():
+ # Block until at least one future completes instead of busy-polling.
+ wait(futures, return_when=FIRST_COMPLETED)
+
remaining = []
for f in futures:
if f.done():
@@ -471,7 +475,6 @@ def crawl(
else:
remaining.append(f)
futures = remaining
- time.sleep(0.01)
if self.queue.empty() and not futures:
break
diff --git a/src/website_profiling/crawl/db_writer.py b/src/website_profiling/crawl/db_writer.py
index 302ba7f..a5f108d 100644
--- a/src/website_profiling/crawl/db_writer.py
+++ b/src/website_profiling/crawl/db_writer.py
@@ -21,10 +21,12 @@ def __init__(self, crawl_run_id: int, batch_size: int = 500, *, store_page_html:
self._error: Optional[BaseException] = None
def enqueue(self, record: dict) -> None:
+ if self._error is not None:
+ return
self._queue.put(("crawl", record))
def enqueue_html(self, record: dict) -> None:
- if not self.store_page_html:
+ if not self.store_page_html or self._error is not None:
return
self._queue.put(("html", record))
diff --git a/src/website_profiling/crawl/fetchers/browser.py b/src/website_profiling/crawl/fetchers/browser.py
index b3c3678..4d6a7d6 100644
--- a/src/website_profiling/crawl/fetchers/browser.py
+++ b/src/website_profiling/crawl/fetchers/browser.py
@@ -263,25 +263,26 @@ async def worker() -> None:
workers = [asyncio.create_task(worker()) for _ in range(self.js_concurrency)]
self._ready.set()
- await asyncio.gather(*workers)
-
- for page in pages:
+ try:
+ await asyncio.gather(*workers)
+ finally:
+ for page in pages:
+ try:
+ await page.close()
+ except Exception:
+ pass
try:
- await page.close()
+ await context.close()
+ except Exception:
+ pass
+ try:
+ await browser.close()
+ except Exception:
+ pass
+ try:
+ await playwright.stop()
except Exception:
pass
- try:
- await context.close()
- except Exception:
- pass
- try:
- await browser.close()
- except Exception:
- pass
- try:
- await playwright.stop()
- except Exception:
- pass
def _diagnostics_enabled(self) -> bool:
return self.capture_console or self.capture_failed_requests
diff --git a/src/website_profiling/crawl/fetchers/static.py b/src/website_profiling/crawl/fetchers/static.py
index 8059e56..7477dcc 100644
--- a/src/website_profiling/crawl/fetchers/static.py
+++ b/src/website_profiling/crawl/fetchers/static.py
@@ -49,7 +49,7 @@ def fetch(self, url: str) -> FetchResult:
redirect_chain_length=redirect_chain_length,
fetch_method="static",
)
- except Exception:
+ except requests.RequestException:
return FetchResult(
status=None,
content_type=None,
diff --git a/src/website_profiling/db/historical.py b/src/website_profiling/db/historical.py
index b1e6279..cb71baf 100644
--- a/src/website_profiling/db/historical.py
+++ b/src/website_profiling/db/historical.py
@@ -4,14 +4,17 @@
import json
import os
import subprocess
+import sys
import time
from pathlib import Path
from typing import Any, Optional
import pandas as pd
from psycopg import Connection
+from psycopg.sql import SQL, Identifier
from urllib.parse import urlparse
+from ..console_io import console_print
from ._common import (
_executemany,
_json_val,
@@ -33,17 +36,26 @@ def backup_db_if_exists(skip_in_ci: bool = True) -> Optional[str]:
suffix = time.strftime("%Y%m%d-%H%M%S")
out_path = backup_dir / f"website_profiling-{suffix}.dump"
try:
+ parsed = urlparse(get_database_url())
+ pg_env = {**os.environ}
+ if parsed.hostname:
+ pg_env["PGHOST"] = parsed.hostname
+ if parsed.port:
+ pg_env["PGPORT"] = str(parsed.port)
+ if parsed.username:
+ pg_env["PGUSER"] = parsed.username
+ if parsed.password:
+ pg_env["PGPASSWORD"] = parsed.password
+ dbname = (parsed.path or "").lstrip("/")
+ cmd = ["pg_dump", "-Fc", "-f", str(out_path)]
+ if dbname:
+ cmd.append(dbname)
subprocess.run(
- [
- "pg_dump",
- "-Fc",
- "-f",
- str(out_path),
- get_database_url(),
- ],
+ cmd,
check=True,
capture_output=True,
timeout=300,
+ env=pg_env,
)
return str(out_path)
except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired):
@@ -72,12 +84,18 @@ def read_historical_data() -> dict[str, list]:
for table in tables:
try:
with conn.cursor() as cur:
- cur.execute(f"SELECT * FROM {table}")
+ cur.execute(SQL("SELECT * FROM {}").format(Identifier(table)))
result[table] = [dict(row) for row in cur.fetchall()]
- except Exception:
- pass
- except Exception:
- pass
+ except Exception as e:
+ console_print(
+ f" Warning: could not read historical table '{table}': {e}",
+ file=sys.stderr,
+ )
+ except Exception as e:
+ console_print(
+ f" Warning: could not read historical data (a DB backup is still taken before any overwrite): {e}",
+ file=sys.stderr,
+ )
return result
diff --git a/src/website_profiling/db/report_store.py b/src/website_profiling/db/report_store.py
index ffd794c..431d9ae 100644
--- a/src/website_profiling/db/report_store.py
+++ b/src/website_profiling/db/report_store.py
@@ -6,6 +6,7 @@
from psycopg import Connection
+from ..scoring import round_half_up
from ._common import _json_val, _now_iso, _parse_row_json, _row_field
from .crawl_store import get_crawl_run_info
@@ -51,7 +52,7 @@ def _write_audit_health_snapshot(
for c in categories
if isinstance(c, dict) and isinstance(c.get("score"), (int, float))
]
- health_score = round(sum(scores) / len(scores)) if scores else None
+ health_score = round_half_up(sum(scores) / len(scores)) if scores else None
category_scores: dict[str, float] = {}
issue_counts = {"Critical": 0, "High": 0, "Medium": 0, "Low": 0}
for cat in categories:
diff --git a/src/website_profiling/integrations/crux/fetch.py b/src/website_profiling/integrations/crux/fetch.py
index a34ef25..df2ea4c 100644
--- a/src/website_profiling/integrations/crux/fetch.py
+++ b/src/website_profiling/integrations/crux/fetch.py
@@ -49,9 +49,18 @@ def fetch_crux_origin_metrics(origin_or_url: str, api_key: str | None = None) ->
lcp = parsed["metrics"].get("largest_contentful_paint", {}).get("p75")
inp = parsed["metrics"].get("interaction_to_next_paint", {}).get("p75")
cls = parsed["metrics"].get("cumulative_layout_shift", {}).get("p75")
+
+ def _pass_threshold(value: Any, limit: float) -> bool:
+ if value is None:
+ return False
+ try:
+ return float(value) <= limit
+ except (TypeError, ValueError):
+ return False
+
parsed["pass"] = {
- "lcp": lcp is not None and float(lcp) <= 2500,
- "inp": inp is not None and float(inp) <= 200,
- "cls": cls is not None and float(cls) <= 0.1,
+ "lcp": _pass_threshold(lcp, 2500),
+ "inp": _pass_threshold(inp, 200),
+ "cls": _pass_threshold(cls, 0.1),
}
return parsed
diff --git a/src/website_profiling/integrations/google/keyword_enrich.py b/src/website_profiling/integrations/google/keyword_enrich.py
index 10a1f77..f592051 100644
--- a/src/website_profiling/integrations/google/keyword_enrich.py
+++ b/src/website_profiling/integrations/google/keyword_enrich.py
@@ -21,6 +21,7 @@
from __future__ import annotations
import json
+import math
import re
from datetime import datetime, timezone
from typing import Any
@@ -34,14 +35,19 @@
def ctr_as_fraction(ctr: Any) -> float:
- """GSC rows use CTR percent (2.8); normalize to fraction for comparisons."""
+ """GSC rows use CTR as percent (e.g. 2.8 for 2.8%); normalize to fraction.
+
+ Invariant: ingest always stores percent — see gsc._to_query_record / _to_page_record (* 100).
+ """
if ctr is None:
return 0.0
try:
v = float(ctr)
except (TypeError, ValueError):
return 0.0
- return v / 100.0 if v > 1 else v
+ if v > 100:
+ return 1.0
+ return v / 100.0
QUESTION_STARTS = re.compile(
r"^(how|what|why|when|where|who|can|does|is|are|should|will|do)\s", re.I
@@ -143,13 +149,15 @@ def estimate_difficulty(kw: str, gsc_row: dict | None, branded: bool = False) ->
# ── CTR curve ─────────────────────────────────────────────────────────────────
def opportunity_clicks(impressions: int, current_pos: float, target_pos: int = 3) -> int:
- cur_ctr = CTR_CURVE.get(round(current_pos), CTR_CURVE_DEFAULT)
+ pos_slot = max(1, math.ceil(current_pos)) if current_pos > 0 else 1
+ cur_ctr = CTR_CURVE.get(pos_slot, CTR_CURVE_DEFAULT)
tgt_ctr = CTR_CURVE.get(target_pos, CTR_CURVE.get(3, 0.103))
return max(0, int((impressions or 0) * (tgt_ctr - cur_ctr)))
def industry_ctr(pos: float) -> float:
- return CTR_CURVE.get(round(pos), CTR_CURVE_DEFAULT)
+ pos_slot = max(1, math.ceil(pos)) if pos > 0 else 1
+ return CTR_CURVE.get(pos_slot, CTR_CURVE_DEFAULT)
# ── Cannibalisation ───────────────────────────────────────────────────────────
diff --git a/src/website_profiling/integrations/serp/estimates.py b/src/website_profiling/integrations/serp/estimates.py
index e344c09..ab81db0 100644
--- a/src/website_profiling/integrations/serp/estimates.py
+++ b/src/website_profiling/integrations/serp/estimates.py
@@ -6,6 +6,10 @@
import urllib.request
from typing import Any
+_MAX_ORGANIC = 10
+_MAX_FEATURES = 4
+_RAW_MAX = _MAX_ORGANIC * 8 + _MAX_FEATURES * 12 # 128
+
def fetch_serp_features(keyword: str, api_key: str) -> dict[str, Any]:
"""Fetch SERP metadata from SerpAPI (Estimated competition proxy)."""
@@ -38,13 +42,14 @@ def fetch_serp_features(keyword: str, api_key: str) -> dict[str, Any]:
if data.get("top_stories"):
features.append("top_stories")
- competition = min(100, len(organic) * 8 + len(features) * 12)
+ raw_score = len(organic) * 8 + len(features) * 12
+ competition = min(100, round(raw_score / _RAW_MAX * 100))
return {
"ok": True,
"organic_count": len(organic),
"serp_features": features,
"estimated_competition": competition,
- "provenance": "Estimated",
+ "provenance": "Estimated (heuristic-v1)",
}
diff --git a/src/website_profiling/lighthouse/config.py b/src/website_profiling/lighthouse/config.py
index 1a6c7b0..489bf7c 100644
--- a/src/website_profiling/lighthouse/config.py
+++ b/src/website_profiling/lighthouse/config.py
@@ -50,117 +50,4 @@ def _node_cmd() -> str:
return node
-def _build_report_html_content(summary: dict[str, Any]) -> str:
- """Build report.html content (for DB or file). Returns HTML string."""
- import html as html_module
- mm = summary.get("median_metrics") or {}
- cs = summary.get("category_scores") or {}
- failures = summary.get("top_failures") or []
- raw_reports = summary.get("raw_reports") or []
- url = html_module.escape(summary.get("url", ""))
- path_summary = "summary.json"
- path_human = "human_summary.txt"
- path_diag = "diagnostics.json"
- raw_dir = "raw_runs"
- rows_fail = "".join(
- f"
| {html_module.escape(str(f.get('id', '')))} | {html_module.escape(str(f.get('impact', '')))} | {html_module.escape(str(f.get('helpText', ''))[:80])}... |
"
- for f in failures[:10]
- ) or "| None |
"
- raw_links = "".join(f"{os.path.basename(p)} " for p in raw_reports[:5])
- return f"""
-
-Lighthouse Report
-
-Lighthouse Report
-URL: {url}
-Median metrics
-
-| Metric | Value |
-| LCP (ms) | {mm.get('lcp_ms') or '—'} |
-| CLS | {mm.get('cls') or '—'} |
-| TBT (ms) | {mm.get('tbt_ms') or '—'} |
-| FCP (ms) | {mm.get('fcp_ms') or '—'} |
-
-Category scores (0–100)
-
-| Category | Score |
-| performance | {cs.get('performance') or '—'} |
-| accessibility | {cs.get('accessibility') or '—'} |
-| best-practices | {cs.get('best-practices') or '—'} |
-| seo | {cs.get('seo') or '—'} |
-| pwa | {cs.get('pwa') or '—'} |
-
-Top failures
-| Audit | Impact | Help |
{rows_fail}
-Artifacts
-summary.json | human_summary.txt | diagnostics.json
-Raw runs: {raw_links or '—'}
-
-
-"""
-
-
-def _write_report_html(output_dir: str, summary: dict[str, Any]) -> None:
- """Write report.html to output_dir (used when not using DB)."""
- content = summary.get("report_html") or _build_report_html_content(summary)
- report_path = os.path.join(output_dir, "report.html")
- with open(report_path, "w", encoding="utf-8") as f:
- f.write(content)
-
-
-def _url_safe(s: str) -> str:
- """Return a filesystem-safe slug from URL for filenames."""
- return re.sub(r"[^\w\-.]", "_", s.strip().rstrip("/"))[:80]
-
-
-def _lighthouse_cmd() -> list[str]:
- """Return argv prefix: [resolved lighthouse] or [resolved npx, -y, lighthouse]. Paths from shutil.which (portable)."""
- explicit = (os.environ.get("LIGHTHOUSE_PATH") or os.environ.get("LIGHTHOUSE_BIN") or "").strip()
- if explicit and os.path.isfile(explicit) and os.access(explicit, os.X_OK):
- return [explicit]
- lh = shutil.which("lighthouse")
- if lh is not None:
- return [lh]
- npx = shutil.which("npx")
- if npx is not None:
- return [npx, "-y", "lighthouse"]
- raise RuntimeError(_LIGHTHOUSE_INSTALL_MSG)
-
-
-def _uses_npx(cmd: list[str]) -> bool:
- base = os.path.basename(cmd[0]).lower()
- return base in ("npx", "npx.cmd")
-
-
-def is_lighthouse_available() -> bool:
- """Return True if lighthouse or npx is on PATH (so we can run Lighthouse)."""
- try:
- _lighthouse_cmd()
- return True
- except RuntimeError:
- return False
-
-
-def _preset_for_strategy(strategy: str) -> str:
- """Map user strategy 'mobile'|'desktop' to Lighthouse CLI preset. Newer Lighthouse only accepts perf, experimental, desktop."""
- s = (strategy or "mobile").lower()
- if s == "desktop":
- return "desktop"
- return "perf" # mobile -> perf (mobile-like throttling in current Lighthouse)
-
-
-# Valid Lighthouse category IDs for --only-categories
-LIGHTHOUSE_CATEGORY_IDS = {"performance", "accessibility", "best-practices", "seo", "pwa"}
-
-
-def _parse_categories(categories: str | list[str] | None) -> list[str] | None:
- """Return list of valid category IDs, or None to run all categories."""
- if categories is None:
- return None
- if isinstance(categories, str):
- categories = [c.strip().lower() for c in categories.split(",") if c.strip()]
- if not categories:
- return None
- out = [c for c in categories if c in LIGHTHOUSE_CATEGORY_IDS]
- return out if out else None
diff --git a/src/website_profiling/lighthouse/runner.py b/src/website_profiling/lighthouse/runner.py
index 3b0d0ca..39eaf52 100644
--- a/src/website_profiling/lighthouse/runner.py
+++ b/src/website_profiling/lighthouse/runner.py
@@ -21,16 +21,10 @@
_LIGHTHOUSE_INSTALL_MSG,
_LIGHTHOUSE_FLOW_MODES,
_NPX_LIGHTHOUSE_LOCK,
- _lighthouse_cmd,
_lighthouse_flow_script,
_node_cmd,
_normalize_lighthouse_mode,
- _parse_categories,
- _preset_for_strategy,
_repo_root,
- _url_safe,
- _uses_npx,
- is_lighthouse_available,
)
from .result_parser import _evidence_from_audit, extract_from_lighthouse_json, median_or_none
@@ -175,10 +169,8 @@ def run_lighthouse_flow_once(
flow_args.append("--categories=" + ",".join(categories))
if npx is not None:
cmd = [npx, "-y", "-p", "lighthouse", "-p", "puppeteer", "node", script, *flow_args]
- use_lock = True
else:
cmd = [node, script, *flow_args]
- use_lock = False
try:
run_kwargs = {
"capture_output": True,
@@ -187,10 +179,8 @@ def run_lighthouse_flow_once(
"timeout": 300,
"cwd": _repo_root(),
}
- if use_lock:
- with _NPX_LIGHTHOUSE_LOCK:
- return subprocess.run(cmd, **run_kwargs)
- return subprocess.run(cmd, **run_kwargs)
+ with _NPX_LIGHTHOUSE_LOCK:
+ return subprocess.run(cmd, **run_kwargs)
except FileNotFoundError as e:
raise RuntimeError(_LIGHTHOUSE_INSTALL_MSG) from e
@@ -205,6 +195,10 @@ def run_lighthouse_once(
wait_ms: int = 1500,
) -> subprocess.CompletedProcess:
"""Run lighthouse once; navigation uses CLI, snapshot/timespan use User Flow API."""
+ from urllib.parse import urlparse as _urlparse
+ parsed_scheme = _urlparse(url).scheme.lower()
+ if parsed_scheme not in ("http", "https"):
+ raise ValueError(f"Lighthouse URL must use http or https, got: {url!r}")
lh_mode = _normalize_lighthouse_mode(mode)
if lh_mode in _LIGHTHOUSE_FLOW_MODES:
return run_lighthouse_flow_once(
@@ -235,10 +229,8 @@ def run_lighthouse_once(
"errors": "replace",
"timeout": 300,
}
- if _uses_npx(base):
- with _NPX_LIGHTHOUSE_LOCK:
- return subprocess.run(cmd, **run_kwargs)
- return subprocess.run(cmd, **run_kwargs)
+ with _NPX_LIGHTHOUSE_LOCK:
+ return subprocess.run(cmd, **run_kwargs)
except FileNotFoundError as e:
raise RuntimeError(_LIGHTHOUSE_INSTALL_MSG) from e
diff --git a/src/website_profiling/llm/audit_summary.py b/src/website_profiling/llm/audit_summary.py
index 0943f0d..bdb786b 100644
--- a/src/website_profiling/llm/audit_summary.py
+++ b/src/website_profiling/llm/audit_summary.py
@@ -3,6 +3,8 @@
from typing import Any
+from ..scoring import round_half_up
+
def rank_issues_by_traffic(
categories: list[dict[str, Any]],
@@ -51,17 +53,10 @@ def generate_audit_executive_summary(
gsc_pages = gsc.get("pages") if isinstance(gsc, dict) else []
top_issues = rank_issues_by_traffic(categories, gsc_pages)[:5]
- lines = []
scores = [c.get("score") for c in categories if isinstance(c.get("score"), (int, float))]
- if scores:
- avg = round(sum(scores) / len(scores))
- lines.append(f"Overall audit health score: {avg}/100.")
- if top_issues:
- lines.append("Top traffic-impacting issues:")
- for i, iss in enumerate(top_issues[:3], 1):
- lines.append(f"{i}. [{iss.get('priority')}] {iss.get('message')} ({iss.get('url') or 'site-wide'})")
+ avg = round_half_up(sum(scores) / len(scores)) if scores else None
- fallback = "\n".join(lines) if lines else "No major issues detected in this audit run."
+ fallback = _deterministic_summary_text(avg, top_issues)
source = "deterministic"
priorities: list[str] = []
@@ -72,11 +67,13 @@ def generate_audit_executive_summary(
fallback = str(llm_result["summary"])
priorities = llm_result.get("priorities") or []
else:
- lines.append("(LLM summary unavailable — using deterministic summary.)")
- fallback = "\n".join(lines)
+ fallback = _deterministic_summary_text(avg, top_issues, llm_unavailable=True)
elif llm_is_enabled(cfg or {}):
- lines.append("(Enable audit executive summary in AI task settings for LLM narrative.)")
- fallback = "\n".join(lines)
+ fallback = _deterministic_summary_text(
+ avg,
+ top_issues,
+ hint_enable_llm=True,
+ )
return {
"ok": True,
@@ -87,6 +84,30 @@ def generate_audit_executive_summary(
}
+def _deterministic_summary_text(
+ avg: int | None,
+ top_issues: list[dict[str, Any]],
+ *,
+ llm_unavailable: bool = False,
+ hint_enable_llm: bool = False,
+) -> str:
+ """Short narrative for UI; structured score/issues render separately in the app."""
+ if top_issues:
+ msg = "Prioritize fixes below by severity and Search Console traffic impact."
+ elif avg is not None and avg >= 80:
+ msg = "Site health looks strong. Keep monitoring crawl and Search Console trends."
+ elif avg is not None:
+ msg = "Review category scores and address high-priority issues to improve overall health."
+ else:
+ msg = "No major issues detected in this audit run."
+
+ if llm_unavailable:
+ msg = f"{msg} (AI summary unavailable — showing structured overview only.)"
+ elif hint_enable_llm:
+ msg = f"{msg} Enable audit executive summary in AI settings for an AI narrative."
+ return msg
+
+
def _audit_summary_llm_enabled(cfg: dict[str, str]) -> bool:
v = str(cfg.get("llm_enable_audit_summary", "true")).lower()
return v in ("true", "1", "yes")
@@ -104,7 +125,7 @@ def _generate_llm_executive_summary(
categories = report_payload.get("categories") or []
scores = [c.get("score") for c in categories if isinstance(c.get("score"), (int, float))]
- avg = round(sum(scores) / len(scores)) if scores else None
+ avg = round_half_up(sum(scores) / len(scores)) if scores else None
payload = {
"health_score": avg,
"category_scores": [
diff --git a/src/website_profiling/reporting/builder.py b/src/website_profiling/reporting/builder.py
index 24660ee..f2c14ec 100644
--- a/src/website_profiling/reporting/builder.py
+++ b/src/website_profiling/reporting/builder.py
@@ -18,6 +18,7 @@
from ..llm.enrich import cluster_keywords_llm, run_llm_enrichment
from ..llm_config import load_llm_config_from_db, llm_is_enabled
from ..security_scanner import run_security_scan
+from ..scoring import round_half_up
from .categories import build_categories
from .content_analytics import (
_build_content_analytics,
@@ -1079,7 +1080,7 @@ def _bool_col(col):
scores.append(int(float(c.get("score"))))
except (TypeError, ValueError):
continue
- prop_health = round(sum(scores) / len(scores)) if scores else None
+ prop_health = round_half_up(sum(scores) / len(scores)) if scores else None
prop_count = int(portfolio.get("count") or 0)
median = portfolio.get("median_health_score")
bench: dict[str, Any] = {
diff --git a/src/website_profiling/reporting/compare_payload.py b/src/website_profiling/reporting/compare_payload.py
index fc187e6..87112df 100644
--- a/src/website_profiling/reporting/compare_payload.py
+++ b/src/website_profiling/reporting/compare_payload.py
@@ -4,6 +4,8 @@
from typing import Any
from urllib.parse import urlparse
+from ..scoring import round_half_up
+
_PRIORITY_ORDER = {"Critical": 0, "High": 1, "Medium": 2, "Low": 3}
_LH_DELTA_THRESHOLD = 5
_ISSUE_DELTA_CAP = 100
@@ -50,7 +52,7 @@ def _score_from_categories(categories: list[Any]) -> int | None:
for c in categories
if isinstance(c, dict) and isinstance(c.get("score"), (int, float))
]
- return round(sum(scores) / len(scores)) if scores else None
+ return round_half_up(sum(scores) / len(scores)) if scores else None
def _issue_key(url: str, category: str, message: str) -> str:
diff --git a/src/website_profiling/reporting/crawl_segments.py b/src/website_profiling/reporting/crawl_segments.py
index ee0c25d..4dfa960 100644
--- a/src/website_profiling/reporting/crawl_segments.py
+++ b/src/website_profiling/reporting/crawl_segments.py
@@ -4,6 +4,8 @@
from typing import Any
from urllib.parse import urlparse
+from ..scoring import round_half_up
+
def build_crawl_segments(
df,
@@ -18,7 +20,7 @@ def build_crawl_segments(
for c in categories
if isinstance(c, dict) and isinstance(c.get("score"), (int, float))
]
- overall = round(sum(overall_scores) / len(overall_scores)) if overall_scores else None
+ overall = round_half_up(sum(overall_scores) / len(overall_scores)) if overall_scores else None
segments: list[dict[str, Any]] = []
for prefix in path_prefixes:
diff --git a/src/website_profiling/reporting/indexation.py b/src/website_profiling/reporting/indexation.py
index 12f0306..8f8394e 100644
--- a/src/website_profiling/reporting/indexation.py
+++ b/src/website_profiling/reporting/indexation.py
@@ -31,7 +31,8 @@ def _gsc_page_urls(google_data: dict[str, Any] | None) -> list[str]:
if not google_data:
return []
gsc = google_data.get("gsc") if isinstance(google_data.get("gsc"), dict) else {}
- pages = gsc.get("pages") if isinstance(gsc.get("pages"), list) else []
+ raw = gsc.get("top_pages") or gsc.get("pages")
+ pages = raw if isinstance(raw, list) else []
out: list[str] = []
for row in pages:
if isinstance(row, dict):
@@ -45,7 +46,8 @@ def _gsc_by_page(google_data: dict[str, Any] | None) -> dict[str, dict]:
if not google_data:
return {}
gsc = google_data.get("gsc") if isinstance(google_data.get("gsc"), dict) else {}
- pages = gsc.get("pages") if isinstance(gsc.get("pages"), list) else []
+ raw = gsc.get("top_pages") or gsc.get("pages")
+ pages = raw if isinstance(raw, list) else []
out: dict[str, dict] = {}
for row in pages:
if isinstance(row, dict):
diff --git a/src/website_profiling/reporting/optional_audits.py b/src/website_profiling/reporting/optional_audits.py
index 32c79f1..f95c79f 100644
--- a/src/website_profiling/reporting/optional_audits.py
+++ b/src/website_profiling/reporting/optional_audits.py
@@ -4,6 +4,7 @@
import json
import re
import sys
+import threading
from typing import Any, Optional
from urllib.parse import urlparse
@@ -15,6 +16,7 @@
from .categories import _issue, _sort_issues
_WAYBACK_CACHE: dict[str, bool] = {}
+_WAYBACK_LOCK: threading.Lock = threading.Lock()
def _parse_page_analysis(raw: object) -> dict[str, Any]:
@@ -221,8 +223,10 @@ def wayback_issues(df: pd.DataFrame, *, max_lookups: int = 15) -> list[dict]:
if not url:
continue
cache_key = url.rstrip("/")
- if cache_key in _WAYBACK_CACHE:
- if _WAYBACK_CACHE[cache_key]:
+ with _WAYBACK_LOCK:
+ cached = _WAYBACK_CACHE.get(cache_key, None)
+ if cached is not None:
+ if cached:
issues.append(_issue(
"404 URL has Wayback snapshot (Estimated).",
url=url,
@@ -240,7 +244,8 @@ def wayback_issues(df: pd.DataFrame, *, max_lookups: int = 15) -> list[dict]:
data = resp.json()
snap = (data.get("archived_snapshots") or {}).get("closest") or {}
available = bool(snap.get("available"))
- _WAYBACK_CACHE[cache_key] = available
+ with _WAYBACK_LOCK:
+ _WAYBACK_CACHE[cache_key] = available
if available:
ts = snap.get("timestamp") or "unknown"
issues.append(_issue(
@@ -251,7 +256,8 @@ def wayback_issues(df: pd.DataFrame, *, max_lookups: int = 15) -> list[dict]:
))
looked += 1
except Exception:
- _WAYBACK_CACHE[cache_key] = False
+ with _WAYBACK_LOCK:
+ _WAYBACK_CACHE[cache_key] = False
continue
return issues
diff --git a/src/website_profiling/scoring.py b/src/website_profiling/scoring.py
new file mode 100644
index 0000000..dbcb276
--- /dev/null
+++ b/src/website_profiling/scoring.py
@@ -0,0 +1,9 @@
+"""Shared score rounding helpers."""
+from __future__ import annotations
+
+import math
+
+
+def round_half_up(value: float) -> int:
+ """Round to nearest integer, halves away from zero (not banker's rounding)."""
+ return math.floor(value + 0.5)
diff --git a/src/website_profiling/security_scanner.py b/src/website_profiling/security_scanner.py
index f545299..4affb2b 100644
--- a/src/website_profiling/security_scanner.py
+++ b/src/website_profiling/security_scanner.py
@@ -256,7 +256,9 @@ def _passive_html_checks(
evidence=pname,
))
break
- except Exception:
+ except Exception as exc:
+ import sys
+ print(f" security_scanner: skipping {url}: {type(exc).__name__}: {exc}", file=sys.stderr)
continue
return findings
diff --git a/src/website_profiling/tools/audit_tools/geo_list_tools.py b/src/website_profiling/tools/audit_tools/geo_list_tools.py
index 4aaaa24..32d8d7a 100644
--- a/src/website_profiling/tools/audit_tools/geo_list_tools.py
+++ b/src/website_profiling/tools/audit_tools/geo_list_tools.py
@@ -74,7 +74,7 @@ def _aeo_score(rec: dict[str, Any]) -> dict[str, Any]:
def _parse_robots_txt(domain: str) -> str:
if not domain:
return ""
- base = f"https://{domain.lstrip('https://').lstrip('http://').split('/')[0]}"
+ base = f"https://{re.sub(r'^https?://', '', domain).split('/')[0]}"
url = urljoin(base + "/", "robots.txt")
try:
resp = requests.get(url, timeout=8, headers={"User-Agent": "SiteAudit/1.0"})
diff --git a/src/website_profiling/tools/audit_tools/geo_tools.py b/src/website_profiling/tools/audit_tools/geo_tools.py
index 2c4a729..a61a8d7 100644
--- a/src/website_profiling/tools/audit_tools/geo_tools.py
+++ b/src/website_profiling/tools/audit_tools/geo_tools.py
@@ -20,7 +20,7 @@
def _fetch_llms_txt(domain: str) -> dict[str, Any]:
if not domain:
return {"found": False, "error": "domain unknown"}
- base = f"https://{domain.lstrip('https://').lstrip('http://').split('/')[0]}"
+ base = f"https://{re.sub(r'^https?://', '', domain).split('/')[0]}"
paths = ("/llms.txt", "/.well-known/llms.txt")
for path in paths:
url = urljoin(base + "/", path.lstrip("/"))
diff --git a/src/website_profiling/tools/audit_tools/lighthouse.py b/src/website_profiling/tools/audit_tools/lighthouse.py
index 70944b9..0d8a546 100644
--- a/src/website_profiling/tools/audit_tools/lighthouse.py
+++ b/src/website_profiling/tools/audit_tools/lighthouse.py
@@ -234,7 +234,7 @@ def list_lighthouse_poor_best_practices_pages(conn: Connection, ctx: AuditToolCo
def list_lighthouse_cwv_failures(conn: Connection, ctx: AuditToolContext, args: dict[str, Any]) -> dict[str, Any]:
- from ...lighthouse.runner import CLS_GOOD, LCP_GOOD_MS
+ from ...lighthouse.runner import CLS_GOOD, LCP_GOOD_MS, TBT_GOOD_MS
scoped = ctx.with_args(args)
payload = scoped.load_payload(conn)
@@ -263,7 +263,7 @@ def list_lighthouse_cwv_failures(conn: Connection, ctx: AuditToolContext, args:
except (TypeError, ValueError):
pass
try:
- if tbt is not None and float(tbt) > 200:
+ if tbt is not None and float(tbt) > TBT_GOOD_MS:
failed.append("tbt")
except (TypeError, ValueError):
pass
diff --git a/src/website_profiling/tools/audit_tools/report.py b/src/website_profiling/tools/audit_tools/report.py
index d5c25b6..cf38b17 100644
--- a/src/website_profiling/tools/audit_tools/report.py
+++ b/src/website_profiling/tools/audit_tools/report.py
@@ -50,13 +50,16 @@ def _iter_category_issues(payload: dict[str, Any]) -> list[dict[str, Any]]:
return rows
+from ...scoring import round_half_up
+
+
def _health_score(payload: dict[str, Any]) -> int | None:
scores = [
float(c.get("score"))
for c in (payload.get("categories") or [])
if isinstance(c, dict) and isinstance(c.get("score"), (int, float))
]
- return round(sum(scores) / len(scores)) if scores else None
+ return round_half_up(sum(scores) / len(scores)) if scores else None
def _issue_counts(issues: list[dict[str, Any]]) -> dict[str, int]:
diff --git a/src/website_profiling/tools/export_audit_data.py b/src/website_profiling/tools/export_audit_data.py
index 56974a4..e6d28a0 100644
--- a/src/website_profiling/tools/export_audit_data.py
+++ b/src/website_profiling/tools/export_audit_data.py
@@ -6,6 +6,7 @@
from typing import Any, Optional
from ..reporting.terminology import category_display_name
+from ..scoring import round_half_up
_GLOSSARY_ROWS: list[tuple[str, str]] = [
("Crawl", "URLs fetched by the site spider (status codes, titles, inlinks)."),
@@ -191,7 +192,7 @@ def _overall_score(payload: dict[str, Any]) -> Optional[int]:
continue
if not scores:
return None
- return int(round(sum(scores) / len(scores)))
+ return round_half_up(sum(scores) / len(scores))
def _score_band(score: Optional[float]) -> tuple[str, str]:
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
index 6227ecf..31d5511 100644
--- a/tests/test_analysis.py
+++ b/tests/test_analysis.py
@@ -3,7 +3,7 @@
import pandas as pd
-from website_profiling.analysis.local import compute_duplicate_groups, simhash_64
+from website_profiling.analysis.local import compute_duplicate_groups, run_local_enrichment, simhash_64
def test_simhash_identical_text_same_hash():
@@ -39,6 +39,40 @@ def test_duplicate_groups_fuzzy_merge():
"analysis_fuzzy_threshold": "90",
"analysis_dup_max_pages": "100",
}
- groups, url_gid = compute_duplicate_groups(df, cfg)
+ groups, url_gid, warnings = compute_duplicate_groups(df, cfg)
assert len(groups) >= 1
assert url_gid.get("https://example.com/a") == url_gid.get("https://example.com/b")
+ assert warnings == []
+
+
+def test_duplicate_groups_emit_warnings_when_url_caps_exceeded(monkeypatch) -> None:
+ monkeypatch.setattr(
+ "website_profiling.analysis.local._import_rapidfuzz",
+ lambda: type("F", (), {"token_set_ratio": staticmethod(lambda _a, _b: 0)})(),
+ )
+ rows = []
+ for i in range(3):
+ rows.append(
+ {
+ "url": f"https://example.com/p{i}",
+ "status": "200",
+ "content_type": "text/html",
+ "title": f"Unique page title number {i}",
+ "meta_description": "desc",
+ "h1": "h1",
+ "content_excerpt": " ".join(["content"] * 50),
+ }
+ )
+ df = pd.DataFrame(rows)
+ cfg = {
+ "enable_duplicate_detection": "true",
+ "analysis_simhash_hamming": "3",
+ "analysis_simhash_max_urls": "1",
+ "analysis_fuzzy_max_urls": "1",
+ }
+ _groups, _url_gid, warnings = compute_duplicate_groups(df, cfg)
+ assert any("SimHash" in w for w in warnings)
+ assert any("fuzzy" in w for w in warnings)
+
+ bundle = run_local_enrichment(df, cfg)
+ assert any("SimHash" in w for w in bundle.get("ml_errors") or [])
diff --git a/tests/test_analysis_crawl_stores_edge_unit.py b/tests/test_analysis_crawl_stores_edge_unit.py
index 3bfc891..584d508 100644
--- a/tests/test_analysis_crawl_stores_edge_unit.py
+++ b/tests/test_analysis_crawl_stores_edge_unit.py
@@ -85,7 +85,7 @@ def test_analysis_local_duplicate_and_language_paths(monkeypatch) -> None:
{"url": "https://a.com/3", "status": "404", "content_type": "text/html", "title": long_text},
]
)
- groups, mapping = local.compute_duplicate_groups(
+ groups, mapping, _warnings = local.compute_duplicate_groups(
df,
{
"enable_duplicate_detection": "true",
@@ -529,7 +529,7 @@ def find_all(self, name=None, **kwargs):
{"url": "https://a.com/3", "status": "200", "content_type": "text/html", "title": "word " * 20},
]
)
- groups, _ = local.compute_duplicate_groups(
+ groups, _mapping, _warnings = local.compute_duplicate_groups(
dup_df, {"enable_duplicate_detection": "true", "analysis_simhash_hamming": "64"}
)
assert groups
@@ -543,7 +543,7 @@ def find_all(self, name=None, **kwargs):
title = f"unique duplicate group title number {i} " * 4
many_rows.append({"url": f"https://a.com/{i}a", "status": "200", "content_type": "text/html", "title": title})
many_rows.append({"url": f"https://a.com/{i}b", "status": "200", "content_type": "text/html", "title": title})
- many_groups, _ = local.compute_duplicate_groups(
+ many_groups, _mapping, _warnings = local.compute_duplicate_groups(
pd.DataFrame(many_rows),
{"enable_duplicate_detection": "true", "analysis_fuzzy_threshold": "90"},
)
diff --git a/tests/test_commands_config_stores_edge_unit.py b/tests/test_commands_config_stores_edge_unit.py
index 0057d6d..ceab5a6 100644
--- a/tests/test_commands_config_stores_edge_unit.py
+++ b/tests/test_commands_config_stores_edge_unit.py
@@ -68,7 +68,7 @@ def test_compute_duplicate_groups_full_paths(monkeypatch) -> None:
},
]
)
- groups, mapping = local.compute_duplicate_groups(
+ groups, mapping, _warnings = local.compute_duplicate_groups(
df, {"enable_duplicate_detection": "true", "analysis_simhash_hamming": "3", "analysis_fuzzy_threshold": "90"}
)
assert len(groups) >= 1
@@ -76,7 +76,7 @@ def test_compute_duplicate_groups_full_paths(monkeypatch) -> None:
# short fingerprint skipped
df_short = pd.DataFrame([{"url": "https://a.com/x", "status": "200", "content_type": "text/html", "title": "hi"}])
- g2, m2 = local.compute_duplicate_groups(df_short, {"enable_duplicate_detection": "true"})
+ g2, m2, _w2 = local.compute_duplicate_groups(df_short, {"enable_duplicate_detection": "true"})
assert g2 == []
diff --git a/tests/test_common_analysis_commands_db_unit.py b/tests/test_common_analysis_commands_db_unit.py
index a61d180..d244466 100644
--- a/tests/test_common_analysis_commands_db_unit.py
+++ b/tests/test_common_analysis_commands_db_unit.py
@@ -361,7 +361,7 @@ def test_compute_duplicate_groups_hamming_and_fuzzy(monkeypatch) -> None:
"analysis_fuzzy_threshold": "90",
"analysis_dup_max_pages": "10",
}
- groups, mapping = local.compute_duplicate_groups(df, cfg)
+ groups, mapping, _warnings = local.compute_duplicate_groups(df, cfg)
assert len(groups) >= 1
assert any(k.startswith("dup_") for k in mapping.values())
@@ -384,7 +384,7 @@ def test_compute_language_signals_enabled(monkeypatch) -> None:
def test_run_local_enrichment_success(monkeypatch) -> None:
from website_profiling.analysis import local
- monkeypatch.setattr(local, "compute_duplicate_groups", lambda *_a, **_k: ([{"id": "dup_0"}], {"https://a.com": "dup_0"}))
+ monkeypatch.setattr(local, "compute_duplicate_groups", lambda *_a, **_k: ([{"id": "dup_0"}], {"https://a.com": "dup_0"}, []))
monkeypatch.setattr(
local,
"compute_language_signals",
@@ -911,7 +911,7 @@ def test_historical_backup_success_and_restore_fallback(monkeypatch, tmp_path) -
from website_profiling.db import historical as h
monkeypatch.setattr(h, "get_data_dir", lambda: str(tmp_path))
- monkeypatch.setattr(h, "get_database_url", lambda: "postgres://u:p@h/db")
+ monkeypatch.setattr(h, "get_database_url", lambda: "postgres://u:p@h:5432/db")
dump_path = tmp_path / "backups" / "out.dump"
diff --git a/tests/test_config_schema_keys.py b/tests/test_config_schema_keys.py
index 9395716..bfcd483 100644
--- a/tests/test_config_schema_keys.py
+++ b/tests/test_config_schema_keys.py
@@ -89,6 +89,8 @@
"enable_language_detection",
"analysis_fuzzy_threshold",
"analysis_simhash_hamming",
+ "analysis_simhash_max_urls",
+ "analysis_fuzzy_max_urls",
"analysis_dup_max_pages",
"run_crawl",
"run_report",
diff --git a/tests/test_crawl_db_writer_imports.py b/tests/test_crawl_db_writer_imports.py
index 17c3459..164bb38 100644
--- a/tests/test_crawl_db_writer_imports.py
+++ b/tests/test_crawl_db_writer_imports.py
@@ -101,6 +101,31 @@ def __exit__(self, _t, _v, _tb):
writer.raise_if_failed()
+def test_crawl_db_writer_enqueue_short_circuits_after_error(monkeypatch: pytest.MonkeyPatch) -> None:
+ """enqueue/enqueue_html should be no-ops once _error is set (avoids unbounded queue growth)."""
+ from website_profiling.crawl.crawler import _CrawlDbWriter
+
+ class _BrokenCtx:
+ def __enter__(self):
+ raise RuntimeError("db down")
+
+ def __exit__(self, _t, _v, _tb):
+ return False
+
+ monkeypatch.setattr("website_profiling.db.db_session", lambda: _BrokenCtx())
+
+ writer = _CrawlDbWriter(crawl_run_id=1, batch_size=50, store_page_html=True)
+ writer.enqueue({"url": "https://a.com"})
+ writer.finish()
+ writer.run() # sets _error
+
+ # Queue should be empty now; further enqueues must be dropped
+ assert not writer._queue.qsize()
+ writer.enqueue({"url": "https://b.com"})
+ writer.enqueue_html({"url": "https://b.com", "html": ""})
+ assert not writer._queue.qsize()
+
+
def test_crawl_db_writer_run_does_not_import_error() -> None:
"""
Regression: during the db/ split, _CrawlDbWriter.run() imported helpers from
diff --git a/tests/test_crux_fetch.py b/tests/test_crux_fetch.py
new file mode 100644
index 0000000..ae470e7
--- /dev/null
+++ b/tests/test_crux_fetch.py
@@ -0,0 +1,30 @@
+"""Tests for CrUX fetch helpers."""
+from __future__ import annotations
+
+import json
+from unittest.mock import MagicMock, patch
+
+from website_profiling.integrations.crux.fetch import fetch_crux_origin_metrics
+
+
+def test_fetch_crux_non_numeric_p75_does_not_raise() -> None:
+ payload = {
+ "record": {
+ "metrics": {
+ "largest_contentful_paint": {"percentiles": {"p75": "n/a"}, "histogram": []},
+ "interaction_to_next_paint": {"percentiles": {"p75": None}, "histogram": []},
+ "cumulative_layout_shift": {"percentiles": {"p75": "bad"}, "histogram": []},
+ }
+ }
+ }
+ body = json.dumps(payload).encode("utf-8")
+ mock_resp = MagicMock()
+ mock_resp.read.return_value = body
+ mock_resp.__enter__.return_value = mock_resp
+ mock_resp.__exit__.return_value = False
+
+ with patch("website_profiling.integrations.crux.fetch.urlopen", return_value=mock_resp):
+ out = fetch_crux_origin_metrics("https://example.com", api_key="test-key")
+
+ assert out["ok"] is True
+ assert out["pass"] == {"lcp": False, "inp": False, "cls": False}
diff --git a/tests/test_fetchers_sitemap_config_unit.py b/tests/test_fetchers_sitemap_config_unit.py
index d224b1d..d53511f 100644
--- a/tests/test_fetchers_sitemap_config_unit.py
+++ b/tests/test_fetchers_sitemap_config_unit.py
@@ -39,11 +39,13 @@ def test_fetch_result_as_tuple():
def test_static_fetcher_network_error_returns_empty_result():
+ import requests as req_module
+
class BoomSession:
headers = {}
def get(self, *_a, **_k):
- raise ConnectionError("offline")
+ raise req_module.exceptions.ConnectionError("offline")
def close(self):
pass
diff --git a/tests/test_indexation_coverage.py b/tests/test_indexation_coverage.py
index 1472d6b..a7bc498 100644
--- a/tests/test_indexation_coverage.py
+++ b/tests/test_indexation_coverage.py
@@ -22,22 +22,44 @@ def test_success_urls_filters_non_200() -> None:
assert urls == ["https://example.com/a"]
-def test_gsc_page_urls_extracts_pages() -> None:
- google = {"gsc": {"pages": [{"page": "https://example.com/x"}, {"url": "https://example.com/y"}]}}
+def test_gsc_page_urls_extracts_top_pages() -> None:
+ google = {
+ "gsc": {
+ "top_pages": [
+ {"page": "https://example.com/x"},
+ {"url": "https://example.com/y"},
+ ]
+ }
+ }
assert len(_gsc_page_urls(google)) == 2
+def test_gsc_page_urls_legacy_pages_fallback() -> None:
+ google = {"gsc": {"pages": [{"page": "https://example.com/x"}]}}
+ assert _gsc_page_urls(google) == ["https://example.com/x"]
+
+
@patch("website_profiling.reporting.indexation.discover_sitemap_urls")
def test_build_indexation_coverage_lists(mock_sitemap) -> None:
mock_sitemap.return_value = ["https://example.com/", "https://example.com/sitemap-only"]
df = pd.DataFrame([{"url": "https://example.com/", "status": "200"}])
- google = {"gsc": {"pages": [{"page": "https://example.com/gsc-only"}]}}
+ google = {"gsc": {"top_pages": [{"page": "https://example.com/gsc-only"}]}}
out = build_indexation_coverage(df, "https://example.com/", google)
assert out["counts"]["crawled"] == 1
assert out["counts"]["sitemap_only"] >= 1
assert "sitemap_only" in out["lists"]
+@patch("website_profiling.reporting.indexation.discover_sitemap_urls")
+def test_build_indexation_coverage_gsc_not_crawled(mock_sitemap) -> None:
+ mock_sitemap.return_value = []
+ df = pd.DataFrame([{"url": "https://example.com/", "status": "200"}])
+ google = {"gsc": {"top_pages": [{"page": "https://example.com/gsc-only"}]}}
+ out = build_indexation_coverage(df, "https://example.com/", google)
+ assert out["counts"]["gsc_pages"] == 1
+ assert "https://example.com/gsc-only" in out["lists"]["gsc_not_crawled"]
+
+
def test_success_urls_empty_dataframe() -> None:
assert _success_urls(pd.DataFrame()) == []
diff --git a/tests/test_keyword_enrich.py b/tests/test_keyword_enrich.py
new file mode 100644
index 0000000..f5bd465
--- /dev/null
+++ b/tests/test_keyword_enrich.py
@@ -0,0 +1,35 @@
+"""Tests for keyword enrichment math helpers."""
+from __future__ import annotations
+
+import pytest
+from website_profiling.integrations.google.keyword_enrich import (
+ CTR_CURVE,
+ ctr_as_fraction,
+ industry_ctr,
+ opportunity_clicks,
+)
+
+
+def test_ctr_as_fraction_percent_values() -> None:
+ assert ctr_as_fraction(2.8) == pytest.approx(0.028)
+ assert ctr_as_fraction(100) == 1.0
+ assert ctr_as_fraction(0.5) == 0.005
+
+
+def test_ctr_as_fraction_clamps_above_100_percent() -> None:
+ assert ctr_as_fraction(150) == 1.0
+
+
+def test_opportunity_clicks_uses_ceil_for_position_slot() -> None:
+ # Position 4.1 -> slot 5 (conservative), not slot 4 from round().
+ clicks = opportunity_clicks(1000, 4.1, target_pos=3)
+ assert clicks == int(1000 * (CTR_CURVE[3] - CTR_CURVE[5]))
+
+
+def test_opportunity_clicks_boundary_position_three() -> None:
+ assert opportunity_clicks(1000, 3.0, target_pos=3) == 0
+
+
+def test_industry_ctr_uses_ceil() -> None:
+ assert industry_ctr(2.1) == CTR_CURVE[3]
+ assert industry_ctr(3.0) == CTR_CURVE[3]
diff --git a/tests/test_missing_points_batch.py b/tests/test_missing_points_batch.py
index d45ca0b..a223763 100644
--- a/tests/test_missing_points_batch.py
+++ b/tests/test_missing_points_batch.py
@@ -18,7 +18,7 @@ def test_compute_duplicate_groups_disabled_returns_empty() -> None:
from website_profiling.analysis.local import compute_duplicate_groups
df = pd.DataFrame([{"url": "https://a.com", "status": "200", "content_type": "text/html"}])
- groups, mapping = compute_duplicate_groups(df, {"enable_duplicate_detection": "false"})
+ groups, mapping, _warnings = compute_duplicate_groups(df, {"enable_duplicate_detection": "false"})
assert groups == []
assert mapping == {}
@@ -35,7 +35,7 @@ def test_compute_duplicate_groups_basic_cluster(monkeypatch) -> None:
{"url": "https://a.com/2", "status": "200", "content_type": "text/html"},
]
)
- groups, mapping = local.compute_duplicate_groups(df, {"enable_duplicate_detection": "true"})
+ groups, mapping, _warnings = local.compute_duplicate_groups(df, {"enable_duplicate_detection": "true"})
assert len(groups) == 1
assert mapping["https://a.com/1"].startswith("dup_")
diff --git a/tests/test_roadmap_extras.py b/tests/test_roadmap_extras.py
index 3c0ccc8..c2e4889 100644
--- a/tests/test_roadmap_extras.py
+++ b/tests/test_roadmap_extras.py
@@ -34,8 +34,9 @@ def test_executive_summary_deterministic() -> None:
result = generate_audit_executive_summary(payload, {})
assert result["ok"] is True
assert result["source"] == "deterministic"
- assert "80" in result["summary"]
assert len(result["top_issues"]) >= 1
+ assert isinstance(result["summary"], str)
+ assert "Prioritize fixes below" in result["summary"]
def test_executive_summary_empty_payload() -> None:
diff --git a/tests/test_scoring.py b/tests/test_scoring.py
new file mode 100644
index 0000000..ef259da
--- /dev/null
+++ b/tests/test_scoring.py
@@ -0,0 +1,9 @@
+"""Tests for scoring helpers."""
+from __future__ import annotations
+
+from website_profiling.scoring import round_half_up
+
+
+def test_round_half_up_away_from_bankers_rounding() -> None:
+ assert round_half_up(49.5) == 50
+ assert round_half_up(50.5) == 51
diff --git a/tests/test_serp_estimates.py b/tests/test_serp_estimates.py
new file mode 100644
index 0000000..7a08c8f
--- /dev/null
+++ b/tests/test_serp_estimates.py
@@ -0,0 +1,26 @@
+"""Tests for SERP competition estimates."""
+from __future__ import annotations
+
+import json
+from unittest.mock import MagicMock, patch
+
+from website_profiling.integrations.serp.estimates import fetch_serp_features
+
+
+def test_standard_serp_competition_normalized_to_72() -> None:
+ payload = {
+ "organic_results": [{}] * 10,
+ "answer_box": {"snippet": "x"},
+ }
+ body = json.dumps(payload).encode("utf-8")
+ mock_resp = MagicMock()
+ mock_resp.read.return_value = body
+ mock_resp.__enter__.return_value = mock_resp
+ mock_resp.__exit__.return_value = False
+
+ with patch("website_profiling.integrations.serp.estimates.urllib.request.urlopen", return_value=mock_resp):
+ out = fetch_serp_features("seo tools", "key")
+
+ assert out["ok"] is True
+ assert out["estimated_competition"] == 72
+ assert out["provenance"] == "Estimated (heuristic-v1)"
diff --git a/web/app/api/ai/fix-suggestion/route.ts b/web/app/api/ai/fix-suggestion/route.ts
index 054df7a..78ed2a8 100644
--- a/web/app/api/ai/fix-suggestion/route.ts
+++ b/web/app/api/ai/fix-suggestion/route.ts
@@ -63,13 +63,22 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise { stdout += c.toString(); });
proc.stdin?.write(JSON.stringify(payload));
proc.stdin?.end();
+ proc.on('error', () => {
+ clearTimeout(timer);
+ resolve(NextResponse.json({ error: 'Fix suggestion failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
+ clearTimeout(timer);
const parsed = parsePythonJsonStdout(stdout);
if (code === 0 && parsed) {
resolve(NextResponse.json(parsed));
return;
}
- resolve(NextResponse.json({ error: stdout.trim() || 'Fix suggestion failed' }, { status: 500 }));
+ resolve(NextResponse.json({ error: 'Fix suggestion failed' }, { status: 500 }));
});
+ const timer = setTimeout(() => {
+ try { proc.kill(); } catch { /* ignore */ }
+ resolve(NextResponse.json({ error: 'Fix suggestion timed out after 90s' }, { status: 504 }));
+ }, 90_000);
});
};
diff --git a/web/app/api/alerts/check/route.ts b/web/app/api/alerts/check/route.ts
index 0ef27fa..0d3d86e 100644
--- a/web/app/api/alerts/check/route.ts
+++ b/web/app/api/alerts/check/route.ts
@@ -2,7 +2,7 @@ import { NextResponse, type NextRequest } from 'next/server';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { spawn } from 'child_process';
import path from 'path';
-import { resolvePythonExecutable } from '@/server/resolvePython';
+import { resolvePythonExecutable, formatPythonSpawnError } from '@/server/resolvePython';
import { getRepoRoot } from '@/server/pipelineSpawnEnv';
import type { ApiRouteHandler } from '@/types/api';
@@ -50,6 +50,9 @@ print(json.dumps({"alerts": alerts, "webhook_sent": webhook_sent}))
});
let stdout = '';
proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
+ proc.on('error', (err: Error) => {
+ resolve(NextResponse.json({ error: formatPythonSpawnError(err, pythonExe, repoRoot) }, { status: 500 }));
+ });
proc.on('close', (code) => {
try {
const parsed = JSON.parse(stdout.trim() || '{}');
diff --git a/web/app/api/backlinks/competitor-import/route.ts b/web/app/api/backlinks/competitor-import/route.ts
index 404b7dc..151289a 100644
--- a/web/app/api/backlinks/competitor-import/route.ts
+++ b/web/app/api/backlinks/competitor-import/route.ts
@@ -58,13 +58,16 @@ print(json.dumps(build_competitor_domain_gap(our, payload.get("competitor") or "
}),
);
proc.stdin?.end();
+ proc.on('error', () => {
+ resolve(NextResponse.json({ error: 'Import failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
const parsed = parsePythonJsonStdout(stdout);
if (code === 0 && parsed) {
resolve(NextResponse.json({ gap: parsed }));
return;
}
- resolve(NextResponse.json({ error: stdout.trim() || 'Import failed' }, { status: 500 }));
+ resolve(NextResponse.json({ error: 'Competitor backlink import failed' }, { status: 500 }));
});
});
};
diff --git a/web/app/api/backlinks/third-party-import/route.ts b/web/app/api/backlinks/third-party-import/route.ts
index 363fcd1..fe1ef91 100644
--- a/web/app/api/backlinks/third-party-import/route.ts
+++ b/web/app/api/backlinks/third-party-import/route.ts
@@ -77,18 +77,16 @@ print(json.dumps(result))
}),
);
proc.stdin?.end();
+ proc.on('error', () => {
+ resolve(NextResponse.json({ error: 'Import failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
const parsed = parsePythonJsonStdout(stdout);
if (code === 0 && parsed) {
resolve(NextResponse.json(parsed));
return;
}
- resolve(
- NextResponse.json(
- { error: (stderr || stdout).trim() || 'Import failed' },
- { status: 500 },
- ),
- );
+ resolve(NextResponse.json({ error: 'Third-party backlink import failed' }, { status: 500 }));
});
});
};
diff --git a/web/app/api/chat/artifacts/[id]/route.ts b/web/app/api/chat/artifacts/[id]/route.ts
index 942ac29..7b60731 100644
--- a/web/app/api/chat/artifacts/[id]/route.ts
+++ b/web/app/api/chat/artifacts/[id]/route.ts
@@ -3,7 +3,7 @@ import { spawn } from 'child_process';
import path from 'path';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuthForChat } from '@/server/auth';
-import { resolvePythonExecutable } from '@/server/resolvePython';
+import { resolvePythonExecutable, formatPythonSpawnError } from '@/server/resolvePython';
import type { ApiRouteHandlerWithParams } from '@/types/api';
export const runtime = 'nodejs';
@@ -62,6 +62,9 @@ export const GET: ApiRouteHandlerWithParams<{ id: string }> = async (
proc.stderr.on('data', (c) => {
err += c.toString();
});
+ proc.on('error', (spawnErr: Error) => {
+ resolve(NextResponse.json({ error: formatPythonSpawnError(spawnErr, python, REPO_ROOT) }, { status: 500 }));
+ });
proc.on('close', (code) => {
if (code !== 0) {
resolve(NextResponse.json({ error: err.trim() || 'Artifact read failed' }, { status: 500 }));
diff --git a/web/app/api/integrations/bing/sync/route.ts b/web/app/api/integrations/bing/sync/route.ts
index bee7e96..cf04374 100644
--- a/web/app/api/integrations/bing/sync/route.ts
+++ b/web/app/api/integrations/bing/sync/route.ts
@@ -1,7 +1,7 @@
import { NextResponse, type NextRequest } from 'next/server';
import { spawn } from 'child_process';
import { getRepoRoot, getPipelineSpawnEnv } from '@/server/pipelineSpawnEnv';
-import { resolvePythonExecutable, parsePythonJsonStdout } from '@/server/resolvePython';
+import { resolvePythonExecutable, parsePythonJsonStdout, formatPythonSpawnError } from '@/server/resolvePython';
import { loadPipelineConfig } from '@/server/pipelineConfig';
import type { ApiRouteHandler } from '@/types/api';
@@ -46,6 +46,9 @@ print(json.dumps(fetch_bing_backlinks_summary(api_key, site_url)))
});
let stdout = '';
proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
+ proc.on('error', (err: Error) => {
+ resolve(NextResponse.json({ error: formatPythonSpawnError(err, pythonExe, repoRoot) }, { status: 500 }));
+ });
proc.on('close', (code) => {
const parsed = parsePythonJsonStdout(stdout);
if (code === 0 && parsed) {
diff --git a/web/app/api/integrations/google/page-compare/route.ts b/web/app/api/integrations/google/page-compare/route.ts
index 9f97134..1efc447 100644
--- a/web/app/api/integrations/google/page-compare/route.ts
+++ b/web/app/api/integrations/google/page-compare/route.ts
@@ -147,6 +147,9 @@ export const GET: ApiRouteHandler = async (request: NextRequest): Promise {
+ clearTimeout(timer);
resolve(
NextResponse.json(
{ ok: false, error: formatPythonSpawnError(err, pythonExe, repoRoot), log },
@@ -73,6 +74,7 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise {
+ clearTimeout(timer);
try {
const lines = stdout.trim().split('\n').filter(Boolean);
const last = lines[lines.length - 1] || '{}';
@@ -103,7 +105,7 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise {
+ const timer = setTimeout(() => {
try {
proc.kill();
} catch {
diff --git a/web/app/api/integrations/google/test/route.ts b/web/app/api/integrations/google/test/route.ts
index e5bd3a2..f66492e 100644
--- a/web/app/api/integrations/google/test/route.ts
+++ b/web/app/api/integrations/google/test/route.ts
@@ -34,6 +34,7 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise {
+ clearTimeout(timer);
const message = formatPythonSpawnError(err, pythonExe, repoRoot);
resolve(
NextResponse.json({ ok: false, log, error: message }, { status: 500 }),
@@ -41,11 +42,12 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise {
+ clearTimeout(timer);
resolve(NextResponse.json({ ok: code === 0, log, exitCode: code }));
});
// Safety timeout: 30s
- setTimeout(() => {
+ const timer = setTimeout(() => {
try { proc.kill(); } catch { /* ignore */ }
resolve(
NextResponse.json({ ok: false, log, error: 'Test timed out after 30s' }, { status: 504 }),
diff --git a/web/app/api/issues/fix-suggestion/route.ts b/web/app/api/issues/fix-suggestion/route.ts
index 95a1965..b3b27b6 100644
--- a/web/app/api/issues/fix-suggestion/route.ts
+++ b/web/app/api/issues/fix-suggestion/route.ts
@@ -60,13 +60,22 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise { stdout += c.toString(); });
proc.stdin?.write(JSON.stringify(payload));
proc.stdin?.end();
+ proc.on('error', () => {
+ clearTimeout(timer);
+ resolve(NextResponse.json({ error: 'Fix suggestion failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
+ clearTimeout(timer);
const parsed = parsePythonJsonStdout(stdout);
if (code === 0 && parsed) {
resolve(NextResponse.json(parsed));
return;
}
- resolve(NextResponse.json({ error: stdout.trim() || 'Fix suggestion failed' }, { status: 500 }));
+ resolve(NextResponse.json({ error: 'Fix suggestion failed' }, { status: 500 }));
});
+ const timer = setTimeout(() => {
+ try { proc.kill(); } catch { /* ignore */ }
+ resolve(NextResponse.json({ error: 'Fix suggestion timed out after 90s' }, { status: 504 }));
+ }, 90_000);
});
};
diff --git a/web/app/api/keywords/competitor-import/route.ts b/web/app/api/keywords/competitor-import/route.ts
index 2936942..b797687 100644
--- a/web/app/api/keywords/competitor-import/route.ts
+++ b/web/app/api/keywords/competitor-import/route.ts
@@ -60,6 +60,9 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise { stderr += c.toString(); });
proc.stdin?.write(JSON.stringify({ propertyId, competitor, csvText }));
proc.stdin?.end();
+ proc.on('error', () => {
+ resolve(NextResponse.json({ error: 'Import failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
const parsed = parsePythonJsonStdout(stdout);
if (code === 0 && parsed) {
@@ -73,12 +76,7 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise { stdout += c.toString(); });
proc.stdin?.write(JSON.stringify({ keyword, rows: body.rows || [], gaps: body.gaps || [] }));
proc.stdin?.end();
+ proc.on('error', () => {
+ clearTimeout(timer);
+ resolve(NextResponse.json({ error: 'Content brief failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
+ clearTimeout(timer);
const parsed = parsePythonJsonStdout(stdout);
if (code === 0 && parsed) {
resolve(NextResponse.json({ brief: parsed }));
return;
}
- resolve(NextResponse.json({ error: stdout.trim() || 'Brief failed' }, { status: 500 }));
+ resolve(NextResponse.json({ error: 'Content brief generation failed' }, { status: 500 }));
});
+ const timer = setTimeout(() => {
+ try { proc.kill(); } catch { /* ignore */ }
+ resolve(NextResponse.json({ error: 'Content brief timed out after 90s' }, { status: 504 }));
+ }, 90_000);
});
};
diff --git a/web/app/api/links/page-coach/route.ts b/web/app/api/links/page-coach/route.ts
index 5a1e774..5dc9669 100644
--- a/web/app/api/links/page-coach/route.ts
+++ b/web/app/api/links/page-coach/route.ts
@@ -42,9 +42,6 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise((resolve) => {
let log = '';
@@ -73,6 +70,7 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise {
+ clearTimeout(timer);
resolve(
NextResponse.json(
{ ok: false, error: formatPythonSpawnError(err, pythonExe, repoRoot), log },
@@ -82,6 +80,7 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise {
+ clearTimeout(timer);
try {
const lines = stdout.trim().split('\n').filter(Boolean);
const last = lines[lines.length - 1] || '{}';
@@ -108,7 +107,7 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise {
+ const timer = setTimeout(() => {
try {
proc.kill();
} catch {
diff --git a/web/app/api/logs/upload/route.ts b/web/app/api/logs/upload/route.ts
index 14ce9eb..8023632 100644
--- a/web/app/api/logs/upload/route.ts
+++ b/web/app/api/logs/upload/route.ts
@@ -49,13 +49,21 @@ print(json.dumps(analysis))
const meta = JSON.stringify({ start_url: startUrl, crawl_urls: crawlUrls });
const proc = spawn('python3', ['-c', script, meta], { cwd: repoRoot, shell: false });
let out = '';
+ let errOut = '';
proc.stdout?.on('data', (c: Buffer) => { out += c.toString(); });
- proc.stderr?.on('data', (c: Buffer) => { out += c.toString(); });
+ proc.stderr?.on('data', (c: Buffer) => { errOut += c.toString(); });
proc.stdin?.write(text);
proc.stdin?.end();
+ proc.on('error', (e) => reject(e));
proc.on('close', (code) => {
- if (code !== 0) reject(new Error(out || 'parse failed'));
- else resolve(JSON.parse(out.trim() || '{}') as Record);
+ if (code !== 0) reject(new Error(errOut || out || 'parse failed'));
+ else {
+ try {
+ resolve(JSON.parse(out.trim() || '{}') as Record);
+ } catch {
+ reject(new Error('Invalid JSON response from log parser'));
+ }
+ }
});
});
await withDb(async (client) => {
diff --git a/web/app/api/report/export-sitemap/route.ts b/web/app/api/report/export-sitemap/route.ts
index 2b0b130..38a7c51 100644
--- a/web/app/api/report/export-sitemap/route.ts
+++ b/web/app/api/report/export-sitemap/route.ts
@@ -41,9 +41,12 @@ export const GET: ApiRouteHandler = async (request: NextRequest): Promise { out += c.toString(); });
proc.stderr.on('data', (c) => { err += c.toString(); });
+ proc.on('error', () => {
+ resolve(NextResponse.json({ error: 'Sitemap export failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
if (code !== 0) {
- resolve(NextResponse.json({ error: err.trim() || 'Sitemap export failed' }, { status: 500 }));
+ resolve(NextResponse.json({ error: 'Sitemap export failed' }, { status: 500 }));
return;
}
resolve(
diff --git a/web/app/api/report/export-workbook/route.ts b/web/app/api/report/export-workbook/route.ts
index fc74070..8c99ada 100644
--- a/web/app/api/report/export-workbook/route.ts
+++ b/web/app/api/report/export-workbook/route.ts
@@ -49,9 +49,12 @@ export const GET: ApiRouteHandler = async (request: NextRequest): Promise {
err += c.toString();
});
+ proc.on('error', () => {
+ resolve(NextResponse.json({ error: 'Workbook export failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
if (code !== 0) {
- resolve(NextResponse.json({ error: err.trim() || 'Workbook export failed' }, { status: 500 }));
+ resolve(NextResponse.json({ error: 'Workbook export failed' }, { status: 500 }));
return;
}
const body = Buffer.concat(chunks);
diff --git a/web/app/api/report/export/route.ts b/web/app/api/report/export/route.ts
index 321f6d8..f8155ab 100644
--- a/web/app/api/report/export/route.ts
+++ b/web/app/api/report/export/route.ts
@@ -75,9 +75,12 @@ export const GET: ApiRouteHandler = async (request: NextRequest): Promise {
err += c.toString();
});
+ proc.on('error', () => {
+ resolve(NextResponse.json({ error: 'Export failed: could not start Python process' }, { status: 500 }));
+ });
proc.on('close', (code) => {
if (code !== 0) {
- resolve(NextResponse.json({ error: err.trim() || 'Export failed' }, { status: 500 }));
+ resolve(NextResponse.json({ error: 'Export failed' }, { status: 500 }));
return;
}
const body = Buffer.concat(chunks);
diff --git a/web/app/api/schedule/check/route.ts b/web/app/api/schedule/check/route.ts
index a83ae89..58be5f9 100644
--- a/web/app/api/schedule/check/route.ts
+++ b/web/app/api/schedule/check/route.ts
@@ -2,6 +2,7 @@ import { NextResponse, type NextRequest } from 'next/server';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { spawn } from 'child_process';
import path from 'path';
+import { resolvePythonExecutable, formatPythonSpawnError } from '@/server/resolvePython';
import type { ApiRouteHandler } from '@/types/api';
export const runtime = 'nodejs';
@@ -14,17 +15,21 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise {
- const proc = spawn('python3', ['-m', 'src.website_profiling.tools.schedule_runner'], {
+ const proc = spawn(pythonExe, ['-m', 'src.website_profiling.tools.schedule_runner'], {
cwd: repoRoot,
shell: false,
});
let out = '';
proc.stdout?.on('data', (c) => { out += c.toString(); });
proc.stderr?.on('data', (c) => { out += c.toString(); });
+ proc.on('error', (err: Error) => {
+ resolve(NextResponse.json({ error: formatPythonSpawnError(err, pythonExe, repoRoot) }, { status: 500 }));
+ });
proc.on('close', (code) => {
const staleProc = spawn(
- 'python3',
+ pythonExe,
[
'-c',
'from website_profiling.tools.schedule_runner import run_gsc_links_staleness_alerts; import json; print(json.dumps(run_gsc_links_staleness_alerts()))',
@@ -33,6 +38,16 @@ export const POST: ApiRouteHandler = async (request: NextRequest): Promise { staleOut += c.toString(); });
+ staleProc.on('error', () => {
+ // Secondary staleness enrichment failed to spawn — degrade gracefully
+ // rather than hang, returning the primary result with an empty list.
+ resolve(
+ NextResponse.json(
+ { ok: code === 0, output: out.trim(), gscLinksStale: [] },
+ { status: code === 0 ? 200 : 500 },
+ ),
+ );
+ });
staleProc.on('close', () => {
let stale: unknown[] = [];
try {
diff --git a/web/app/globals.css b/web/app/globals.css
index 7cebdcd..fa11f18 100644
--- a/web/app/globals.css
+++ b/web/app/globals.css
@@ -49,6 +49,24 @@
--chat-surface-hover: var(--app-bg-muted);
--chat-glow: rgba(66, 97, 255, 0.08);
--chat-glow-secondary: rgba(138, 79, 255, 0.04);
+
+ /* Warm secondary accent (welcome moments / highlights) — complements the cool blue */
+ --accent-warm: #f97316;
+ --accent-warm-soft: #fb923c;
+ --accent-2: #8b5cf6;
+ --surface-warm: rgba(249, 115, 22, 0.06);
+
+ /* Elevation scale (depth) */
+ --elevation-1: 0 1px 2px rgba(15, 23, 42, 0.06);
+ --elevation-2: 0 6px 16px -4px rgba(15, 23, 42, 0.1);
+ --elevation-3: 0 16px 36px -8px rgba(15, 23, 42, 0.18);
+
+ /* Motion tokens (theme-independent) */
+ --ease-out: cubic-bezier(0.16, 1, 0.3, 1);
+ --ease-spring: cubic-bezier(0.34, 1.56, 0.64, 1);
+ --dur-fast: 120ms;
+ --dur-base: 220ms;
+ --dur-slow: 420ms;
}
html.dark {
@@ -91,6 +109,17 @@ html.dark {
--chat-surface-hover: #28292a;
--chat-glow: rgba(66, 97, 255, 0.12);
--chat-glow-secondary: rgba(138, 79, 255, 0.06);
+
+ /* Warm secondary accent — brighter on dark surfaces */
+ --accent-warm: #fb923c;
+ --accent-warm-soft: #fdba74;
+ --accent-2: #a78bfa;
+ --surface-warm: rgba(251, 146, 60, 0.08);
+
+ /* Elevation scale — deeper shadows on dark */
+ --elevation-1: 0 1px 2px rgba(0, 0, 0, 0.4);
+ --elevation-2: 0 6px 16px -4px rgba(0, 0, 0, 0.45);
+ --elevation-3: 0 16px 36px -8px rgba(0, 0, 0, 0.55);
}
@theme {
@@ -113,7 +142,14 @@ html.dark {
--color-link: var(--app-link);
--color-link-soft: var(--app-link-soft);
- --radius-card: 0.75rem;
+ --color-accent-warm: var(--accent-warm);
+ --color-accent-warm-soft: var(--accent-warm-soft);
+ --color-accent-2: var(--accent-2);
+
+ --radius-sm: 0.5rem;
+ --radius-card: 1rem;
+ --radius-lg: 1.25rem;
+ --radius-xl: 1.5rem;
--spacing-page-x: 1.5rem;
--spacing-page-y: 1.5rem;
@@ -177,7 +213,7 @@ body {
}
.fade-in {
- animation: fadeIn 0.3s ease-in-out;
+ animation: fadeIn var(--dur-base, 0.3s) var(--ease-out, ease-in-out);
}
@keyframes fadeIn {
from {
@@ -190,6 +226,12 @@ body {
}
}
+@media (prefers-reduced-motion: reduce) {
+ .fade-in {
+ animation: none;
+ }
+}
+
.tab-active {
background-color: rgba(37, 99, 235, 0.12);
border-color: rgba(37, 99, 235, 0.35);
@@ -527,7 +569,7 @@ select:focus-visible {
}
.landing-gradient-text {
- background: linear-gradient(135deg, var(--app-link) 0%, #8b5cf6 55%, var(--app-link-soft) 100%);
+ background: linear-gradient(135deg, var(--app-link) 0%, var(--accent-2) 52%, var(--accent-warm-soft) 100%);
-webkit-background-clip: text;
background-clip: text;
color: transparent;
@@ -575,3 +617,131 @@ select:focus-visible {
.landing-section-alt {
background: color-mix(in srgb, var(--app-bg-muted) 35%, transparent);
}
+
+/* ─────────────────────────────────────────────────────────────
+ Reusable redesign utilities — depth, motion, warmth
+ ───────────────────────────────────────────────────────────── */
+
+/* Ambient "aurora" backdrop — generalizes the hand-rolled blob layers
+ previously inlined on Landing/Home. Drop
+ into any `relative overflow-hidden` container. */
+.aurora-bg {
+ position: absolute;
+ inset: 0;
+ z-index: -10;
+ overflow: hidden;
+ pointer-events: none;
+}
+.aurora-bg::before {
+ content: "";
+ position: absolute;
+ inset: -12%;
+ background:
+ radial-gradient(38% 38% at 18% 20%, color-mix(in srgb, var(--app-link) 22%, transparent), transparent 70%),
+ radial-gradient(42% 42% at 84% 10%, color-mix(in srgb, var(--accent-2) 20%, transparent), transparent 70%),
+ radial-gradient(40% 40% at 62% 96%, color-mix(in srgb, var(--accent-warm) 16%, transparent), transparent 72%);
+ filter: blur(64px);
+ opacity: 0.85;
+}
+
+/* Reusable gradient hairline border (generalizes .landing-mock-glow).
+ Apply to an element with a border-radius; ::before paints a 1px ring. */
+.gradient-border {
+ position: relative;
+}
+.gradient-border::before {
+ content: "";
+ position: absolute;
+ inset: 0;
+ border-radius: inherit;
+ padding: 1px;
+ background: linear-gradient(
+ 135deg,
+ color-mix(in srgb, var(--app-link) 45%, transparent),
+ color-mix(in srgb, var(--accent-2) 35%, transparent),
+ color-mix(in srgb, var(--accent-warm) 22%, transparent)
+ );
+ -webkit-mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0);
+ mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0);
+ -webkit-mask-composite: xor;
+ mask-composite: exclude;
+ pointer-events: none;
+}
+
+@media (prefers-reduced-motion: no-preference) {
+ /* Enter animation — single element */
+ .animate-in {
+ animation: fadeRise var(--dur-slow) var(--ease-out) both;
+ }
+
+ /* Scroll-reveal (driven by + useInView). Guarded by reduced-motion,
+ so motion-averse users always see content regardless of scroll state. */
+ [data-reveal='hidden'] {
+ opacity: 0;
+ transform: translateY(12px);
+ }
+ [data-reveal='shown'] {
+ animation: fadeRise var(--dur-slow) var(--ease-out) both;
+ }
+
+ /* Staggered children — set style={{ '--i': index }} on each child */
+ .stagger > * {
+ animation: fadeRise var(--dur-slow) var(--ease-out) both;
+ animation-delay: calc(var(--i, 0) * 60ms);
+ }
+
+ @keyframes fadeRise {
+ from {
+ opacity: 0;
+ transform: translateY(10px);
+ }
+ to {
+ opacity: 1;
+ transform: none;
+ }
+ }
+
+ /* Hover elevation */
+ .hover-lift {
+ transition:
+ transform var(--dur-base) var(--ease-out),
+ box-shadow var(--dur-base) var(--ease-out),
+ border-color var(--dur-base) var(--ease-out);
+ }
+ .hover-lift:hover {
+ transform: translateY(-3px);
+ box-shadow: var(--elevation-3);
+ }
+
+ /* Tactile press feedback */
+ .press {
+ transition: transform var(--dur-fast) var(--ease-out);
+ }
+ .press:active {
+ transform: scale(0.97);
+ }
+
+ /* Skeleton shimmer sweep (replaces animate-pulse) */
+ .shimmer {
+ position: relative;
+ overflow: hidden;
+ }
+ .shimmer::after {
+ content: "";
+ position: absolute;
+ inset: 0;
+ transform: translateX(-100%);
+ background: linear-gradient(
+ 90deg,
+ transparent,
+ color-mix(in srgb, var(--app-text-subtle) 16%, transparent),
+ transparent
+ );
+ animation: shimmerSweep 1.6s ease-in-out infinite;
+ }
+ @keyframes shimmerSweep {
+ 100% {
+ transform: translateX(100%);
+ }
+ }
+}
diff --git a/web/src/components/AppShell.tsx b/web/src/components/AppShell.tsx
index deede26..bbf8a6a 100644
--- a/web/src/components/AppShell.tsx
+++ b/web/src/components/AppShell.tsx
@@ -4,6 +4,8 @@ import { useEffect, useState, type ReactNode } from 'react';
import Link from 'next/link';
import { usePathname, useRouter, useSearchParams } from 'next/navigation';
import {
+ ChevronLeft,
+ ChevronRight,
ExternalLink,
Menu,
Search,
@@ -41,6 +43,20 @@ interface ReportCategoryWithIssues {
issues?: unknown[];
}
+const SIDEBAR_COLLAPSED_KEY = 'app-sidebar-collapsed';
+
+function navItemBadgeCount(
+ itemId: NavItemId,
+ issueCount: number,
+ securityCount: number,
+ jsErrorPageCount: number,
+): number {
+ if (itemId === 'issues') return issueCount;
+ if (itemId === 'security') return securityCount;
+ if (itemId === 'javascript-errors') return jsErrorPageCount;
+ return 0;
+}
+
export interface AppShellProps {
children: ReactNode;
showSidebar?: boolean;
@@ -62,6 +78,7 @@ export default function AppShell({
const pathname = usePathname();
const searchParams = useSearchParams();
const [sidebarOpen, setSidebarOpen] = useState(false);
+ const [sidebarCollapsed, setSidebarCollapsed] = useState(false);
const [integrationsOpen, setIntegrationsOpen] = useState(false);
const [integrationsToast, setIntegrationsToast] = useState(null);
const { data, startUrlByRunId } = useReport();
@@ -70,6 +87,28 @@ export default function AppShell({
const trailing = searchParams.toString() ? `?${searchParams.toString()}` : '';
const closeSidebar = () => setSidebarOpen(false);
+ const toggleSidebarCollapsed = () => {
+ setSidebarCollapsed((prev) => {
+ const next = !prev;
+ try {
+ localStorage.setItem(SIDEBAR_COLLAPSED_KEY, next ? '1' : '0');
+ } catch {
+ /* ignore storage errors */
+ }
+ return next;
+ });
+ };
+
+ useEffect(() => {
+ try {
+ if (localStorage.getItem(SIDEBAR_COLLAPSED_KEY) === '1') {
+ setSidebarCollapsed(true);
+ }
+ } catch {
+ /* ignore storage errors */
+ }
+ }, []);
+
useEffect(() => {
const intParam = searchParams.get('integrations');
const authParam = searchParams.get('auth');
@@ -144,14 +183,25 @@ export default function AppShell({
{showSidebar ? (