Skip to content
Merged

UI #30

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions input.txt.example
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ enable_duplicate_detection = true
enable_language_detection = true
analysis_fuzzy_threshold = 92
analysis_simhash_hamming = 0
analysis_simhash_max_urls = 800
analysis_fuzzy_max_urls = 600
analysis_dup_max_pages = 2000

# --- Audit steps ---
Expand Down
2 changes: 2 additions & 0 deletions pipeline-config.example.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ enable_duplicate_detection = true
enable_language_detection = true
analysis_fuzzy_threshold = 92
analysis_simhash_hamming = 0
analysis_simhash_max_urls = 800
analysis_fuzzy_max_urls = 600
analysis_dup_max_pages = 2000

# --- Pipeline ---
Expand Down
27 changes: 21 additions & 6 deletions src/website_profiling/analysis/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,11 @@ def _import_langdetect():
def compute_duplicate_groups(
df: pd.DataFrame,
cfg: dict[str, str] | None,
) -> tuple[list[dict[str, Any]], dict[str, str]]:
) -> tuple[list[dict[str, Any]], dict[str, str], list[str]]:
if df.empty or not _cfg_bool(cfg, "enable_duplicate_detection", False):
return [], {}
return [], {}, []

warnings: list[str] = []

success = df[df["status"].astype(str).str.match(r"2\d{2}", na=False)] if "status" in df.columns else df
if "content_type" in success.columns:
Expand All @@ -126,6 +128,8 @@ def compute_duplicate_groups(
fuzz = _import_rapidfuzz()
fuzzy_threshold = _cfg_int(cfg, "analysis_fuzzy_threshold", 92) or 92
hamming_max = _cfg_int(cfg, "analysis_simhash_hamming", 0) or 0
simhash_max_urls = _cfg_int(cfg, "analysis_simhash_max_urls", 800) or 800
fuzzy_max_urls = _cfg_int(cfg, "analysis_fuzzy_max_urls", 600) or 600

parent: dict[str, str] = {}

Expand All @@ -150,20 +154,30 @@ def union(a: str, b: str) -> None:
for m in members[1:]:
union(base, m)

if hamming_max > 0 and len(urls) <= 800:
if hamming_max > 0 and len(urls) <= simhash_max_urls:
sh_list = [(u, url_to_sh[u]) for u in urls]
for i, (u1, h1) in enumerate(sh_list):
for u2, h2 in sh_list[i + 1 :]:
if _hamming(h1, h2) <= hamming_max:
union(u1, u2)
elif hamming_max > 0 and len(urls) > simhash_max_urls:
warnings.append(
f"Duplicate detection: SimHash similarity skipped for {len(urls)} URLs "
f"(cap {simhash_max_urls}); results may be incomplete."
)

if len(urls) <= 600:
if len(urls) <= fuzzy_max_urls:
for i, u1 in enumerate(urls):
fp1 = url_to_fp.get(u1, "")
for u2 in urls[i + 1 :]:
fp2 = url_to_fp.get(u2, "")
if fp1 and fp2 and fuzz.token_set_ratio(fp1, fp2) >= fuzzy_threshold:
union(u1, u2)
elif len(urls) > fuzzy_max_urls:
warnings.append(
f"Duplicate detection: fuzzy title matching skipped for {len(urls)} URLs "
f"(cap {fuzzy_max_urls}); results may be incomplete."
)

clusters: dict[str, list[str]] = defaultdict(list)
for u in urls:
Expand Down Expand Up @@ -196,7 +210,7 @@ def union(a: str, b: str) -> None:
if gid >= max_groups:
break

return groups_out[:max_groups], url_to_gid
return groups_out[:max_groups], url_to_gid, warnings


def compute_language_signals(df: pd.DataFrame, cfg: dict[str, str] | None) -> tuple[dict[str, str], dict[str, Any]]:
Expand Down Expand Up @@ -243,9 +257,10 @@ def run_local_enrichment(df: pd.DataFrame, cfg: dict[str, str] | None) -> dict[s
return bundle

try:
dups, url_gid = compute_duplicate_groups(df, cfg)
dups, url_gid, dup_warnings = compute_duplicate_groups(df, cfg)
bundle["content_duplicates"] = dups
bundle["url_duplicate_group_id"] = url_gid
bundle["ml_errors"].extend(dup_warnings)
except ImportError as e:
bundle["ml_errors"].append(str(e))

Expand Down
7 changes: 5 additions & 2 deletions src/website_profiling/crawl/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import json
import time
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
from typing import Optional

import pandas as pd
Expand Down Expand Up @@ -449,6 +449,10 @@ def crawl(
continue
futures.append(ex.submit(self.worker, url))

if futures and self.queue.empty():
# Block until at least one future completes instead of busy-polling.
wait(futures, return_when=FIRST_COMPLETED)

remaining = []
for f in futures:
if f.done():
Expand All @@ -471,7 +475,6 @@ def crawl(
else:
remaining.append(f)
futures = remaining
time.sleep(0.01)

if self.queue.empty() and not futures:
break
Expand Down
4 changes: 3 additions & 1 deletion src/website_profiling/crawl/db_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ def __init__(self, crawl_run_id: int, batch_size: int = 500, *, store_page_html:
self._error: Optional[BaseException] = None

def enqueue(self, record: dict) -> None:
if self._error is not None:
return
self._queue.put(("crawl", record))

def enqueue_html(self, record: dict) -> None:
if not self.store_page_html:
if not self.store_page_html or self._error is not None:
return
self._queue.put(("html", record))

Expand Down
33 changes: 17 additions & 16 deletions src/website_profiling/crawl/fetchers/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,25 +263,26 @@ async def worker() -> None:

workers = [asyncio.create_task(worker()) for _ in range(self.js_concurrency)]
self._ready.set()
await asyncio.gather(*workers)

for page in pages:
try:
await asyncio.gather(*workers)
finally:
for page in pages:
try:
await page.close()
except Exception:
pass
try:
await page.close()
await context.close()
except Exception:
pass
try:
await browser.close()
except Exception:
pass
try:
await playwright.stop()
except Exception:
pass
try:
await context.close()
except Exception:
pass
try:
await browser.close()
except Exception:
pass
try:
await playwright.stop()
except Exception:
pass

def _diagnostics_enabled(self) -> bool:
return self.capture_console or self.capture_failed_requests
Expand Down
2 changes: 1 addition & 1 deletion src/website_profiling/crawl/fetchers/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def fetch(self, url: str) -> FetchResult:
redirect_chain_length=redirect_chain_length,
fetch_method="static",
)
except Exception:
except requests.RequestException:
return FetchResult(
status=None,
content_type=None,
Expand Down
42 changes: 30 additions & 12 deletions src/website_profiling/db/historical.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
import json
import os
import subprocess
import sys
import time
from pathlib import Path
from typing import Any, Optional

import pandas as pd
from psycopg import Connection
from psycopg.sql import SQL, Identifier
from urllib.parse import urlparse

from ..console_io import console_print
from ._common import (
_executemany,
_json_val,
Expand All @@ -33,17 +36,26 @@ def backup_db_if_exists(skip_in_ci: bool = True) -> Optional[str]:
suffix = time.strftime("%Y%m%d-%H%M%S")
out_path = backup_dir / f"website_profiling-{suffix}.dump"
try:
parsed = urlparse(get_database_url())
pg_env = {**os.environ}
if parsed.hostname:
pg_env["PGHOST"] = parsed.hostname
if parsed.port:
pg_env["PGPORT"] = str(parsed.port)
if parsed.username:
pg_env["PGUSER"] = parsed.username
if parsed.password:
pg_env["PGPASSWORD"] = parsed.password
dbname = (parsed.path or "").lstrip("/")
cmd = ["pg_dump", "-Fc", "-f", str(out_path)]
if dbname:
cmd.append(dbname)
subprocess.run(
[
"pg_dump",
"-Fc",
"-f",
str(out_path),
get_database_url(),
],
cmd,
check=True,
capture_output=True,
timeout=300,
env=pg_env,
)
return str(out_path)
except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired):
Expand Down Expand Up @@ -72,12 +84,18 @@ def read_historical_data() -> dict[str, list]:
for table in tables:
try:
with conn.cursor() as cur:
cur.execute(f"SELECT * FROM {table}")
cur.execute(SQL("SELECT * FROM {}").format(Identifier(table)))
result[table] = [dict(row) for row in cur.fetchall()]
except Exception:
pass
except Exception:
pass
except Exception as e:
console_print(
f" Warning: could not read historical table '{table}': {e}",
file=sys.stderr,
)
except Exception as e:
console_print(
f" Warning: could not read historical data (a DB backup is still taken before any overwrite): {e}",
file=sys.stderr,
)
return result


Expand Down
3 changes: 2 additions & 1 deletion src/website_profiling/db/report_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from psycopg import Connection

from ..scoring import round_half_up
from ._common import _json_val, _now_iso, _parse_row_json, _row_field
from .crawl_store import get_crawl_run_info

Expand Down Expand Up @@ -51,7 +52,7 @@ def _write_audit_health_snapshot(
for c in categories
if isinstance(c, dict) and isinstance(c.get("score"), (int, float))
]
health_score = round(sum(scores) / len(scores)) if scores else None
health_score = round_half_up(sum(scores) / len(scores)) if scores else None
category_scores: dict[str, float] = {}
issue_counts = {"Critical": 0, "High": 0, "Medium": 0, "Low": 0}
for cat in categories:
Expand Down
15 changes: 12 additions & 3 deletions src/website_profiling/integrations/crux/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,18 @@ def fetch_crux_origin_metrics(origin_or_url: str, api_key: str | None = None) ->
lcp = parsed["metrics"].get("largest_contentful_paint", {}).get("p75")
inp = parsed["metrics"].get("interaction_to_next_paint", {}).get("p75")
cls = parsed["metrics"].get("cumulative_layout_shift", {}).get("p75")

def _pass_threshold(value: Any, limit: float) -> bool:
if value is None:
return False
try:
return float(value) <= limit
except (TypeError, ValueError):
return False

parsed["pass"] = {
"lcp": lcp is not None and float(lcp) <= 2500,
"inp": inp is not None and float(inp) <= 200,
"cls": cls is not None and float(cls) <= 0.1,
"lcp": _pass_threshold(lcp, 2500),
"inp": _pass_threshold(inp, 200),
"cls": _pass_threshold(cls, 0.1),
}
return parsed
16 changes: 12 additions & 4 deletions src/website_profiling/integrations/google/keyword_enrich.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from __future__ import annotations

import json
import math
import re
from datetime import datetime, timezone
from typing import Any
Expand All @@ -34,14 +35,19 @@


def ctr_as_fraction(ctr: Any) -> float:
"""GSC rows use CTR percent (2.8); normalize to fraction for comparisons."""
"""GSC rows use CTR as percent (e.g. 2.8 for 2.8%); normalize to fraction.

Invariant: ingest always stores percent — see gsc._to_query_record / _to_page_record (* 100).
"""
if ctr is None:
return 0.0
try:
v = float(ctr)
except (TypeError, ValueError):
return 0.0
return v / 100.0 if v > 1 else v
if v > 100:
return 1.0
return v / 100.0

QUESTION_STARTS = re.compile(
r"^(how|what|why|when|where|who|can|does|is|are|should|will|do)\s", re.I
Expand Down Expand Up @@ -143,13 +149,15 @@ def estimate_difficulty(kw: str, gsc_row: dict | None, branded: bool = False) ->
# ── CTR curve ─────────────────────────────────────────────────────────────────

def opportunity_clicks(impressions: int, current_pos: float, target_pos: int = 3) -> int:
cur_ctr = CTR_CURVE.get(round(current_pos), CTR_CURVE_DEFAULT)
pos_slot = max(1, math.ceil(current_pos)) if current_pos > 0 else 1
cur_ctr = CTR_CURVE.get(pos_slot, CTR_CURVE_DEFAULT)
tgt_ctr = CTR_CURVE.get(target_pos, CTR_CURVE.get(3, 0.103))
return max(0, int((impressions or 0) * (tgt_ctr - cur_ctr)))


def industry_ctr(pos: float) -> float:
return CTR_CURVE.get(round(pos), CTR_CURVE_DEFAULT)
pos_slot = max(1, math.ceil(pos)) if pos > 0 else 1
return CTR_CURVE.get(pos_slot, CTR_CURVE_DEFAULT)


# ── Cannibalisation ───────────────────────────────────────────────────────────
Expand Down
9 changes: 7 additions & 2 deletions src/website_profiling/integrations/serp/estimates.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import urllib.request
from typing import Any

_MAX_ORGANIC = 10
_MAX_FEATURES = 4
_RAW_MAX = _MAX_ORGANIC * 8 + _MAX_FEATURES * 12 # 128


def fetch_serp_features(keyword: str, api_key: str) -> dict[str, Any]:
"""Fetch SERP metadata from SerpAPI (Estimated competition proxy)."""
Expand Down Expand Up @@ -38,13 +42,14 @@ def fetch_serp_features(keyword: str, api_key: str) -> dict[str, Any]:
if data.get("top_stories"):
features.append("top_stories")

competition = min(100, len(organic) * 8 + len(features) * 12)
raw_score = len(organic) * 8 + len(features) * 12
competition = min(100, round(raw_score / _RAW_MAX * 100))
return {
"ok": True,
"organic_count": len(organic),
"serp_features": features,
"estimated_competition": competition,
"provenance": "Estimated",
"provenance": "Estimated (heuristic-v1)",
}


Expand Down
Loading
Loading