codefrydev · PrashantUnity · Jun 15, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/input.txt.example b/input.txt.example
@@ -96,6 +96,8 @@ enable_duplicate_detection = true
 enable_language_detection = true
 analysis_fuzzy_threshold = 92
 analysis_simhash_hamming = 0
+analysis_simhash_max_urls = 800
+analysis_fuzzy_max_urls = 600
 analysis_dup_max_pages = 2000
 
 # --- Audit steps ---

diff --git a/pipeline-config.example.txt b/pipeline-config.example.txt
@@ -97,6 +97,8 @@ enable_duplicate_detection = true
 enable_language_detection = true
 analysis_fuzzy_threshold = 92
 analysis_simhash_hamming = 0
+analysis_simhash_max_urls = 800
+analysis_fuzzy_max_urls = 600
 analysis_dup_max_pages = 2000
 
 # --- Pipeline ---

diff --git a/src/website_profiling/analysis/local.py b/src/website_profiling/analysis/local.py
@@ -97,9 +97,11 @@ def _import_langdetect():
 def compute_duplicate_groups(
     df: pd.DataFrame,
     cfg: dict[str, str] | None,
-) -> tuple[list[dict[str, Any]], dict[str, str]]:
+) -> tuple[list[dict[str, Any]], dict[str, str], list[str]]:
     if df.empty or not _cfg_bool(cfg, "enable_duplicate_detection", False):
-        return [], {}
+        return [], {}, []
+
+    warnings: list[str] = []
 
     success = df[df["status"].astype(str).str.match(r"2\d{2}", na=False)] if "status" in df.columns else df
     if "content_type" in success.columns:
@@ -126,6 +128,8 @@ def compute_duplicate_groups(
     fuzz = _import_rapidfuzz()
     fuzzy_threshold = _cfg_int(cfg, "analysis_fuzzy_threshold", 92) or 92
     hamming_max = _cfg_int(cfg, "analysis_simhash_hamming", 0) or 0
+    simhash_max_urls = _cfg_int(cfg, "analysis_simhash_max_urls", 800) or 800
+    fuzzy_max_urls = _cfg_int(cfg, "analysis_fuzzy_max_urls", 600) or 600
 
     parent: dict[str, str] = {}
 
@@ -150,20 +154,30 @@ def union(a: str, b: str) -> None:
         for m in members[1:]:
             union(base, m)
 
-    if hamming_max > 0 and len(urls) <= 800:
+    if hamming_max > 0 and len(urls) <= simhash_max_urls:
         sh_list = [(u, url_to_sh[u]) for u in urls]
         for i, (u1, h1) in enumerate(sh_list):
             for u2, h2 in sh_list[i + 1 :]:
                 if _hamming(h1, h2) <= hamming_max:
                     union(u1, u2)
+    elif hamming_max > 0 and len(urls) > simhash_max_urls:
+        warnings.append(
+            f"Duplicate detection: SimHash similarity skipped for {len(urls)} URLs "
+            f"(cap {simhash_max_urls}); results may be incomplete."
+        )
 
-    if len(urls) <= 600:
+    if len(urls) <= fuzzy_max_urls:
         for i, u1 in enumerate(urls):
             fp1 = url_to_fp.get(u1, "")
             for u2 in urls[i + 1 :]:
                 fp2 = url_to_fp.get(u2, "")
                 if fp1 and fp2 and fuzz.token_set_ratio(fp1, fp2) >= fuzzy_threshold:
                     union(u1, u2)
+    elif len(urls) > fuzzy_max_urls:
+        warnings.append(
+            f"Duplicate detection: fuzzy title matching skipped for {len(urls)} URLs "
+            f"(cap {fuzzy_max_urls}); results may be incomplete."
+        )
 
     clusters: dict[str, list[str]] = defaultdict(list)
     for u in urls:
@@ -196,7 +210,7 @@ def union(a: str, b: str) -> None:
         if gid >= max_groups:
             break
 
-    return groups_out[:max_groups], url_to_gid
+    return groups_out[:max_groups], url_to_gid, warnings
 
 
 def compute_language_signals(df: pd.DataFrame, cfg: dict[str, str] | None) -> tuple[dict[str, str], dict[str, Any]]:
@@ -243,9 +257,10 @@ def run_local_enrichment(df: pd.DataFrame, cfg: dict[str, str] | None) -> dict[s
         return bundle
 
     try:
-        dups, url_gid = compute_duplicate_groups(df, cfg)
+        dups, url_gid, dup_warnings = compute_duplicate_groups(df, cfg)
         bundle["content_duplicates"] = dups
         bundle["url_duplicate_group_id"] = url_gid
+        bundle["ml_errors"].extend(dup_warnings)
     except ImportError as e:
         bundle["ml_errors"].append(str(e))
 

diff --git a/src/website_profiling/crawl/crawler.py b/src/website_profiling/crawl/crawler.py
@@ -5,7 +5,7 @@
 
 import json
 import time
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
 from typing import Optional
 
 import pandas as pd
@@ -449,6 +449,10 @@ def crawl(
                             continue
                         futures.append(ex.submit(self.worker, url))
 
+                    if futures and self.queue.empty():
+                        # Block until at least one future completes instead of busy-polling.
+                        wait(futures, return_when=FIRST_COMPLETED)
+
                     remaining = []
                     for f in futures:
                         if f.done():
@@ -471,7 +475,6 @@ def crawl(
                         else:
                             remaining.append(f)
                     futures = remaining
-                    time.sleep(0.01)
 
                     if self.queue.empty() and not futures:
                         break

diff --git a/src/website_profiling/crawl/db_writer.py b/src/website_profiling/crawl/db_writer.py
@@ -21,10 +21,12 @@ def __init__(self, crawl_run_id: int, batch_size: int = 500, *, store_page_html:
         self._error: Optional[BaseException] = None
 
     def enqueue(self, record: dict) -> None:
+        if self._error is not None:
+            return
         self._queue.put(("crawl", record))
 
     def enqueue_html(self, record: dict) -> None:
-        if not self.store_page_html:
+        if not self.store_page_html or self._error is not None:
             return
         self._queue.put(("html", record))
 

diff --git a/src/website_profiling/crawl/fetchers/browser.py b/src/website_profiling/crawl/fetchers/browser.py
@@ -263,25 +263,26 @@ async def worker() -> None:
 
         workers = [asyncio.create_task(worker()) for _ in range(self.js_concurrency)]
         self._ready.set()
-        await asyncio.gather(*workers)
-
-        for page in pages:
+        try:
+            await asyncio.gather(*workers)
+        finally:
+            for page in pages:
+                try:
+                    await page.close()
+                except Exception:
+                    pass
             try:
-                await page.close()
+                await context.close()
+            except Exception:
+                pass
+            try:
+                await browser.close()
+            except Exception:
+                pass
+            try:
+                await playwright.stop()
             except Exception:
                 pass
-        try:
-            await context.close()
-        except Exception:
-            pass
-        try:
-            await browser.close()
-        except Exception:
-            pass
-        try:
-            await playwright.stop()
-        except Exception:
-            pass
 
     def _diagnostics_enabled(self) -> bool:
         return self.capture_console or self.capture_failed_requests

diff --git a/src/website_profiling/crawl/fetchers/static.py b/src/website_profiling/crawl/fetchers/static.py
@@ -49,7 +49,7 @@ def fetch(self, url: str) -> FetchResult:
                 redirect_chain_length=redirect_chain_length,
                 fetch_method="static",
             )
-        except Exception:
+        except requests.RequestException:
             return FetchResult(
                 status=None,
                 content_type=None,

diff --git a/src/website_profiling/db/historical.py b/src/website_profiling/db/historical.py
@@ -4,14 +4,17 @@
 import json
 import os
 import subprocess
+import sys
 import time
 from pathlib import Path
 from typing import Any, Optional
 
 import pandas as pd
 from psycopg import Connection
+from psycopg.sql import SQL, Identifier
 from urllib.parse import urlparse
 
+from ..console_io import console_print
 from ._common import (
     _executemany,
     _json_val,
@@ -33,17 +36,26 @@ def backup_db_if_exists(skip_in_ci: bool = True) -> Optional[str]:
     suffix = time.strftime("%Y%m%d-%H%M%S")
     out_path = backup_dir / f"website_profiling-{suffix}.dump"
     try:
+        parsed = urlparse(get_database_url())
+        pg_env = {**os.environ}
+        if parsed.hostname:
+            pg_env["PGHOST"] = parsed.hostname
+        if parsed.port:
+            pg_env["PGPORT"] = str(parsed.port)
+        if parsed.username:
+            pg_env["PGUSER"] = parsed.username
+        if parsed.password:
+            pg_env["PGPASSWORD"] = parsed.password
+        dbname = (parsed.path or "").lstrip("/")
+        cmd = ["pg_dump", "-Fc", "-f", str(out_path)]
+        if dbname:
+            cmd.append(dbname)
         subprocess.run(
-            [
-                "pg_dump",
-                "-Fc",
-                "-f",
-                str(out_path),
-                get_database_url(),
-            ],
+            cmd,
             check=True,
             capture_output=True,
             timeout=300,
+            env=pg_env,
         )
         return str(out_path)
     except (FileNotFoundError, subprocess.CalledProcessError, subprocess.TimeoutExpired):
@@ -72,12 +84,18 @@ def read_historical_data() -> dict[str, list]:
             for table in tables:
                 try:
                     with conn.cursor() as cur:
-                        cur.execute(f"SELECT * FROM {table}")
+                        cur.execute(SQL("SELECT * FROM {}").format(Identifier(table)))
                         result[table] = [dict(row) for row in cur.fetchall()]
-                except Exception:
-                    pass
-    except Exception:
-        pass
+                except Exception as e:
+                    console_print(
+                        f"  Warning: could not read historical table '{table}': {e}",
+                        file=sys.stderr,
+                    )
+    except Exception as e:
+        console_print(
+            f"  Warning: could not read historical data (a DB backup is still taken before any overwrite): {e}",
+            file=sys.stderr,
+        )
     return result
 
 

diff --git a/src/website_profiling/db/report_store.py b/src/website_profiling/db/report_store.py
@@ -6,6 +6,7 @@
 
 from psycopg import Connection
 
+from ..scoring import round_half_up
 from ._common import _json_val, _now_iso, _parse_row_json, _row_field
 from .crawl_store import get_crawl_run_info
 
@@ -51,7 +52,7 @@ def _write_audit_health_snapshot(
         for c in categories
         if isinstance(c, dict) and isinstance(c.get("score"), (int, float))
     ]
-    health_score = round(sum(scores) / len(scores)) if scores else None
+    health_score = round_half_up(sum(scores) / len(scores)) if scores else None
     category_scores: dict[str, float] = {}
     issue_counts = {"Critical": 0, "High": 0, "Medium": 0, "Low": 0}
     for cat in categories:

diff --git a/src/website_profiling/integrations/crux/fetch.py b/src/website_profiling/integrations/crux/fetch.py
@@ -49,9 +49,18 @@ def fetch_crux_origin_metrics(origin_or_url: str, api_key: str | None = None) ->
     lcp = parsed["metrics"].get("largest_contentful_paint", {}).get("p75")
     inp = parsed["metrics"].get("interaction_to_next_paint", {}).get("p75")
     cls = parsed["metrics"].get("cumulative_layout_shift", {}).get("p75")
+
+    def _pass_threshold(value: Any, limit: float) -> bool:
+        if value is None:
+            return False
+        try:
+            return float(value) <= limit
+        except (TypeError, ValueError):
+            return False
+
     parsed["pass"] = {
-        "lcp": lcp is not None and float(lcp) <= 2500,
-        "inp": inp is not None and float(inp) <= 200,
-        "cls": cls is not None and float(cls) <= 0.1,
+        "lcp": _pass_threshold(lcp, 2500),
+        "inp": _pass_threshold(inp, 200),
+        "cls": _pass_threshold(cls, 0.1),
     }
     return parsed
diff --git a/src/website_profiling/integrations/google/keyword_enrich.py b/src/website_profiling/integrations/google/keyword_enrich.py
@@ -21,6 +21,7 @@
 from __future__ import annotations
 
 import json
+import math
 import re
 from datetime import datetime, timezone
 from typing import Any
@@ -34,14 +35,19 @@
 
 
 def ctr_as_fraction(ctr: Any) -> float:
-    """GSC rows use CTR percent (2.8); normalize to fraction for comparisons."""
+    """GSC rows use CTR as percent (e.g. 2.8 for 2.8%); normalize to fraction.
+
+    Invariant: ingest always stores percent — see gsc._to_query_record / _to_page_record (* 100).
+    """
     if ctr is None:
         return 0.0
     try:
         v = float(ctr)
     except (TypeError, ValueError):
         return 0.0
-    return v / 100.0 if v > 1 else v
+    if v > 100:
+        return 1.0
+    return v / 100.0
 
 QUESTION_STARTS = re.compile(
     r"^(how|what|why|when|where|who|can|does|is|are|should|will|do)\s", re.I
@@ -143,13 +149,15 @@ def estimate_difficulty(kw: str, gsc_row: dict | None, branded: bool = False) ->
 # ── CTR curve ─────────────────────────────────────────────────────────────────
 
 def opportunity_clicks(impressions: int, current_pos: float, target_pos: int = 3) -> int:
-    cur_ctr = CTR_CURVE.get(round(current_pos), CTR_CURVE_DEFAULT)
+    pos_slot = max(1, math.ceil(current_pos)) if current_pos > 0 else 1
+    cur_ctr = CTR_CURVE.get(pos_slot, CTR_CURVE_DEFAULT)
     tgt_ctr = CTR_CURVE.get(target_pos, CTR_CURVE.get(3, 0.103))
     return max(0, int((impressions or 0) * (tgt_ctr - cur_ctr)))
 
 
 def industry_ctr(pos: float) -> float:
-    return CTR_CURVE.get(round(pos), CTR_CURVE_DEFAULT)
+    pos_slot = max(1, math.ceil(pos)) if pos > 0 else 1
+    return CTR_CURVE.get(pos_slot, CTR_CURVE_DEFAULT)
 
 
 # ── Cannibalisation ───────────────────────────────────────────────────────────

diff --git a/src/website_profiling/integrations/serp/estimates.py b/src/website_profiling/integrations/serp/estimates.py
@@ -6,6 +6,10 @@
 import urllib.request
 from typing import Any
 
+_MAX_ORGANIC = 10
+_MAX_FEATURES = 4
+_RAW_MAX = _MAX_ORGANIC * 8 + _MAX_FEATURES * 12  # 128
+
 
 def fetch_serp_features(keyword: str, api_key: str) -> dict[str, Any]:
     """Fetch SERP metadata from SerpAPI (Estimated competition proxy)."""
@@ -38,13 +42,14 @@ def fetch_serp_features(keyword: str, api_key: str) -> dict[str, Any]:
     if data.get("top_stories"):
         features.append("top_stories")
 
-    competition = min(100, len(organic) * 8 + len(features) * 12)
+    raw_score = len(organic) * 8 + len(features) * 12
+    competition = min(100, round(raw_score / _RAW_MAX * 100))
     return {
         "ok": True,
         "organic_count": len(organic),
         "serp_features": features,
         "estimated_competition": competition,
-        "provenance": "Estimated",
+        "provenance": "Estimated (heuristic-v1)",
     }