codefrydev · PrashantUnity · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/src/website_profiling/crawl/crawler.py b/src/website_profiling/crawl/crawler.py
@@ -302,6 +302,8 @@ def worker(self, url: str) -> dict:
             )
 
         status = result.status
+        is_success = isinstance(status, int) and 200 <= status < 300
+        is_redirect = isinstance(status, int) and 300 <= status < 400
         ct = result.content_type
         text = result.text
         response_time_ms = result.response_time_ms
@@ -371,7 +373,15 @@ def worker(self, url: str) -> dict:
                 if self.store_outlinks:
                     outlink_list.append(link)
                     self.link_edges_accum.append({"from_url": url, **edge})
-                self.frontier.try_enqueue_link(link, url)
+                # Only crawl links discovered on successful (2xx) pages; links
+                # parsed from custom 4xx/5xx error pages should not be followed.
+                if is_success:
+                    self.frontier.try_enqueue_link(link, url)
+
+        # A redirect (3xx) has no crawlable body; enqueue its target so the
+        # destination is fetched and recorded as its own row (per-hop chain).
+        if is_redirect and final_url and final_url.rstrip("/") != url.rstrip("/"):
+            self.frontier.try_enqueue_link(final_url, url)
 
         ext["response_time_ms"] = response_time_ms if response_time_ms is not None else ""
         ext["content_length"] = content_length or 0

diff --git a/src/website_profiling/crawl/fetchers/browser.py b/src/website_profiling/crawl/fetchers/browser.py
@@ -299,6 +299,21 @@ async def _fetch_page(self, page: Any, url: str) -> FetchResult:
                 max_per_page=self.console_max_per_page,
             )
             collector.attach(page)
+
+        # Record main-frame navigation responses in order so that (a) a response
+        # that was received is not lost when goto raises, and (b) we can report
+        # the URL's OWN status (e.g. a 301) instead of the followed destination.
+        nav_responses: list[Any] = []
+
+        def _on_response(resp: Any) -> None:
+            try:
+                req = resp.request
+                if req.is_navigation_request() and resp.frame == page.main_frame:
+                    nav_responses.append(resp)
+            except Exception:
+                pass
+
+        page.on("response", _on_response)
         try:
             try:
                 response = await page.goto(
@@ -312,15 +327,21 @@ async def _fetch_page(self, page: Any, url: str) -> FetchResult:
             if self.extra_wait_ms and response is not None:
                 await asyncio.sleep(self.extra_wait_ms / 1000.0)
         finally:
+            try:
+                page.remove_listener("response", _on_response)
+            except Exception:
+                pass
             if collector is not None:
                 collector.detach(page)
 
         response_time_ms = int((time.perf_counter() - t0) * 1000)
         final_url = page.url or url
-        redirect_chain_length = 1 if final_url.rstrip("/") != url.rstrip("/") else 0
         browser_diagnostics = collector.build() if collector is not None else None
 
-        if response is None:
+        # Prefer the first observed main-frame response: this is the URL's own
+        # response (a 3xx redirect or an error status), not the final hop.
+        own_response = nav_responses[0] if nav_responses else response
+        if own_response is None:
             return FetchResult(
                 status=None,
                 content_type=None,
@@ -329,20 +350,27 @@ async def _fetch_page(self, page: Any, url: str) -> FetchResult:
                 content_length=0,
                 final_url=final_url,
                 headers_dict={},
-                redirect_chain_length=redirect_chain_length,
+                redirect_chain_length=1 if final_url.rstrip("/") != url.rstrip("/") else 0,
                 fetch_method="rendered",
                 browser_diagnostics=browser_diagnostics,
             )
 
-        status = response.status
-        headers = response.headers or {}
+        status = own_response.status
+        redirect_chain_length = sum(
+            1 for r in nav_responses if 300 <= int(getattr(r, "status", 0) or 0) < 400
+        )
+        headers = own_response.headers or {}
         lower_headers = {str(k).lower(): v for k, v in headers.items()}
         ct = lower_headers.get("content-type", "")
         headers_dict = {
             k: (headers.get(k) or lower_headers.get(k.lower(), "")) for k in HEADER_KEYS
         }
 
-        is_html = status == 200 and ("text/html" in ct or "application/xhtml+xml" in ct)
+        is_redirect = 300 <= status < 400
+        # Capture body for 2xx and error (4xx/5xx) HTML pages; skip redirects.
+        is_html = (not is_redirect) and (
+            "text/html" in ct or "application/xhtml+xml" in ct
+        )
         text: Optional[str] = None
         content_length = 0
         if is_html:

diff --git a/src/website_profiling/crawl/fetchers/static.py b/src/website_profiling/crawl/fetchers/static.py
@@ -5,6 +5,7 @@
 import threading
 import time
 from typing import Callable, Optional
+from urllib.parse import urljoin
 
 import requests
 
@@ -71,16 +72,30 @@ def fetch(self, url: str) -> FetchResult:
         session = self.session
         try:
             t0 = time.perf_counter()
-            resp = session.get(url, timeout=self.timeout, allow_redirects=True)
+            # Do NOT auto-follow redirects: we want to record the URL's own
+            # response (e.g. 301/308) rather than collapsing the chain into the
+            # final 200. The crawler enqueues the Location target so each hop is
+            # crawled and recorded as its own row.
+            resp = session.get(url, timeout=self.timeout, allow_redirects=False)
             response_time_ms = int((time.perf_counter() - t0) * 1000)
             ct = resp.headers.get("Content-Type", "")
-            is_html = resp.status_code == 200 and (
+            location = resp.headers.get("Location") or resp.headers.get("location") or ""
+            # A redirect is a 3xx with a Location header (matches requests' own
+            # definition; excludes 304 Not Modified).
+            is_redirect = resp.status_code in (301, 302, 303, 307, 308) and bool(location)
+            # Capture the body for 2xx and error (4xx/5xx) HTML pages so custom
+            # error pages can be analysed; redirects have no meaningful body.
+            is_html = (not is_redirect) and (
                 "text/html" in ct or "application/xhtml+xml" in ct
             )
             text = resp.text if is_html else None
             content_length = len(resp.content) if resp.content is not None else 0
-            final_url = resp.url or url
-            redirect_chain_length = len(resp.history)
+            if is_redirect:
+                final_url = urljoin(url, location)
+                redirect_chain_length = 1
+            else:
+                final_url = resp.url or url
+                redirect_chain_length = len(resp.history)
             headers_dict = {k: (resp.headers.get(k) or "") for k in HEADER_KEYS}
             return FetchResult(
                 status=resp.status_code,

diff --git a/src/website_profiling/reporting/seo_summary.py b/src/website_profiling/reporting/seo_summary.py
@@ -10,10 +10,29 @@
 META_DESC_LEN_MAX = 160
 THIN_CONTENT_CHARS = 300
 
+
+def _status_text(value: object) -> str:
+    """Normalize a status value to a clean string (e.g. 400.0 -> "400").
+
+    Numeric statuses can arrive as ints, strings, or floats (when pandas coerces
+    a column containing NaN); non-numeric markers like "error"/"blocked_by_robots"
+    pass through unchanged. Keeps status-code matching robust across all of them.
+    """
+    if value is None:
+        return ""
+    try:
+        f = float(value)  # type: ignore[arg-type]
+    except (TypeError, ValueError):
+        return str(value).strip()
+    if f != f:  # NaN
+        return ""
+    return str(int(f))
+
+
 def _compute_summary_seo_issues(df: pd.DataFrame) -> dict:
     """Compute crawl summary, SEO health metrics, issues list, and recommendations from crawl DataFrame."""
     total = len(df)
-    status_str = df["status"].astype(str) if "status" in df.columns else pd.Series(["unknown"] * len(df))
+    status_str = df["status"].map(_status_text) if "status" in df.columns else pd.Series(["unknown"] * len(df))
     count_2xx = int((status_str.str.match(r"2\d{2}").fillna(False)).sum())
     count_3xx = int((status_str.str.match(r"3\d{2}").fillna(False)).sum())
     count_4xx = int((status_str.str.match(r"4\d{2}").fillna(False)).sum())
@@ -76,7 +95,7 @@ def _compute_summary_seo_issues(df: pd.DataFrame) -> dict:
         if pd.isna(u) or not u:
             continue
         u = str(u).strip()
-        st = str(row.get("status", "")).strip()
+        st = _status_text(row.get("status", ""))
         if st.startswith("4") or st.startswith("5") or st in ("error", "blocked_by_robots"):
             issues["broken"].append({"url": u, "status": st})
         elif st.startswith("3"):

diff --git a/tests/reporting/test_reporting_builder_modules.py b/tests/reporting/test_reporting_builder_modules.py
@@ -122,6 +122,40 @@ def test_compute_summary_seo_issues() -> None:
     assert out["recommendations"]
 
 
+def test_status_text_normalization() -> None:
+    assert seo_summary._status_text(400) == "400"
+    assert seo_summary._status_text(400.0) == "400"
+    assert seo_summary._status_text("301") == "301"
+    assert seo_summary._status_text("error") == "error"
+    assert seo_summary._status_text(None) == ""
+    assert seo_summary._status_text(float("nan")) == ""
+
+
+def test_compute_summary_classifies_numeric_and_float_statuses() -> None:
+    df = pd.DataFrame(
+        [
+            {"url": "https://example.com/ok", "status": 200.0},
+            {"url": "https://example.com/redir", "status": 301, "final_url": "https://example.com/dest"},
+            {"url": "https://example.com/bad", "status": 400.0},
+            {"url": "https://example.com/boom", "status": 500},
+        ]
+    )
+    out = seo_summary._compute_summary_seo_issues(df)
+    summary = out["summary"]
+    assert summary["count_2xx"] == 1
+    assert summary["count_3xx"] == 1
+    assert summary["count_4xx"] == 1
+    assert summary["count_5xx"] == 1
+
+    broken = {b["url"] for b in out["issues"]["broken"]}
+    assert {"https://example.com/bad", "https://example.com/boom"} <= broken
+
+    redirects = {r["url"]: r for r in out["issues"]["redirects"]}
+    assert "https://example.com/redir" in redirects
+    assert redirects["https://example.com/redir"]["status"] == "301"
+    assert redirects["https://example.com/redir"]["final_url"] == "https://example.com/dest"
+
+
 def test_content_analytics_helpers() -> None:
     df = _crawl_df()
     content = content_analytics._build_content_analytics(df)