Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/website_profiling/crawl/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ def worker(self, url: str) -> dict:
)

status = result.status
is_success = isinstance(status, int) and 200 <= status < 300
is_redirect = isinstance(status, int) and 300 <= status < 400
ct = result.content_type
text = result.text
response_time_ms = result.response_time_ms
Expand Down Expand Up @@ -371,7 +373,15 @@ def worker(self, url: str) -> dict:
if self.store_outlinks:
outlink_list.append(link)
self.link_edges_accum.append({"from_url": url, **edge})
self.frontier.try_enqueue_link(link, url)
# Only crawl links discovered on successful (2xx) pages; links
# parsed from custom 4xx/5xx error pages should not be followed.
if is_success:
self.frontier.try_enqueue_link(link, url)

# A redirect (3xx) has no crawlable body; enqueue its target so the
# destination is fetched and recorded as its own row (per-hop chain).
if is_redirect and final_url and final_url.rstrip("/") != url.rstrip("/"):
self.frontier.try_enqueue_link(final_url, url)

ext["response_time_ms"] = response_time_ms if response_time_ms is not None else ""
ext["content_length"] = content_length or 0
Expand Down
40 changes: 34 additions & 6 deletions src/website_profiling/crawl/fetchers/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,21 @@ async def _fetch_page(self, page: Any, url: str) -> FetchResult:
max_per_page=self.console_max_per_page,
)
collector.attach(page)

# Record main-frame navigation responses in order so that (a) a response
# that was received is not lost when goto raises, and (b) we can report
# the URL's OWN status (e.g. a 301) instead of the followed destination.
nav_responses: list[Any] = []

def _on_response(resp: Any) -> None:
try:
req = resp.request
if req.is_navigation_request() and resp.frame == page.main_frame:
nav_responses.append(resp)
except Exception:
pass

page.on("response", _on_response)
try:
try:
response = await page.goto(
Expand All @@ -312,15 +327,21 @@ async def _fetch_page(self, page: Any, url: str) -> FetchResult:
if self.extra_wait_ms and response is not None:
await asyncio.sleep(self.extra_wait_ms / 1000.0)
finally:
try:
page.remove_listener("response", _on_response)
except Exception:
pass
if collector is not None:
collector.detach(page)

response_time_ms = int((time.perf_counter() - t0) * 1000)
final_url = page.url or url
redirect_chain_length = 1 if final_url.rstrip("/") != url.rstrip("/") else 0
browser_diagnostics = collector.build() if collector is not None else None

if response is None:
# Prefer the first observed main-frame response: this is the URL's own
# response (a 3xx redirect or an error status), not the final hop.
own_response = nav_responses[0] if nav_responses else response
if own_response is None:
return FetchResult(
status=None,
content_type=None,
Expand All @@ -329,20 +350,27 @@ async def _fetch_page(self, page: Any, url: str) -> FetchResult:
content_length=0,
final_url=final_url,
headers_dict={},
redirect_chain_length=redirect_chain_length,
redirect_chain_length=1 if final_url.rstrip("/") != url.rstrip("/") else 0,
fetch_method="rendered",
browser_diagnostics=browser_diagnostics,
)

status = response.status
headers = response.headers or {}
status = own_response.status
redirect_chain_length = sum(
1 for r in nav_responses if 300 <= int(getattr(r, "status", 0) or 0) < 400
)
headers = own_response.headers or {}
lower_headers = {str(k).lower(): v for k, v in headers.items()}
ct = lower_headers.get("content-type", "")
headers_dict = {
k: (headers.get(k) or lower_headers.get(k.lower(), "")) for k in HEADER_KEYS
}

is_html = status == 200 and ("text/html" in ct or "application/xhtml+xml" in ct)
is_redirect = 300 <= status < 400
# Capture body for 2xx and error (4xx/5xx) HTML pages; skip redirects.
is_html = (not is_redirect) and (
"text/html" in ct or "application/xhtml+xml" in ct
)
text: Optional[str] = None
content_length = 0
if is_html:
Expand Down
23 changes: 19 additions & 4 deletions src/website_profiling/crawl/fetchers/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import threading
import time
from typing import Callable, Optional
from urllib.parse import urljoin

import requests

Expand Down Expand Up @@ -71,16 +72,30 @@ def fetch(self, url: str) -> FetchResult:
session = self.session
try:
t0 = time.perf_counter()
resp = session.get(url, timeout=self.timeout, allow_redirects=True)
# Do NOT auto-follow redirects: we want to record the URL's own
# response (e.g. 301/308) rather than collapsing the chain into the
# final 200. The crawler enqueues the Location target so each hop is
# crawled and recorded as its own row.
resp = session.get(url, timeout=self.timeout, allow_redirects=False)
response_time_ms = int((time.perf_counter() - t0) * 1000)
ct = resp.headers.get("Content-Type", "")
is_html = resp.status_code == 200 and (
location = resp.headers.get("Location") or resp.headers.get("location") or ""
# A redirect is a 3xx with a Location header (matches requests' own
# definition; excludes 304 Not Modified).
is_redirect = resp.status_code in (301, 302, 303, 307, 308) and bool(location)
# Capture the body for 2xx and error (4xx/5xx) HTML pages so custom
# error pages can be analysed; redirects have no meaningful body.
is_html = (not is_redirect) and (
"text/html" in ct or "application/xhtml+xml" in ct
)
text = resp.text if is_html else None
content_length = len(resp.content) if resp.content is not None else 0
final_url = resp.url or url
redirect_chain_length = len(resp.history)
if is_redirect:
final_url = urljoin(url, location)
redirect_chain_length = 1
else:
final_url = resp.url or url
redirect_chain_length = len(resp.history)
headers_dict = {k: (resp.headers.get(k) or "") for k in HEADER_KEYS}
return FetchResult(
status=resp.status_code,
Expand Down
23 changes: 21 additions & 2 deletions src/website_profiling/reporting/seo_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,29 @@
META_DESC_LEN_MAX = 160
THIN_CONTENT_CHARS = 300


def _status_text(value: object) -> str:
"""Normalize a status value to a clean string (e.g. 400.0 -> "400").

Numeric statuses can arrive as ints, strings, or floats (when pandas coerces
a column containing NaN); non-numeric markers like "error"/"blocked_by_robots"
pass through unchanged. Keeps status-code matching robust across all of them.
"""
if value is None:
return ""
try:
f = float(value) # type: ignore[arg-type]
except (TypeError, ValueError):
return str(value).strip()
if f != f: # NaN
return ""
return str(int(f))


def _compute_summary_seo_issues(df: pd.DataFrame) -> dict:
"""Compute crawl summary, SEO health metrics, issues list, and recommendations from crawl DataFrame."""
total = len(df)
status_str = df["status"].astype(str) if "status" in df.columns else pd.Series(["unknown"] * len(df))
status_str = df["status"].map(_status_text) if "status" in df.columns else pd.Series(["unknown"] * len(df))
count_2xx = int((status_str.str.match(r"2\d{2}").fillna(False)).sum())
count_3xx = int((status_str.str.match(r"3\d{2}").fillna(False)).sum())
count_4xx = int((status_str.str.match(r"4\d{2}").fillna(False)).sum())
Expand Down Expand Up @@ -76,7 +95,7 @@ def _compute_summary_seo_issues(df: pd.DataFrame) -> dict:
if pd.isna(u) or not u:
continue
u = str(u).strip()
st = str(row.get("status", "")).strip()
st = _status_text(row.get("status", ""))
if st.startswith("4") or st.startswith("5") or st in ("error", "blocked_by_robots"):
issues["broken"].append({"url": u, "status": st})
elif st.startswith("3"):
Expand Down
34 changes: 34 additions & 0 deletions tests/reporting/test_reporting_builder_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,40 @@ def test_compute_summary_seo_issues() -> None:
assert out["recommendations"]


def test_status_text_normalization() -> None:
assert seo_summary._status_text(400) == "400"
assert seo_summary._status_text(400.0) == "400"
assert seo_summary._status_text("301") == "301"
assert seo_summary._status_text("error") == "error"
assert seo_summary._status_text(None) == ""
assert seo_summary._status_text(float("nan")) == ""


def test_compute_summary_classifies_numeric_and_float_statuses() -> None:
df = pd.DataFrame(
[
{"url": "https://example.com/ok", "status": 200.0},
{"url": "https://example.com/redir", "status": 301, "final_url": "https://example.com/dest"},
{"url": "https://example.com/bad", "status": 400.0},
{"url": "https://example.com/boom", "status": 500},
]
)
out = seo_summary._compute_summary_seo_issues(df)
summary = out["summary"]
assert summary["count_2xx"] == 1
assert summary["count_3xx"] == 1
assert summary["count_4xx"] == 1
assert summary["count_5xx"] == 1

broken = {b["url"] for b in out["issues"]["broken"]}
assert {"https://example.com/bad", "https://example.com/boom"} <= broken

redirects = {r["url"]: r for r in out["issues"]["redirects"]}
assert "https://example.com/redir" in redirects
assert redirects["https://example.com/redir"]["status"] == "301"
assert redirects["https://example.com/redir"]["final_url"] == "https://example.com/dest"


def test_content_analytics_helpers() -> None:
df = _crawl_df()
content = content_analytics._build_content_analytics(df)
Expand Down
Loading
Loading