From e91873d214d0c13d19c7b90431dc9afc78b06699 Mon Sep 17 00:00:00 2001 From: PrashantUnity Date: Wed, 17 Jun 2026 00:43:54 +0530 Subject: [PATCH 01/11] Race Conditon --- src/website_profiling/crawl/crawler.py | 9 +- .../crawl/fetchers/factory.py | 6 +- .../integrations/google/keyword_enrich.py | 3 +- src/website_profiling/llm/enrich.py | 12 ++- src/website_profiling/llm/providers/openai.py | 5 +- tests/test_browser_auth_from_session.py | 39 +++++++++ tests/test_crawler_deep.py | 42 ++++++++++ tests/test_keyword_intent.py | 25 ++++++ tests/test_llm_enrich_batches.py | 54 ++++++++++++ tests/test_llm_provider_openai.py | 63 ++++++++++++++ web/next.config.mjs | 5 ++ web/src/ReportShell.tsx | 18 +++- web/src/components/chat/ChatSidebar.tsx | 49 ++++++----- .../contentStudio/WriteStudioSidebar.tsx | 22 ++--- web/src/context/PortfolioContext.tsx | 20 ++++- web/src/context/ReportContext.tsx | 6 ++ web/src/context/portfolioLoadPlan.test.ts | 18 ++++ web/src/context/portfolioLoadPlan.ts | 12 +++ web/src/context/reportContextTypes.ts | 2 + web/src/hooks/usePortfolioWidget.ts | 9 +- web/src/lib/appNav.ts | 61 ++++++++++++++ web/src/lib/chatUrlState.test.ts | 2 + web/src/lib/chatUrlState.ts | 3 +- web/src/routes.test.ts | 83 ++++++++++++++++--- web/src/routes.ts | 2 - web/src/strings.json | 8 -- web/src/views/ContentStudio.tsx | 21 ----- 27 files changed, 500 insertions(+), 99 deletions(-) create mode 100644 tests/test_browser_auth_from_session.py create mode 100644 tests/test_keyword_intent.py create mode 100644 tests/test_llm_enrich_batches.py create mode 100644 tests/test_llm_provider_openai.py create mode 100644 web/src/context/portfolioLoadPlan.test.ts create mode 100644 web/src/context/portfolioLoadPlan.ts delete mode 100644 web/src/views/ContentStudio.tsx diff --git a/src/website_profiling/crawl/crawler.py b/src/website_profiling/crawl/crawler.py index c3b9076..1757e49 100644 --- a/src/website_profiling/crawl/crawler.py +++ b/src/website_profiling/crawl/crawler.py @@ -449,8 +449,15 @@ def crawl( continue futures.append(ex.submit(self.worker, url)) - if futures and self.queue.empty(): + can_submit_more = ( + not self.queue.empty() + and len(futures) < self.concurrency + and (len(self.results) + len(futures)) < self.max_pages + ) + if futures and not can_submit_more: # Block until at least one future completes instead of busy-polling. + # Covers both an empty frontier and a saturated worker pool; wait() + # returns immediately if a future is already done. wait(futures, return_when=FIRST_COMPLETED) remaining = [] diff --git a/src/website_profiling/crawl/fetchers/factory.py b/src/website_profiling/crawl/fetchers/factory.py index 5f0bb8d..ff4f1d0 100644 --- a/src/website_profiling/crawl/fetchers/factory.py +++ b/src/website_profiling/crawl/fetchers/factory.py @@ -29,8 +29,10 @@ def _browser_auth_from_session( headers[str(key)] = str(value) credentials: Optional[dict[str, str]] = None auth = getattr(session, "auth", None) - if auth and auth[0]: - credentials = {"username": str(auth[0]), "password": str(auth[1] or "")} + # requests also allows a callable auth handler; only basic (user, pass) tuples map here. + if isinstance(auth, (tuple, list)) and len(auth) >= 1 and auth[0]: + password = str(auth[1] or "") if len(auth) > 1 else "" + credentials = {"username": str(auth[0]), "password": password} return headers, credentials diff --git a/src/website_profiling/integrations/google/keyword_enrich.py b/src/website_profiling/integrations/google/keyword_enrich.py index f592051..49a2310 100644 --- a/src/website_profiling/integrations/google/keyword_enrich.py +++ b/src/website_profiling/integrations/google/keyword_enrich.py @@ -73,7 +73,8 @@ def _normalize_kw(kw: str) -> str: # ── Intent ──────────────────────────────────────────────────────────────────── def classify_intent(kw: str, brand_name: str = "") -> str: - if brand_name and brand_name.lower().split()[0] in kw.lower(): + brand_tokens = brand_name.lower().split() + if brand_tokens and brand_tokens[0] in kw.lower(): return "navigational" if QUESTION_STARTS.match(kw): return "informational" diff --git a/src/website_profiling/llm/enrich.py b/src/website_profiling/llm/enrich.py index b2f068b..1b0d4d8 100644 --- a/src/website_profiling/llm/enrich.py +++ b/src/website_profiling/llm/enrich.py @@ -12,6 +12,7 @@ import pandas as pd from ..analysis.text import normalize_fingerprint_text +from ..console_io import console_print from ..llm_config import llm_is_enabled from .base import get_llm_client from .prompts import ( @@ -147,8 +148,11 @@ def _one(item: tuple[str, dict[str, Any]]) -> tuple[str, dict[str, Any], dict[st if workers <= 1 or len(pending) <= 1: for item in pending: - _, payload, result = _one(item) - apply_batch(payload, result) + try: + _, payload, result = _one(item) + apply_batch(payload, result) + except Exception as exc: # noqa: BLE001 - one batch failing must not abort the rest + console_print(f" LLM enrichment batch failed: {exc}", flush=True) return with ThreadPoolExecutor(max_workers=workers) as pool: @@ -157,8 +161,8 @@ def _one(item: tuple[str, dict[str, Any]]) -> tuple[str, dict[str, Any], dict[st try: _, payload, result = future.result() apply_batch(payload, result) - except Exception: - pass + except Exception as exc: # noqa: BLE001 - one batch failing must not abort the rest + console_print(f" LLM enrichment batch failed: {exc}", flush=True) def _call_cached( diff --git a/src/website_profiling/llm/providers/openai.py b/src/website_profiling/llm/providers/openai.py index 29170eb..583e592 100644 --- a/src/website_profiling/llm/providers/openai.py +++ b/src/website_profiling/llm/providers/openai.py @@ -38,7 +38,10 @@ def complete_json(self, system: str, user: str) -> dict[str, Any]: r = client.post(url, headers=headers, json=payload) r.raise_for_status() data = r.json() - content = data["choices"][0]["message"]["content"] + choice = (data.get("choices") or [{}])[0] + content = (choice.get("message") or {}).get("content") + if content is None: + raise RuntimeError("OpenAI response contained no content.") return parse_json_response(content if isinstance(content, str) else json.dumps(content)) def chat_with_tools( diff --git a/tests/test_browser_auth_from_session.py b/tests/test_browser_auth_from_session.py new file mode 100644 index 0000000..7215e23 --- /dev/null +++ b/tests/test_browser_auth_from_session.py @@ -0,0 +1,39 @@ +"""Regression tests for mapping a requests Session's auth onto browser context options. + +`_browser_auth_from_session` must not assume `session.auth` is a 2-tuple — requests +also allows a callable auth handler and 1-element credentials. +""" +from __future__ import annotations + +import requests + +from website_profiling.crawl.fetchers.factory import _browser_auth_from_session + + +def test_none_session_returns_empty_options() -> None: + assert _browser_auth_from_session(None) == ({}, None) + + +def test_basic_two_tuple_auth_maps_to_credentials() -> None: + session = requests.Session() + session.headers["X-Custom"] = "1" + session.auth = ("user", "secret") + headers, credentials = _browser_auth_from_session(session) + assert credentials == {"username": "user", "password": "secret"} + assert headers.get("X-Custom") == "1" + # User-Agent is intentionally filtered out (the browser sets its own). + assert "User-Agent" not in headers + + +def test_single_element_auth_defaults_password_to_empty() -> None: + session = requests.Session() + session.auth = ("user-only",) + _, credentials = _browser_auth_from_session(session) + assert credentials == {"username": "user-only", "password": ""} + + +def test_callable_auth_handler_is_ignored_without_raising() -> None: + session = requests.Session() + session.auth = lambda request: request # e.g. HTTPDigestAuth / custom handler + _, credentials = _browser_auth_from_session(session) + assert credentials is None diff --git a/tests/test_crawler_deep.py b/tests/test_crawler_deep.py index 1f8c58e..6ad95a4 100644 --- a/tests/test_crawler_deep.py +++ b/tests/test_crawler_deep.py @@ -180,6 +180,48 @@ def _slow_worker(url): assert len(df) == 2 +def test_crawl_waits_when_pool_saturated_instead_of_busy_spinning(monkeypatch): + # Regression: when every worker slot is busy AND the frontier still has URLs, + # the loop must block on wait() rather than busy-spin. We record the queue size + # at each wait() call; on the buggy version wait() only fired once the queue was + # empty (qsize == 0), so observing a wait() with a non-empty queue proves the fix. + import time + import website_profiling.crawl.crawler as mod + + monkeypatch.setattr( + "website_profiling.crawl.sitemap.discover_sitemap_urls", + lambda *_a, **_k: [], + ) + c = mod.Crawler( + start_url="https://site.com", + ignore_robots=True, + use_wappalyzer=False, + concurrency=2, + max_pages=4, + ) + for path in ("/a", "/b", "/c"): + c.queue.put(f"https://site.com{path}") + + qsizes_at_wait: list[int] = [] + real_wait = mod.wait + + def _recording_wait(fs, **kwargs): + qsizes_at_wait.append(c.queue.qsize()) + return real_wait(fs, **kwargs) + + monkeypatch.setattr(mod, "wait", _recording_wait) + + def _slow_worker(url): + time.sleep(0.02) + return {"url": url, "status": 200, "content_type": "text/html", "title": "ok", "outlinks": 0} + + monkeypatch.setattr(c, "worker", _slow_worker) + df = c.crawl(show_progress=False) + + assert len(df) == 4 + assert any(q > 0 for q in qsizes_at_wait) + + def test_crawl_runs_and_handles_done_futures(monkeypatch): import website_profiling.crawl.crawler as mod diff --git a/tests/test_keyword_intent.py b/tests/test_keyword_intent.py new file mode 100644 index 0000000..7b220b8 --- /dev/null +++ b/tests/test_keyword_intent.py @@ -0,0 +1,25 @@ +"""Regression tests for keyword intent classification. + +`classify_intent` must not raise when the brand name is empty or whitespace-only. +""" +from __future__ import annotations + +import pytest + +from website_profiling.integrations.google.keyword_enrich import classify_intent + +_VALID = {"navigational", "informational", "transactional", "commercial"} + + +@pytest.mark.parametrize("brand", ["", " ", "\t\n"]) +def test_blank_or_whitespace_brand_does_not_raise(brand: str) -> None: + # Whitespace-only brand is truthy but splits to [], which used to IndexError. + assert classify_intent("some search query", brand_name=brand) in _VALID + + +def test_brand_token_match_is_navigational() -> None: + assert classify_intent("acme login", brand_name="Acme") == "navigational" + + +def test_no_brand_falls_through_to_other_intents() -> None: + assert classify_intent("how to bake bread") == "informational" diff --git a/tests/test_llm_enrich_batches.py b/tests/test_llm_enrich_batches.py new file mode 100644 index 0000000..8b885df --- /dev/null +++ b/tests/test_llm_enrich_batches.py @@ -0,0 +1,54 @@ +"""Regression tests for `_run_llm_batches` failure handling. + +A failing batch must not abort the run, and the failure must be logged (observable) +in BOTH the sequential and the concurrent code paths — they used to diverge +(sequential propagated; concurrent silently swallowed). +""" +from __future__ import annotations + +import pytest + +from website_profiling.llm import enrich + + +class _BoomClient: + """LLM client whose every call fails.""" + + def complete_json(self, system: str, user: str) -> dict: + raise RuntimeError("api unavailable") + + +def test_sequential_path_logs_and_continues_on_failure( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setattr(enrich, "_llm_concurrency", lambda _cfg: 1) + applied: list = [] + enrich._run_llm_batches( + _BoomClient(), + "task", + "system", + [{"k": 1}], + {}, + lambda payload, result: applied.append(result), + ) + out = capsys.readouterr().out + assert "LLM enrichment batch failed" in out + assert applied == [] # nothing applied, but no exception escaped + + +def test_concurrent_path_logs_and_continues_on_failure( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + monkeypatch.setattr(enrich, "_llm_concurrency", lambda _cfg: 4) + applied: list = [] + enrich._run_llm_batches( + _BoomClient(), + "task", + "system", + [{"k": 1}, {"k": 2}], # >1 batch + workers>1 -> concurrent path + {}, + lambda payload, result: applied.append(result), + ) + out = capsys.readouterr().out + assert out.count("LLM enrichment batch failed") >= 1 + assert applied == [] diff --git a/tests/test_llm_provider_openai.py b/tests/test_llm_provider_openai.py new file mode 100644 index 0000000..e46e91f --- /dev/null +++ b/tests/test_llm_provider_openai.py @@ -0,0 +1,63 @@ +"""Regression tests for the OpenAI JSON-completion client. + +`complete_json` must defensively handle a 200 response whose body lacks the +expected ``choices``/``message`` structure instead of raising KeyError/IndexError. +""" +from __future__ import annotations + +import sys +import types + +import pytest + +from website_profiling.llm.providers.openai import OpenAIClient + + +class _FakeResponse: + def __init__(self, payload: dict) -> None: + self._payload = payload + + def raise_for_status(self) -> None: + return None + + def json(self) -> dict: + return self._payload + + +class _FakeClient: + def __init__(self, payload: dict) -> None: + self._payload = payload + + def __call__(self, *args, **kwargs): # httpx.Client(...) constructor + return self + + def __enter__(self): + return self + + def __exit__(self, *args) -> bool: + return False + + def post(self, *args, **kwargs) -> _FakeResponse: + return _FakeResponse(self._payload) + + +def _install_fake_httpx(monkeypatch: pytest.MonkeyPatch, payload: dict) -> None: + fake = types.ModuleType("httpx") + fake.Client = _FakeClient(payload) + monkeypatch.setitem(sys.modules, "httpx", fake) + + +def test_complete_json_missing_choices_raises_clean_error(monkeypatch: pytest.MonkeyPatch) -> None: + _install_fake_httpx(monkeypatch, {}) # no "choices" + client = OpenAIClient({"llm_api_key": "sk-test"}) + with pytest.raises(RuntimeError, match="no content"): + client.complete_json("system", "user") + + +def test_complete_json_parses_content_on_well_formed_response(monkeypatch: pytest.MonkeyPatch) -> None: + _install_fake_httpx( + monkeypatch, + {"choices": [{"message": {"content": '{"ok": true}'}}]}, + ) + client = OpenAIClient({"llm_api_key": "sk-test"}) + assert client.complete_json("system", "user") == {"ok": True} diff --git a/web/next.config.mjs b/web/next.config.mjs index a19b5a1..dc804c2 100644 --- a/web/next.config.mjs +++ b/web/next.config.mjs @@ -22,6 +22,11 @@ const nextConfig = { destination: '/dashboard?tab=charts', permanent: false, }, + { + source: '/content-studio', + destination: '/write', + permanent: true, + }, ]; }, }; diff --git a/web/src/ReportShell.tsx b/web/src/ReportShell.tsx index 5983f44..ed1e508 100644 --- a/web/src/ReportShell.tsx +++ b/web/src/ReportShell.tsx @@ -31,13 +31,13 @@ import { Globe2, Contact2, TextSearch, - PenLine, } from 'lucide-react'; import { UrlInspectorProvider } from './context/UrlInspectorContext'; import AppShell from './components/AppShell'; import { useReport } from './context/useReport'; import { strings } from './lib/strings'; import { canonicalDomainFromPayload, slugifyDomain } from './lib/domainSlug'; +import { REPORT_VIEW_IDS } from './lib/appNav'; import { pathSlugToViewId, viewIdToPathSlug, type ViewId } from './routes'; import { dispatchOpenIntegrations } from './lib/pipelineJobEvents'; import ReportShellSkeleton from './components/ReportShellSkeleton'; @@ -82,7 +82,6 @@ const Indexation = dynamic(() => import('./views/Indexation'), { loading: () => const Backlinks = dynamic(() => import('./views/Backlinks'), { loading: () => viewLoading() }); const Traffic = dynamic(() => import('./views/Traffic'), { loading: () => viewLoading() }); const KeywordsExplorer = dynamic(() => import('./views/KeywordsExplorer'), { loading: () => viewLoading() }); -const ContentStudio = dynamic(() => import('./views/ContentStudio'), { loading: () => viewLoading() }); const ExportReport = dynamic(() => import('./views/ExportReport'), { loading: () => viewLoading() }); const LogAnalyzer = dynamic(() => import('./views/LogAnalyzer'), { loading: () => viewLoading() }); const Subdomains = dynamic(() => import('./views/Subdomains'), { loading: () => viewLoading() }); @@ -146,9 +145,22 @@ const VIEW_CONFIG: ViewConfigEntry[] = [ { id: 'backlinks', component: Backlinks as ComponentType, icon: Link2 }, { id: 'traffic', component: Traffic as ComponentType, icon: BarChart2 }, { id: 'keywords-explorer', component: KeywordsExplorer as ComponentType, icon: Key }, - { id: 'content-studio', component: ContentStudio as ComponentType, icon: PenLine }, ]; +if (process.env.NODE_ENV !== 'production') { + const configIds = new Set(VIEW_CONFIG.map((entry) => entry.id)); + for (const id of REPORT_VIEW_IDS) { + if (!configIds.has(id)) { + throw new Error(`ReportShell VIEW_CONFIG missing view id: ${id}`); + } + } + for (const entry of VIEW_CONFIG) { + if (!REPORT_VIEW_IDS.includes(entry.id)) { + throw new Error(`ReportShell VIEW_CONFIG has unknown view id: ${entry.id}`); + } + } +} + const VIEWS = VIEW_CONFIG.map((v) => ({ ...v, label: strings.nav[v.id].label, diff --git a/web/src/components/chat/ChatSidebar.tsx b/web/src/components/chat/ChatSidebar.tsx index c287ee5..df06f00 100644 --- a/web/src/components/chat/ChatSidebar.tsx +++ b/web/src/components/chat/ChatSidebar.tsx @@ -2,23 +2,24 @@ import { useEffect, useRef, useState, type ReactNode } from 'react'; import Link from 'next/link'; +import { usePathname } from 'next/navigation'; import { ChevronLeft, History, - Home, - Link as LinkIcon, MessageSquarePlus, PanelLeft, - PenLine, Settings, - Terminal, Trash2, - TrendingUp, } from 'lucide-react'; import AppLogo from '@/components/AppLogo'; import ThemeToggle from '@/components/ThemeToggle'; import type { ChatLayoutState } from '@/components/chat/ChatShell'; import { formatChatPropertyOption } from '@/lib/chatPropertyLabel'; +import { + CHAT_SIDEBAR_NAV_IDS, + isMiniNavLinkActive, + miniNavLinks, +} from '@/lib/appNav'; import { strings } from '@/lib/strings'; const c = strings.components.chat; @@ -46,13 +47,7 @@ export interface ChatSidebarProps extends ChatLayoutState { loading?: boolean; } -const NAV_LINKS = [ - { href: '/home', label: c.navHome, icon: Home }, - { href: '/search-performance', label: c.navGsc, icon: TrendingUp }, - { href: '/links', label: c.navLinks, icon: LinkIcon }, - { href: '/pipeline', label: c.navPipeline, icon: Terminal }, - { href: '/write', label: strings.nav.write.label, icon: PenLine }, -] as const; +const NAV_LINKS = miniNavLinks(CHAT_SIDEBAR_NAV_IDS); function RailButton({ label, @@ -115,6 +110,7 @@ export default function ChatSidebar({ toggle, setExpanded, }: ChatSidebarProps) { + const pathname = usePathname(); const [settingsOpen, setSettingsOpen] = useState(false); const settingsRef = useRef(null); @@ -276,17 +272,24 @@ export default function ChatSidebar({ diff --git a/web/src/components/contentStudio/WriteStudioSidebar.tsx b/web/src/components/contentStudio/WriteStudioSidebar.tsx index 548f670..da0a17d 100644 --- a/web/src/components/contentStudio/WriteStudioSidebar.tsx +++ b/web/src/components/contentStudio/WriteStudioSidebar.tsx @@ -7,21 +7,20 @@ import { ChevronLeft, FileText, History, - Home, - Link as LinkIcon, - MessageSquarePlus, PanelLeft, - PenLine, Plus, Settings, - Terminal, Trash2, - TrendingUp, } from 'lucide-react'; import AppLogo from '@/components/AppLogo'; import ThemeToggle from '@/components/ThemeToggle'; import type { WriteLayoutState } from '@/components/contentStudio/WriteStudioShell'; import { formatChatPropertyOption } from '@/lib/chatPropertyLabel'; +import { + isMiniNavLinkActive, + miniNavLinks, + WRITE_SIDEBAR_NAV_IDS, +} from '@/lib/appNav'; import { strings } from '@/lib/strings'; import type { ContentDraftListItem } from '@/types/contentStudio'; @@ -47,14 +46,7 @@ export interface WriteStudioSidebarProps extends WriteLayoutState { readOnly?: boolean; } -const NAV_LINKS = [ - { href: '/home', label: c.navHome, icon: Home }, - { href: '/search-performance', label: c.navGsc, icon: TrendingUp }, - { href: '/links', label: c.navLinks, icon: LinkIcon }, - { href: '/pipeline', label: c.navPipeline, icon: Terminal }, - { href: '/chat', label: c.pageTitle, icon: MessageSquarePlus }, - { href: '/write', label: strings.nav.write.label, icon: PenLine }, -] as const; +const NAV_LINKS = miniNavLinks(WRITE_SIDEBAR_NAV_IDS); function RailButton({ label, @@ -284,7 +276,7 @@ export default function WriteStudioSidebar({