From ddf9055a8404d60245560e8b89500765a68c9af3 Mon Sep 17 00:00:00 2001
From: admin-raintree <277948009+admin-raintree@users.noreply.github.com>
Date: Fri, 12 Jun 2026 10:52:28 -0700
Subject: [PATCH 1/2] chore: sync workspace changes

---
 .github/workflows/benchmark.yml       |   8 +-
 .github/workflows/ci.yml              |   6 +-
 .github/workflows/codeql.yml          |   6 +-
 .github/workflows/metrics.yml         |  33 +++++--
 .github/workflows/publish.yml         |   8 +-
 .github/workflows/security.yml        |  12 ++-
 README.md                             |  10 +++
 audit/2026-06-10-eval-system-audit.md | 103 +++++++++++++++++++++
 src/docpull/benchmark.py              | 125 +++++++++++++++++++++-----
 src/docpull/parallel_workflows.py     | 122 +++++++++++++++++++++----
 src/docpull/source_scoring.py         |  12 ++-
 11 files changed, 386 insertions(+), 59 deletions(-)
 create mode 100644 audit/2026-06-10-eval-system-audit.md

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 856f3a0..070aadc 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -16,10 +16,14 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: false
+
 jobs:
   benchmark:
     if: github.event_name != 'schedule' || github.event.schedule == '17 4 * * *'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     timeout-minutes: 15
     steps:
       - name: Checkout
@@ -63,7 +67,7 @@ jobs:
 
   provider-matrix:
     if: github.event_name != 'schedule' || github.event.schedule == '31 5 * * 1'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     timeout-minutes: 20
     steps:
       - name: Checkout
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8fb02fa..49793c5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,7 +15,7 @@ concurrency:
 
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     strategy:
       fail-fast: false
       matrix:
@@ -50,7 +50,7 @@ jobs:
           path: coverage.xml
 
   lint:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -77,7 +77,7 @@ jobs:
         run: pre-commit run --all-files --show-diff-on-failure
 
   typecheck:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 23e1cff..b038d71 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -13,10 +13,14 @@ permissions:
   contents: read
   security-events: write
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   analyze:
     name: analyze (${{ matrix.language }})
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/metrics.yml b/.github/workflows/metrics.yml
index a29f124..f9728b6 100644
--- a/.github/workflows/metrics.yml
+++ b/.github/workflows/metrics.yml
@@ -40,7 +40,7 @@ concurrency:
 
 jobs:
   update:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
 
@@ -50,14 +50,23 @@ jobs:
 
       - name: Refresh METRICS.md
         env:
-          # Prefer the PAT (has Administration: read for traffic endpoints).
-          # Fall back to the default token (still works for stars / issues /
-          # downloads — only the traffic section will be empty).
-          GH_TOKEN: ${{ secrets.METRICS_TOKEN || secrets.GITHUB_TOKEN }}
-        run: python .github/scripts/update_metrics.py
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          METRICS_TOKEN: ${{ secrets.METRICS_TOKEN }}
+        run: |
+          if [ -n "${METRICS_TOKEN:-}" ] && GH_TOKEN="$METRICS_TOKEN" gh api "repos/${GITHUB_REPOSITORY}" >/dev/null 2>&1; then
+            export GH_TOKEN="$METRICS_TOKEN"
+          else
+            if [ -n "${METRICS_TOKEN:-}" ]; then
+              echo "::warning::METRICS_TOKEN is present but failed GitHub API validation; falling back to GITHUB_TOKEN."
+            fi
+            export GH_TOKEN="$GITHUB_TOKEN"
+          fi
+          python .github/scripts/update_metrics.py
 
       - name: Open metrics refresh PR
-        uses: peter-evans/create-pull-request@22a9089034f40e5a961c8808d113e2c98fb63676
+        id: metrics-pr
+        uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8.1.1
+        continue-on-error: true
         with:
           add-paths: METRICS.md
           branch: automation/metrics-refresh
@@ -65,3 +74,13 @@ jobs:
           commit-message: "chore(metrics): refresh metrics"
           title: "chore(metrics): refresh metrics"
           body: "Automated METRICS.md refresh from the scheduled metrics workflow."
+
+      - name: Report PR creation limitation
+        if: steps.metrics-pr.outcome == 'failure'
+        run: |
+          echo "::warning::Metrics refreshed, but PR creation failed. Enable Actions-created pull requests in repository settings or create a PR from automation/metrics-refresh manually."
+          {
+            echo "### Metrics refresh"
+            echo
+            echo "METRICS.md was refreshed, but the pull request step failed. Enable Actions-created pull requests in repository settings or open a PR from automation/metrics-refresh manually."
+          } >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index b1dd02a..4c59579 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -20,9 +20,13 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: false
+
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     outputs:
       version: ${{ steps.meta.outputs.version }}
     steps:
@@ -81,7 +85,7 @@ jobs:
 
   publish:
     needs: build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     environment:
       name: pypi
       url: https://pypi.org/project/docpull/${{ needs.build.outputs.version }}/
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index 8016b12..bc467ec 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -9,9 +9,13 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   secret-scan:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout full history
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -28,7 +32,7 @@ jobs:
             detect --source=/repo --redact --no-banner
 
   python-security:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -53,7 +57,7 @@ jobs:
         run: PYTHONPATH=src pytest -q tests/test_security_hardening.py tests/test_discovery.py tests/test_integration.py
 
   mcp-security:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     defaults:
       run:
         working-directory: mcp
@@ -79,7 +83,7 @@ jobs:
         run: bun run typecheck
 
   web-security:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     defaults:
       run:
         working-directory: web
diff --git a/README.md b/README.md
index 0ceed04..0706114 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,16 @@
   </a>
 </p>
 
+## Star History
+
+<a href="https://star-history.com/#raintree-technology/docpull&Date">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=raintree-technology/docpull&type=Date&theme=dark" />
+    <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=raintree-technology/docpull&type=Date" />
+    <img alt="Star history chart for raintree-technology/docpull" src="https://api.star-history.com/svg?repos=raintree-technology/docpull&type=Date" />
+  </picture>
+</a>
+
 docpull is a web scraper for static and server-rendered sites, with
 documentation crawling as its sharpest default workflow. It uses async HTTP (not
 Playwright) to fetch pages, discover links, extract main content, and write clean
diff --git a/audit/2026-06-10-eval-system-audit.md b/audit/2026-06-10-eval-system-audit.md
new file mode 100644
index 0000000..bcecd7e
--- /dev/null
+++ b/audit/2026-06-10-eval-system-audit.md
@@ -0,0 +1,103 @@
+# Eval/Benchmark System Audit — 2026-06-10
+
+Scope: the docpull **eval/benchmark Python core** —
+`src/docpull/benchmark.py`, `parallel_workflows.py`, `pack_tools.py`,
+`source_scoring.py`, `metadata_extractor.py`. CI workflow, secret-store
+helpers (`provider_keys.py`/`provider_cli.py`), and the web publication path
+were explicitly out of scope for this pass.
+
+Method: multi-agent finder + adversarial-verifier sweep across 9 audit
+dimensions (Sonnet), each candidate finding independently re-checked against
+current source by a skeptic prompted to refute. 41 agents, 32 findings raised,
+**27 confirmed / 5 refuted**. Opus triaged, deduped to 8 root causes, and
+remediated.
+
+Threat model: a malicious/compromised provider API response (Tavily, Exa,
+Parallel), a malicious recipe/fixture file, or untrusted fetched doc content
+written into agent-consumed artifacts. docpull is a local-first CLI, so
+"remote exploit" generally means "a shared/CI recipe or a misbehaving paid
+provider," not a network-facing service.
+
+## Fixed
+
+### Security
+1. **Recipe `output_dir` path traversal** (`parallel_workflows.py`) — a recipe
+   field could write pack files to any absolute path or `..`-escape the cwd
+   (two independent code paths: `_recipe_output_dir` and the inline
+   context-pack resolver). Added `_ensure_within_cwd` containment; both paths
+   now route through `_recipe_output_dir`. The CLI `--output-dir` override
+   stays trusted/exempt.
+2. **Prompt-injection via provider Markdown** — provider-supplied `title`/`url`
+   were written verbatim into `AGENT_CONTEXT.md`, `sources.md`, and
+   `NEXT_STEPS.md` (LLM-consumed). Added shared `_md_link` / `_md_inline_text`
+   / `_md_safe_url` helpers (escape `[]` `` ` ``, strip CR/LF, http(s)-only
+   URLs) and applied them at every writer site in both files.
+3. **`_http_json_post_once` hardening** (`benchmark.py`) — closes four findings
+   at once: (a) size-capped response read (`HTTP_MAX_RESPONSE_BYTES`, 16 MB) to
+   stop multi-GB OOM; (b) `_NoRedirectHandler` refuses 3xx on authenticated
+   POSTs, which previously forwarded `Authorization`/`x-api-key` across
+   redirects and followed https→http downgrades; (c) the same handler removes
+   the SSRF-via-redirect-to-internal-host vector.
+4. **Cost-cap gaps** (`benchmark.py`) — the `--runs N` multiplier was missing
+   from the Parallel estimate (10× silent overspend), and Tavily/Exa bypassed
+   the guard entirely (it lived inside `if parallel:`). The guard now covers
+   all three providers, multiplies by `len(targets) * runs`, and reports a
+   per-provider breakdown on trip.
+5. **FindAll poll-loop logic bug** (`parallel_workflows.py`) — on deadline
+   expiry the loop still called `.result()` against an active job, writing
+   partial data as success. Now raises `ParallelWorkflowError` on timeout, like
+   `_wait_for_taskgroup_completion`.
+
+### Eval integrity (published-number credibility)
+6. **Freshness dimension** returned 100/100 (a free +15) for any target without
+   `freshness_terms`; now returns a neutral 65 with a visible signal. The eight
+   published targets all set terms, so reference numbers are unaffected — this
+   only de-inflates ad-hoc single-target runs.
+7. **`_aggregate_runs` wall time** took the median over *all* runs including
+   fast failures (a broken case could report 0.1 s), contradicting its own
+   docstring; now medians over successful runs only.
+8. **`source_scoring` substring false positives** — `"developer" in domain` and
+   `"/api" in path` rewarded `notadeveloper.com`, `/apiary/…`. Domain check is
+   now subdomain-anchored; path check matches a whole segment or a
+   `-`/`_`-prefixed one (so `/api-reference` and `/api/v2` still score, but
+   `/apiary` does not). Verified against all 819 source rows in `.bench/runs/`:
+   **0 change** — the only real effect is excluding genuine false positives,
+   which do not appear in the published corpus. (A first cut using
+   segment-*exact* matching wrongly dropped 163 `/api-reference` rows by −10;
+   that regression was caught by re-scoring the stored runs and corrected.)
+
+### Hygiene (low severity, defensive)
+- Recipe size guard (`MAX_RECIPE_BYTES`, 1 MB) before `yaml.safe_load`
+  (billion-laughs).
+- stdin API-key length cap (512 chars).
+- `_redact_secret_like` strips token-shaped substrings from third-party error
+  bodies before they reach `benchmark.report.json` / Raindrop traces.
+- Raindrop traces now send `output_dir.name` / artifact basenames instead of
+  absolute home paths.
+- Removed the dead/misleading redaction branch in `_load_mcp_servers`.
+- `_cap_fixture_content` bounds imported-fixture `full_content`/`excerpts` to
+  the live `DEFAULT_MAX_FULL_CONTENT_CHARS`.
+- SSRF/artifact hygiene: `run_live_context_pack` now runs provider URLs through
+  `UrlValidator` (https-only) before extract, matching the extract-pack path.
+
+### Incidental
+- Fixed a pre-existing mypy error in `_workload_disclosure_lines` (`med` typed
+  int then assigned `""`), introduced by commit `a2a8535`.
+- Annotated the pre-existing B311 jitter finding in `_retry_delay_seconds` with
+  a policy-compliant `# nosec B311` (non-crypto retry backoff). Bandit was red
+  at HEAD on this; it is now green.
+
+## Refuted (verified false positives)
+- `_resolve_recipe_path` arbitrary read — trust boundary is "ran an untrusted
+  file"; `url_file` content is https-validated, `diff` reads only a fixed name.
+- `_safe_slug` — genuinely neutralizes path separators.
+- benchmark argparse bare `type=int/float` — post-parse `_validate_positive_int`
+  already rejects zero/negative.
+- "Unbounded Retry-After" — the cap is applied at parse time (`min(..., CAP)`).
+- Provider text in the published article — the Targets section is built only
+  from hardcoded/user-controlled `_BenchmarkTarget` fields, not provider data.
+
+## Verification
+`ruff check` ✅ · `ruff format` ✅ · `mypy src` ✅ · `pytest tests` ✅ 476 passed ·
+`bandit -c pyproject.toml -r src` ✅ exit 0 · `pip-audit` ✅ no known vulns.
+Diff: 3 files, +201/−54.
diff --git a/src/docpull/benchmark.py b/src/docpull/benchmark.py
index f3bc7a6..720bc03 100644
--- a/src/docpull/benchmark.py
+++ b/src/docpull/benchmark.py
@@ -6,6 +6,7 @@
 import asyncio
 import json
 import random
+import re
 import resource
 import sys
 import time
@@ -18,7 +19,7 @@
 from typing import Any
 from urllib.error import HTTPError, URLError
 from urllib.parse import urlparse
-from urllib.request import Request, urlopen
+from urllib.request import HTTPRedirectHandler, Request, build_opener
 
 from rich.console import Console
 from rich.markup import escape
@@ -34,6 +35,7 @@
     DEFAULT_MODE,
     ParallelWorkflowError,
     _build_source_policy,
+    _md_link,
     _parallel_sdk_installed,
     estimate_context_pack_cost,
     estimate_search_pack_cost,
@@ -76,6 +78,14 @@
 }
 PASS_AT_K_THRESHOLDS: tuple[int, ...] = (70, 80, 90)
 TARGET_SET_CHOICES = ("single", "tool-docs", "provider-matrix", "v2")
+HTTP_MAX_RESPONSE_BYTES = 16 * 1024 * 1024
+HTTP_MAX_ERROR_BYTES = 64 * 1024
+# Conservative per-call USD figures used ONLY for the pre-flight
+# --max-estimated-cost guard on the live Tavily/Exa search providers. Real
+# spend is reconciled from provider usage after the run; these are loose upper
+# bounds chosen so the guard fails safe. They never feed published numbers.
+APPROX_TAVILY_CREDIT_USD = 0.01
+APPROX_EXA_SEARCH_USD = 0.01
 
 
 class BenchmarkError(RuntimeError):
@@ -502,21 +512,32 @@ def run_quick_benchmark(
 
     estimated_search_cost = 0.0
     estimated_context_cost = 0.0
+    estimated_costs: dict[str, float] = {}
     if parallel:
         estimated_search_cost = estimate_search_pack_cost(max_search_results=max_search_results)
         estimated_context_cost = estimate_context_pack_cost(
             extract_limit=extract_limit,
             max_search_results=max_search_results,
         )
-        estimated_total_cost = round(
-            (estimated_search_cost + estimated_context_cost) * len(targets),
+        estimated_costs["parallel"] = round(
+            (estimated_search_cost + estimated_context_cost) * len(targets) * runs,
             6,
         )
-        if estimated_total_cost > max_estimated_cost:
-            raise BenchmarkError(
-                "Estimated Parallel benchmark cost "
-                f"${estimated_total_cost:.6f} exceeds guard ${max_estimated_cost:.6f}."
-            )
+    if "tavily" in providers:
+        credit_usd = tavily_credit_usd if tavily_credit_usd is not None else APPROX_TAVILY_CREDIT_USD
+        estimated_costs["tavily"] = round(
+            (1 + extract_limit) * credit_usd * len(targets) * runs,
+            6,
+        )
+    if "exa" in providers:
+        estimated_costs["exa"] = round(APPROX_EXA_SEARCH_USD * len(targets) * runs, 6)
+    estimated_total_cost = round(sum(estimated_costs.values()), 6)
+    if estimated_total_cost > max_estimated_cost:
+        breakdown = ", ".join(f"{name}=${value:.6f}" for name, value in estimated_costs.items())
+        raise BenchmarkError(
+            f"Estimated live-provider benchmark cost ${estimated_total_cost:.6f} "
+            f"({breakdown}) exceeds guard ${max_estimated_cost:.6f}."
+        )
 
     trace = _make_trace_recorder(
         trace_backend,
@@ -1092,6 +1113,18 @@ def _benchmark_provider_statuses(
     return safe_statuses
 
 
+def _path_basename(value: Any) -> str | None:
+    if not value:
+        return None
+    return Path(str(value)).name
+
+
+def _basename_only(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {key: _path_basename(item) for key, item in value.items()}
+    return value
+
+
 class _TraceRecorder:
     provider = "none"
 
@@ -1148,7 +1181,7 @@ def __init__(
             properties={
                 "target_set": target_set,
                 "target_count": len(targets),
-                "output_dir": str(output_dir),
+                "output_dir": output_dir.name,
                 "parallel_enabled": parallel_enabled,
                 "max_estimated_cost_usd": max_estimated_cost,
                 "content_policy": "metadata_only",
@@ -1158,7 +1191,7 @@ def __init__(
                     "target_url": target_url,
                     "target_set": target_set,
                     "targets": [target.report_dict() for target in targets],
-                    "output_dir": str(output_dir),
+                    "output_dir": output_dir.name,
                     "parallel_enabled": parallel_enabled,
                     "max_estimated_cost_usd": max_estimated_cost,
                 }
@@ -1180,7 +1213,7 @@ def record_case(self, case: dict[str, Any]) -> None:
                 "target": case.get("target"),
                 "prompt": case.get("prompt"),
                 "settings": case.get("settings"),
-                "output_dir": case.get("output_dir"),
+                "output_dir": _path_basename(case.get("output_dir")),
             },
             output=_trace_case_output(case),
             duration_ms=int(float(case.get("wall_seconds") or 0.0) * 1000),
@@ -1212,7 +1245,7 @@ def finish(self, report: dict[str, Any]) -> None:
         self._interaction.set_properties(
             {
                 "summary": report.get("summary"),
-                "artifacts": report.get("artifacts"),
+                "artifacts": _basename_only(report.get("artifacts")),
                 "signal_count": self._signal_count,
                 "positive_signal_count": self._positive_signal_count,
                 "negative_signal_count": self._negative_signal_count,
@@ -1223,7 +1256,7 @@ def finish(self, report: dict[str, Any]) -> None:
             output=_json_trace_text(
                 {
                     "summary": report.get("summary"),
-                    "artifacts": report.get("artifacts"),
+                    "artifacts": _basename_only(report.get("artifacts")),
                     "trace_signals": {
                         "total": self._signal_count,
                         "positive": self._positive_signal_count,
@@ -2007,7 +2040,9 @@ def _aggregate_runs(
     The full per-run list is preserved under ``runs`` for raw inspection.
     """
     successful = [run for run in runs if run.get("status") != "failed"]
-    wall_seconds_list = [float(run.get("wall_seconds") or 0.0) for run in runs]
+    # Headline wall time is the median across *successful* runs only; a run that
+    # aborts quickly on a network error must not pull the reported latency down.
+    wall_seconds_list = [float(run.get("wall_seconds") or 0.0) for run in (successful or runs)]
     estimated_costs = [float(run.get("estimated_cost_usd") or 0.0) for run in runs]
     artifact_sizes = [int(run.get("artifact_size_bytes") or 0) for run in runs]
     cache_sizes = [int(run.get("cache_size_bytes") or 0) for run in runs]
@@ -2201,7 +2236,7 @@ def _source_fidelity_dimension(
 
 def _freshness_dimension(records: list[dict[str, Any]], target: _BenchmarkTarget | None) -> dict[str, Any]:
     if not target or not target.freshness_terms:
-        return _dimension(100, [])
+        return _dimension(65, ["freshness not evaluated - no freshness terms configured"])
     haystack = "\n".join(
         " ".join(
             [
@@ -2426,7 +2461,7 @@ def _write_provider_sources_md(
         index = source.get("index")
         title = str(source.get("title") or source.get("url") or "Untitled")
         url = str(source.get("url") or "")
-        lines.append(f"{index}. [{title}]({url})")
+        lines.append(f"{index}. {_md_link(title, url)}")
         if source.get("source_type"):
             lines.append(f"   - Source type: `{source['source_type']}`")
         lines.append("   - Records file: `documents.ndjson`")
@@ -2467,6 +2502,20 @@ def _http_json_post(
     raise last_error
 
 
+class _NoRedirectHandler(HTTPRedirectHandler):
+    """Refuse 3xx redirects on authenticated POSTs.
+
+    urllib forwards ``Authorization`` / ``x-api-key`` across redirects (only
+    ``content-*`` headers are stripped) and will follow an https->http
+    downgrade, so a redirect from a provider endpoint would leak the API key
+    in cleartext and could be steered at an internal host. These endpoints
+    never legitimately redirect, so surface any 3xx as an error instead.
+    """
+
+    def redirect_request(self, req, fp, code, msg, headers, newurl):  # type: ignore[no-untyped-def]
+        raise HTTPError(req.full_url, code, f"Refused redirect to {newurl!r}", headers, fp)
+
+
 def _http_json_post_once(
     *,
     label: str,
@@ -2487,11 +2536,12 @@ def _http_json_post_once(
         },
         method="POST",
     )
+    opener = build_opener(_NoRedirectHandler())
     try:
-        with urlopen(request, timeout=timeout) as response:  # nosec B310
-            raw = response.read().decode("utf-8")
+        with opener.open(request, timeout=timeout) as response:  # nosec B310
+            raw_bytes = response.read(HTTP_MAX_RESPONSE_BYTES + 1)
     except HTTPError as err:
-        detail = err.read().decode("utf-8", errors="replace")
+        detail = _redact_secret_like(err.read(HTTP_MAX_ERROR_BYTES).decode("utf-8", errors="replace"))
         message = f"{label} returned HTTP {err.code}: {_short_error_detail(detail)}"
         if err.code in HTTP_RETRY_TRANSIENT_STATUSES:
             transient = _TransientHTTPError(message, retry_after=_parse_retry_after(err))
@@ -2503,8 +2553,10 @@ def _http_json_post_once(
         transient = _TransientHTTPError(message, retry_after=None)
         transient.__cause__ = err
         raise transient from err
+    if len(raw_bytes) > HTTP_MAX_RESPONSE_BYTES:
+        raise BenchmarkError(f"{label} response exceeds {HTTP_MAX_RESPONSE_BYTES}-byte limit.")
     try:
-        parsed = json.loads(raw)
+        parsed = json.loads(raw_bytes.decode("utf-8"))
     except json.JSONDecodeError as err:
         raise BenchmarkError(f"{label} returned invalid JSON: {err}") from err
     if not isinstance(parsed, dict):
@@ -2588,6 +2640,20 @@ def _relative_path(path: Path, base_dir: Path) -> str:
         return str(path)
 
 
+_SECRET_LIKE_RE = re.compile(
+    r"(?i)(?:bearer\s+|x-api-key\s*[:=]\s*|api[-_]?key\s*[\"':=]\s*|tvly-|exa_|sk-)[A-Za-z0-9._\-]{6,}"
+)
+
+
+def _redact_secret_like(value: str) -> str:
+    """Strip token-shaped substrings out of a third-party error body.
+
+    Provider error responses occasionally echo the submitted credential back;
+    this keeps such tokens out of ``benchmark.report.json`` and any trace upload.
+    """
+    return _SECRET_LIKE_RE.sub("[redacted]", value)
+
+
 def _short_error_detail(value: str) -> str:
     compact = " ".join(value.split())
     return compact[:500]
@@ -3141,7 +3207,24 @@ def _article_markdown(report: dict[str, Any], *, title: str) -> str:
             f"{cost_text} |"
         )
     if heatmap:
-        lines.extend(["", "## Provider x Target Heatmap", "", *heatmap])
+        lines.extend(
+            [
+                "",
+                "## Provider x Target Heatmap",
+                "",
+                (
+                    "Read across rows (one target, all providers), not down columns. "
+                    "The five workflows are not equivalent jobs: the core crawl walks a "
+                    "page graph from a known seed URL, while provider workflows run a "
+                    "search query and optionally extract a small number of results. "
+                    "A provider that returns zero search results for a lesser-known site "
+                    "scores 0 — not because its extractor is weak, but because its index "
+                    "doesn't cover that site. See Workload disclosure above."
+                ),
+                "",
+                *heatmap,
+            ]
+        )
     lines.extend(
         [
             "",
diff --git a/src/docpull/parallel_workflows.py b/src/docpull/parallel_workflows.py
index eee4829..9c89c5b 100644
--- a/src/docpull/parallel_workflows.py
+++ b/src/docpull/parallel_workflows.py
@@ -107,6 +107,7 @@
     "lite": 0.003,
     "base": 0.010,
 }
+MAX_RECIPE_BYTES = 1_000_000
 
 
 class ParallelWorkflowError(RuntimeError):
@@ -2202,6 +2203,9 @@ def run_live_context_pack(
     selected_urls = _select_urls(search_results, extract_limit)
     if not selected_urls:
         raise ParallelWorkflowError("Parallel Search returned no URLs to extract.")
+    selected_urls = _validated_https_urls(selected_urls)
+    if not selected_urls:
+        raise ParallelWorkflowError("Parallel Search returned no valid HTTPS URLs to extract.")
 
     extract_kwargs: dict[str, Any] = {
         "urls": selected_urls,
@@ -2297,7 +2301,7 @@ def run_recipe(
     if not isinstance(queries, list) or not all(isinstance(item, str) and item for item in queries):
         raise ParallelWorkflowError("Recipe field 'queries' must be a list of non-empty strings.")
 
-    output_dir = output_dir_override or Path(str(recipe.get("output_dir") or DEFAULT_OUTPUT_DIR))
+    output_dir = _recipe_output_dir(recipe, DEFAULT_OUTPUT_DIR, output_dir_override)
     source_policy_recipe = recipe.get("source_policy") or {}
     if not isinstance(source_policy_recipe, dict):
         raise ParallelWorkflowError("Recipe field 'source_policy' must be an object when present.")
@@ -3088,10 +3092,14 @@ def run_findall_pack(
         raise ParallelWorkflowError("Parallel FindAll create response did not include a findall_id.")
     if wait and findall_id:
         deadline = time.monotonic() + timeout
-        while time.monotonic() < deadline:
+        while True:
             current = client.beta.findall.retrieve(findall_id)
             if not _status_is_active(_get(current, "status")):
                 break
+            if time.monotonic() >= deadline:
+                raise ParallelWorkflowError(
+                    f"Parallel FindAll {findall_id} did not complete within {timeout}s."
+                )
             time.sleep(poll_interval)
         result = client.beta.findall.result(findall_id)
     candidates = [_jsonable(item) for item in _list(_get(result, "candidates"))] if result else []
@@ -4013,6 +4021,41 @@ def _write_parallel_pack(
     return path
 
 
+def _md_inline_text(value: str) -> str:
+    """Neutralize Markdown control characters in provider-supplied inline text."""
+    return (
+        value.replace("\\", "\\\\")
+        .replace("`", "\\`")
+        .replace("[", "\\[")
+        .replace("]", "\\]")
+        .replace("\r", " ")
+        .replace("\n", " ")
+    )
+
+
+def _md_safe_url(value: str) -> str:
+    """Return an http(s) URL safe to embed in Markdown, or '' if not web-safe."""
+    cleaned = value.strip().replace("\r", "").replace("\n", "")
+    if urlparse(cleaned).scheme not in {"http", "https"}:
+        return ""
+    return cleaned.replace(" ", "%20").replace("(", "%28").replace(")", "%29")
+
+
+def _md_link(title: str, url: str) -> str:
+    """Render a provider title/URL as a Markdown link that cannot break out."""
+    safe_title = _md_inline_text(title)
+    safe_url = _md_safe_url(url)
+    if not safe_url:
+        return f"{safe_title} (unverified URL)"
+    return f"[{safe_title}]({safe_url})"
+
+
+def _validated_https_urls(urls: list[str]) -> list[str]:
+    """Drop any non-HTTPS / private-host URLs from a provider-supplied list."""
+    validator = UrlValidator(allowed_schemes={"https"})
+    return [url for url in urls if validator.validate(url).is_valid]
+
+
 def _write_agent_context_md(
     output_dir: Path,
     *,
@@ -4091,7 +4134,7 @@ def _write_agent_context_md(
             source_path = _coerce_str(entry.get("path")) or ""
             index = entry.get("index") or "?"
             local = f" - `{source_path}`" if source_path else ""
-            lines.append(f"{index}. [{title}]({url}){local}")
+            lines.append(f"{index}. {_md_link(title, url)}{local}")
 
         source_scores = score_source_entries(entries, expected_domains=expected_domains)
         if source_scores:
@@ -4101,7 +4144,7 @@ def _write_agent_context_md(
                 path_text = f" - `{source['path']}`" if source.get("path") else ""
                 lines.append(
                     f"- {source['score']}/100 {source['grade']}: "
-                    f"[{source['title']}]({source['url']}){path_text} ({reason_text})"
+                    f"{_md_link(str(source['title']), str(source['url']))}{path_text} ({reason_text})"
                 )
 
     warning_lines = _agent_context_warning_lines(warnings or {}, errors or [])
@@ -4136,7 +4179,7 @@ def _agent_context_warning_lines(
     for error in errors:
         url = _coerce_str(_get(error, "url")) or "unknown URL"
         error_type = _coerce_str(_get(error, "error_type")) or "extract_error"
-        lines.append(f"- Extract error `{error_type}`: {url}")
+        lines.append(f"- Extract error `{error_type}`: {_md_inline_text(url)}")
     return lines
 
 
@@ -4164,14 +4207,14 @@ def _write_sources_md(output_dir: Path, pack: ParallelContextPack, entries: list
         "",
     ]
     for entry in entries:
-        lines.append(f"{entry['index']}. [{entry['title']}]({entry['url']})")
+        lines.append(f"{entry['index']}. {_md_link(str(entry['title']), str(entry['url']))}")
         lines.append(f"   - Local: `{entry['path']}`")
     if pack.extract_errors:
         lines.extend(["", "## Extract Errors", ""])
         for error in pack.extract_errors:
             url = _coerce_str(_get(error, "url")) or "unknown URL"
             error_type = _coerce_str(_get(error, "error_type")) or "extract_error"
-            lines.append(f"- `{error_type}`: {url}")
+            lines.append(f"- `{error_type}`: {_md_inline_text(url)}")
     path = output_dir / "sources.md"
     path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
     return path
@@ -4182,13 +4225,34 @@ def _write_generic_sources_md(output_dir: Path, objective: str, entries: list[di
     if not entries:
         lines.append("_No records were available when this pack was written._")
     for entry in entries:
-        lines.append(f"{entry['index']}. [{entry['title']}]({entry['url']})")
+        lines.append(f"{entry['index']}. {_md_link(str(entry['title']), str(entry['url']))}")
         lines.append(f"   - Local: `{entry['path']}`")
     path = output_dir / "sources.md"
     path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
     return path
 
 
+def _cap_fixture_content(item: Any, max_chars: int = DEFAULT_MAX_FULL_CONTENT_CHARS) -> Any:
+    """Bound full_content/excerpts from an imported fixture to the live cap.
+
+    The live Extract path truncates server-side; fixture import has no such
+    bound, so a crafted fixture could otherwise write an unbounded source file.
+    """
+    data = _jsonable(item)
+    if not isinstance(data, dict):
+        return data
+    content = data.get("full_content")
+    if isinstance(content, str) and len(content) > max_chars:
+        data["full_content"] = content[:max_chars]
+    excerpts = data.get("excerpts")
+    if isinstance(excerpts, list):
+        data["excerpts"] = [
+            value[:max_chars] if isinstance(value, str) and len(value) > max_chars else value
+            for value in excerpts
+        ]
+    return data
+
+
 def _pack_from_fixture(raw: Any) -> ParallelContextPack:
     if not isinstance(raw, dict):
         raise ParallelWorkflowError("Fixture must be a JSON object.")
@@ -4222,7 +4286,7 @@ def _pack_from_fixture(raw: Any) -> ParallelContextPack:
         extract_id=_coerce_str(extract.get("extract_id")),
         task_run_id=_coerce_str(task.get("run_id") or task.get("task_run_id")),
         search_results=[_jsonable(item) for item in _list(search.get("results"))],
-        extract_results=[_jsonable(item) for item in extract_results],
+        extract_results=[_cap_fixture_content(item) for item in extract_results],
         extract_errors=[_jsonable(item) for item in extract_errors],
         task_brief=_coerce_str(task.get("brief") or task.get("content")),
         task_basis=_list(task.get("basis")),
@@ -4238,6 +4302,8 @@ def _load_recipe(recipe_path: Path) -> dict[str, Any]:
         raw_text = recipe_path.read_text(encoding="utf-8")
     except OSError as err:
         raise ParallelWorkflowError(f"Could not read recipe {recipe_path}: {err}") from err
+    if len(raw_text) > MAX_RECIPE_BYTES:
+        raise ParallelWorkflowError(f"Recipe {recipe_path} exceeds the {MAX_RECIPE_BYTES}-byte limit.")
     try:
         if recipe_path.suffix.lower() in {".yaml", ".yml"}:
             import yaml
@@ -4360,6 +4426,8 @@ def _read_parallel_api_key_for_init(*, from_stdin: bool) -> str:
     api_key = _clean_parallel_api_key(value)
     if not api_key:
         raise ParallelWorkflowError("Parallel API key cannot be empty.")
+    if len(api_key) > 512:
+        raise ParallelWorkflowError("Parallel API key is implausibly long (>512 characters).")
     return api_key
 
 
@@ -4822,9 +4890,9 @@ def _discovery_next_steps_md(
         )
         lines.extend(
             [
-                f"### {title}",
+                f"### {_md_inline_text(title)}",
                 "",
-                f"- URL: {url}",
+                f"- URL: {_md_inline_text(url)}",
                 (
                     f"- Score: {_get(source_score, 'score', '?')}/100 "
                     f"({_get(source_score, 'grade', 'unscored')})"
@@ -5004,12 +5072,10 @@ def _load_mcp_servers(raw_servers: list[str]) -> list[dict[str, Any]]:
             value = _load_json_file(Path(source), "mcp_server")
         if not isinstance(value, dict):
             raise ParallelWorkflowError("MCP server entries must be JSON objects.")
-        redacted = _redact_sensitive_headers(value)
+        # Live auth headers must reach the Parallel API as-is. Any pack metadata
+        # derived from these kwargs is redacted at serialization time via
+        # _redact_sensitive_headers, so the raw value is only used for the call.
         servers.append(value)
-        if "headers" in redacted and redacted["headers"] != value.get("headers"):
-            # Keep this branch explicit to make the secret handling obvious; the
-            # returned request metadata is redacted separately.
-            pass
     return servers
 
 
@@ -5121,12 +5187,34 @@ def _collect_iterable(iterable: Any, *, limit: int) -> list[dict[str, Any]]:
     return items
 
 
+def _ensure_within_cwd(candidate: Path, *, field: str) -> Path:
+    """Confine a recipe-sourced path to the working tree.
+
+    The CLI ``--output-dir`` override is trusted and exempt; a recipe field must
+    not redirect writes outside the current directory via an absolute path or
+    ``..`` traversal (e.g. into ~/.ssh or /etc). Returns the original candidate
+    on success so existing relative-path behavior is preserved.
+    """
+    cwd = Path.cwd().resolve()
+    resolved = (cwd / candidate).resolve()
+    try:
+        resolved.relative_to(cwd)
+    except ValueError:
+        raise ParallelWorkflowError(
+            f"Recipe '{field}' must stay within the working directory "
+            f"({cwd}); use --output-dir for a path outside it."
+        ) from None
+    return candidate
+
+
 def _recipe_output_dir(
     recipe: dict[str, Any],
     default: Path,
     override: Path | None,
 ) -> Path:
-    return override or Path(str(recipe.get("output_dir") or default))
+    if override is not None:
+        return override
+    return _ensure_within_cwd(Path(str(recipe.get("output_dir") or default)), field="output_dir")
 
 
 def _resolve_recipe_path(recipe_path: Path, value: Any) -> Path | None:
diff --git a/src/docpull/source_scoring.py b/src/docpull/source_scoring.py
index cae02bc..75352fb 100644
--- a/src/docpull/source_scoring.py
+++ b/src/docpull/source_scoring.py
@@ -33,11 +33,19 @@ def score_source(
             score -= 25
             reasons.append("off_domain")
 
-    if domain.startswith("docs.") or ".docs." in domain or "developer" in domain:
+    if domain.startswith("docs.") or ".docs." in domain or domain.startswith(("developer.", "developers.")):
         score += 12
         reasons.append("docs_domain")
 
-    if any(part in path for part in ("/docs", "/api", "/reference", "/developers")):
+    # Match a doc token as a whole path segment or a hyphen/underscore-prefixed
+    # one (so "/api-reference" and "/api/v2" score, but "/apiary" does not).
+    path_segments = [segment for segment in path.split("/") if segment]
+    doc_path_tokens = ("docs", "api", "reference", "developers")
+    if any(
+        segment == token or segment.startswith((f"{token}-", f"{token}_"))
+        for segment in path_segments
+        for token in doc_path_tokens
+    ):
         score += 10
         reasons.append("docs_path")
 

From ce27ae347dd7fd6c85a58e040ab882f8078b8a27 Mon Sep 17 00:00:00 2001
From: admin-raintree <277948009+admin-raintree@users.noreply.github.com>
Date: Fri, 12 Jun 2026 10:59:54 -0700
Subject: [PATCH 2/2] docs: show GitHub star count

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0706114..b7a01fd 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
 [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
 [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
 [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
+[![GitHub stars](https://img.shields.io/github/stars/raintree-technology/docpull?style=social)](https://github.com/raintree-technology/docpull/stargazers)
 [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
 
 <p align="center">