From ddf9055a8404d60245560e8b89500765a68c9af3 Mon Sep 17 00:00:00 2001 From: admin-raintree <277948009+admin-raintree@users.noreply.github.com> Date: Fri, 12 Jun 2026 10:52:28 -0700 Subject: [PATCH 1/2] chore: sync workspace changes --- .github/workflows/benchmark.yml | 8 +- .github/workflows/ci.yml | 6 +- .github/workflows/codeql.yml | 6 +- .github/workflows/metrics.yml | 33 +++++-- .github/workflows/publish.yml | 8 +- .github/workflows/security.yml | 12 ++- README.md | 10 +++ audit/2026-06-10-eval-system-audit.md | 103 +++++++++++++++++++++ src/docpull/benchmark.py | 125 +++++++++++++++++++++----- src/docpull/parallel_workflows.py | 122 +++++++++++++++++++++---- src/docpull/source_scoring.py | 12 ++- 11 files changed, 386 insertions(+), 59 deletions(-) create mode 100644 audit/2026-06-10-eval-system-audit.md diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 856f3a0..070aadc 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -16,10 +16,14 @@ on: permissions: contents: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + jobs: benchmark: if: github.event_name != 'schedule' || github.event.schedule == '17 4 * * *' - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 timeout-minutes: 15 steps: - name: Checkout @@ -63,7 +67,7 @@ jobs: provider-matrix: if: github.event_name != 'schedule' || github.event.schedule == '31 5 * * 1' - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 timeout-minutes: 20 steps: - name: Checkout diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8fb02fa..49793c5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ concurrency: jobs: test: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: @@ -50,7 +50,7 @@ jobs: path: coverage.xml lint: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -77,7 +77,7 @@ jobs: run: pre-commit run --all-files --show-diff-on-failure typecheck: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 23e1cff..b038d71 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -13,10 +13,14 @@ permissions: contents: read security-events: write +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: analyze: name: analyze (${{ matrix.language }}) - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: diff --git a/.github/workflows/metrics.yml b/.github/workflows/metrics.yml index a29f124..f9728b6 100644 --- a/.github/workflows/metrics.yml +++ b/.github/workflows/metrics.yml @@ -40,7 +40,7 @@ concurrency: jobs: update: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -50,14 +50,23 @@ jobs: - name: Refresh METRICS.md env: - # Prefer the PAT (has Administration: read for traffic endpoints). - # Fall back to the default token (still works for stars / issues / - # downloads — only the traffic section will be empty). - GH_TOKEN: ${{ secrets.METRICS_TOKEN || secrets.GITHUB_TOKEN }} - run: python .github/scripts/update_metrics.py + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + METRICS_TOKEN: ${{ secrets.METRICS_TOKEN }} + run: | + if [ -n "${METRICS_TOKEN:-}" ] && GH_TOKEN="$METRICS_TOKEN" gh api "repos/${GITHUB_REPOSITORY}" >/dev/null 2>&1; then + export GH_TOKEN="$METRICS_TOKEN" + else + if [ -n "${METRICS_TOKEN:-}" ]; then + echo "::warning::METRICS_TOKEN is present but failed GitHub API validation; falling back to GITHUB_TOKEN." + fi + export GH_TOKEN="$GITHUB_TOKEN" + fi + python .github/scripts/update_metrics.py - name: Open metrics refresh PR - uses: peter-evans/create-pull-request@22a9089034f40e5a961c8808d113e2c98fb63676 + id: metrics-pr + uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8.1.1 + continue-on-error: true with: add-paths: METRICS.md branch: automation/metrics-refresh @@ -65,3 +74,13 @@ jobs: commit-message: "chore(metrics): refresh metrics" title: "chore(metrics): refresh metrics" body: "Automated METRICS.md refresh from the scheduled metrics workflow." + + - name: Report PR creation limitation + if: steps.metrics-pr.outcome == 'failure' + run: | + echo "::warning::Metrics refreshed, but PR creation failed. Enable Actions-created pull requests in repository settings or create a PR from automation/metrics-refresh manually." + { + echo "### Metrics refresh" + echo + echo "METRICS.md was refreshed, but the pull request step failed. Enable Actions-created pull requests in repository settings or open a PR from automation/metrics-refresh manually." + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b1dd02a..4c59579 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -20,9 +20,13 @@ on: permissions: contents: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 outputs: version: ${{ steps.meta.outputs.version }} steps: @@ -81,7 +85,7 @@ jobs: publish: needs: build - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 environment: name: pypi url: https://pypi.org/project/docpull/${{ needs.build.outputs.version }}/ diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 8016b12..bc467ec 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -9,9 +9,13 @@ on: permissions: contents: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + jobs: secret-scan: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Checkout full history uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -28,7 +32,7 @@ jobs: detect --source=/repo --redact --no-banner python-security: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -53,7 +57,7 @@ jobs: run: PYTHONPATH=src pytest -q tests/test_security_hardening.py tests/test_discovery.py tests/test_integration.py mcp-security: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 defaults: run: working-directory: mcp @@ -79,7 +83,7 @@ jobs: run: bun run typecheck web-security: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 defaults: run: working-directory: web diff --git a/README.md b/README.md index 0ceed04..0706114 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,16 @@

+## Star History + + + + + + Star history chart for raintree-technology/docpull + + + docpull is a web scraper for static and server-rendered sites, with documentation crawling as its sharpest default workflow. It uses async HTTP (not Playwright) to fetch pages, discover links, extract main content, and write clean diff --git a/audit/2026-06-10-eval-system-audit.md b/audit/2026-06-10-eval-system-audit.md new file mode 100644 index 0000000..bcecd7e --- /dev/null +++ b/audit/2026-06-10-eval-system-audit.md @@ -0,0 +1,103 @@ +# Eval/Benchmark System Audit — 2026-06-10 + +Scope: the docpull **eval/benchmark Python core** — +`src/docpull/benchmark.py`, `parallel_workflows.py`, `pack_tools.py`, +`source_scoring.py`, `metadata_extractor.py`. CI workflow, secret-store +helpers (`provider_keys.py`/`provider_cli.py`), and the web publication path +were explicitly out of scope for this pass. + +Method: multi-agent finder + adversarial-verifier sweep across 9 audit +dimensions (Sonnet), each candidate finding independently re-checked against +current source by a skeptic prompted to refute. 41 agents, 32 findings raised, +**27 confirmed / 5 refuted**. Opus triaged, deduped to 8 root causes, and +remediated. + +Threat model: a malicious/compromised provider API response (Tavily, Exa, +Parallel), a malicious recipe/fixture file, or untrusted fetched doc content +written into agent-consumed artifacts. docpull is a local-first CLI, so +"remote exploit" generally means "a shared/CI recipe or a misbehaving paid +provider," not a network-facing service. + +## Fixed + +### Security +1. **Recipe `output_dir` path traversal** (`parallel_workflows.py`) — a recipe + field could write pack files to any absolute path or `..`-escape the cwd + (two independent code paths: `_recipe_output_dir` and the inline + context-pack resolver). Added `_ensure_within_cwd` containment; both paths + now route through `_recipe_output_dir`. The CLI `--output-dir` override + stays trusted/exempt. +2. **Prompt-injection via provider Markdown** — provider-supplied `title`/`url` + were written verbatim into `AGENT_CONTEXT.md`, `sources.md`, and + `NEXT_STEPS.md` (LLM-consumed). Added shared `_md_link` / `_md_inline_text` + / `_md_safe_url` helpers (escape `[]` `` ` ``, strip CR/LF, http(s)-only + URLs) and applied them at every writer site in both files. +3. **`_http_json_post_once` hardening** (`benchmark.py`) — closes four findings + at once: (a) size-capped response read (`HTTP_MAX_RESPONSE_BYTES`, 16 MB) to + stop multi-GB OOM; (b) `_NoRedirectHandler` refuses 3xx on authenticated + POSTs, which previously forwarded `Authorization`/`x-api-key` across + redirects and followed https→http downgrades; (c) the same handler removes + the SSRF-via-redirect-to-internal-host vector. +4. **Cost-cap gaps** (`benchmark.py`) — the `--runs N` multiplier was missing + from the Parallel estimate (10× silent overspend), and Tavily/Exa bypassed + the guard entirely (it lived inside `if parallel:`). The guard now covers + all three providers, multiplies by `len(targets) * runs`, and reports a + per-provider breakdown on trip. +5. **FindAll poll-loop logic bug** (`parallel_workflows.py`) — on deadline + expiry the loop still called `.result()` against an active job, writing + partial data as success. Now raises `ParallelWorkflowError` on timeout, like + `_wait_for_taskgroup_completion`. + +### Eval integrity (published-number credibility) +6. **Freshness dimension** returned 100/100 (a free +15) for any target without + `freshness_terms`; now returns a neutral 65 with a visible signal. The eight + published targets all set terms, so reference numbers are unaffected — this + only de-inflates ad-hoc single-target runs. +7. **`_aggregate_runs` wall time** took the median over *all* runs including + fast failures (a broken case could report 0.1 s), contradicting its own + docstring; now medians over successful runs only. +8. **`source_scoring` substring false positives** — `"developer" in domain` and + `"/api" in path` rewarded `notadeveloper.com`, `/apiary/…`. Domain check is + now subdomain-anchored; path check matches a whole segment or a + `-`/`_`-prefixed one (so `/api-reference` and `/api/v2` still score, but + `/apiary` does not). Verified against all 819 source rows in `.bench/runs/`: + **0 change** — the only real effect is excluding genuine false positives, + which do not appear in the published corpus. (A first cut using + segment-*exact* matching wrongly dropped 163 `/api-reference` rows by −10; + that regression was caught by re-scoring the stored runs and corrected.) + +### Hygiene (low severity, defensive) +- Recipe size guard (`MAX_RECIPE_BYTES`, 1 MB) before `yaml.safe_load` + (billion-laughs). +- stdin API-key length cap (512 chars). +- `_redact_secret_like` strips token-shaped substrings from third-party error + bodies before they reach `benchmark.report.json` / Raindrop traces. +- Raindrop traces now send `output_dir.name` / artifact basenames instead of + absolute home paths. +- Removed the dead/misleading redaction branch in `_load_mcp_servers`. +- `_cap_fixture_content` bounds imported-fixture `full_content`/`excerpts` to + the live `DEFAULT_MAX_FULL_CONTENT_CHARS`. +- SSRF/artifact hygiene: `run_live_context_pack` now runs provider URLs through + `UrlValidator` (https-only) before extract, matching the extract-pack path. + +### Incidental +- Fixed a pre-existing mypy error in `_workload_disclosure_lines` (`med` typed + int then assigned `""`), introduced by commit `a2a8535`. +- Annotated the pre-existing B311 jitter finding in `_retry_delay_seconds` with + a policy-compliant `# nosec B311` (non-crypto retry backoff). Bandit was red + at HEAD on this; it is now green. + +## Refuted (verified false positives) +- `_resolve_recipe_path` arbitrary read — trust boundary is "ran an untrusted + file"; `url_file` content is https-validated, `diff` reads only a fixed name. +- `_safe_slug` — genuinely neutralizes path separators. +- benchmark argparse bare `type=int/float` — post-parse `_validate_positive_int` + already rejects zero/negative. +- "Unbounded Retry-After" — the cap is applied at parse time (`min(..., CAP)`). +- Provider text in the published article — the Targets section is built only + from hardcoded/user-controlled `_BenchmarkTarget` fields, not provider data. + +## Verification +`ruff check` ✅ · `ruff format` ✅ · `mypy src` ✅ · `pytest tests` ✅ 476 passed · +`bandit -c pyproject.toml -r src` ✅ exit 0 · `pip-audit` ✅ no known vulns. +Diff: 3 files, +201/−54. diff --git a/src/docpull/benchmark.py b/src/docpull/benchmark.py index f3bc7a6..720bc03 100644 --- a/src/docpull/benchmark.py +++ b/src/docpull/benchmark.py @@ -6,6 +6,7 @@ import asyncio import json import random +import re import resource import sys import time @@ -18,7 +19,7 @@ from typing import Any from urllib.error import HTTPError, URLError from urllib.parse import urlparse -from urllib.request import Request, urlopen +from urllib.request import HTTPRedirectHandler, Request, build_opener from rich.console import Console from rich.markup import escape @@ -34,6 +35,7 @@ DEFAULT_MODE, ParallelWorkflowError, _build_source_policy, + _md_link, _parallel_sdk_installed, estimate_context_pack_cost, estimate_search_pack_cost, @@ -76,6 +78,14 @@ } PASS_AT_K_THRESHOLDS: tuple[int, ...] = (70, 80, 90) TARGET_SET_CHOICES = ("single", "tool-docs", "provider-matrix", "v2") +HTTP_MAX_RESPONSE_BYTES = 16 * 1024 * 1024 +HTTP_MAX_ERROR_BYTES = 64 * 1024 +# Conservative per-call USD figures used ONLY for the pre-flight +# --max-estimated-cost guard on the live Tavily/Exa search providers. Real +# spend is reconciled from provider usage after the run; these are loose upper +# bounds chosen so the guard fails safe. They never feed published numbers. +APPROX_TAVILY_CREDIT_USD = 0.01 +APPROX_EXA_SEARCH_USD = 0.01 class BenchmarkError(RuntimeError): @@ -502,21 +512,32 @@ def run_quick_benchmark( estimated_search_cost = 0.0 estimated_context_cost = 0.0 + estimated_costs: dict[str, float] = {} if parallel: estimated_search_cost = estimate_search_pack_cost(max_search_results=max_search_results) estimated_context_cost = estimate_context_pack_cost( extract_limit=extract_limit, max_search_results=max_search_results, ) - estimated_total_cost = round( - (estimated_search_cost + estimated_context_cost) * len(targets), + estimated_costs["parallel"] = round( + (estimated_search_cost + estimated_context_cost) * len(targets) * runs, 6, ) - if estimated_total_cost > max_estimated_cost: - raise BenchmarkError( - "Estimated Parallel benchmark cost " - f"${estimated_total_cost:.6f} exceeds guard ${max_estimated_cost:.6f}." - ) + if "tavily" in providers: + credit_usd = tavily_credit_usd if tavily_credit_usd is not None else APPROX_TAVILY_CREDIT_USD + estimated_costs["tavily"] = round( + (1 + extract_limit) * credit_usd * len(targets) * runs, + 6, + ) + if "exa" in providers: + estimated_costs["exa"] = round(APPROX_EXA_SEARCH_USD * len(targets) * runs, 6) + estimated_total_cost = round(sum(estimated_costs.values()), 6) + if estimated_total_cost > max_estimated_cost: + breakdown = ", ".join(f"{name}=${value:.6f}" for name, value in estimated_costs.items()) + raise BenchmarkError( + f"Estimated live-provider benchmark cost ${estimated_total_cost:.6f} " + f"({breakdown}) exceeds guard ${max_estimated_cost:.6f}." + ) trace = _make_trace_recorder( trace_backend, @@ -1092,6 +1113,18 @@ def _benchmark_provider_statuses( return safe_statuses +def _path_basename(value: Any) -> str | None: + if not value: + return None + return Path(str(value)).name + + +def _basename_only(value: Any) -> Any: + if isinstance(value, dict): + return {key: _path_basename(item) for key, item in value.items()} + return value + + class _TraceRecorder: provider = "none" @@ -1148,7 +1181,7 @@ def __init__( properties={ "target_set": target_set, "target_count": len(targets), - "output_dir": str(output_dir), + "output_dir": output_dir.name, "parallel_enabled": parallel_enabled, "max_estimated_cost_usd": max_estimated_cost, "content_policy": "metadata_only", @@ -1158,7 +1191,7 @@ def __init__( "target_url": target_url, "target_set": target_set, "targets": [target.report_dict() for target in targets], - "output_dir": str(output_dir), + "output_dir": output_dir.name, "parallel_enabled": parallel_enabled, "max_estimated_cost_usd": max_estimated_cost, } @@ -1180,7 +1213,7 @@ def record_case(self, case: dict[str, Any]) -> None: "target": case.get("target"), "prompt": case.get("prompt"), "settings": case.get("settings"), - "output_dir": case.get("output_dir"), + "output_dir": _path_basename(case.get("output_dir")), }, output=_trace_case_output(case), duration_ms=int(float(case.get("wall_seconds") or 0.0) * 1000), @@ -1212,7 +1245,7 @@ def finish(self, report: dict[str, Any]) -> None: self._interaction.set_properties( { "summary": report.get("summary"), - "artifacts": report.get("artifacts"), + "artifacts": _basename_only(report.get("artifacts")), "signal_count": self._signal_count, "positive_signal_count": self._positive_signal_count, "negative_signal_count": self._negative_signal_count, @@ -1223,7 +1256,7 @@ def finish(self, report: dict[str, Any]) -> None: output=_json_trace_text( { "summary": report.get("summary"), - "artifacts": report.get("artifacts"), + "artifacts": _basename_only(report.get("artifacts")), "trace_signals": { "total": self._signal_count, "positive": self._positive_signal_count, @@ -2007,7 +2040,9 @@ def _aggregate_runs( The full per-run list is preserved under ``runs`` for raw inspection. """ successful = [run for run in runs if run.get("status") != "failed"] - wall_seconds_list = [float(run.get("wall_seconds") or 0.0) for run in runs] + # Headline wall time is the median across *successful* runs only; a run that + # aborts quickly on a network error must not pull the reported latency down. + wall_seconds_list = [float(run.get("wall_seconds") or 0.0) for run in (successful or runs)] estimated_costs = [float(run.get("estimated_cost_usd") or 0.0) for run in runs] artifact_sizes = [int(run.get("artifact_size_bytes") or 0) for run in runs] cache_sizes = [int(run.get("cache_size_bytes") or 0) for run in runs] @@ -2201,7 +2236,7 @@ def _source_fidelity_dimension( def _freshness_dimension(records: list[dict[str, Any]], target: _BenchmarkTarget | None) -> dict[str, Any]: if not target or not target.freshness_terms: - return _dimension(100, []) + return _dimension(65, ["freshness not evaluated - no freshness terms configured"]) haystack = "\n".join( " ".join( [ @@ -2426,7 +2461,7 @@ def _write_provider_sources_md( index = source.get("index") title = str(source.get("title") or source.get("url") or "Untitled") url = str(source.get("url") or "") - lines.append(f"{index}. [{title}]({url})") + lines.append(f"{index}. {_md_link(title, url)}") if source.get("source_type"): lines.append(f" - Source type: `{source['source_type']}`") lines.append(" - Records file: `documents.ndjson`") @@ -2467,6 +2502,20 @@ def _http_json_post( raise last_error +class _NoRedirectHandler(HTTPRedirectHandler): + """Refuse 3xx redirects on authenticated POSTs. + + urllib forwards ``Authorization`` / ``x-api-key`` across redirects (only + ``content-*`` headers are stripped) and will follow an https->http + downgrade, so a redirect from a provider endpoint would leak the API key + in cleartext and could be steered at an internal host. These endpoints + never legitimately redirect, so surface any 3xx as an error instead. + """ + + def redirect_request(self, req, fp, code, msg, headers, newurl): # type: ignore[no-untyped-def] + raise HTTPError(req.full_url, code, f"Refused redirect to {newurl!r}", headers, fp) + + def _http_json_post_once( *, label: str, @@ -2487,11 +2536,12 @@ def _http_json_post_once( }, method="POST", ) + opener = build_opener(_NoRedirectHandler()) try: - with urlopen(request, timeout=timeout) as response: # nosec B310 - raw = response.read().decode("utf-8") + with opener.open(request, timeout=timeout) as response: # nosec B310 + raw_bytes = response.read(HTTP_MAX_RESPONSE_BYTES + 1) except HTTPError as err: - detail = err.read().decode("utf-8", errors="replace") + detail = _redact_secret_like(err.read(HTTP_MAX_ERROR_BYTES).decode("utf-8", errors="replace")) message = f"{label} returned HTTP {err.code}: {_short_error_detail(detail)}" if err.code in HTTP_RETRY_TRANSIENT_STATUSES: transient = _TransientHTTPError(message, retry_after=_parse_retry_after(err)) @@ -2503,8 +2553,10 @@ def _http_json_post_once( transient = _TransientHTTPError(message, retry_after=None) transient.__cause__ = err raise transient from err + if len(raw_bytes) > HTTP_MAX_RESPONSE_BYTES: + raise BenchmarkError(f"{label} response exceeds {HTTP_MAX_RESPONSE_BYTES}-byte limit.") try: - parsed = json.loads(raw) + parsed = json.loads(raw_bytes.decode("utf-8")) except json.JSONDecodeError as err: raise BenchmarkError(f"{label} returned invalid JSON: {err}") from err if not isinstance(parsed, dict): @@ -2588,6 +2640,20 @@ def _relative_path(path: Path, base_dir: Path) -> str: return str(path) +_SECRET_LIKE_RE = re.compile( + r"(?i)(?:bearer\s+|x-api-key\s*[:=]\s*|api[-_]?key\s*[\"':=]\s*|tvly-|exa_|sk-)[A-Za-z0-9._\-]{6,}" +) + + +def _redact_secret_like(value: str) -> str: + """Strip token-shaped substrings out of a third-party error body. + + Provider error responses occasionally echo the submitted credential back; + this keeps such tokens out of ``benchmark.report.json`` and any trace upload. + """ + return _SECRET_LIKE_RE.sub("[redacted]", value) + + def _short_error_detail(value: str) -> str: compact = " ".join(value.split()) return compact[:500] @@ -3141,7 +3207,24 @@ def _article_markdown(report: dict[str, Any], *, title: str) -> str: f"{cost_text} |" ) if heatmap: - lines.extend(["", "## Provider x Target Heatmap", "", *heatmap]) + lines.extend( + [ + "", + "## Provider x Target Heatmap", + "", + ( + "Read across rows (one target, all providers), not down columns. " + "The five workflows are not equivalent jobs: the core crawl walks a " + "page graph from a known seed URL, while provider workflows run a " + "search query and optionally extract a small number of results. " + "A provider that returns zero search results for a lesser-known site " + "scores 0 — not because its extractor is weak, but because its index " + "doesn't cover that site. See Workload disclosure above." + ), + "", + *heatmap, + ] + ) lines.extend( [ "", diff --git a/src/docpull/parallel_workflows.py b/src/docpull/parallel_workflows.py index eee4829..9c89c5b 100644 --- a/src/docpull/parallel_workflows.py +++ b/src/docpull/parallel_workflows.py @@ -107,6 +107,7 @@ "lite": 0.003, "base": 0.010, } +MAX_RECIPE_BYTES = 1_000_000 class ParallelWorkflowError(RuntimeError): @@ -2202,6 +2203,9 @@ def run_live_context_pack( selected_urls = _select_urls(search_results, extract_limit) if not selected_urls: raise ParallelWorkflowError("Parallel Search returned no URLs to extract.") + selected_urls = _validated_https_urls(selected_urls) + if not selected_urls: + raise ParallelWorkflowError("Parallel Search returned no valid HTTPS URLs to extract.") extract_kwargs: dict[str, Any] = { "urls": selected_urls, @@ -2297,7 +2301,7 @@ def run_recipe( if not isinstance(queries, list) or not all(isinstance(item, str) and item for item in queries): raise ParallelWorkflowError("Recipe field 'queries' must be a list of non-empty strings.") - output_dir = output_dir_override or Path(str(recipe.get("output_dir") or DEFAULT_OUTPUT_DIR)) + output_dir = _recipe_output_dir(recipe, DEFAULT_OUTPUT_DIR, output_dir_override) source_policy_recipe = recipe.get("source_policy") or {} if not isinstance(source_policy_recipe, dict): raise ParallelWorkflowError("Recipe field 'source_policy' must be an object when present.") @@ -3088,10 +3092,14 @@ def run_findall_pack( raise ParallelWorkflowError("Parallel FindAll create response did not include a findall_id.") if wait and findall_id: deadline = time.monotonic() + timeout - while time.monotonic() < deadline: + while True: current = client.beta.findall.retrieve(findall_id) if not _status_is_active(_get(current, "status")): break + if time.monotonic() >= deadline: + raise ParallelWorkflowError( + f"Parallel FindAll {findall_id} did not complete within {timeout}s." + ) time.sleep(poll_interval) result = client.beta.findall.result(findall_id) candidates = [_jsonable(item) for item in _list(_get(result, "candidates"))] if result else [] @@ -4013,6 +4021,41 @@ def _write_parallel_pack( return path +def _md_inline_text(value: str) -> str: + """Neutralize Markdown control characters in provider-supplied inline text.""" + return ( + value.replace("\\", "\\\\") + .replace("`", "\\`") + .replace("[", "\\[") + .replace("]", "\\]") + .replace("\r", " ") + .replace("\n", " ") + ) + + +def _md_safe_url(value: str) -> str: + """Return an http(s) URL safe to embed in Markdown, or '' if not web-safe.""" + cleaned = value.strip().replace("\r", "").replace("\n", "") + if urlparse(cleaned).scheme not in {"http", "https"}: + return "" + return cleaned.replace(" ", "%20").replace("(", "%28").replace(")", "%29") + + +def _md_link(title: str, url: str) -> str: + """Render a provider title/URL as a Markdown link that cannot break out.""" + safe_title = _md_inline_text(title) + safe_url = _md_safe_url(url) + if not safe_url: + return f"{safe_title} (unverified URL)" + return f"[{safe_title}]({safe_url})" + + +def _validated_https_urls(urls: list[str]) -> list[str]: + """Drop any non-HTTPS / private-host URLs from a provider-supplied list.""" + validator = UrlValidator(allowed_schemes={"https"}) + return [url for url in urls if validator.validate(url).is_valid] + + def _write_agent_context_md( output_dir: Path, *, @@ -4091,7 +4134,7 @@ def _write_agent_context_md( source_path = _coerce_str(entry.get("path")) or "" index = entry.get("index") or "?" local = f" - `{source_path}`" if source_path else "" - lines.append(f"{index}. [{title}]({url}){local}") + lines.append(f"{index}. {_md_link(title, url)}{local}") source_scores = score_source_entries(entries, expected_domains=expected_domains) if source_scores: @@ -4101,7 +4144,7 @@ def _write_agent_context_md( path_text = f" - `{source['path']}`" if source.get("path") else "" lines.append( f"- {source['score']}/100 {source['grade']}: " - f"[{source['title']}]({source['url']}){path_text} ({reason_text})" + f"{_md_link(str(source['title']), str(source['url']))}{path_text} ({reason_text})" ) warning_lines = _agent_context_warning_lines(warnings or {}, errors or []) @@ -4136,7 +4179,7 @@ def _agent_context_warning_lines( for error in errors: url = _coerce_str(_get(error, "url")) or "unknown URL" error_type = _coerce_str(_get(error, "error_type")) or "extract_error" - lines.append(f"- Extract error `{error_type}`: {url}") + lines.append(f"- Extract error `{error_type}`: {_md_inline_text(url)}") return lines @@ -4164,14 +4207,14 @@ def _write_sources_md(output_dir: Path, pack: ParallelContextPack, entries: list "", ] for entry in entries: - lines.append(f"{entry['index']}. [{entry['title']}]({entry['url']})") + lines.append(f"{entry['index']}. {_md_link(str(entry['title']), str(entry['url']))}") lines.append(f" - Local: `{entry['path']}`") if pack.extract_errors: lines.extend(["", "## Extract Errors", ""]) for error in pack.extract_errors: url = _coerce_str(_get(error, "url")) or "unknown URL" error_type = _coerce_str(_get(error, "error_type")) or "extract_error" - lines.append(f"- `{error_type}`: {url}") + lines.append(f"- `{error_type}`: {_md_inline_text(url)}") path = output_dir / "sources.md" path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") return path @@ -4182,13 +4225,34 @@ def _write_generic_sources_md(output_dir: Path, objective: str, entries: list[di if not entries: lines.append("_No records were available when this pack was written._") for entry in entries: - lines.append(f"{entry['index']}. [{entry['title']}]({entry['url']})") + lines.append(f"{entry['index']}. {_md_link(str(entry['title']), str(entry['url']))}") lines.append(f" - Local: `{entry['path']}`") path = output_dir / "sources.md" path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") return path +def _cap_fixture_content(item: Any, max_chars: int = DEFAULT_MAX_FULL_CONTENT_CHARS) -> Any: + """Bound full_content/excerpts from an imported fixture to the live cap. + + The live Extract path truncates server-side; fixture import has no such + bound, so a crafted fixture could otherwise write an unbounded source file. + """ + data = _jsonable(item) + if not isinstance(data, dict): + return data + content = data.get("full_content") + if isinstance(content, str) and len(content) > max_chars: + data["full_content"] = content[:max_chars] + excerpts = data.get("excerpts") + if isinstance(excerpts, list): + data["excerpts"] = [ + value[:max_chars] if isinstance(value, str) and len(value) > max_chars else value + for value in excerpts + ] + return data + + def _pack_from_fixture(raw: Any) -> ParallelContextPack: if not isinstance(raw, dict): raise ParallelWorkflowError("Fixture must be a JSON object.") @@ -4222,7 +4286,7 @@ def _pack_from_fixture(raw: Any) -> ParallelContextPack: extract_id=_coerce_str(extract.get("extract_id")), task_run_id=_coerce_str(task.get("run_id") or task.get("task_run_id")), search_results=[_jsonable(item) for item in _list(search.get("results"))], - extract_results=[_jsonable(item) for item in extract_results], + extract_results=[_cap_fixture_content(item) for item in extract_results], extract_errors=[_jsonable(item) for item in extract_errors], task_brief=_coerce_str(task.get("brief") or task.get("content")), task_basis=_list(task.get("basis")), @@ -4238,6 +4302,8 @@ def _load_recipe(recipe_path: Path) -> dict[str, Any]: raw_text = recipe_path.read_text(encoding="utf-8") except OSError as err: raise ParallelWorkflowError(f"Could not read recipe {recipe_path}: {err}") from err + if len(raw_text) > MAX_RECIPE_BYTES: + raise ParallelWorkflowError(f"Recipe {recipe_path} exceeds the {MAX_RECIPE_BYTES}-byte limit.") try: if recipe_path.suffix.lower() in {".yaml", ".yml"}: import yaml @@ -4360,6 +4426,8 @@ def _read_parallel_api_key_for_init(*, from_stdin: bool) -> str: api_key = _clean_parallel_api_key(value) if not api_key: raise ParallelWorkflowError("Parallel API key cannot be empty.") + if len(api_key) > 512: + raise ParallelWorkflowError("Parallel API key is implausibly long (>512 characters).") return api_key @@ -4822,9 +4890,9 @@ def _discovery_next_steps_md( ) lines.extend( [ - f"### {title}", + f"### {_md_inline_text(title)}", "", - f"- URL: {url}", + f"- URL: {_md_inline_text(url)}", ( f"- Score: {_get(source_score, 'score', '?')}/100 " f"({_get(source_score, 'grade', 'unscored')})" @@ -5004,12 +5072,10 @@ def _load_mcp_servers(raw_servers: list[str]) -> list[dict[str, Any]]: value = _load_json_file(Path(source), "mcp_server") if not isinstance(value, dict): raise ParallelWorkflowError("MCP server entries must be JSON objects.") - redacted = _redact_sensitive_headers(value) + # Live auth headers must reach the Parallel API as-is. Any pack metadata + # derived from these kwargs is redacted at serialization time via + # _redact_sensitive_headers, so the raw value is only used for the call. servers.append(value) - if "headers" in redacted and redacted["headers"] != value.get("headers"): - # Keep this branch explicit to make the secret handling obvious; the - # returned request metadata is redacted separately. - pass return servers @@ -5121,12 +5187,34 @@ def _collect_iterable(iterable: Any, *, limit: int) -> list[dict[str, Any]]: return items +def _ensure_within_cwd(candidate: Path, *, field: str) -> Path: + """Confine a recipe-sourced path to the working tree. + + The CLI ``--output-dir`` override is trusted and exempt; a recipe field must + not redirect writes outside the current directory via an absolute path or + ``..`` traversal (e.g. into ~/.ssh or /etc). Returns the original candidate + on success so existing relative-path behavior is preserved. + """ + cwd = Path.cwd().resolve() + resolved = (cwd / candidate).resolve() + try: + resolved.relative_to(cwd) + except ValueError: + raise ParallelWorkflowError( + f"Recipe '{field}' must stay within the working directory " + f"({cwd}); use --output-dir for a path outside it." + ) from None + return candidate + + def _recipe_output_dir( recipe: dict[str, Any], default: Path, override: Path | None, ) -> Path: - return override or Path(str(recipe.get("output_dir") or default)) + if override is not None: + return override + return _ensure_within_cwd(Path(str(recipe.get("output_dir") or default)), field="output_dir") def _resolve_recipe_path(recipe_path: Path, value: Any) -> Path | None: diff --git a/src/docpull/source_scoring.py b/src/docpull/source_scoring.py index cae02bc..75352fb 100644 --- a/src/docpull/source_scoring.py +++ b/src/docpull/source_scoring.py @@ -33,11 +33,19 @@ def score_source( score -= 25 reasons.append("off_domain") - if domain.startswith("docs.") or ".docs." in domain or "developer" in domain: + if domain.startswith("docs.") or ".docs." in domain or domain.startswith(("developer.", "developers.")): score += 12 reasons.append("docs_domain") - if any(part in path for part in ("/docs", "/api", "/reference", "/developers")): + # Match a doc token as a whole path segment or a hyphen/underscore-prefixed + # one (so "/api-reference" and "/api/v2" score, but "/apiary" does not). + path_segments = [segment for segment in path.split("/") if segment] + doc_path_tokens = ("docs", "api", "reference", "developers") + if any( + segment == token or segment.startswith((f"{token}-", f"{token}_")) + for segment in path_segments + for token in doc_path_tokens + ): score += 10 reasons.append("docs_path") From ce27ae347dd7fd6c85a58e040ab882f8078b8a27 Mon Sep 17 00:00:00 2001 From: admin-raintree <277948009+admin-raintree@users.noreply.github.com> Date: Fri, 12 Jun 2026 10:59:54 -0700 Subject: [PATCH 2/2] docs: show GitHub star count --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0706114..b7a01fd 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull) [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull) +[![GitHub stars](https://img.shields.io/github/stars/raintree-technology/docpull?style=social)](https://github.com/raintree-technology/docpull/stargazers) [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)