raintree-technology · admin-raintree · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -16,10 +16,14 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: false
+
 jobs:
   benchmark:
     if: github.event_name != 'schedule' || github.event.schedule == '17 4 * * *'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     timeout-minutes: 15
     steps:
       - name: Checkout
@@ -63,7 +67,7 @@ jobs:
 
   provider-matrix:
     if: github.event_name != 'schedule' || github.event.schedule == '31 5 * * 1'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     timeout-minutes: 20
     steps:
       - name: Checkout

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,7 +15,7 @@ concurrency:
 
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     strategy:
       fail-fast: false
       matrix:
@@ -50,7 +50,7 @@ jobs:
           path: coverage.xml
 
   lint:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -77,7 +77,7 @@ jobs:
         run: pre-commit run --all-files --show-diff-on-failure
 
   typecheck:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -13,10 +13,14 @@ permissions:
   contents: read
   security-events: write
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   analyze:
     name: analyze (${{ matrix.language }})
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     strategy:
       fail-fast: false
       matrix:

diff --git a/.github/workflows/metrics.yml b/.github/workflows/metrics.yml
@@ -40,7 +40,7 @@ concurrency:
 
 jobs:
   update:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
 
@@ -50,18 +50,37 @@ jobs:
 
       - name: Refresh METRICS.md
         env:
-          # Prefer the PAT (has Administration: read for traffic endpoints).
-          # Fall back to the default token (still works for stars / issues /
-          # downloads — only the traffic section will be empty).
-          GH_TOKEN: ${{ secrets.METRICS_TOKEN || secrets.GITHUB_TOKEN }}
-        run: python .github/scripts/update_metrics.py
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          METRICS_TOKEN: ${{ secrets.METRICS_TOKEN }}
+        run: |
+          if [ -n "${METRICS_TOKEN:-}" ] && GH_TOKEN="$METRICS_TOKEN" gh api "repos/${GITHUB_REPOSITORY}" >/dev/null 2>&1; then
+            export GH_TOKEN="$METRICS_TOKEN"
+          else
+            if [ -n "${METRICS_TOKEN:-}" ]; then
+              echo "::warning::METRICS_TOKEN is present but failed GitHub API validation; falling back to GITHUB_TOKEN."
+            fi
+            export GH_TOKEN="$GITHUB_TOKEN"
+          fi
+          python .github/scripts/update_metrics.py
 
       - name: Open metrics refresh PR
-        uses: peter-evans/create-pull-request@22a9089034f40e5a961c8808d113e2c98fb63676
+        id: metrics-pr
+        uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8.1.1
+        continue-on-error: true
         with:
           add-paths: METRICS.md
           branch: automation/metrics-refresh
           delete-branch: true
           commit-message: "chore(metrics): refresh metrics"
           title: "chore(metrics): refresh metrics"
           body: "Automated METRICS.md refresh from the scheduled metrics workflow."
+
+      - name: Report PR creation limitation
+        if: steps.metrics-pr.outcome == 'failure'
+        run: |
+          echo "::warning::Metrics refreshed, but PR creation failed. Enable Actions-created pull requests in repository settings or create a PR from automation/metrics-refresh manually."
+          {
+            echo "### Metrics refresh"
+            echo
+            echo "METRICS.md was refreshed, but the pull request step failed. Enable Actions-created pull requests in repository settings or open a PR from automation/metrics-refresh manually."
+          } >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -20,9 +20,13 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: false
+
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     outputs:
       version: ${{ steps.meta.outputs.version }}
     steps:
@@ -81,7 +85,7 @@ jobs:
 
   publish:
     needs: build
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     environment:
       name: pypi
       url: https://pypi.org/project/docpull/${{ needs.build.outputs.version }}/

diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
@@ -9,9 +9,13 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   secret-scan:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout full history
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -28,7 +32,7 @@ jobs:
             detect --source=/repo --redact --no-banner
 
   python-security:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - name: Checkout
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -53,7 +57,7 @@ jobs:
         run: PYTHONPATH=src pytest -q tests/test_security_hardening.py tests/test_discovery.py tests/test_integration.py
 
   mcp-security:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     defaults:
       run:
         working-directory: mcp
@@ -79,7 +83,7 @@ jobs:
         run: bun run typecheck
 
   web-security:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     defaults:
       run:
         working-directory: web

diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@
 [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
 [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
 [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
+[![GitHub stars](https://img.shields.io/github/stars/raintree-technology/docpull?style=social)](https://github.com/raintree-technology/docpull/stargazers)
 [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
 
 <p align="center">
@@ -13,6 +14,16 @@
   </a>
 </p>
 
+## Star History
+
+<a href="https://star-history.com/#raintree-technology/docpull&Date">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=raintree-technology/docpull&type=Date&theme=dark" />
+    <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=raintree-technology/docpull&type=Date" />
+    <img alt="Star history chart for raintree-technology/docpull" src="https://api.star-history.com/svg?repos=raintree-technology/docpull&type=Date" />
+  </picture>
+</a>
+
 docpull is a web scraper for static and server-rendered sites, with
 documentation crawling as its sharpest default workflow. It uses async HTTP (not
 Playwright) to fetch pages, discover links, extract main content, and write clean

diff --git a/audit/2026-06-10-eval-system-audit.md b/audit/2026-06-10-eval-system-audit.md
@@ -0,0 +1,103 @@
+# Eval/Benchmark System Audit — 2026-06-10
+
+Scope: the docpull **eval/benchmark Python core** —
+`src/docpull/benchmark.py`, `parallel_workflows.py`, `pack_tools.py`,
+`source_scoring.py`, `metadata_extractor.py`. CI workflow, secret-store
+helpers (`provider_keys.py`/`provider_cli.py`), and the web publication path
+were explicitly out of scope for this pass.
+
+Method: multi-agent finder + adversarial-verifier sweep across 9 audit
+dimensions (Sonnet), each candidate finding independently re-checked against
+current source by a skeptic prompted to refute. 41 agents, 32 findings raised,
+**27 confirmed / 5 refuted**. Opus triaged, deduped to 8 root causes, and
+remediated.
+
+Threat model: a malicious/compromised provider API response (Tavily, Exa,
+Parallel), a malicious recipe/fixture file, or untrusted fetched doc content
+written into agent-consumed artifacts. docpull is a local-first CLI, so
+"remote exploit" generally means "a shared/CI recipe or a misbehaving paid
+provider," not a network-facing service.
+
+## Fixed
+
+### Security
+1. **Recipe `output_dir` path traversal** (`parallel_workflows.py`) — a recipe
+   field could write pack files to any absolute path or `..`-escape the cwd
+   (two independent code paths: `_recipe_output_dir` and the inline
+   context-pack resolver). Added `_ensure_within_cwd` containment; both paths
+   now route through `_recipe_output_dir`. The CLI `--output-dir` override
+   stays trusted/exempt.
+2. **Prompt-injection via provider Markdown** — provider-supplied `title`/`url`
+   were written verbatim into `AGENT_CONTEXT.md`, `sources.md`, and
+   `NEXT_STEPS.md` (LLM-consumed). Added shared `_md_link` / `_md_inline_text`
+   / `_md_safe_url` helpers (escape `[]` `` ` ``, strip CR/LF, http(s)-only
+   URLs) and applied them at every writer site in both files.
+3. **`_http_json_post_once` hardening** (`benchmark.py`) — closes four findings
+   at once: (a) size-capped response read (`HTTP_MAX_RESPONSE_BYTES`, 16 MB) to
+   stop multi-GB OOM; (b) `_NoRedirectHandler` refuses 3xx on authenticated
+   POSTs, which previously forwarded `Authorization`/`x-api-key` across
+   redirects and followed https→http downgrades; (c) the same handler removes
+   the SSRF-via-redirect-to-internal-host vector.
+4. **Cost-cap gaps** (`benchmark.py`) — the `--runs N` multiplier was missing
+   from the Parallel estimate (10× silent overspend), and Tavily/Exa bypassed
+   the guard entirely (it lived inside `if parallel:`). The guard now covers
+   all three providers, multiplies by `len(targets) * runs`, and reports a
+   per-provider breakdown on trip.
+5. **FindAll poll-loop logic bug** (`parallel_workflows.py`) — on deadline
+   expiry the loop still called `.result()` against an active job, writing
+   partial data as success. Now raises `ParallelWorkflowError` on timeout, like
+   `_wait_for_taskgroup_completion`.
+
+### Eval integrity (published-number credibility)
+6. **Freshness dimension** returned 100/100 (a free +15) for any target without
+   `freshness_terms`; now returns a neutral 65 with a visible signal. The eight
+   published targets all set terms, so reference numbers are unaffected — this
+   only de-inflates ad-hoc single-target runs.
+7. **`_aggregate_runs` wall time** took the median over *all* runs including
+   fast failures (a broken case could report 0.1 s), contradicting its own
+   docstring; now medians over successful runs only.
+8. **`source_scoring` substring false positives** — `"developer" in domain` and
+   `"/api" in path` rewarded `notadeveloper.com`, `/apiary/…`. Domain check is
+   now subdomain-anchored; path check matches a whole segment or a
+   `-`/`_`-prefixed one (so `/api-reference` and `/api/v2` still score, but
+   `/apiary` does not). Verified against all 819 source rows in `.bench/runs/`:
+   **0 change** — the only real effect is excluding genuine false positives,
+   which do not appear in the published corpus. (A first cut using
+   segment-*exact* matching wrongly dropped 163 `/api-reference` rows by −10;
+   that regression was caught by re-scoring the stored runs and corrected.)
+
+### Hygiene (low severity, defensive)
+- Recipe size guard (`MAX_RECIPE_BYTES`, 1 MB) before `yaml.safe_load`
+  (billion-laughs).
+- stdin API-key length cap (512 chars).
+- `_redact_secret_like` strips token-shaped substrings from third-party error
+  bodies before they reach `benchmark.report.json` / Raindrop traces.
+- Raindrop traces now send `output_dir.name` / artifact basenames instead of
+  absolute home paths.
+- Removed the dead/misleading redaction branch in `_load_mcp_servers`.
+- `_cap_fixture_content` bounds imported-fixture `full_content`/`excerpts` to
+  the live `DEFAULT_MAX_FULL_CONTENT_CHARS`.
+- SSRF/artifact hygiene: `run_live_context_pack` now runs provider URLs through
+  `UrlValidator` (https-only) before extract, matching the extract-pack path.
+
+### Incidental
+- Fixed a pre-existing mypy error in `_workload_disclosure_lines` (`med` typed
+  int then assigned `""`), introduced by commit `a2a8535`.
+- Annotated the pre-existing B311 jitter finding in `_retry_delay_seconds` with
+  a policy-compliant `# nosec B311` (non-crypto retry backoff). Bandit was red
+  at HEAD on this; it is now green.
+
+## Refuted (verified false positives)
+- `_resolve_recipe_path` arbitrary read — trust boundary is "ran an untrusted
+  file"; `url_file` content is https-validated, `diff` reads only a fixed name.
+- `_safe_slug` — genuinely neutralizes path separators.
+- benchmark argparse bare `type=int/float` — post-parse `_validate_positive_int`
+  already rejects zero/negative.
+- "Unbounded Retry-After" — the cap is applied at parse time (`min(..., CAP)`).
+- Provider text in the published article — the Targets section is built only
+  from hardcoded/user-controlled `_BenchmarkTarget` fields, not provider data.
+
+## Verification
+`ruff check` ✅ · `ruff format` ✅ · `mypy src` ✅ · `pytest tests` ✅ 476 passed ·
+`bandit -c pyproject.toml -r src` ✅ exit 0 · `pip-audit` ✅ no known vulns.
+Diff: 3 files, +201/−54.