From 834c7d3b2dd441f2894f946308f4faa719eb5aac Mon Sep 17 00:00:00 2001 From: admin-raintree <277948009+admin-raintree@users.noreply.github.com> Date: Fri, 5 Jun 2026 21:54:02 -0700 Subject: [PATCH 1/2] Prepare docpull release updates --- .claude-plugin/marketplace.json | 11 +- .cursor/mcp.json | 9 + .cursor/rules/docpull-research.mdc | 25 ++ .editorconfig | 6 +- .gitattributes | 23 +- .github/CODEOWNERS | 2 +- .github/CONTRIBUTING.md | 9 +- .../actions/setup-python-docpull/action.yml | 35 +++ .github/labeler.yml | 20 +- .github/scripts/update_metrics.py | 206 ++++++++---- .github/workflows/benchmark.yml | 29 +- .github/workflows/ci.yml | 51 +-- .github/workflows/publish.yml | 26 +- .github/workflows/security.yml | 18 +- .gitignore | 34 ++ .mcp.json | 8 + .pre-commit-config.yaml | 2 + .vercel/project.json | 15 - AGENTS.md | 67 ++++ CLAUDE.md | 5 + Makefile | 65 +++- README.md | 133 ++++++-- docs/README.md | 65 ++++ docs/examples/README.md | 84 +++-- docs/examples/deduplication-strategies.yaml | 54 ++-- docs/examples/format-conversion.yaml | 62 ++-- docs/examples/incremental-updates.yaml | 37 +-- docs/examples/multi-source-optimized.yaml | 76 +++-- docs/examples/selective-crawling.yaml | 44 ++- docs/examples/simple-optimization.yaml | 27 +- docs/mcp-pgvector-setup.md | 78 +++++ mcp/.env.example | 8 + mcp/.gitignore | 4 - mcp/README.md | 52 +++- mcp/migrations/001_harden_embeddings.down.sql | 7 +- mcp/migrations/001_harden_embeddings.up.sql | 7 +- mcp/package.json | 5 + mcp/schema.sql | 3 +- mcp/src/db.test.ts | 20 +- mcp/src/db.ts | 86 +++-- mcp/src/embeddings.test.ts | 29 ++ mcp/src/embeddings.ts | 68 ++-- mcp/src/env.test.ts | 24 ++ mcp/src/env.ts | 2 +- mcp/src/ingest.test.ts | 18 ++ mcp/src/ingest.ts | 20 +- mcp/src/migrate.test.ts | 170 ++++++++++ mcp/src/migrate.ts | 269 ++++++++++++++++ mcp/src/server.ts | 293 +++++++++--------- plugin/.claude-plugin/plugin.json | 10 +- plugin/.codex-plugin/plugin.json | 6 + plugin/README.md | 86 ++++- plugin/commands/docs-add.md | 54 ---- plugin/commands/docs-list.md | 23 -- plugin/commands/docs-refresh.md | 31 -- plugin/commands/docs-remove.md | 42 --- plugin/commands/docs-search.md | 44 --- plugin/skills/docpull-research/SKILL.md | 33 +- pyproject.toml | 4 +- scripts/sync_agent_host_configs.py | 71 +++++ scripts/sync_claude_plugin.py | 124 ++++++++ src/docpull/__init__.py | 4 +- src/docpull/cache/manager.py | 155 +++++++-- src/docpull/cli.py | 91 +++--- src/docpull/conversion/chunking.py | 35 ++- src/docpull/conversion/special_cases.py | 11 +- src/docpull/core/__init__.py | 4 +- src/docpull/core/fetcher.py | 256 +++++++++++---- src/docpull/discovery/crawler.py | 34 +- .../discovery/link_extractors/static.py | 6 + src/docpull/discovery/sitemap.py | 11 +- src/docpull/http/client.py | 261 +++++++++------- src/docpull/http/protocols.py | 2 + src/docpull/http/rate_limiter.py | 74 ++++- src/docpull/mcp/prompts.py | 163 ++++++++++ src/docpull/mcp/server.py | 90 +++++- src/docpull/mcp/sources.py | 21 +- src/docpull/mcp/tools.py | 108 +++++-- src/docpull/metadata_extractor.py | 51 ++- src/docpull/models/__init__.py | 2 + src/docpull/models/config.py | 60 +++- src/docpull/models/events.py | 8 + src/docpull/models/profiles.py | 3 +- src/docpull/models/run.py | 67 ++++ src/docpull/pipeline/base.py | 78 ++++- src/docpull/pipeline/steps/convert.py | 4 +- src/docpull/pipeline/steps/dedup.py | 12 +- src/docpull/pipeline/steps/fetch.py | 20 +- src/docpull/pipeline/steps/save.py | 33 +- src/docpull/pipeline/steps/save_json.py | 37 ++- src/docpull/pipeline/steps/save_ndjson.py | 1 + src/docpull/pipeline/steps/save_sqlite.py | 57 ++-- src/docpull/pipeline/steps/validate.py | 19 +- src/docpull/security/robots.py | 50 +-- src/docpull/security/url_validator.py | 38 ++- tests/benchmarks/test_10k_pages.py | 21 +- tests/benchmarks/test_performance.py | 20 +- tests/conftest.py | 6 +- tests/test_cache_conditional_get.py | 41 ++- tests/test_cache_manager.py | 144 +++++++++ tests/test_chunking.py | 19 ++ tests/test_ci_policy.py | 49 +++ tests/test_claude_plugin_bundle.py | 150 +++++++++ tests/test_cli.py | 31 ++ tests/test_discovery.py | 103 ++++++ tests/test_integration.py | 267 ++++++++++++++++ tests/test_link_extractors.py | 7 + tests/test_mcp_server.py | 25 ++ tests/test_mcp_tools.py | 159 +++++++++- tests/test_metadata_extractor.py | 65 ++++ tests/test_naming.py | 16 + tests/test_pipeline.py | 96 +++++- tests/test_save_ndjson.py | 48 +++ tests/test_security_hardening.py | 232 ++++++++++++++ tests/test_special_cases.py | 37 +++ tests/test_update_metrics_script.py | 98 ++++++ web/.gitignore | 37 --- web/README.md | 4 +- .../.well-known/agent-skills.json/route.ts | 33 ++ web/app/.well-known/security.txt/route.ts | 19 ++ .../agent-skills/docpull-research.md/route.ts | 56 ++++ web/app/globals.css | 140 ++++++++- web/app/layout.tsx | 47 +-- web/app/llms-full.txt/route.ts | 124 ++++++++ web/app/llms.txt/route.ts | 62 ++++ web/app/not-found.tsx | 6 +- web/app/page.tsx | 4 +- web/app/robots.ts | 6 +- web/app/rss.xml/route.ts | 34 ++ web/app/sitemap.ts | 28 +- web/components/AsciiBackground.tsx | 43 ++- web/components/CodeExamples.tsx | 126 ++------ web/components/FAQ.tsx | 26 +- web/components/Features.tsx | 118 ++++--- web/components/Footer.tsx | 221 ++++++------- web/components/Header.tsx | 49 ++- web/components/Hero.tsx | 274 +++++++++------- web/components/HostBadge.tsx | 96 ++++++ web/components/HowItWorks.tsx | 42 +-- web/components/Install.tsx | 73 ++--- web/components/McpSetup.tsx | 174 +++++++++++ web/components/Profiles.tsx | 121 +++++--- web/components/StructuredData.tsx | 26 +- web/components/ThemeProvider.tsx | 8 +- web/components/ThemeToggle.tsx | 1 + web/components/faq-content.tsx | 186 ----------- web/lib/content/faqs.tsx | 175 +++++++++++ web/lib/content/home.ts | 176 +++++++++++ web/lib/content/install.ts | 53 ++++ web/lib/hooks/use-copy-to-clipboard.ts | 44 +++ web/lib/site.ts | 30 ++ web/lib/use-reduced-motion.ts | 28 +- web/lib/utils.ts | 9 + web/next.config.ts | 24 ++ web/package.json | 5 +- web/public/brands/anthropic-dark.svg | 86 +++++ web/public/brands/anthropic-light.svg | 86 +++++ web/public/brands/anthropic-symbol-dark.svg | 8 + web/public/brands/anthropic-symbol-light.svg | 8 + web/public/brands/cursor-dark.svg | 1 + web/public/brands/cursor-light.svg | 1 + web/public/brands/cursor-symbol.svg | 12 + web/public/brands/openai-dark.svg | 1 + web/public/brands/openai-light.svg | 1 + web/public/brands/openai-symbol-dark.svg | 15 + web/public/brands/openai-symbol-light.svg | 15 + web/public/og-image.png | Bin 602053 -> 67915 bytes web/tsconfig.json | 1 + 168 files changed, 7281 insertions(+), 2121 deletions(-) create mode 100644 .cursor/mcp.json create mode 100644 .cursor/rules/docpull-research.mdc create mode 100644 .github/actions/setup-python-docpull/action.yml create mode 100644 .mcp.json delete mode 100644 .vercel/project.json create mode 100644 AGENTS.md create mode 100644 CLAUDE.md create mode 100644 docs/README.md create mode 100644 docs/mcp-pgvector-setup.md delete mode 100644 mcp/.gitignore create mode 100644 mcp/src/embeddings.test.ts create mode 100644 mcp/src/env.test.ts create mode 100644 mcp/src/ingest.test.ts create mode 100644 mcp/src/migrate.test.ts create mode 100644 mcp/src/migrate.ts create mode 100644 plugin/.codex-plugin/plugin.json delete mode 100644 plugin/commands/docs-add.md delete mode 100644 plugin/commands/docs-list.md delete mode 100644 plugin/commands/docs-refresh.md delete mode 100644 plugin/commands/docs-remove.md delete mode 100644 plugin/commands/docs-search.md create mode 100644 scripts/sync_agent_host_configs.py create mode 100644 scripts/sync_claude_plugin.py create mode 100644 src/docpull/mcp/prompts.py create mode 100644 src/docpull/models/run.py create mode 100644 tests/test_cache_manager.py create mode 100644 tests/test_claude_plugin_bundle.py create mode 100644 tests/test_metadata_extractor.py create mode 100644 tests/test_update_metrics_script.py delete mode 100644 web/.gitignore create mode 100644 web/app/.well-known/agent-skills.json/route.ts create mode 100644 web/app/.well-known/security.txt/route.ts create mode 100644 web/app/agent-skills/docpull-research.md/route.ts create mode 100644 web/app/llms-full.txt/route.ts create mode 100644 web/app/llms.txt/route.ts create mode 100644 web/app/rss.xml/route.ts create mode 100644 web/components/HostBadge.tsx create mode 100644 web/components/McpSetup.tsx delete mode 100644 web/components/faq-content.tsx create mode 100644 web/lib/content/faqs.tsx create mode 100644 web/lib/content/home.ts create mode 100644 web/lib/content/install.ts create mode 100644 web/lib/hooks/use-copy-to-clipboard.ts create mode 100644 web/lib/site.ts create mode 100644 web/public/brands/anthropic-dark.svg create mode 100644 web/public/brands/anthropic-light.svg create mode 100644 web/public/brands/anthropic-symbol-dark.svg create mode 100644 web/public/brands/anthropic-symbol-light.svg create mode 100644 web/public/brands/cursor-dark.svg create mode 100644 web/public/brands/cursor-light.svg create mode 100644 web/public/brands/cursor-symbol.svg create mode 100644 web/public/brands/openai-dark.svg create mode 100644 web/public/brands/openai-light.svg create mode 100644 web/public/brands/openai-symbol-dark.svg create mode 100644 web/public/brands/openai-symbol-light.svg diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 0faf922..ca450bc 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -3,14 +3,14 @@ "owner": { "name": "Raintree Technology", "email": "support@raintree.technology", - "url": "https://raintree.technology" + "url": "https://github.com/raintree-technology/docpull" }, "plugins": [ { "name": "docpull", "source": "./plugin", - "description": "Pull docs from any URL into Claude Code. Local, fast, no API keys.", - "version": "0.2.0", + "description": "Pull server-rendered web content from any URL into Claude Code. Local, fast, no API keys.", + "version": "4.0.0", "author": { "name": "Raintree Technology", "email": "support@raintree.technology" @@ -20,11 +20,10 @@ "license": "MIT", "category": "documentation", "keywords": [ - "documentation", - "docs", + "web", + "crawler", "fetch", "markdown", - "rag", "indexing", "mcp", "local-first" diff --git a/.cursor/mcp.json b/.cursor/mcp.json new file mode 100644 index 0000000..905d636 --- /dev/null +++ b/.cursor/mcp.json @@ -0,0 +1,9 @@ +{ + "mcpServers": { + "docpull": { + "type": "stdio", + "command": "docpull", + "args": ["mcp"] + } + } +} diff --git a/.cursor/rules/docpull-research.mdc b/.cursor/rules/docpull-research.mdc new file mode 100644 index 0000000..5d476cc --- /dev/null +++ b/.cursor/rules/docpull-research.mdc @@ -0,0 +1,25 @@ +--- +description: Use docpull MCP tools to ground library, framework, SDK, API, docs-backed tooling, or pasted-docs questions in fetched documentation. +alwaysApply: false +--- + +# docpull research + +Use docpull MCP tools when the user asks about a specific library, framework, SDK, API surface, docs-backed tool ecosystem, version-sensitive behavior, or pasted documentation URL. + +## Workflow + +1. Check cached sources with `mcp__docpull__list_indexed`. +2. If the requested library is cached, search it with `mcp__docpull__grep_docs`. +3. Use `mcp__docpull__read_doc` for line-level follow-up context. +4. If the library is not cached: + - use `mcp__docpull__ensure_docs` for a built-in alias + - use `mcp__docpull__fetch_url` for one pasted page + - otherwise ask once for the docs URL +5. Answer with attribution to the fetched source. + +For Vercel skills, `skills.sh`, `npx skills`, agent skill installation, or `SKILL.md` questions, treat the docs as version-sensitive. Search cached docs first for exact commands and flags such as `skills add`, `--agent`, `--skill`, `--copy`, `--yes`, `skills use`, `skills list`, `skills find`, `skills update`, and `skills remove`. If no cached source exists, use a pasted skills.sh URL with `mcp__docpull__fetch_url`; otherwise prefer `https://www.skills.sh/docs` for quick one-page answers and the official Vercel Labs Skills README for CLI option details. + +Do not use docpull for general programming explanations, the user's own codebase, or stable standard-library APIs. + +Built-in aliases include `react`, `nextjs`, `tailwindcss`, `vite`, `hono`, `fastapi`, `express`, `anthropic`, `openai`, `langchain`, `supabase`, `drizzle`, and `prisma`. diff --git a/.editorconfig b/.editorconfig index 2680b6b..5f66d1f 100644 --- a/.editorconfig +++ b/.editorconfig @@ -12,7 +12,6 @@ trim_trailing_whitespace = true [*.{py,pyi}] indent_style = space indent_size = 4 -max_line_length = 110 [*.{yml,yaml}] indent_style = space @@ -22,9 +21,12 @@ indent_size = 2 indent_style = space indent_size = 2 +[*.{js,cjs,mjs,ts,tsx}] +indent_style = space +indent_size = 2 + [*.md] trim_trailing_whitespace = false -max_line_length = off [Makefile] indent_style = tab diff --git a/.gitattributes b/.gitattributes index 8b2dcc6..b437b36 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,24 +1,6 @@ # Auto detect text files and normalize line endings to LF * text=auto eol=lf -# Source files -*.py text eol=lf -*.pyi text eol=lf - -# Configuration files -*.toml text eol=lf -*.yaml text eol=lf -*.yml text eol=lf -*.json text eol=lf -*.txt text eol=lf -*.md text eol=lf -*.rst text eol=lf - -# Scripts -*.sh text eol=lf -*.bash text eol=lf -Makefile text eol=lf - # Windows-specific files *.bat text eol=crlf *.cmd text eol=crlf @@ -42,14 +24,15 @@ Makefile text eol=lf *.gz binary *.bz2 binary -# Exclude from exports (for git archive / PyPI sdist) +# Exclude from git archives .gitattributes export-ignore .gitignore export-ignore +.claude-plugin/ export-ignore .github/ export-ignore .pre-commit-config.yaml export-ignore .editorconfig export-ignore Makefile export-ignore -CONTRIBUTING.md export-ignore +.github/CONTRIBUTING.md export-ignore tests/ export-ignore test*.py export-ignore .venv/ export-ignore diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b3069bd..36d79b6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -11,7 +11,7 @@ /src/docpull/ @raintree-technology /mcp/ @raintree-technology /web/ @raintree-technology -/security/ @raintree-technology +/src/docpull/security/ @raintree-technology # Documentation *.md @raintree-technology diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index dd9ab04..5f83b9d 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -6,7 +6,7 @@ git clone https://github.com/YOUR_USERNAME/docpull.git cd docpull python -m venv .venv && source .venv/bin/activate -pip install -e ".[dev]" +pip install -e ".[all,dev]" pre-commit install ``` @@ -15,7 +15,9 @@ pre-commit install ```bash git checkout -b feature/your-feature # Make changes, add tests -make test && make lint +make pre-commit-check +make typecheck +make test git commit -m "feat: description" # Use conventional commits git push origin feature/your-feature # Open PR on GitHub @@ -25,7 +27,8 @@ git push origin feature/your-feature - Type hints required - Tests required for new features -- Pre-commit hooks enforce formatting (Black, Ruff) +- Pre-commit hooks enforce formatting and linting (Ruff) plus type checks (mypy) +- If you touch `web/` or `mcp/`, run that workspace's own install/test/build commands too ## Commit Types diff --git a/.github/actions/setup-python-docpull/action.yml b/.github/actions/setup-python-docpull/action.yml new file mode 100644 index 0000000..a30e5e4 --- /dev/null +++ b/.github/actions/setup-python-docpull/action.yml @@ -0,0 +1,35 @@ +name: Setup Python for docpull +description: Set up Python, optionally enable pip cache, and install docpull extras. +inputs: + python-version: + description: Python version to install. + required: true + dependency-groups: + description: Extras expression to install, e.g. all,dev or dev. + required: true + enable-cache: + description: Whether to enable actions/setup-python pip caching. + required: false + default: "false" +runs: + using: composite + steps: + - name: Set up Python + if: inputs.enable-cache != 'true' + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 + with: + python-version: ${{ inputs.python-version }} + + - name: Set up Python with pip cache + if: inputs.enable-cache == 'true' + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 + with: + python-version: ${{ inputs.python-version }} + cache: pip + cache-dependency-path: pyproject.toml + + - name: Install Python dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install -e ".[${{ inputs.dependency-groups }}]" diff --git a/.github/labeler.yml b/.github/labeler.yml index 0ba9a21..1e9a86f 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -10,15 +10,25 @@ tests: dependencies: - 'pyproject.toml' - - 'requirements.txt' + - 'requirements*.txt' + - 'web/package.json' + - 'web/package-lock.json' + - 'mcp/package.json' + - 'mcp/bun.lock' ci-cd: - '.github/workflows/**/*' - - '.github/**/*' + - '.github/actions/**/*' fetchers: - - 'docpull/fetchers/**/*' + - 'src/docpull/core/fetcher.py' + - 'src/docpull/discovery/_fetch.py' + - 'src/docpull/http/**/*' + - 'src/docpull/pipeline/steps/fetch.py' core: - - 'docpull/**/*.py' - - '!docpull/fetchers/**/*' + - 'src/docpull/**/*.py' + - '!src/docpull/core/fetcher.py' + - '!src/docpull/discovery/_fetch.py' + - '!src/docpull/http/**/*' + - '!src/docpull/pipeline/steps/fetch.py' diff --git a/.github/scripts/update_metrics.py b/.github/scripts/update_metrics.py index 08d7052..087d4c8 100644 --- a/.github/scripts/update_metrics.py +++ b/.github/scripts/update_metrics.py @@ -25,6 +25,7 @@ REPO = os.environ.get("GITHUB_REPOSITORY", "raintree-technology/docpull") PKG = os.environ.get("PYPI_PACKAGE", "docpull") OUTPUT = Path(os.environ.get("METRICS_OUTPUT", "METRICS.md")) +BLANK = "" def gh(path: str) -> dict | list: @@ -71,6 +72,101 @@ def fmt(n: int | float) -> str: return f"{int(n):,}" +def append_table(lines: list[str], headers: list[str], rows: list[list[str]]) -> None: + """Append a markdown table to ``lines``.""" + lines.append("| " + " | ".join(headers) + " |") + lines.append("|" + "|".join(["---"] * len(headers)) + "|") + for row in rows: + lines.append("| " + " | ".join(row) + " |") + + +def append_section_with_table( + lines: list[str], + title: str, + headers: list[str], + rows: list[list[str]], + *, + empty_message: str, +) -> None: + """Append a heading followed by either a table or an empty-state message.""" + lines.extend([title, BLANK]) + if rows: + append_table(lines, headers, rows) + else: + lines.append(empty_message) + lines.append(BLANK) + + +def append_table_or_empty( + lines: list[str], + headers: list[str], + rows: list[list[str]], + *, + empty_message: str, +) -> None: + """Append either a table or an empty-state message to the current section.""" + if rows: + append_table(lines, headers, rows) + else: + lines.append(empty_message) + lines.append(BLANK) + + +def trim_repo_path(path: str) -> str: + """Trim the repo prefix from GitHub traffic paths for readability.""" + return path.replace(f"/{REPO}", "") or "/" + + +def build_snapshot_rows( + *, + recent: dict, + repo: dict, + open_issues: int, + open_prs: int, + clones: dict, + views: dict, +) -> list[list[str]]: + """Return the summary table rows for the Snapshot section.""" + return [ + ["PyPI downloads (last 24h)", fmt(recent.get("last_day", 0))], + ["PyPI downloads (last 7d)", fmt(recent.get("last_week", 0))], + ["PyPI downloads (last 30d)", fmt(recent.get("last_month", 0))], + ["GitHub stars", fmt(repo.get("stargazers_count", 0))], + ["GitHub forks", fmt(repo.get("forks_count", 0))], + ["GitHub watchers", fmt(repo.get("subscribers_count", 0))], + ["Open issues", fmt(open_issues)], + ["Open PRs", fmt(open_prs)], + ["Repo clones (last 14d)", fmt(clones.get("count", 0))], + ["Unique cloners (last 14d)", fmt(clones.get("uniques", 0))], + ["Repo views (last 14d)", fmt(views.get("count", 0))], + ["Unique visitors (last 14d)", fmt(views.get("uniques", 0))], + ] + + +def build_clone_rows(clones: dict) -> list[list[str]]: + """Return daily clone rows, newest first.""" + return [ + [row.get("timestamp", "")[:10], fmt(row.get("count", 0)), fmt(row.get("uniques", 0))] + for row in reversed(clones.get("clones", [])) + ] + + +def build_referrer_rows(referrers: list[dict]) -> list[list[str]]: + """Return the top referrer rows.""" + return [ + [ref.get("referrer", "?"), fmt(ref.get("count", 0)), fmt(ref.get("uniques", 0))] + for ref in referrers[:10] + ] + + +def build_path_rows(paths: list[dict]) -> list[list[str]]: + """Return the top path rows.""" + return [ + [f"`{trim_repo_path(path.get('path', '?'))}`", fmt(path.get("count", 0)), fmt(path.get("uniques", 0))] + for path in paths[:10] + ] + + def safe_get(fn, default, *, on_error: list[str] | None = None): """Best-effort wrapper — never let a transient API hiccup blank METRICS.md. @@ -123,25 +219,17 @@ def main() -> int: recent = safe_get(lambda: pypistats("/recent")["data"], {}, on_error=pypi_errors) pypi_blocked = bool(pypi_errors) - stars = repo.get("stargazers_count", 0) - forks = repo.get("forks_count", 0) - watchers = repo.get("subscribers_count", 0) - - last_day = recent.get("last_day", 0) - last_week = recent.get("last_week", 0) - last_month = recent.get("last_month", 0) - lines: list[str] = [] push = lines.append push(f"# {PKG} metrics") - push("") + push(BLANK) push( f"_Last updated: {timestamp}. Auto-generated by `.github/workflows/metrics.yml`; " "do not edit by hand._" ) - push("") + push(BLANK) push("## Snapshot") - push("") + push(BLANK) if pypi_blocked: push( "> **PyPI download counts unavailable this run.** pypistats.org " @@ -149,29 +237,27 @@ def main() -> int: "IPs). Showing the last successful values would be misleading; " "the next run should recover automatically." ) - push("") - push("| Metric | Value |") - push("|---|---|") - push(f"| PyPI downloads (last 24h) | {fmt(last_day)} |") - push(f"| PyPI downloads (last 7d) | {fmt(last_week)} |") - push(f"| PyPI downloads (last 30d) | {fmt(last_month)} |") - push(f"| GitHub stars | {fmt(stars)} |") - push(f"| GitHub forks | {fmt(forks)} |") - push(f"| GitHub watchers | {fmt(watchers)} |") - push(f"| Open issues | {fmt(open_issues)} |") - push(f"| Open PRs | {fmt(open_prs)} |") - push(f"| Repo clones (last 14d) | {fmt(clones.get('count', 0))} |") - push(f"| Unique cloners (last 14d) | {fmt(clones.get('uniques', 0))} |") - push(f"| Repo views (last 14d) | {fmt(views.get('count', 0))} |") - push(f"| Unique visitors (last 14d) | {fmt(views.get('uniques', 0))} |") - push("") + push(BLANK) + append_table( + lines, + ["Metric", "Value"], + build_snapshot_rows( + recent=recent, + repo=repo, + open_issues=open_issues, + open_prs=open_prs, + clones=clones, + views=views, + ), + ) + push(BLANK) push("## Plugin install proxy: daily clones (last 14d)") - push("") + push(BLANK) push( f"`/plugin marketplace add {REPO}` is a git clone " "under the hood, so daily clone counts approximate plugin installs." ) - push("") + push(BLANK) if traffic_blocked: push( "> **Traffic data unavailable.** The workflow's token is missing " @@ -182,51 +268,35 @@ def main() -> int: "[`.github/workflows/metrics.yml`](.github/workflows/metrics.yml) " "for full setup." ) - push("") - daily = clones.get("clones", []) - if daily: - push("| Date | Clones | Unique cloners |") - push("|---|---|---|") - # GitHub returns oldest-first; show newest-first for readability. - for row in reversed(daily): - ts = row.get("timestamp", "")[:10] - push(f"| {ts} | {fmt(row.get('count', 0))} | {fmt(row.get('uniques', 0))} |") - else: - push("_No clones recorded in the last 14 days._") - push("") - push("## Top referrers (last 14d)") - push("") - if referrers: - push("| Source | Views | Unique |") - push("|---|---|---|") - for ref in referrers[:10]: - push( - f"| {ref.get('referrer', '?')} | {fmt(ref.get('count', 0))} | {fmt(ref.get('uniques', 0))} |" - ) - else: - push("_No referrers recorded in the last 14 days._") - push("") - push("## Top paths (last 14d)") - push("") - if paths: - push("| Path | Views | Unique |") - push("|---|---|---|") - for p in paths[:10]: - label = p.get("path", "?") - # Paths are full URLs; trim the repo prefix for readability. - label = label.replace(f"/{REPO}", "") or "/" - push(f"| `{label}` | {fmt(p.get('count', 0))} | {fmt(p.get('uniques', 0))} |") - else: - push("_No path traffic recorded in the last 14 days._") - push("") + push(BLANK) + append_table_or_empty( + lines, + ["Date", "Clones", "Unique cloners"], + build_clone_rows(clones), + empty_message="_No clones recorded in the last 14 days._", + ) + append_section_with_table( + lines, + "## Top referrers (last 14d)", + ["Source", "Views", "Unique"], + build_referrer_rows(referrers), + empty_message="_No referrers recorded in the last 14 days._", + ) + append_section_with_table( + lines, + "## Top paths (last 14d)", + ["Path", "Views", "Unique"], + build_path_rows(paths), + empty_message="_No path traffic recorded in the last 14 days._", + ) push("## Drill deeper") - push("") + push(BLANK) push(f"- [PyPI page](https://pypi.org/project/{PKG}/)") push(f"- [pepy.tech graphs](https://pepy.tech/project/{PKG})") push(f"- [pypistats daily history](https://pypistats.org/packages/{PKG})") push(f"- [GitHub Insights → Traffic](https://github.com/{REPO}/graphs/traffic)") push(f"- [Star history](https://star-history.com/#{REPO}&Date)") - push("") + push(BLANK) OUTPUT.write_text("\n".join(lines)) print(f"Wrote {OUTPUT} ({len(lines)} lines)") diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 6def7c5..4e59900 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -14,12 +14,12 @@ on: branches: - main paths: - # Re-run on changes that could plausibly move the numbers. - - "src/docpull/core/fetcher.py" - - "src/docpull/pipeline/**" - - "src/docpull/conversion/**" - - "src/docpull/cache/**" - - "tests/benchmarks/test_10k_pages.py" + # Re-run on docpull implementation, benchmark harness, or packaging + # changes that can shift perf or memory characteristics. + - "src/docpull/**" + - "tests/benchmarks/**" + - "pyproject.toml" + - ".github/workflows/benchmark.yml" permissions: contents: read @@ -32,22 +32,17 @@ jobs: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 + - name: Set up Python and benchmark deps + uses: ./.github/actions/setup-python-docpull with: python-version: "3.11" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" + dependency-groups: dev - name: Run 10k-page benchmark - env: - DOCPULL_BENCHMARK_10K: "1" + shell: bash run: | - PYTHONPATH=src pytest -v -s tests/benchmarks/test_10k_pages.py \ - | tee benchmark.log + set -o pipefail + make benchmark-10k | tee benchmark.log - name: Surface report in summary if: always() diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8fb02fa..b2fac27 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,28 +19,20 @@ jobs: strategy: fail-fast: false matrix: - # pyproject.toml claims 3.10-3.14 support, but actions/setup-python - # may not have stable 3.14 in its release manifest yet, so leave 3.14 - # out for now. Add it back once setup-python ships a stable 3.14. - python-version: ["3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 + uses: ./.github/actions/setup-python-docpull with: python-version: ${{ matrix.python-version }} - cache: pip - cache-dependency-path: pyproject.toml - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[all,dev]" + dependency-groups: all,dev + enable-cache: "true" - name: Run tests with coverage - run: pytest --cov=docpull --cov-report=xml --cov-report=term -q + run: make test-cov - name: Upload coverage XML if: matrix.python-version == '3.11' @@ -56,25 +48,17 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 + uses: ./.github/actions/setup-python-docpull with: python-version: "3.11" - cache: pip - cache-dependency-path: pyproject.toml - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[all,dev]" + dependency-groups: all,dev + enable-cache: "true" - - name: Run ruff - run: ruff check . - - - name: Check formatting - run: ruff format --check . + - name: Run lint checks + run: make lint-check - name: Run pre-commit - run: pre-commit run --all-files --show-diff-on-failure + run: make pre-commit-check typecheck: runs-on: ubuntu-latest @@ -83,16 +67,11 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 + uses: ./.github/actions/setup-python-docpull with: python-version: "3.11" - cache: pip - cache-dependency-path: pyproject.toml - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[all,dev]" + dependency-groups: all,dev + enable-cache: "true" - name: Run mypy - run: mypy src + run: make typecheck diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b1dd02a..00c3ee9 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -27,12 +27,15 @@ jobs: version: ${{ steps.meta.outputs.version }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd + with: + fetch-depth: 0 - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 + - uses: ./.github/actions/setup-python-docpull with: python-version: "3.11" + dependency-groups: all,dev - - name: Read pyproject.toml version (and verify tag match on tag push) + - name: Read pyproject.toml version and verify tag match id: meta run: | set -e @@ -51,21 +54,22 @@ jobs: fi echo "version=$PROJECT_VERSION" >> "$GITHUB_OUTPUT" - - name: Install release dependencies + - name: Verify tagged commit is on main + run: | + git fetch --no-tags origin main + if ! git merge-base --is-ancestor "$GITHUB_SHA" "origin/main"; then + echo "::error::Release tags must point to a commit reachable from origin/main" + exit 1 + fi + + - name: Install pinned release tooling run: | # Pinned (see requirements-release.txt) so a compromised *latest* release # of pip/build/twine cannot be auto-pulled into the wheel-building job. python -m pip install -r requirements-release.txt - pip install -e ".[all,dev]" - name: Run release gates - run: | - ruff check . - ruff format --check . - mypy src - pytest -q - pip-audit - bandit -q -c pyproject.toml -r src + run: make release-gates - name: Build distributions run: | diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 8016b12..43513a7 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -34,23 +34,13 @@ jobs: uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 + uses: ./.github/actions/setup-python-docpull with: python-version: "3.11" - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install -e ".[dev]" - - - name: Audit Python dependencies - run: pip-audit - - - name: Run Bandit - run: bandit -q -c pyproject.toml -r src + dependency-groups: all,dev - name: Run Python security regression tests - run: PYTHONPATH=src pytest -q tests/test_security_hardening.py tests/test_discovery.py tests/test_integration.py + run: make python-security mcp-security: runs-on: ubuntu-latest @@ -90,7 +80,7 @@ jobs: - name: Set up Node.js uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e with: - node-version: "22" + node-version: "24" cache: npm cache-dependency-path: web/package-lock.json diff --git a/.gitignore b/.gitignore index c94a488..84b1840 100644 --- a/.gitignore +++ b/.gitignore @@ -111,6 +111,8 @@ celerybeat.pid .env .env.* !.env.example +.env.local +.env.*.local .venv # Key material / certificates (defense-in-depth) @@ -156,6 +158,7 @@ cython_debug/ .vscode/ .idea/ .claude/ +.vercel/ *.swp *.swo *~ @@ -165,6 +168,14 @@ cython_debug/ Thumbs.db # Project-specific +# Node / web app artifacts +node_modules/ +.next/ +out/ +next-env.d.ts +*.tsbuildinfo +npm-debug.log* + # Downloaded documentation (output from docpull) # Note: docs/ is NOT ignored - it contains project documentation *-docs/ @@ -194,3 +205,26 @@ screenshots/ # Application cache .docpull-cache/ + +# Generated Claude Code plugin bundle; source of truth lives in plugin/ +.claude-plugin/ + +# Codex/skills installer artifacts. Keep the installer spillover ignored, but +# allow the intentionally checked-in repo-scoped Codex host config/skill. +/.agents/* +!/.agents/skills/ +!/.agents/skills/docpull-research/ +!/.agents/skills/docpull-research/** +!/.agents/plugins/ +!/.agents/plugins/marketplace.json +/.crush/ +/.goose/ +/.pi/ +/skills/ +/skills-lock.json +/mcp/.agents/ +/mcp/.crush/ +/mcp/.goose/ +/mcp/.pi/ +/mcp/skills/ +/mcp/skills-lock.json diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 0000000..68cf2b3 --- /dev/null +++ b/.mcp.json @@ -0,0 +1,8 @@ +{ + "mcpServers": { + "docpull": { + "command": "docpull", + "args": ["mcp"] + } + } +} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d5dc9fb..6fc35f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,6 +4,8 @@ repos: hooks: - id: trailing-whitespace - id: end-of-file-fixer + - id: check-json + - id: check-toml - id: check-yaml - id: check-added-large-files - id: check-merge-conflict diff --git a/.vercel/project.json b/.vercel/project.json deleted file mode 100644 index 6feb555..0000000 --- a/.vercel/project.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "projectId": "prj_pWB2G4Mef4CDHdU3HQibqlRpRyOO", - "orgId": "team_FFnZfrvD19xX7UdEzrIq4swD", - "projectName": "docpull", - "settings": { - "framework": "nextjs", - "devCommand": null, - "installCommand": null, - "buildCommand": null, - "outputDirectory": null, - "rootDirectory": "web", - "directoryListing": false, - "nodeVersion": "24.x" - } -} diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..ccdbb13 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,67 @@ +# Project Agent Instructions + +## docpull MCP + +This repo ships the `docpull mcp` stdio server for agent clients. Claude Code, Cursor, and Codex should all use the same server command: + +```bash +docpull mcp +``` + +Install the MCP extra before relying on the server: + +```bash +pip install 'docpull[mcp]' +docpull mcp --help +``` + +Codex can add the server to the shared CLI/IDE MCP config: + +```bash +codex mcp add docpull -- docpull mcp +``` + +In trusted projects, Codex also supports project-scoped `.codex/config.toml`: + +```toml +[mcp_servers.docpull] +command = "docpull" +args = ["mcp"] +``` + +For a repo-scoped reusable skill, Codex discovers skills in `.agents/skills` from the current directory up to the repo root. The equivalent skill path is `.agents/skills/docpull-research/SKILL.md`. + +## docpull Research Behavior + +Use docpull MCP tools when the user asks about a specific library, framework, SDK, API surface, docs-backed tool ecosystem, version-sensitive behavior, or pasted documentation URL. + +1. Check cached sources with `mcp__docpull__list_indexed`. +2. If the requested library is cached, search it with `mcp__docpull__grep_docs`. +3. Use `mcp__docpull__read_doc` for line-level follow-up context. +4. If the library is not cached: + - use `mcp__docpull__ensure_docs` for a built-in alias + - use `mcp__docpull__fetch_url` for one pasted page + - otherwise ask once for the docs URL +5. Answer with attribution to the fetched source. + +For Vercel skills, `skills.sh`, `npx skills`, agent skill installation, or `SKILL.md` questions, treat the docs as version-sensitive. Search cached docs first for exact commands and flags such as `skills add`, `--agent`, `--skill`, `--copy`, `--yes`, `skills use`, `skills list`, `skills find`, `skills update`, and `skills remove`. If no cached source exists, use a pasted skills.sh URL with `mcp__docpull__fetch_url`; otherwise prefer `https://www.skills.sh/docs` for quick one-page answers and the official Vercel Labs Skills README for CLI option details. + +Do not use docpull for general programming explanations, the user's own codebase, or stable standard-library APIs. + +Built-in aliases include `react`, `nextjs`, `tailwindcss`, `vite`, `hono`, `fastapi`, `express`, `anthropic`, `openai`, `langchain`, `supabase`, `drizzle`, and `prisma`. + +## Skills Installer + +When installing skills with `npx skills add `, always pass these flags: + +```bash +npx -y skills add --skill '*' --agent codex --copy --yes +``` + +After install, the CLI still creates `.agents/` and `skills-lock.json` at project root. Delete both to keep the layout flat: + +```bash +rm -rf .agents skills-lock.json +``` + +Keep installer artifacts ignored: `.agents/`, `.crush/`, `.goose/`, `.pi/`, top-level `skills/`, and `skills-lock.json`. If this repo intentionally checks in Codex repo skills or a Codex plugin marketplace, allow only the specific `.agents/skills/...` or `.agents/plugins/...` files needed for that setup. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..69ca9e5 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,5 @@ +@AGENTS.md + +## Claude Code + +Claude Code can read the project MCP server from `.mcp.json`. The Claude marketplace plugin in `plugin/` registers the same server and adds the same research behavior as a Claude skill. diff --git a/Makefile b/Makefile index 70974a0..eba6c88 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,11 @@ -.PHONY: clean clean-pyc clean-build clean-test help +.PHONY: benchmark benchmark-fast benchmark-10k clean clean-pyc clean-build clean-test help +.PHONY: lint lint-check format pre-commit-check typecheck +.PHONY: test test-cov python-security release-gates sync-agent-host-configs sync-claude-plugin + +PYTHONPATH ?= src +PYTEST = PYTHONPATH=$(PYTHONPATH) pytest +RUFF = ruff +MYPY = mypy help: @echo "clean - remove all build, test, coverage and Python artifacts" @@ -6,8 +13,19 @@ help: @echo "clean-pyc - remove Python file artifacts" @echo "clean-test - remove test and coverage artifacts" @echo "test - run tests with pytest" + @echo "test-cov - run tests with coverage" + @echo "benchmark - run the default benchmark suite" + @echo "benchmark-fast - run lightweight performance benchmarks" @echo "lint - check style with ruff" + @echo "lint-check - check style and formatting with ruff" @echo "format - format code with ruff" + @echo "pre-commit-check - run pre-commit across the repo" + @echo "typecheck - run mypy" + @echo "python-security - run Python dependency and source security checks" + @echo "release-gates - run the full Python release gate suite" + @echo "sync-agent-host-configs - regenerate project-local agent host config files" + @echo "sync-claude-plugin - regenerate the self-contained Claude plugin bundle" + @echo "benchmark-10k - run the 10k-page benchmark" clean: clean-build clean-pyc clean-test @@ -39,13 +57,50 @@ clean-test: # clean it manually or pick a different -o. test: - pytest + $(PYTEST) + +test-cov: + $(PYTEST) --cov=docpull --cov-report=xml --cov-report=term -q benchmark: - DOCPULL_BENCHMARK_10K=1 pytest tests/benchmarks/test_10k_pages.py -v -s + $(MAKE) benchmark-10k + +benchmark-fast: + DOCPULL_BENCHMARKS=1 $(PYTEST) tests/benchmarks/test_performance.py -v -s + +benchmark-10k: + DOCPULL_BENCHMARK_10K=1 $(PYTEST) tests/benchmarks/test_10k_pages.py -v -s lint: - ruff check . + $(RUFF) check . + +lint-check: + $(RUFF) check . + $(RUFF) format --check . format: - ruff format . + $(RUFF) format . + +pre-commit-check: + pre-commit run --all-files --show-diff-on-failure + +typecheck: + $(MYPY) src + +python-security: + pip-audit + bandit -q -c pyproject.toml -r src + $(PYTEST) -q tests/test_security_hardening.py tests/test_discovery.py tests/test_integration.py + +release-gates: + $(MAKE) lint-check + $(MAKE) typecheck + $(PYTEST) -q + pip-audit + bandit -q -c pyproject.toml -r src + +sync-claude-plugin: + python scripts/sync_claude_plugin.py + +sync-agent-host-configs: + python scripts/sync_agent_host_configs.py diff --git a/README.md b/README.md index 0d2e0e7..c9b0919 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # docpull -**Security-hardened, browser-free crawler that turns static documentation sites into clean, AI-ready Markdown — fast.** +**Security-hardened, browser-free web puller that turns server-rendered sites into clean, AI-ready Markdown — fast.** [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull) @@ -15,13 +15,14 @@ docpull uses async HTTP (not Playwright) to fetch server-rendered pages, extracts main content, and writes clean Markdown with source-URL frontmatter — -in seconds, with a small install footprint. It won't render JavaScript, but for -the large class of docs that don't need it (API references, Python/Go stdlib, -most dev-tool docs, OpenAPI specs, Next.js and Docusaurus builds), it is a -fast, auditable, sandbox-friendly way to pipe documentation into an LLM context, -a RAG index, or an offline archive. SSRF, XXE, DNS-rebinding, and -CRLF-injection protections are on by default — a necessity when an AI agent -is choosing the URLs. +in seconds, with a small install footprint. It will not render JavaScript, but +for the large class of pages that arrive as HTML without a browser +(documentation, blogs, help centers, knowledge bases, changelogs, policy pages, +marketing pages, and many framework-built sites), it is a fast, auditable, +sandbox-friendly way to pull web content into an LLM context, a RAG index, a +local archive, or an agent workflow. SSRF, XXE, DNS-rebinding, and +CRLF-injection protections are on by default — a necessity when an AI agent is +choosing the URLs. ## Install @@ -39,18 +40,28 @@ pip install 'docpull[all]' # everything above ```bash # Crawl and save Markdown -docpull https://docs.example.com +docpull https://example.com # One page, no crawl — the fast path for agents -docpull https://docs.example.com/guide --single +docpull https://example.com/pricing --single # LLM-ready NDJSON with 4k-token chunks streamed to stdout -docpull https://docs.example.com --profile llm --stream | jq . +docpull https://example.com --profile llm --stream | jq . # Mirror a site for offline use -docpull https://docs.example.com --profile mirror --cache +docpull https://example.com --profile mirror --cache + +# Generate a docs-backed agent skill +docpull https://docs.example.com --skill example-docs --max-pages 100 ``` +## What it is best at + +docpull is strongest on server-rendered sites where the HTML already contains +the content you care about. Documentation is the most common use case, but it +also works well for many blogs, company sites, release notes, help centers, and +other content-heavy sections of the web. + ## Framework-aware extraction docpull inspects each page before running the generic extractor and can pull @@ -77,6 +88,8 @@ can route elsewhere). - **`--emit-chunks`** — write one file or record per chunk instead of per page. - **`--strict-js-required`** — hard-fail on JS-only pages instead of silently skipping. +- **`--skill NAME`** — write a hierarchical docs snapshot plus a `SKILL.md` + manifest under `.claude/skills/NAME` by default. - **`--extractor trafilatura`** — swap in [trafilatura](https://trafilatura.readthedocs.io/) for sites where the default heuristics struggle. @@ -130,6 +143,36 @@ docpull https://site.com --profile mirror # Full archive, polite, cached. docpull https://site.com --profile quick # Sampling: 50 pages, depth 2. ``` +## Configuration files + +The public config model is `DocpullConfig`. It accepts one target URL per +config; for multiple sites, run the CLI once per URL, load several configs in +Python, or use the MCP alias workflow. + +```yaml +profile: rag +url: https://docs.example.com +crawl: + max_pages: 200 + max_depth: 3 +output: + directory: ./docs/example + format: markdown +content_filter: + streaming_dedup: true +cache: + enabled: true +``` + +```python +from pathlib import Path +from docpull import DocpullConfig + +cfg = DocpullConfig.from_yaml(Path("docpull.yaml").read_text()) +``` + +See [docs/](docs/) and [docs/examples/](docs/examples/) for current examples. + ## MCP server docpull ships an MCP (Model Context Protocol) server so AI agents can call it @@ -143,10 +186,12 @@ docpull mcp # starts the stdio server Claude Code: ```bash -claude mcp add --transport stdio docpull -- docpull mcp +claude mcp add --transport stdio --scope user docpull -- docpull mcp ``` -Cursor (`.cursor/mcp.json` in a project, or `~/.cursor/mcp.json` globally): +This repo also includes a project `.mcp.json` with the same server command. + +Cursor (`.cursor/mcp.json` in this project, or `~/.cursor/mcp.json` globally): ```json { @@ -160,13 +205,44 @@ Cursor (`.cursor/mcp.json` in a project, or `~/.cursor/mcp.json` globally): } ``` +Codex: + +```bash +codex mcp add docpull -- docpull mcp +``` + +For project-scoped Codex setup in a trusted repo, use `.codex/config.toml`: + +```toml +[mcp_servers.docpull] +command = "docpull" +args = ["mcp"] +``` + +Regenerate the project-local Codex config, repo-scoped skill, and local plugin +marketplace entry with: + +```bash +make sync-agent-host-configs +``` + Claude Desktop uses the same `mcpServers` shape in `claude_desktop_config.json`. -Or, if you use Claude Code, install the plugin instead — it bundles the MCP -server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`, -`/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude -when to reach for docpull automatically: +Project-local agent guidance is included for all supported coding agents: + +- Claude Code: `CLAUDE.md` and `plugin/skills/docpull-research/SKILL.md` +- Cursor: `.cursor/rules/docpull-research.mdc` +- Codex: `AGENTS.md`; Codex repo-scoped skills can use `.agents/skills/docpull-research/SKILL.md` + +Claude Code also surfaces docpull's MCP prompts as commands, including +`/mcp__docpull__docs_add`, `/mcp__docpull__docs_search`, +`/mcp__docpull__docs_list`, `/mcp__docpull__docs_refresh`, and +`/mcp__docpull__docs_remove`. + +If you specifically want marketplace distribution in Claude Code, install the +minimal plugin. It registers the same MCP server and adds a meta-skill that +teaches Claude when to reach for docpull automatically: ```bash # 1. Install docpull with the MCP extra (required for the plugin) @@ -180,6 +256,14 @@ pip install 'docpull[mcp]' ``` See [plugin/README.md](plugin/README.md) for details. +The `plugin/` directory is the source of truth. The marketplace catalog lives +at `.claude-plugin/marketplace.json`; the copied plugin payload under +`.claude-plugin/plugin/` is generated on demand via +`python scripts/sync_claude_plugin.py`. +The same `plugin/` folder also includes `.codex-plugin/plugin.json` so it can +be packaged as a Codex plugin with the shared `docpull-research` skill. +Use `make sync-agent-host-configs` after editing `plugin/skills/docpull-research` +to refresh Codex's repo-scoped `.agents/skills` copy and local plugin marketplace. Tools exposed (8 total — read tools advertise `readOnlyHint` so hosts that auto-approve safe tools won't prompt): @@ -208,6 +292,10 @@ sources: maxPages: 200 ``` +Fetched MCP docs are cached for seven days under +`~/.local/share/docpull-mcp/docs` by default. Override that location with +`DOCPULL_DOCS_DIR` or `DOCS_DIR`. + ### About the `mcp/` directory in this repo The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP @@ -217,7 +305,10 @@ the Python MCP server shipped in the `docpull` package described above with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp); unless you specifically need pgvector-backed semantic search, ignore it -and use `docpull mcp`. +and use `docpull mcp`. Advanced users who do need vector search should run +`bun run db:setup` inside `mcp/` after configuring `DATABASE_URL`. +See [docs/mcp-pgvector-setup.md](docs/mcp-pgvector-setup.md) for the focused +setup guide. ## Output @@ -259,8 +350,9 @@ Run `docpull --help` for the full list. Highlights: ``` Core: - --profile {rag,mirror,quick,llm,custom} + --profile {rag,mirror,quick,llm} --single Fetch one URL (no crawl) + --skill NAME Generate a docs-backed agent skill --format {markdown,json,ndjson,sqlite} --stream Stream NDJSON to stdout @@ -278,6 +370,7 @@ Cache: --cache Enable incremental updates --cache-dir DIR --cache-ttl DAYS + --resume Resume an interrupted cached run ``` ## Performance diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..505829c --- /dev/null +++ b/docs/README.md @@ -0,0 +1,65 @@ +# docpull Documentation + +This directory contains maintained docs for the current `docpull` package. +The root [README](../README.md) is the canonical quick-start and feature +overview; files here provide focused setup notes and copy-pasteable examples. + +## Current Version + +The docs in this directory are aligned with docpull 4.0.0: + +- Python 3.10+ +- CLI entry point: `docpull` +- MCP server entry point: `docpull mcp` +- Supported output formats: `markdown`, `json`, `ndjson`, `sqlite` +- Supported profiles: `rag`, `mirror`, `quick`, `llm` +- Config files use the `DocpullConfig` shape from `src/docpull/models/config.py` + +## Files + +| Path | Purpose | +|---|---| +| [examples/](examples/) | Valid YAML config examples and equivalent CLI commands | +| [mcp-pgvector-setup.md](mcp-pgvector-setup.md) | Optional TypeScript MCP server with PostgreSQL + pgvector semantic search | +| [CHANGELOG.md](CHANGELOG.md) | Historical release notes | + +## Configuration Shape + +docpull 4.x accepts one target URL per `DocpullConfig`. For multiple sites, +run the CLI once per URL, create one config per source, or use the MCP +`add_source` / `ensure_docs` alias workflow. + +Valid YAML mirrors the Python model: + +```yaml +profile: rag +url: https://docs.example.com +crawl: + max_pages: 200 + max_depth: 3 +output: + directory: ./docs/example + format: markdown +content_filter: + streaming_dedup: true +cache: + enabled: true +``` + +Removed 2.x fields such as `language`, `deduplicate`, `exclude_sections`, +`create_index`, `incremental`, `update_only_changed`, and top-level +`sources:` are intentionally not used in current examples. Pydantic forbids +unknown fields so stale configs fail loudly instead of being ignored. + +## MCP Choices + +Most users should run the Python stdio server that ships with the package: + +```bash +pip install 'docpull[mcp]' +docpull mcp +``` + +The root-level `mcp/` directory is a separate TypeScript server for users who +specifically need PostgreSQL, pgvector, and OpenAI embeddings for semantic +search. See [mcp-pgvector-setup.md](mcp-pgvector-setup.md). diff --git a/docs/examples/README.md b/docs/examples/README.md index 29719c4..2a54bfc 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -1,44 +1,64 @@ # Configuration Examples -Example YAML configurations for docpull. +These examples use the current `DocpullConfig` YAML shape. They are meant for +Python callers, tests, and agent workflows that construct `DocpullConfig` from +YAML: + +```python +from pathlib import Path +from docpull import DocpullConfig + +config = DocpullConfig.from_yaml(Path("docs/examples/simple-optimization.yaml").read_text()) +``` + +The CLI does not currently accept a config-file flag. Each example also includes +an equivalent `docpull ...` command in comments. Files that start with a YAML +list contain one valid `DocpullConfig` payload per list item. ## Files | File | Description | |------|-------------| -| `simple-optimization.yaml` | Single source with language filter + index | -| `multi-source-optimized.yaml` | Multiple sources with full optimization | -| `incremental-updates.yaml` | Resume downloads, update only changed files | -| `format-conversion.yaml` | TOON, JSON, SQLite output formats | -| `deduplication-strategies.yaml` | Different dedup strategies (mainnet, shortest, etc.) | -| `selective-crawling.yaml` | Include/exclude path patterns | +| `simple-optimization.yaml` | RAG-oriented Markdown crawl with rich metadata and streaming dedup | +| `multi-source-optimized.yaml` | Sequential single-source configs for a multi-site docs refresh | +| `incremental-updates.yaml` | Cache + resume configuration for changed-page refreshes | +| `format-conversion.yaml` | JSON, NDJSON, and SQLite output examples | +| `deduplication-strategies.yaml` | Current streaming dedup behavior and alternatives | +| `selective-crawling.yaml` | Include/exclude path patterns for scoped crawls | -## Usage - -```bash -docpull --sources-file docs/examples/simple-optimization.yaml -``` - -## Configuration Reference +## Current Field Reference ```yaml -# Global settings -output_dir: ./docs -rate_limit: 0.5 - -# Per-source settings -sources: - my-docs: - url: https://example.com - language: en - deduplicate: true - keep_variant: mainnet - max_file_size: 200kb - include_paths: ["guides/*"] - exclude_paths: ["*/changelog"] - exclude_sections: ["Examples"] - format: markdown - create_index: true +profile: rag +url: https://docs.example.com +crawl: + max_pages: 200 + max_depth: 3 + max_concurrent: 20 + rate_limit: 0.5 + include_paths: ["*/guides/*"] + exclude_paths: ["*/changelog"] +content_filter: + streaming_dedup: true + max_file_size: 200kb + extractor: default + enable_special_cases: true + strict_js_required: false +output: + directory: ./docs/example + format: markdown + naming_strategy: full + rich_metadata: true +cache: + enabled: true + directory: .docpull-cache + ttl_days: 30 ``` -See `docpull --help` for all options. +Removed fields from older examples are intentionally absent: top-level +`sources:`, `language`, `deduplicate`, `keep_variant`, `exclude_sections`, +`create_index`, `incremental`, and `update_only_changed` are not accepted by +docpull 4.x. + +For the optional TypeScript MCP server backed by PostgreSQL, pgvector, and +OpenAI embeddings, see [pgvector MCP Setup](../mcp-pgvector-setup.md). diff --git a/docs/examples/deduplication-strategies.yaml b/docs/examples/deduplication-strategies.yaml index 38856b8..0496444 100644 --- a/docs/examples/deduplication-strategies.yaml +++ b/docs/examples/deduplication-strategies.yaml @@ -1,29 +1,29 @@ -# Deduplication strategies example -# Remove duplicate files with different keep strategies +# Deduplication behavior for docpull 4.x +# +# Current dedup is streaming content-hash deduplication: the first successfully +# saved copy of a duplicate page wins, and later duplicate pages are skipped. +# The old keep_variant strategies were removed with the deprecated 2.x config +# fields. +# +# CLI equivalent: +# docpull https://aptos.dev --profile rag --streaming-dedup --output-dir ./docs-aptos --include-paths "*/build/guides/*" "*/build/apis/*" -sources: - aptos-mainnet: - url: https://aptos.dev - deduplicate: true - keep_variant: mainnet # Keep mainnet version, skip testnet/devnet - max_file_size: 200kb - create_index: true - - # Alternative: keep shortest variant - aptos-shortest: - url: https://aptos.dev - deduplicate: true - keep_variant: shortest # Keep shortest file - max_file_size: 200kb - output_dir: ./docs-shortest - - # Alternative: keep first encountered - aptos-first: - url: https://aptos.dev - deduplicate: true - keep_variant: first # Keep first file encountered - max_file_size: 200kb - output_dir: ./docs-first - -rate_limit: 0.5 +profile: rag +url: https://aptos.dev +crawl: + max_pages: 300 + rate_limit: 0.5 + include_paths: + - "*/build/guides/*" + - "*/build/apis/*" + exclude_paths: + - "*/changelog" + - "*/release-notes" +content_filter: + streaming_dedup: true + max_file_size: 200kb +output: + directory: ./docs-aptos + format: markdown + rich_metadata: true log_level: INFO diff --git a/docs/examples/format-conversion.yaml b/docs/examples/format-conversion.yaml index f856913..c99cf9d 100644 --- a/docs/examples/format-conversion.yaml +++ b/docs/examples/format-conversion.yaml @@ -1,25 +1,43 @@ -# Format conversion examples -# Convert documentation to different output formats +# Format conversion examples for docpull 4.x +# +# Supported formats are markdown, json, ndjson, and sqlite. +# CLI equivalents: +# docpull https://docs.anthropic.com --format json --output-dir ./docs-json +# docpull https://docs.anthropic.com --profile llm --stream > documents.ndjson +# docpull https://docs.anthropic.com --format sqlite --output-dir ./docs-db -sources: - anthropic-toon: - url: https://docs.anthropic.com - language: en - format: toon # 40-60% size reduction, optimized for LLMs - naming_strategy: hierarchical - output_dir: ./docs-toon +- profile: rag + url: https://docs.anthropic.com + crawl: + max_pages: 100 + rate_limit: 0.5 + output: + format: json + directory: ./docs-json + rich_metadata: true + log_level: INFO - anthropic-json: - url: https://docs.anthropic.com - language: en - format: json # Structured JSON with sections - output_dir: ./docs-json +- profile: llm + url: https://docs.anthropic.com + crawl: + max_pages: 100 + rate_limit: 0.5 + output: + format: ndjson + directory: ./docs-ndjson + ndjson_filename: documents.ndjson + max_tokens_per_file: 4000 + emit_chunks: true + rich_metadata: true + log_level: INFO - anthropic-sqlite: - url: https://docs.anthropic.com - language: en - format: sqlite # Searchable database with FTS5 - output_dir: ./docs-db - -rate_limit: 0.5 -log_level: INFO +- profile: rag + url: https://docs.anthropic.com + crawl: + max_pages: 100 + rate_limit: 0.5 + output: + format: sqlite + directory: ./docs-db + rich_metadata: true + log_level: INFO diff --git a/docs/examples/incremental-updates.yaml b/docs/examples/incremental-updates.yaml index 3707032..dfabb50 100644 --- a/docs/examples/incremental-updates.yaml +++ b/docs/examples/incremental-updates.yaml @@ -1,22 +1,19 @@ -# Incremental update configuration -# Only downloads changed files, resume capability +# Incremental update configuration for docpull 4.x +# CLI equivalent: +# docpull https://docs.anthropic.com --profile mirror --cache --resume --output-dir ./docs/anthropic -sources: - anthropic: - url: https://docs.anthropic.com - language: en - create_index: true - - claude-code: - url: https://code.claude.com/docs - language: en - create_index: true - -output_dir: ./docs -rate_limit: 0.5 +profile: mirror +url: https://docs.anthropic.com +crawl: + max_pages: 500 + rate_limit: 0.5 +output: + directory: ./docs/anthropic + format: markdown +cache: + enabled: true + directory: .docpull-cache + ttl_days: 30 + skip_unchanged: true + resume: true log_level: INFO - -# Incremental features -incremental: true -update_only_changed: true -cache_dir: .docpull-cache diff --git a/docs/examples/multi-source-optimized.yaml b/docs/examples/multi-source-optimized.yaml index 082ccca..90873ba 100644 --- a/docs/examples/multi-source-optimized.yaml +++ b/docs/examples/multi-source-optimized.yaml @@ -1,41 +1,55 @@ -# Multi-source configuration with full optimization -# Real-world example: 31 MB → 13 MB (58% reduction) +# Multi-source refresh plan for docpull 4.x +# +# docpull accepts one URL per DocpullConfig. Keep a multi-source refresh as a +# list of single-source configs and run each target sequentially, or use the +# MCP add_source / ensure_docs workflow for named aliases. +# +# CLI equivalents: +# docpull https://docs.anthropic.com --profile rag --output-dir ./optimized-docs/anthropic --max-pages 200 +# docpull https://docs.anthropic.com/en/docs/claude-code --profile rag --output-dir ./optimized-docs/claude-code --max-pages 200 +# docpull https://aptos.dev --profile rag --output-dir ./optimized-docs/aptos --include-paths "*/build/guides/*" "*/build/apis/*" --exclude-paths "*/changelog" "*/release-notes" -sources: - anthropic: - url: https://docs.anthropic.com - language: en +- profile: rag + url: https://docs.anthropic.com + crawl: + max_pages: 200 + rate_limit: 0.5 + content_filter: + streaming_dedup: true max_file_size: 200kb - create_index: true - exclude_sections: - - "Changelog" - - "Examples" + output: + directory: ./optimized-docs/anthropic + rich_metadata: true + log_level: INFO - claude-code: - url: https://code.claude.com/docs - language: en # Skips 352 translation files! - create_index: true +- profile: rag + url: https://docs.anthropic.com/en/docs/claude-code + crawl: + max_pages: 200 + rate_limit: 0.5 + content_filter: + streaming_dedup: true max_file_size: 200kb + output: + directory: ./optimized-docs/claude-code + rich_metadata: true + log_level: INFO - aptos: - url: https://aptos.dev - deduplicate: true - keep_variant: mainnet # Skips 304 duplicates! - max_file_size: 200kb +- profile: rag + url: https://aptos.dev + crawl: + max_pages: 300 + rate_limit: 0.5 include_paths: - - "build/guides/*" - - "build/apis/*" + - "*/build/guides/*" + - "*/build/apis/*" exclude_paths: - "*/changelog" - "*/release-notes" - - shelby: - url: https://docs.shelby.xyz - create_index: true + content_filter: + streaming_dedup: true max_file_size: 200kb - -# Global settings -output_dir: ./optimized-docs -rate_limit: 0.5 -log_level: INFO -extract_metadata: true + output: + directory: ./optimized-docs/aptos + rich_metadata: true + log_level: INFO diff --git a/docs/examples/selective-crawling.yaml b/docs/examples/selective-crawling.yaml index 9299bab..398d978 100644 --- a/docs/examples/selective-crawling.yaml +++ b/docs/examples/selective-crawling.yaml @@ -1,26 +1,22 @@ -# Selective crawling example -# Only fetch specific sections of documentation +# Selective crawling example for docpull 4.x +# CLI equivalent: +# docpull https://docs.anthropic.com --include-paths "*/api/*" "*/reference/*" --exclude-paths "*/changelog" "*/release-notes" --output-dir ./docs-selective/anthropic-api -sources: - aptos-guides-only: - url: https://aptos.dev - include_paths: - - "build/guides/*" - - "build/tutorials/*" - exclude_paths: - - "*/changelog" - - "*/release-notes" - - "*/api-reference" - create_index: true - - anthropic-api-only: - url: https://docs.anthropic.com - include_paths: - - "*/api/*" - - "*/reference/*" - language: en - create_index: true - -output_dir: ./docs-selective -rate_limit: 0.5 +profile: rag +url: https://docs.anthropic.com +crawl: + max_pages: 200 + rate_limit: 0.5 + include_paths: + - "*/api/*" + - "*/reference/*" + exclude_paths: + - "*/changelog" + - "*/release-notes" +content_filter: + streaming_dedup: true +output: + directory: ./docs-selective/anthropic-api + format: markdown + rich_metadata: true log_level: INFO diff --git a/docs/examples/simple-optimization.yaml b/docs/examples/simple-optimization.yaml index 852cb5b..fc0470f 100644 --- a/docs/examples/simple-optimization.yaml +++ b/docs/examples/simple-optimization.yaml @@ -1,14 +1,17 @@ -# Simple optimization example -# Fetches documentation with basic optimizations +# Simple optimization example for docpull 4.x +# CLI equivalent: +# docpull https://docs.anthropic.com --profile rag --output-dir ./docs/anthropic --max-pages 200 --rate-limit 0.5 --streaming-dedup -sources: - anthropic: - url: https://docs.anthropic.com - language: en - max_file_size: 200kb - create_index: true - extract_metadata: true - -output_dir: ./docs -rate_limit: 0.5 +profile: rag +url: https://docs.anthropic.com +crawl: + max_pages: 200 + rate_limit: 0.5 +content_filter: + streaming_dedup: true + max_file_size: 200kb +output: + directory: ./docs/anthropic + format: markdown + rich_metadata: true log_level: INFO diff --git a/docs/mcp-pgvector-setup.md b/docs/mcp-pgvector-setup.md new file mode 100644 index 0000000..f0c95c3 --- /dev/null +++ b/docs/mcp-pgvector-setup.md @@ -0,0 +1,78 @@ +# pgvector MCP Setup + +Most users should use the Python MCP server: + +```bash +pip install 'docpull[mcp]' +docpull mcp +``` + +That path is local, requires no database, and powers the Claude plugin and +standard `docpull` MCP setup. + +The separate TypeScript server in `mcp/` is for users who specifically want +persistent semantic search backed by PostgreSQL, pgvector, and OpenAI +embeddings. It is a different server from `docpull mcp`. + +## When to Use It + +Use the pgvector server when you want: + +- persistent indexed documentation across sessions +- semantic search with `search_docs` +- exact DB-backed search with `grep_docs` +- a shared documentation index for a team or long-running agent setup + +Skip it when you only need the normal `docpull` CLI or the default +`docpull mcp` server. + +## Requirements + +- Bun +- PostgreSQL with the `vector` extension available +- a database connection string in `DATABASE_URL` +- `OPENAI_API_KEY` for embedding generation +- the `docpull` CLI installed for fetching documentation + +## Setup + +```bash +cd mcp +bun install + +export DATABASE_URL="postgresql://user:pass@host:5432/docs" +export OPENAI_API_KEY="sk-..." + +bun run db:setup +bun run dev +``` + +`bun run db:setup` applies `schema.sql`, creates the migration tracking table, +and applies any pending migrations. `ensure_docs(..., index: true)` requires +both `DATABASE_URL` and `OPENAI_API_KEY`; `ensure_docs(..., index: false)` can +fetch docs without embedding them. + +## Database Commands + +```bash +bun run db:setup # initialize schema and apply pending migrations +bun run db:migrate # apply pending migrations only +bun run db:status # show applied and pending migrations +bun run db:rollback # roll back the latest applied migration +``` + +Applied migrations are tracked in `docpull_mcp_migrations`. + +## MCP Usage + +After the server is running, use the MCP tools: + +```text +ensure_docs(source: "react", index: true) +search_docs(query: "how do effects work", library: "react") +grep_docs(pattern: "useEffect", library: "react") +list_indexed() +``` + +Indexing is explicit opt-in. `ensure_docs(source: "react")` fetches docs +without embedding them; pass `index: true` to write embeddings into Postgres. diff --git a/mcp/.env.example b/mcp/.env.example index 009ff09..9967a89 100644 --- a/mcp/.env.example +++ b/mcp/.env.example @@ -4,6 +4,14 @@ OPENAI_API_KEY=sk-... # Optional DOCS_DIR=~/.local/share/docpull-mcp/docs + +# OpenAI embedding client controls +OPENAI_TIMEOUT_MS=30000 +OPENAI_MAX_RETRIES=2 +OPENAI_CIRCUIT_FAILURE_THRESHOLD=5 +OPENAI_CIRCUIT_RESET_MS=60000 + +# PostgreSQL pool controls DB_POOL_MAX=10 DB_POOL_MIN=2 DB_IDLE_TIMEOUT_MS=30000 diff --git a/mcp/.gitignore b/mcp/.gitignore deleted file mode 100644 index 9145c03..0000000 --- a/mcp/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -node_modules/ -dist/ -.env -.DS_Store diff --git a/mcp/README.md b/mcp/README.md index e8beec8..87a6e2e 100644 --- a/mcp/README.md +++ b/mcp/README.md @@ -1,6 +1,18 @@ # docpull-mcp -MCP server for fetching and searching documentation on-demand. Uses [docpull](https://github.com/raintree-technology/docpull) to pull docs and pgvector for semantic search. +Optional TypeScript MCP server for fetching and searching documentation +on-demand with PostgreSQL, pgvector, and OpenAI embeddings. + +Most users should use the Python stdio server shipped by the `docpull` package: + +```bash +pip install 'docpull[mcp]' +docpull mcp +``` + +This `mcp/` directory is for advanced users who specifically need DB-backed +semantic search. It uses [docpull](https://github.com/raintree-technology/docpull) +as the fetcher and pgvector for search. ## Features @@ -18,33 +30,38 @@ git clone https://github.com/raintree-technology/docpull-mcp cd docpull-mcp bun install -# Requires docpull CLI +# Requires the docpull CLI for fetching pip install docpull ``` -### For semantic search (optional but recommended) +### Semantic search setup ```bash -# PostgreSQL with pgvector -psql $DATABASE_URL -f schema.sql - -# Set environment variables +# PostgreSQL with pgvector, then set environment variables. export DATABASE_URL="postgresql://user:pass@localhost:5432/docs" export OPENAI_API_KEY="sk-..." + +# Create schema and apply migrations +bun run db:setup ``` -Existing databases should apply migrations in order: +Existing databases can apply only pending migrations: ```bash -psql $DATABASE_URL -f migrations/001_harden_embeddings.up.sql +bun run db:migrate ``` -Rollback for that migration: +Useful database commands: ```bash -psql $DATABASE_URL -f migrations/001_harden_embeddings.down.sql +bun run db:status # show applied and pending migrations +bun run db:rollback # roll back the latest applied migration ``` +The migration runner records applied migrations in +`docpull_mcp_migrations`. If you need to debug manually, the raw SQL files +remain in `schema.sql` and `migrations/*.sql`. + ## Usage ### Claude Desktop @@ -89,7 +106,8 @@ Add to `~/.claude/settings.json`: ### ensure_docs -Fetch documentation for a configured source. Indexing is explicit opt-in. +Fetch documentation for a configured source. Indexing is explicit opt-in and +requires both `DATABASE_URL` and `OPENAI_API_KEY`. ``` ensure_docs(source: "react") # Fetch only @@ -97,7 +115,8 @@ ensure_docs(source: "react", force: true) # Force refresh ensure_docs(source: "react", index: true) # Fetch and index ``` -Direct URLs are intentionally disabled in MCP. Add custom sites to `~/.config/docpull-mcp/sources.yaml` and call `ensure_docs` with the alias name. +Direct URLs are intentionally disabled for `ensure_docs`. Add custom sites to +`~/.config/docpull-mcp/sources.yaml` and call `ensure_docs` with the alias name. ### search_docs @@ -110,7 +129,7 @@ search_docs(query: "row level security", library: "supabase") ### grep_docs -Fast exact pattern matching (requires DATABASE_URL). +Fast exact pattern matching against indexed chunks (requires `DATABASE_URL`). ``` grep_docs(pattern: "onConflictDoUpdate", library: "drizzle") @@ -128,7 +147,7 @@ list_sources(category: "frontend") # Filter by category ### list_indexed -List libraries that have been indexed for search. +List libraries that have been indexed in Postgres for search. ``` list_indexed() @@ -170,6 +189,9 @@ sources: To ingest docs without the MCP server: ```bash +# Prepare the database first +bun run db:setup + # Ingest all fetched docs bun run ingest diff --git a/mcp/migrations/001_harden_embeddings.down.sql b/mcp/migrations/001_harden_embeddings.down.sql index ee8aaf3..3039bec 100644 --- a/mcp/migrations/001_harden_embeddings.down.sql +++ b/mcp/migrations/001_harden_embeddings.down.sql @@ -1,7 +1,6 @@ -BEGIN; - +-- Restore the pre-hardening schema shape. This intentionally does not erase +-- created_at values that were backfilled by the up migration. ALTER TABLE doc_embeddings ALTER COLUMN embedding DROP NOT NULL, + ALTER COLUMN created_at DROP DEFAULT, ALTER COLUMN created_at DROP NOT NULL; - -COMMIT; diff --git a/mcp/migrations/001_harden_embeddings.up.sql b/mcp/migrations/001_harden_embeddings.up.sql index d991ebe..b773bc1 100644 --- a/mcp/migrations/001_harden_embeddings.up.sql +++ b/mcp/migrations/001_harden_embeddings.up.sql @@ -1,5 +1,5 @@ -BEGIN; - +-- Embeddings are required for pgvector similarity operators. Refuse to +-- silently harden a table that already contains unusable cache rows. DO $$ BEGIN IF EXISTS ( @@ -11,6 +11,7 @@ BEGIN END IF; END $$; +-- Existing rows predate the NOT NULL/default requirement. UPDATE doc_embeddings SET created_at = NOW() WHERE created_at IS NULL; @@ -19,5 +20,3 @@ ALTER TABLE doc_embeddings ALTER COLUMN embedding SET NOT NULL, ALTER COLUMN created_at SET DEFAULT NOW(), ALTER COLUMN created_at SET NOT NULL; - -COMMIT; diff --git a/mcp/package.json b/mcp/package.json index 0cf1a98..e67683b 100644 --- a/mcp/package.json +++ b/mcp/package.json @@ -6,7 +6,12 @@ "main": "src/server.ts", "scripts": { "dev": "bun run src/server.ts", + "db:setup": "bun run src/migrate.ts setup", + "db:migrate": "bun run src/migrate.ts migrate", + "db:rollback": "bun run src/migrate.ts rollback", + "db:status": "bun run src/migrate.ts status", "ingest": "bun run src/ingest.ts", + "test": "bun test src/*.test.ts", "typecheck": "tsc --noEmit" }, "dependencies": { diff --git a/mcp/schema.sql b/mcp/schema.sql index cfbec26..1ee0178 100644 --- a/mcp/schema.sql +++ b/mcp/schema.sql @@ -1,8 +1,9 @@ -- PostgreSQL schema for docpull-mcp -- Requires PostgreSQL with pgvector extension --- Enable pgvector extension +-- Enable extensions used by the schema. CREATE EXTENSION IF NOT EXISTS vector; +CREATE EXTENSION IF NOT EXISTS pgcrypto; -- Documentation embeddings table CREATE TABLE IF NOT EXISTS doc_embeddings ( diff --git a/mcp/src/db.test.ts b/mcp/src/db.test.ts index a510ccd..d961df6 100644 --- a/mcp/src/db.test.ts +++ b/mcp/src/db.test.ts @@ -1,11 +1,14 @@ -import { describe, expect, test } from "bun:test"; +import { afterEach, describe, expect, test } from "bun:test"; import { type DbClient, type EmbeddingDocument, + isDbConfigured, replaceLibraryEmbeddingsWithClient, } from "./db.js"; import { EMBEDDING_DIMENSIONS } from "./embeddings.js"; +const TEST_DATABASE_URL_ENV = "DATABASE_URL"; + interface QueryRecord { sql: string; params?: readonly unknown[]; @@ -44,6 +47,10 @@ function doc(overrides: Partial = {}): EmbeddingDocument { }; } +afterEach(() => { + delete process.env[TEST_DATABASE_URL_ENV]; +}); + describe("replaceLibraryEmbeddingsWithClient", () => { test("deletes and reinserts one library inside a single transaction", async () => { const client = new FakeClient(); @@ -109,3 +116,14 @@ describe("replaceLibraryEmbeddingsWithClient", () => { expect(sqls[sqls.length - 1]).toBe("COMMIT"); }); }); + +describe("isDbConfigured", () => { + test("reads DATABASE_URL at call time", () => { + delete process.env[TEST_DATABASE_URL_ENV]; + expect(isDbConfigured()).toBe(false); + + process.env[TEST_DATABASE_URL_ENV] = + "postgresql://user:pass@localhost:5432/docpull"; + expect(isDbConfigured()).toBe(true); + }); +}); diff --git a/mcp/src/db.ts b/mcp/src/db.ts index a57dbed..1bd9793 100644 --- a/mcp/src/db.ts +++ b/mcp/src/db.ts @@ -11,7 +11,6 @@ import { errorMessage, logStructured } from "./logger.js"; // SETUP // ============================================================================ -const DATABASE_URL = process.env.DATABASE_URL; const DEFAULT_DB_POOL_MAX = 10; const DEFAULT_DB_POOL_MIN = 2; const DEFAULT_DB_IDLE_TIMEOUT_MS = 30_000; @@ -27,46 +26,65 @@ const PARAMS_PER_EMBEDDING_ROW = 6; // ceiling — otherwise large libraries (thousands of chunks) fail to index. const MAX_EMBEDDING_ROWS_PER_INSERT = Math.floor(32767 / PARAMS_PER_EMBEDDING_ROW); -const DB_POOL_MAX = readIntegerEnv("DB_POOL_MAX", DEFAULT_DB_POOL_MAX, { - min: 1, - max: 100, -}); -const DB_POOL_MIN = readIntegerEnv("DB_POOL_MIN", DEFAULT_DB_POOL_MIN, { - min: 0, - max: DB_POOL_MAX, -}); -const DB_IDLE_TIMEOUT_MS = readIntegerEnv( - "DB_IDLE_TIMEOUT_MS", - DEFAULT_DB_IDLE_TIMEOUT_MS, - { min: 1_000, max: 3_600_000 }, -); -const DB_CONNECTION_TIMEOUT_MS = readIntegerEnv( - "DB_CONNECTION_TIMEOUT_MS", - DEFAULT_DB_CONNECTION_TIMEOUT_MS, - { min: 1_000, max: 300_000 }, -); -const DB_STATEMENT_TIMEOUT_MS = readIntegerEnv( - "DB_STATEMENT_TIMEOUT_MS", - DEFAULT_DB_STATEMENT_TIMEOUT_MS, - { min: 1_000, max: 300_000 }, -); - let pool: Pool | null = null; +function getDatabaseUrl(): string | undefined { + return process.env.DATABASE_URL; +} + +function getDbPoolMax(): number { + return readIntegerEnv("DB_POOL_MAX", DEFAULT_DB_POOL_MAX, { + min: 1, + max: 100, + }); +} + +function getDbPoolMin(): number { + return readIntegerEnv("DB_POOL_MIN", DEFAULT_DB_POOL_MIN, { + min: 0, + max: getDbPoolMax(), + }); +} + +function getDbIdleTimeoutMs(): number { + return readIntegerEnv("DB_IDLE_TIMEOUT_MS", DEFAULT_DB_IDLE_TIMEOUT_MS, { + min: 1_000, + max: 3_600_000, + }); +} + +function getDbConnectionTimeoutMs(): number { + return readIntegerEnv( + "DB_CONNECTION_TIMEOUT_MS", + DEFAULT_DB_CONNECTION_TIMEOUT_MS, + { min: 1_000, max: 300_000 }, + ); +} + +function getDbStatementTimeoutMs(): number { + return readIntegerEnv("DB_STATEMENT_TIMEOUT_MS", DEFAULT_DB_STATEMENT_TIMEOUT_MS, { + min: 1_000, + max: 300_000, + }); +} + function getPool(): Pool { - if (!DATABASE_URL) { + const databaseUrl = getDatabaseUrl(); + if (!databaseUrl) { throw new Error("DATABASE_URL environment variable is required"); } if (!pool) { + const dbPoolMax = getDbPoolMax(); + const dbStatementTimeoutMs = getDbStatementTimeoutMs(); pool = new Pool({ - connectionString: DATABASE_URL, - max: DB_POOL_MAX, - min: DB_POOL_MIN, - idleTimeoutMillis: DB_IDLE_TIMEOUT_MS, - connectionTimeoutMillis: DB_CONNECTION_TIMEOUT_MS, - statement_timeout: DB_STATEMENT_TIMEOUT_MS, - query_timeout: DB_STATEMENT_TIMEOUT_MS, + connectionString: databaseUrl, + max: dbPoolMax, + min: getDbPoolMin(), + idleTimeoutMillis: getDbIdleTimeoutMs(), + connectionTimeoutMillis: getDbConnectionTimeoutMs(), + statement_timeout: dbStatementTimeoutMs, + query_timeout: dbStatementTimeoutMs, }); pool.on("error", (error) => { @@ -80,7 +98,7 @@ function getPool(): Pool { } export function isDbConfigured(): boolean { - return !!DATABASE_URL; + return !!getDatabaseUrl(); } async function closePool(signal: string): Promise { diff --git a/mcp/src/embeddings.test.ts b/mcp/src/embeddings.test.ts new file mode 100644 index 0000000..3729539 --- /dev/null +++ b/mcp/src/embeddings.test.ts @@ -0,0 +1,29 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { + getConfiguredOpenAIClient, + requireConfiguredOpenAIClient, +} from "./embeddings.js"; + +const TEST_OPENAI_API_KEY_ENV = "OPENAI_API_KEY"; + +afterEach(() => { + delete process.env[TEST_OPENAI_API_KEY_ENV]; +}); + +describe("OpenAI client configuration", () => { + test("reads OPENAI_API_KEY at call time", () => { + delete process.env[TEST_OPENAI_API_KEY_ENV]; + expect(getConfiguredOpenAIClient()).toBeNull(); + + process.env[TEST_OPENAI_API_KEY_ENV] = "sk-test-123456789012"; + expect(getConfiguredOpenAIClient()).not.toBeNull(); + }); + + test("throws when OPENAI_API_KEY is missing", () => { + delete process.env[TEST_OPENAI_API_KEY_ENV]; + + expect(() => requireConfiguredOpenAIClient()).toThrow( + "OPENAI_API_KEY environment variable required", + ); + }); +}); diff --git a/mcp/src/embeddings.ts b/mcp/src/embeddings.ts index 89abe7d..dbac3a6 100644 --- a/mcp/src/embeddings.ts +++ b/mcp/src/embeddings.ts @@ -10,26 +10,34 @@ const DEFAULT_OPENAI_MAX_RETRIES = 2; const DEFAULT_CIRCUIT_FAILURE_THRESHOLD = 5; const DEFAULT_CIRCUIT_RESET_MS = 60_000; -const OPENAI_TIMEOUT_MS = readIntegerEnv( - "OPENAI_TIMEOUT_MS", - DEFAULT_OPENAI_TIMEOUT_MS, - { min: 1_000, max: 300_000 }, -); -const OPENAI_MAX_RETRIES = readIntegerEnv( - "OPENAI_MAX_RETRIES", - DEFAULT_OPENAI_MAX_RETRIES, - { min: 0, max: 10 }, -); -const CIRCUIT_FAILURE_THRESHOLD = readIntegerEnv( - "OPENAI_CIRCUIT_FAILURE_THRESHOLD", - DEFAULT_CIRCUIT_FAILURE_THRESHOLD, - { min: 1, max: 100 }, -); -const CIRCUIT_RESET_MS = readIntegerEnv( - "OPENAI_CIRCUIT_RESET_MS", - DEFAULT_CIRCUIT_RESET_MS, - { min: 1_000, max: 3_600_000 }, -); +function getOpenAITimeoutMs(): number { + return readIntegerEnv("OPENAI_TIMEOUT_MS", DEFAULT_OPENAI_TIMEOUT_MS, { + min: 1_000, + max: 300_000, + }); +} + +function getOpenAIMaxRetries(): number { + return readIntegerEnv("OPENAI_MAX_RETRIES", DEFAULT_OPENAI_MAX_RETRIES, { + min: 0, + max: 10, + }); +} + +function getCircuitFailureThreshold(): number { + return readIntegerEnv( + "OPENAI_CIRCUIT_FAILURE_THRESHOLD", + DEFAULT_CIRCUIT_FAILURE_THRESHOLD, + { min: 1, max: 100 }, + ); +} + +function getCircuitResetMs(): number { + return readIntegerEnv("OPENAI_CIRCUIT_RESET_MS", DEFAULT_CIRCUIT_RESET_MS, { + min: 1_000, + max: 3_600_000, + }); +} class CircuitBreaker { private failures = 0; @@ -39,14 +47,15 @@ class CircuitBreaker { if (this.openedAt === null) { return; } + const circuitResetMs = getCircuitResetMs(); const elapsedMs = Date.now() - this.openedAt; - if (elapsedMs >= CIRCUIT_RESET_MS) { + if (elapsedMs >= circuitResetMs) { this.openedAt = null; this.failures = 0; return; } throw new Error( - `OpenAI circuit is open; retry after ${Math.ceil((CIRCUIT_RESET_MS - elapsedMs) / 1000)}s`, + `OpenAI circuit is open; retry after ${Math.ceil((circuitResetMs - elapsedMs) / 1000)}s`, ); } @@ -57,7 +66,10 @@ class CircuitBreaker { recordFailure(error: unknown): void { this.failures += 1; - if (this.failures >= CIRCUIT_FAILURE_THRESHOLD && this.openedAt === null) { + if ( + this.failures >= getCircuitFailureThreshold() && + this.openedAt === null + ) { this.openedAt = Date.now(); logStructured("error", "OpenAI circuit opened", { failures: this.failures, @@ -72,8 +84,8 @@ const embeddingCircuit = new CircuitBreaker(); function createOpenAIClient(apiKey: string): OpenAI { return new OpenAI({ apiKey, - timeout: OPENAI_TIMEOUT_MS, - maxRetries: OPENAI_MAX_RETRIES, + timeout: getOpenAITimeoutMs(), + maxRetries: getOpenAIMaxRetries(), }); } @@ -96,14 +108,16 @@ export async function createEmbeddings( ): Promise { embeddingCircuit.beforeRequest(); try { + const openAITimeoutMs = getOpenAITimeoutMs(); + const openAIMaxRetries = getOpenAIMaxRetries(); const response = await client.embeddings.create( { model: EMBEDDING_MODEL, input, }, { - timeout: OPENAI_TIMEOUT_MS, - maxRetries: OPENAI_MAX_RETRIES, + timeout: openAITimeoutMs, + maxRetries: openAIMaxRetries, }, ); const embeddings = response.data.map((item) => item.embedding); diff --git a/mcp/src/env.test.ts b/mcp/src/env.test.ts new file mode 100644 index 0000000..038122c --- /dev/null +++ b/mcp/src/env.test.ts @@ -0,0 +1,24 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { readIntegerEnv } from "./env.js"; + +const TEST_ENV = "DOCPULL_MCP_TEST_INTEGER"; + +afterEach(() => { + delete process.env[TEST_ENV]; +}); + +describe("readIntegerEnv", () => { + test("rejects partially numeric values", () => { + process.env[TEST_ENV] = "10abc"; + + expect(() => + readIntegerEnv(TEST_ENV, 5, { min: 1, max: 20 }), + ).toThrow("DOCPULL_MCP_TEST_INTEGER must be an integer between 1 and 20"); + }); + + test("accepts valid integer values", () => { + process.env[TEST_ENV] = "10"; + + expect(readIntegerEnv(TEST_ENV, 5, { min: 1, max: 20 })).toBe(10); + }); +}); diff --git a/mcp/src/env.ts b/mcp/src/env.ts index 3a303ca..d4b35c1 100644 --- a/mcp/src/env.ts +++ b/mcp/src/env.ts @@ -13,7 +13,7 @@ export function readIntegerEnv( if (raw === undefined || raw === "") { return defaultValue; } - const parsed = Number.parseInt(raw, 10); + const parsed = Number(raw); if (!Number.isInteger(parsed) || parsed < min || parsed > max) { throw new Error(`${name} must be an integer between ${min} and ${max}`); } diff --git a/mcp/src/ingest.test.ts b/mcp/src/ingest.test.ts new file mode 100644 index 0000000..dd7b1c9 --- /dev/null +++ b/mcp/src/ingest.test.ts @@ -0,0 +1,18 @@ +import { describe, expect, test } from "bun:test"; +import { chunkText } from "./ingest.js"; + +describe("chunkText", () => { + test("drops empty chunks for empty files", () => { + expect(chunkText("", 1000, 200)).toEqual([]); + }); + + test("drops whitespace-only chunks", () => { + expect(chunkText(" \n\t\n", 1000, 200)).toEqual([]); + }); + + test("preserves non-empty content", () => { + expect(chunkText("# Heading\n\nBody", 1000, 200)).toEqual([ + "# Heading\n\nBody", + ]); + }); +}); diff --git a/mcp/src/ingest.ts b/mcp/src/ingest.ts index 74b5751..8a9522e 100644 --- a/mcp/src/ingest.ts +++ b/mcp/src/ingest.ts @@ -60,7 +60,15 @@ function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } -function chunkText(text: string, maxTokens: number, overlap: number): string[] { +function hasNonWhitespaceContent(text: string): boolean { + return text.trim().length > 0; +} + +export function chunkText( + text: string, + maxTokens: number, + overlap: number, +): string[] { const chunks: string[] = []; const lines = text.split("\n"); let currentChunk: string[] = []; @@ -70,7 +78,10 @@ function chunkText(text: string, maxTokens: number, overlap: number): string[] { const lineTokens = estimateTokens(line); if (currentTokens + lineTokens > maxTokens && currentChunk.length > 0) { - chunks.push(currentChunk.join("\n")); + const completedChunk = currentChunk.join("\n"); + if (hasNonWhitespaceContent(completedChunk)) { + chunks.push(completedChunk); + } const overlapLines: string[] = []; let overlapTokens = 0; @@ -90,7 +101,10 @@ function chunkText(text: string, maxTokens: number, overlap: number): string[] { } if (currentChunk.length > 0) { - chunks.push(currentChunk.join("\n")); + const finalChunk = currentChunk.join("\n"); + if (hasNonWhitespaceContent(finalChunk)) { + chunks.push(finalChunk); + } } return chunks; diff --git a/mcp/src/migrate.test.ts b/mcp/src/migrate.test.ts new file mode 100644 index 0000000..5562536 --- /dev/null +++ b/mcp/src/migrate.test.ts @@ -0,0 +1,170 @@ +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, describe, expect, test } from "bun:test"; +import { + listMigrationFiles, + migrateDatabase, + migrationStatus, + parseMigrationFilename, + rollbackLatestMigration, + setupDatabase, +} from "./migrate.js"; + +interface QueryRecord { + sql: string; + params?: readonly unknown[]; +} + +class FakeMigrationClient { + readonly queries: QueryRecord[] = []; + private readonly applied = new Set(); + + constructor(applied: string[] = []) { + for (const id of applied) { + this.applied.add(id); + } + } + + async query( + sql: string, + params?: readonly unknown[], + ): Promise<{ rows: Array<{ id: string }> }> { + this.queries.push({ sql, params }); + if (sql.startsWith("SELECT id FROM docpull_mcp_migrations")) { + return { rows: [...this.applied].sort().map((id) => ({ id })) }; + } + if (sql.startsWith("INSERT INTO docpull_mcp_migrations")) { + this.applied.add(String(params?.[0])); + } + if (sql.startsWith("DELETE FROM docpull_mcp_migrations")) { + this.applied.delete(String(params?.[0])); + } + return { rows: [] }; + } +} + +let tempDirs: string[] = []; + +function tempMigrationDir(): string { + const dir = mkdtempSync(join(tmpdir(), "docpull-mcp-migrations-")); + tempDirs.push(dir); + return dir; +} + +function writeMigration(dir: string, filename: string, sql: string): void { + writeFileSync(join(dir, filename), sql); +} + +afterEach(() => { + for (const dir of tempDirs) { + rmSync(dir, { recursive: true, force: true }); + } + tempDirs = []; +}); + +describe("migration filename discovery", () => { + test("parses valid migration filenames", () => { + expect(parseMigrationFilename("001_harden_embeddings.up.sql")).toMatchObject({ + id: "001_harden_embeddings", + direction: "up", + }); + expect(parseMigrationFilename("001_harden_embeddings.down.sql")).toMatchObject({ + id: "001_harden_embeddings", + direction: "down", + }); + }); + + test("ignores non-migration files and sorts migrations", () => { + const dir = tempMigrationDir(); + writeMigration(dir, "002_second.up.sql", "SELECT 2"); + writeMigration(dir, "README.md", "ignore"); + writeMigration(dir, "001_first.up.sql", "SELECT 1"); + + expect(listMigrationFiles(dir).map((file) => file.filename)).toEqual([ + "001_first.up.sql", + "002_second.up.sql", + ]); + }); +}); + +describe("migrateDatabase", () => { + test("applies only pending up migrations and records them", async () => { + const dir = tempMigrationDir(); + writeMigration(dir, "001_first.up.sql", "SELECT 1"); + writeMigration(dir, "002_second.up.sql", "SELECT 2"); + const client = new FakeMigrationClient(["001_first"]); + + const ran = await migrateDatabase(client, { migrationDir: dir }); + + expect(ran).toEqual(["002_second"]); + expect(client.queries.map((query) => query.sql)).toEqual([ + expect.stringContaining("CREATE TABLE IF NOT EXISTS docpull_mcp_migrations"), + "SELECT id FROM docpull_mcp_migrations ORDER BY id", + "BEGIN", + "SELECT 2", + "INSERT INTO docpull_mcp_migrations (id) VALUES ($1)", + "COMMIT", + ]); + }); + + test("setup applies schema before pending migrations", async () => { + const dir = tempMigrationDir(); + const schemaFile = join(dir, "schema.sql"); + writeFileSync(schemaFile, "CREATE EXTENSION vector"); + writeMigration(dir, "001_first.up.sql", "SELECT 1"); + const client = new FakeMigrationClient(); + + const ran = await setupDatabase(client, { schemaFile, migrationDir: dir }); + + expect(ran).toEqual(["001_first"]); + expect(client.queries[0].sql).toBe("CREATE EXTENSION vector"); + }); +}); + +describe("rollbackLatestMigration", () => { + test("rolls back the latest applied migration", async () => { + const dir = tempMigrationDir(); + writeMigration(dir, "001_first.down.sql", "SELECT 'down 1'"); + writeMigration(dir, "002_second.down.sql", "SELECT 'down 2'"); + const client = new FakeMigrationClient(["001_first", "002_second"]); + + const rolledBack = await rollbackLatestMigration(client, { + migrationDir: dir, + }); + + expect(rolledBack).toBe("002_second"); + expect(client.queries.map((query) => query.sql)).toContain("SELECT 'down 2'"); + expect(client.queries.map((query) => query.sql)).toContain( + "DELETE FROM docpull_mcp_migrations WHERE id = $1", + ); + }); + + test("returns null when nothing has been applied", async () => { + const client = new FakeMigrationClient(); + + await expect(rollbackLatestMigration(client)).resolves.toBeNull(); + }); +}); + +describe("migrationStatus", () => { + test("reports applied and pending migrations", async () => { + const dir = tempMigrationDir(); + writeMigration(dir, "001_first.up.sql", "SELECT 1"); + writeMigration(dir, "002_second.up.sql", "SELECT 2"); + const client = new FakeMigrationClient(["001_first"]); + + await expect(migrationStatus(client, { migrationDir: dir })).resolves.toEqual([ + { + id: "001_first", + filename: "001_first.up.sql", + applied: true, + }, + { + id: "002_second", + filename: "002_second.up.sql", + applied: false, + }, + ]); + }); +}); diff --git a/mcp/src/migrate.ts b/mcp/src/migrate.ts new file mode 100644 index 0000000..9a4e134 --- /dev/null +++ b/mcp/src/migrate.ts @@ -0,0 +1,269 @@ +#!/usr/bin/env bun +import { existsSync, readdirSync, readFileSync } from "node:fs"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import { Pool } from "pg"; +import { errorMessage } from "./logger.js"; + +type Direction = "up" | "down"; + +interface Queryable { + query(sql: string, params?: readonly unknown[]): Promise; + end?(): Promise; +} + +export interface MigrationFile { + id: string; + direction: Direction; + filename: string; + path: string; +} + +export interface MigrationStatus { + id: string; + filename: string; + applied: boolean; +} + +const MIGRATION_RE = /^([0-9]+_[a-z0-9_]+)\.(up|down)\.sql$/; +const MIGRATIONS_TABLE = "docpull_mcp_migrations"; + +function repoRoot(): string { + return resolve(dirname(fileURLToPath(import.meta.url)), ".."); +} + +function migrationsDir(): string { + return join(repoRoot(), "migrations"); +} + +function schemaPath(): string { + return join(repoRoot(), "schema.sql"); +} + +function getDatabaseUrl(): string { + const url = process.env.DATABASE_URL; + if (!url) { + throw new Error("DATABASE_URL environment variable is required"); + } + return url; +} + +export function parseMigrationFilename( + filename: string, + dir = migrationsDir(), +): MigrationFile | null { + const match = MIGRATION_RE.exec(filename); + if (!match) { + return null; + } + return { + id: match[1], + direction: match[2] as Direction, + filename, + path: join(dir, filename), + }; +} + +export function listMigrationFiles(dir = migrationsDir()): MigrationFile[] { + if (!existsSync(dir)) { + return []; + } + return readdirSync(dir) + .map((filename) => parseMigrationFilename(filename, dir)) + .filter((file): file is MigrationFile => file !== null) + .sort((a, b) => a.filename.localeCompare(b.filename)); +} + +function upMigrations(dir = migrationsDir()): MigrationFile[] { + return listMigrationFiles(dir).filter((file) => file.direction === "up"); +} + +function downMigrationFor(id: string, dir = migrationsDir()): MigrationFile | null { + return ( + listMigrationFiles(dir).find( + (file) => file.id === id && file.direction === "down", + ) ?? null + ); +} + +async function ensureMigrationsTable(client: Queryable): Promise { + await client.query(` + CREATE TABLE IF NOT EXISTS ${MIGRATIONS_TABLE} ( + id TEXT PRIMARY KEY, + applied_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ) + `); +} + +async function appliedMigrationIds(client: Queryable): Promise> { + const result = await client.query( + `SELECT id FROM ${MIGRATIONS_TABLE} ORDER BY id`, + ); + const rows = (result as { rows?: Array<{ id: unknown }> }).rows ?? []; + return new Set(rows.map((row) => String(row.id))); +} + +async function applySqlFile(client: Queryable, path: string): Promise { + await client.query(readFileSync(path, "utf-8")); +} + +export async function setupDatabase( + client: Queryable, + { + schemaFile = schemaPath(), + migrationDir = migrationsDir(), + }: { schemaFile?: string; migrationDir?: string } = {}, +): Promise { + await applySqlFile(client, schemaFile); + return migrateDatabase(client, { migrationDir }); +} + +export async function migrateDatabase( + client: Queryable, + { migrationDir = migrationsDir() }: { migrationDir?: string } = {}, +): Promise { + await ensureMigrationsTable(client); + const applied = await appliedMigrationIds(client); + const ran: string[] = []; + + for (const migration of upMigrations(migrationDir)) { + if (applied.has(migration.id)) { + continue; + } + await client.query("BEGIN"); + try { + await applySqlFile(client, migration.path); + await client.query( + `INSERT INTO ${MIGRATIONS_TABLE} (id) VALUES ($1)`, + [migration.id], + ); + await client.query("COMMIT"); + ran.push(migration.id); + } catch (error) { + await client.query("ROLLBACK"); + throw error; + } + } + + return ran; +} + +export async function rollbackLatestMigration( + client: Queryable, + { migrationDir = migrationsDir() }: { migrationDir?: string } = {}, +): Promise { + await ensureMigrationsTable(client); + const applied = [...(await appliedMigrationIds(client))].sort(); + const latest = applied.at(-1); + if (!latest) { + return null; + } + + const migration = downMigrationFor(latest, migrationDir); + if (!migration) { + throw new Error(`No down migration found for ${latest}`); + } + + await client.query("BEGIN"); + try { + await applySqlFile(client, migration.path); + await client.query(`DELETE FROM ${MIGRATIONS_TABLE} WHERE id = $1`, [ + latest, + ]); + await client.query("COMMIT"); + return latest; + } catch (error) { + await client.query("ROLLBACK"); + throw error; + } +} + +export async function migrationStatus( + client: Queryable, + { migrationDir = migrationsDir() }: { migrationDir?: string } = {}, +): Promise { + await ensureMigrationsTable(client); + const applied = await appliedMigrationIds(client); + return upMigrations(migrationDir).map((migration) => ({ + id: migration.id, + filename: migration.filename, + applied: applied.has(migration.id), + })); +} + +async function withPool(fn: (client: Queryable) => Promise): Promise { + const pool = new Pool({ connectionString: getDatabaseUrl() }); + try { + return await fn(pool); + } finally { + await pool.end(); + } +} + +function printUsage(): void { + process.stderr.write(`Usage: bun run src/migrate.ts + +Commands: + setup Apply schema.sql, create migration tracking, then apply pending migrations + migrate Apply pending *.up.sql migrations + rollback Apply the latest applied *.down.sql migration + status Show applied and pending migrations +`); +} + +async function main(): Promise { + const command = process.argv[2] ?? "status"; + try { + if (command === "setup") { + const ran = await withPool((client) => setupDatabase(client)); + process.stdout.write( + ran.length === 0 + ? "Database schema is ready; no pending migrations.\n" + : `Database schema is ready; applied migrations: ${ran.join(", ")}\n`, + ); + return 0; + } + if (command === "migrate") { + const ran = await withPool((client) => migrateDatabase(client)); + process.stdout.write( + ran.length === 0 + ? "No pending migrations.\n" + : `Applied migrations: ${ran.join(", ")}\n`, + ); + return 0; + } + if (command === "rollback") { + const rolledBack = await withPool((client) => + rollbackLatestMigration(client), + ); + process.stdout.write( + rolledBack + ? `Rolled back migration: ${rolledBack}\n` + : "No applied migrations to roll back.\n", + ); + return 0; + } + if (command === "status") { + const statuses = await withPool((client) => migrationStatus(client)); + if (statuses.length === 0) { + process.stdout.write("No migrations found.\n"); + return 0; + } + for (const status of statuses) { + process.stdout.write( + `${status.applied ? "applied" : "pending"} ${status.filename}\n`, + ); + } + return 0; + } + printUsage(); + return 1; + } catch (error) { + process.stderr.write(`Migration failed: ${errorMessage(error)}\n`); + return 1; + } +} + +if (import.meta.main) { + process.exitCode = await main(); +} diff --git a/mcp/src/server.ts b/mcp/src/server.ts index b261c3a..fc98ffc 100644 --- a/mcp/src/server.ts +++ b/mcp/src/server.ts @@ -42,8 +42,6 @@ const DOCPULL_TIMEOUT_MS = 10 * 60 * 1_000; const DOCPULL_KILL_GRACE_MS = 5_000; const MAX_DOCPULL_STDERR_BYTES = 10_000; -const openai = getConfiguredOpenAIClient(); - // ============================================================================ // SOURCE CONFIG // ============================================================================ @@ -393,7 +391,7 @@ async function runDocpull( // MCP SERVER // ============================================================================ -const server = new McpServer({ name: "docpull-mcp", version: "0.2.0" }); +const server = new McpServer({ name: "docpull-mcp", version: "0.3.0" }); // --------------------------------------------------------------------------- // ensure_docs - fetch and optionally index documentation @@ -437,6 +435,7 @@ server.tool( isError: true, }; } + const openai = getConfiguredOpenAIClient(); if (index && !openai) { return { content: [ @@ -499,6 +498,9 @@ server.tool( const fileCount = await countMarkdownFiles(join(DOCS_DIR, name)); let indexed = cache.exists ? cache.indexed : false; + if (needsFetch) { + indexed = false; + } // Index if requested and configured if (needsIndex && (!cache.exists || !cache.indexed || needsFetch)) { @@ -580,10 +582,9 @@ server.tool( // search_docs - semantic search (requires DB + OpenAI) // --------------------------------------------------------------------------- -if (isDbConfigured() && openai) { - server.tool( - "search_docs", - `Semantic search for CONCEPTS - use when you don't know the exact name. +server.tool( + "search_docs", + `Semantic search for CONCEPTS - use when you don't know the exact name. Use grep_docs instead if you're looking for a specific method/function name. @@ -591,161 +592,171 @@ Examples: - "how to stream responses" - "row level security" - "make object properties optional"`, - { - query: z - .string() - .min(2) - .max(500) - .describe("Natural language search query"), - library: z.string().optional().describe("Filter to specific library"), - limit: z - .number() - .int() - .min(1) - .max(50) - .default(5) - .describe("Max results (default: 5)"), - }, - async ({ query, library, limit }) => { - try { - const [queryEmbedding] = await createEmbeddings(openai, query); - if (!queryEmbedding) { - throw new Error("Failed to generate embedding"); - } - - const results = await searchDocs(queryEmbedding, { library, limit }); - - if (results.length === 0) { - return { - content: [ - { - type: "text" as const, - text: "No results found for: " + query, - }, - ], - }; - } + { + query: z + .string() + .min(2) + .max(500) + .describe("Natural language search query"), + library: z.string().optional().describe("Filter to specific library"), + limit: z + .number() + .int() + .min(1) + .max(50) + .default(5) + .describe("Max results (default: 5)"), + }, + async ({ query, library, limit }) => { + try { + if (!isDbConfigured()) { + throw new Error("DATABASE_URL is not configured"); + } + const openai = getConfiguredOpenAIClient(); + if (!openai) { + throw new Error("OPENAI_API_KEY is not configured"); + } + const [queryEmbedding] = await createEmbeddings(openai, query); + if (!queryEmbedding) { + throw new Error("Failed to generate embedding"); + } - const output = results - .map( - (r) => - `## ${r.library} - ${r.file_path}\n*Similarity: ${(r.similarity * 100).toFixed(1)}%*\n\n${r.content}`, - ) - .join("\n\n---\n\n"); + const results = await searchDocs(queryEmbedding, { library, limit }); - return { content: [{ type: "text" as const, text: output }] }; - } catch (error) { - const msg = errorMessage(error); + if (results.length === 0) { return { - content: [{ type: "text" as const, text: "Search failed: " + msg }], - isError: true, + content: [ + { + type: "text" as const, + text: "No results found for: " + query, + }, + ], }; } - }, - ); - // --------------------------------------------------------------------------- - // grep_docs - exact pattern matching (requires DB) - // --------------------------------------------------------------------------- + const output = results + .map( + (r) => + `## ${r.library} - ${r.file_path}\n*Similarity: ${(r.similarity * 100).toFixed(1)}%*\n\n${r.content}`, + ) + .join("\n\n---\n\n"); + + return { content: [{ type: "text" as const, text: output }] }; + } catch (error) { + const msg = errorMessage(error); + return { + content: [{ type: "text" as const, text: "Search failed: " + msg }], + isError: true, + }; + } + }, +); + +// --------------------------------------------------------------------------- +// grep_docs - exact pattern matching (requires DB) +// --------------------------------------------------------------------------- - server.tool( - "grep_docs", - `FAST exact text search - use for known method/function/component names. +server.tool( + "grep_docs", + `FAST exact text search - use for known method/function/component names. Examples: - "onConflictDoUpdate" - "usePrefetchQuery" - "streamText"`, - { - pattern: z.string().min(2).max(200).describe("Exact text to search for"), - library: z.string().optional().describe("Filter to specific library"), - limit: z - .number() - .int() - .min(1) - .max(20) - .default(5) - .describe("Max results (default: 5)"), - }, - async ({ pattern, library, limit }) => { - try { - const results = await grepDocs(pattern, { library, limit }); - - if (results.length === 0) { - return { - content: [ - { - type: "text" as const, - text: "No matches for: " + pattern, - }, - ], - }; - } + { + pattern: z.string().min(2).max(200).describe("Exact text to search for"), + library: z.string().optional().describe("Filter to specific library"), + limit: z + .number() + .int() + .min(1) + .max(20) + .default(5) + .describe("Max results (default: 5)"), + }, + async ({ pattern, library, limit }) => { + try { + if (!isDbConfigured()) { + throw new Error("DATABASE_URL is not configured"); + } + const results = await grepDocs(pattern, { library, limit }); - const output = results - .map((r) => { - const lines = r.content.split("\n"); - const patternLower = pattern.toLowerCase(); - const matchingLines = lines - .map((line, idx) => ({ line, idx })) - .filter(({ line }) => line.toLowerCase().includes(patternLower)) - .slice(0, 3) - .map(({ line, idx }) => ` ${idx + 1}: ${line.trim()}`) - .join("\n"); - return `## ${r.library} - ${r.file_path}\n${matchingLines}`; - }) - .join("\n\n"); - - return { content: [{ type: "text" as const, text: output }] }; - } catch (error) { - const msg = errorMessage(error); + if (results.length === 0) { return { - content: [{ type: "text" as const, text: "Grep failed: " + msg }], - isError: true, + content: [ + { + type: "text" as const, + text: "No matches for: " + pattern, + }, + ], }; } - }, - ); - - // --------------------------------------------------------------------------- - // list_indexed - list indexed libraries in the database - // --------------------------------------------------------------------------- - - server.tool( - "list_indexed", - "List all indexed documentation libraries with chunk counts", - {}, - async () => { - try { - const libraries = await listLibraries(); - if (libraries.length === 0) { - return { - content: [ - { - type: "text" as const, - text: "No libraries indexed. Use ensure_docs to fetch and index documentation.", - }, - ], - }; - } + const output = results + .map((r) => { + const lines = r.content.split("\n"); + const patternLower = pattern.toLowerCase(); + const matchingLines = lines + .map((line, idx) => ({ line, idx })) + .filter(({ line }) => line.toLowerCase().includes(patternLower)) + .slice(0, 3) + .map(({ line, idx }) => ` ${idx + 1}: ${line.trim()}`) + .join("\n"); + return `## ${r.library} - ${r.file_path}\n${matchingLines}`; + }) + .join("\n\n"); + + return { content: [{ type: "text" as const, text: output }] }; + } catch (error) { + const msg = errorMessage(error); + return { + content: [{ type: "text" as const, text: "Grep failed: " + msg }], + isError: true, + }; + } + }, +); - const lines = libraries.map( - (l) => `- ${l.library}: ${l.chunks} chunks`, - ); - return { - content: [{ type: "text" as const, text: lines.join("\n") }], - }; - } catch (error) { - const msg = errorMessage(error); +// --------------------------------------------------------------------------- +// list_indexed - list indexed libraries in the database +// --------------------------------------------------------------------------- + +server.tool( + "list_indexed", + "List all indexed documentation libraries with chunk counts", + {}, + async () => { + try { + if (!isDbConfigured()) { + throw new Error("DATABASE_URL is not configured"); + } + const libraries = await listLibraries(); + + if (libraries.length === 0) { return { - content: [{ type: "text" as const, text: "Failed to list: " + msg }], - isError: true, + content: [ + { + type: "text" as const, + text: "No libraries indexed. Use ensure_docs to fetch and index documentation.", + }, + ], }; } - }, - ); -} + + const lines = libraries.map((l) => `- ${l.library}: ${l.chunks} chunks`); + return { + content: [{ type: "text" as const, text: lines.join("\n") }], + }; + } catch (error) { + const msg = errorMessage(error); + return { + content: [{ type: "text" as const, text: "Failed to list: " + msg }], + isError: true, + }; + } + }, +); // --------------------------------------------------------------------------- // START SERVER diff --git a/plugin/.claude-plugin/plugin.json b/plugin/.claude-plugin/plugin.json index 5f71674..518d37e 100644 --- a/plugin/.claude-plugin/plugin.json +++ b/plugin/.claude-plugin/plugin.json @@ -1,18 +1,18 @@ { "name": "docpull", - "version": "0.2.0", - "description": "Pull docs from any URL into Claude Code. Indexes static docs sites in seconds with conditional-GET caching, then exposes them as MCP tools (fetch_url, ensure_docs, list_sources, list_indexed, grep_docs, read_doc, add_source, remove_source). Local, browser-free, no API keys.", + "version": "4.0.0", + "description": "Pull server-rendered web content from any URL into Claude Code. Indexes sites in seconds with conditional-GET caching, then exposes them as MCP tools (fetch_url, ensure_docs, list_sources, list_indexed, grep_docs, read_doc, add_source, remove_source). Local, browser-free, no API keys.", "author": { "name": "Raintree Technology", "email": "support@raintree.technology", - "url": "https://raintree.technology" + "url": "https://github.com/raintree-technology/docpull" }, "homepage": "https://github.com/raintree-technology/docpull", "repository": "https://github.com/raintree-technology/docpull", "license": "MIT", "keywords": [ - "documentation", - "docs", + "web", + "crawler", "fetch", "markdown", "rag", diff --git a/plugin/.codex-plugin/plugin.json b/plugin/.codex-plugin/plugin.json new file mode 100644 index 0000000..463474a --- /dev/null +++ b/plugin/.codex-plugin/plugin.json @@ -0,0 +1,6 @@ +{ + "name": "docpull", + "version": "4.0.0", + "description": "Pull server-rendered web content from any URL into Codex through docpull's local MCP server.", + "skills": "./skills/" +} diff --git a/plugin/README.md b/plugin/README.md index 1fb5535..93c4dfd 100644 --- a/plugin/README.md +++ b/plugin/README.md @@ -1,6 +1,17 @@ # docpull plugin for Claude Code -Pull docs from any URL into Claude Code. Local, fast, no API keys. +Pull server-rendered web content from any URL into Claude Code. Local, fast, no API keys. + +This package is the agent wrapper around docpull's local MCP server. Claude Code +can install it as a Claude plugin; Codex can package the same `skills/` folder +through `.codex-plugin/plugin.json`. Cursor and Claude Desktop connect +`docpull mcp` directly. + +This repo also carries host-native project guidance for the direct-MCP paths: +Claude Code can read `.mcp.json` plus `CLAUDE.md`; Cursor reads +`.cursor/mcp.json` plus `.cursor/rules/docpull-research.mdc`; Codex reads +`AGENTS.md`, supports project `.codex/config.toml` in trusted repos, and can +discover repo skills from `.agents/skills`. ## What you get @@ -8,13 +19,13 @@ Pull docs from any URL into Claude Code. Local, fast, no API keys. - Read: `fetch_url`, `list_sources`, `list_indexed`, `grep_docs`, `read_doc` - Write: `ensure_docs`, `add_source`, `remove_source` - All read tools advertise `readOnlyHint` so hosts that auto-approve safe tools won't prompt for them. -- **Slash commands**: - - `/docs-add ` — fetch a library into the local index. - - `/docs-search [library]` — regex-search cached docs and pull surrounding context for the top hits. - - `/docs-list` — show what's cached, with last-fetched age. - - `/docs-refresh ` — bypass the 7-day cache and re-fetch. - - `/docs-remove [--keep-cache]` — drop a user alias and its cached docs. -- **Meta-skill** (`docpull-research`): teaches Claude *when* to reach for docpull — so you don't have to remember the tool exists every time you ask about a library. +- **MCP prompts**: + - `/mcp__docpull__docs_add ` — fetch a built-in alias, or register an HTTPS docs URL and then fetch it into the local index. + - `/mcp__docpull__docs_search [library]` — regex-search cached docs and pull surrounding context for the top hits. + - `/mcp__docpull__docs_list` — show what's cached, with last-fetched age. + - `/mcp__docpull__docs_refresh ` — bypass the 7-day cache and re-fetch. + - `/mcp__docpull__docs_remove [--keep-cache]` — drop a user alias and its cached docs. +- **Meta-skill** (`docpull-research`): teaches Claude *when* to reach for docpull — so you don't have to remember the tool exists every time you ask about a library or web source. ## Prerequisite @@ -24,7 +35,7 @@ MCP server is available: ```bash pip install 'docpull[mcp]' # or: pipx install 'docpull[mcp]' # uv tool install 'docpull[mcp]' -docpull --version # should print 2.5.0 or newer +docpull --version # should print 4.0.0 or newer docpull mcp --help # confirm the MCP subcommand is wired ``` @@ -43,10 +54,42 @@ In Claude Code: The MCP server starts automatically. The skill activates when you ask Claude about a specific library. +In Codex, the same folder is a Codex plugin source via +`plugin/.codex-plugin/plugin.json`. For direct MCP setup, use: + +```bash +codex mcp add docpull -- docpull mcp +``` + +Or, in a trusted repo, add this to `.codex/config.toml`: + +```toml +[mcp_servers.docpull] +command = "docpull" +args = ["mcp"] +``` + +From this repo, regenerate Codex's project config, repo-scoped skill copy, and +local plugin marketplace with: + +```bash +make sync-agent-host-configs +``` + +For local bundle installs or smoke tests, first generate the self-contained +bundle: + +```bash +python scripts/sync_claude_plugin.py +``` + +Then point Claude Code at `.claude-plugin/`. The bundle is generated from +`plugin/`; the copied plugin payload is not checked into git. + ## 60-second demo ``` -> /docs-add fastapi +> /mcp__docpull__docs_add fastapi [fetches the FastAPI docs in ~15s; ~400 pages, full-text indexed locally] > How does FastAPI handle dependency injection scoping? @@ -54,20 +97,30 @@ The MCP server starts automatically. The skill activates when you ask Claude abo relevant section, and answers with attribution to the actual docs file] ``` +Older `/docs-add` plugin command wrappers are intentionally not shipped; the +workflows now live with the MCP server so they work through any host that +supports MCP prompts. + ## Built-in library aliases These are fetchable by name without any URL setup: `react`, `nextjs`, `tailwindcss`, `vite`, `hono`, `fastapi`, `express`, `anthropic`, `openai`, `langchain`, `supabase`, `drizzle`, `prisma`. -For anything else, pass an HTTPS URL: `/docs-add https://docs.your-library.com`. +For anything else, pass an HTTPS URL to the prompt. It derives an alias, writes +that alias to `~/.config/docpull-mcp/sources.yaml`, then calls `ensure_docs`: + +`/mcp__docpull__docs_add https://docs.your-library.com`. ## Where docs are cached -By default, fetched docs live under `$XDG_DATA_HOME/docpull/docs/` (or `~/.local/share/docpull/docs/` on macOS/Linux). Override with `DOCPULL_DOCS_DIR` if you want them somewhere else (e.g. one cache per project). +By default, fetched docs live under `$XDG_DATA_HOME/docpull-mcp/docs/` (or +`~/.local/share/docpull-mcp/docs/` on macOS/Linux). Override with +`DOCPULL_DOCS_DIR` if you want them somewhere else (e.g. one cache per +project). ## Privacy - 100% local. No telemetry. No remote services. -- The plugin only sends HTTP requests to the docs URLs you ask it to fetch. +- The plugin only sends HTTP requests to the URLs you ask it to fetch. - The User-Agent is `docpull/ (+https://github.com/raintree-technology/docpull)` — public, identifiable, robots.txt-respecting. ## Troubleshooting @@ -75,13 +128,14 @@ By default, fetched docs live under `$XDG_DATA_HOME/docpull/docs/` (or `~/.local | Symptom | Fix | |---------------------------------------------|-----| | MCP tools missing after install | Run `docpull mcp --help`. If it errors with "requires the 'mcp' package", reinstall with `pip install 'docpull[mcp]'`. | -| `/docs-add fastapi` says "unknown source" | Run `mcp__docpull__list_sources()` to see current aliases. Use a URL instead. | +| `/mcp__docpull__docs_add fastapi` says "unknown source" | Run `mcp__docpull__list_sources()` to see current aliases. If the source is not listed, use `/mcp__docpull__docs_add ` to register it. | | Slow first fetch | Normal — first crawl populates the cache. Subsequent runs hit the conditional-GET cache (~70 ms time-to-first-result). | | Want to refresh stale docs | `mcp__docpull__ensure_docs(source="", force=true)`. | -## Roadmap +## Notes -- **v0.3.0**: per-project docs cache directory, `/docs-skill ` for generating Claude Code skill scaffolds from fetched libraries, `docs-researcher` subagent for parallel multi-library research. +Direct MCP tools and MCP prompts are the supported workflow. Legacy plugin +command wrappers such as `/docs-add` are intentionally not shipped. ## License diff --git a/plugin/commands/docs-add.md b/plugin/commands/docs-add.md deleted file mode 100644 index c2a2d9f..0000000 --- a/plugin/commands/docs-add.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -description: Fetch documentation for a library and make it searchable in this session. Accepts a built-in alias (e.g. "react"), an HTTPS URL, or "name url" to register a custom alias. -argument-hint: | | -allowed-tools: mcp__docpull__ensure_docs, mcp__docpull__add_source, mcp__docpull__list_sources, mcp__docpull__list_indexed ---- - -# Add docs to this session - -The user wants to add documentation to docpull's local index so it's searchable later via `/docs-search` (or directly via the `grep_docs` MCP tool). - -User input: **$ARGUMENTS** - -## How to handle the input - -Inspect `$ARGUMENTS`: - -1. **Empty or missing.** Reply with a one-line usage hint and stop: - `Usage: /docs-add , /docs-add , or /docs-add . Run /docs-list to see what's already cached.` - -2. **One token, no URL scheme** (e.g. `react`, `fastapi`). - - Treat as a built-in alias. Call `ensure_docs(source="")`. The default `rag` profile is right for most cases — only override if the user mentioned a specific profile. - - If the alias is unknown, the tool will return an error listing available aliases. In that case call `list_sources()` and suggest the closest match by edit distance, or recommend running `/docs-add ` with the docs URL. - -3. **One token, an HTTPS URL** (starts with `https://`). - - Auto-derive an alias name from the hostname: - 1. Take the hostname. - 2. Strip a leading `docs.` or `www.` if present. - 3. Take the first dot-separated label. - 4. Lowercase it. - 5. Examples: `https://docs.fastapi.tiangolo.com` → `fastapi`; `https://nextjs.org/docs` → `nextjs`; `https://example.com/api` → `example`. - - If the derived name collides with a built-in alias (`list_sources` to check) or an existing entry (`list_indexed`), tell the user and suggest the explicit `/docs-add ` form so they pick a unique name. - - Otherwise call `add_source(name=, url=)` to register, then `ensure_docs(source=)` to fetch. - -4. **Two tokens, second is an HTTPS URL** (` `). - - Validate the name is a sensible alias (alnum + `_ . -`, ≤128 chars). If not, ask for a cleaner name. - - Call `add_source(name=, url=)`. If it returns "is a builtin source", tell the user that `add_source` refuses to shadow builtins by default (the agent shouldn't pass `force=true` here without explicit user consent). - - Then call `ensure_docs(source=)` to fetch. - -## After it succeeds - -Report a one-line summary: -- Library name (alias used). -- Pages fetched (from the `ensure_docs` response — pages_fetched / pages_skipped / pages_failed). -- Suggest the next step: `/docs-search [library]` or ask Claude to grep for something specific. - -## After it fails - -Show the error in plain language. Common cases: -- **Unknown built-in alias** → list a few suggestions from `list_sources`. -- **URL rejected** (HTTP, localhost, private IP) → tell the user docpull is HTTPS-only by design and won't fetch internal hosts; suggest a public docs URL. -- **`add_source` refused a builtin** → tell the user the alias collides with a built-in; pick a different name. -- **Network / 4xx / 5xx during `ensure_docs`** → show the URL and status code; suggest checking network, the URL itself, or trying a different docs path. - -Do not use any tools beyond the ones listed in `allowed-tools`. Do not send filler messages while the fetch is running — let the tool output speak for itself. diff --git a/plugin/commands/docs-list.md b/plugin/commands/docs-list.md deleted file mode 100644 index 8edbb86..0000000 --- a/plugin/commands/docs-list.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -description: List documentation libraries currently cached locally, with last-fetched age. -allowed-tools: mcp__docpull__list_indexed, mcp__docpull__list_sources ---- - -# List cached docs - -Show what's available to `/docs-search` right now. - -## Workflow - -1. Call `list_indexed()`. It returns libraries that have been fetched, with file count and how long ago they were fetched. - -2. **If empty**: reply with a one-liner pointing to `/docs-add` and `list_sources` for the built-in alias list. Don't fetch anything. - -3. **If non-empty**: render the list as the tool returned it (it's already formatted). Note any libraries marked `stale` (older than 7 days) and suggest `/docs-refresh ` for those if there are any. - -4. If the user is likely going to follow up with a search, suggest `/docs-search [library]` once at the bottom. - -## Don't - -- Don't crawl, fetch, or call `ensure_docs` from this command. It's a read-only listing. -- Don't expand each library's file tree — `list_indexed` summarizes for a reason. diff --git a/plugin/commands/docs-refresh.md b/plugin/commands/docs-refresh.md deleted file mode 100644 index 9d62858..0000000 --- a/plugin/commands/docs-refresh.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -description: Re-fetch a cached library, ignoring the 7-day cache. Use when docs have been updated upstream. -argument-hint: -allowed-tools: mcp__docpull__ensure_docs, mcp__docpull__list_indexed ---- - -# Refresh cached docs - -Force-refetch a library that's already cached. The default `ensure_docs` honors a 7-day cache; this command bypasses it. - -User input: **$ARGUMENTS** - -## Workflow - -1. Parse `$ARGUMENTS` as a single library name. If empty: reply `Usage: /docs-refresh . Run /docs-list to see what's cached.` and stop. - -2. Call `ensure_docs(source=, force=true)`. The tool will re-crawl the source (using whatever URL the alias resolves to) and overwrite the cached `.md` files in place. - -3. **If the alias is unknown**: pass through the tool's error. Suggest `/docs-add ` if it's a built-in or `/docs-add ` if not. - -4. After success, report a one-line summary using the tool's response (pages fetched / skipped / failed). - -## When to use this vs `/docs-add` - -- `/docs-add ` — first time fetching, OR when the cache is fresh and you want to use it. -- `/docs-refresh ` — already cached but you want the latest. Don't run this every time you search; the conditional-GET cache makes it cheap, but it still hits the network for every page. - -## Don't - -- Don't loop this across all cached libraries unprompted. If the user wants a global refresh, ask first. -- Don't pass `force=true` to `ensure_docs` from any other command — that's what this command is for. diff --git a/plugin/commands/docs-remove.md b/plugin/commands/docs-remove.md deleted file mode 100644 index b24653e..0000000 --- a/plugin/commands/docs-remove.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -description: Remove a user-defined source alias from sources.yaml, optionally deleting its cached docs. -argument-hint: [--keep-cache] -allowed-tools: mcp__docpull__remove_source, mcp__docpull__list_indexed ---- - -# Remove a docs source - -The user wants to remove a previously-added source. By default this also deletes the cached docs to free disk and avoid stale answers. - -User input: **$ARGUMENTS** - -## How to handle the input - -Parse `$ARGUMENTS` as: - -- **First token = library name** (the alias to remove). -- **Optional `--keep-cache` flag** = remove the alias from the user registry but leave the cached `.md` files on disk. - -If empty: reply `Usage: /docs-remove [--keep-cache]. Run /docs-list to see what's cached.` and stop. - -## Workflow - -1. **Default (no flag): remove alias AND delete cache.** - Call `remove_source(name=, delete_cache=true)`. The MCP tool refuses to remove builtins (`react`, `nextjs`, etc.) — pass that error through to the user with the suggestion in step 3. - -2. **`--keep-cache` flag: remove alias only.** - Call `remove_source(name=, delete_cache=false)`. The cached docs stay; `/docs-search ` will keep working until the user runs this without the flag. - -3. **If the tool returns "is a builtin source"**: - Tell the user that builtins can't be removed but they can be shadowed with a custom URL via `/docs-add` (or by editing `sources.yaml` directly). - -4. **If the tool returns the no-op response** (no user source AND no cache to delete): tell the user there was nothing to remove. Don't error. - -## Output - -One line: confirm what was removed (alias only, cache only, both, or nothing). The MCP tool's response is already specific — relay it. - -## Don't - -- Don't run `rm -rf` via Bash; the MCP tool's `delete_cache=true` does the safe path-validated deletion. -- Don't call this on builtins thinking force will help — there's no force flag for removal by design. diff --git a/plugin/commands/docs-search.md b/plugin/commands/docs-search.md deleted file mode 100644 index 5905102..0000000 --- a/plugin/commands/docs-search.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -description: Search fetched docs by regex and pull surrounding context for the best hits. Optionally restrict to one library. -argument-hint: [library] -allowed-tools: mcp__docpull__grep_docs, mcp__docpull__read_doc, mcp__docpull__list_indexed ---- - -# Search fetched docs - -The user wants to search docs that have already been pulled by `/docs-add` (or `ensure_docs`). This composes two MCP tools: `grep_docs` finds matching files; `read_doc` pulls more context around the top hits so the answer is grounded, not just a list of file:line references. - -User input: **$ARGUMENTS** - -## How to handle the input - -Parse `$ARGUMENTS` as: - -- **First whitespace-separated token = pattern** (regex; can be quoted to include spaces). -- **Optional second token = library alias** to restrict the search to one library. - -If empty: reply `Usage: /docs-search [library]. Run /docs-list to see what's cached.` and stop. - -## Workflow - -1. **Find candidates.** Call `grep_docs(pattern=, library=, limit=10, context=2)`. The tool returns the top files ranked by match density with two lines of context above and below each hit. - -2. **Read deeper context for the top 2–3 files.** For each of the top files in the grep result (max 3), call `read_doc(library=, path=, line_start=, line_end=)` to pull a ~60-line window. Skip this step if the user's pattern is very narrow (a literal symbol name) and the grep context already answers the question. - -3. **If grep returns nothing**: - - If a library was specified, run `list_indexed()` to confirm the library is actually cached. If it isn't, suggest `/docs-add ` and stop. - - If no library was specified, broaden the pattern *once* (e.g. add common prefixes/suffixes, drop word boundaries) and retry. If still nothing, surface the gap to the user. - -4. **If `grep_docs` says "search timed out"**: the pattern is likely catastrophic. Suggest a tighter pattern (no nested quantifiers, anchor with `\b`). - -## Output - -- Lead with the synthesized answer to the user's likely question, grounded in what you read. -- Cite each source as `library/path.md:line` so the user can verify. -- Don't dump the full grep output unless the user asked for it — the goal is an answer, not a search log. - -## Don't - -- Don't call `ensure_docs` from this command. If the library isn't cached, send the user to `/docs-add` instead — auto-fetching from a search command surprises people. -- Don't re-`read_doc` the same file twice in one call. -- Don't use any tool not in `allowed-tools`. diff --git a/plugin/skills/docpull-research/SKILL.md b/plugin/skills/docpull-research/SKILL.md index c226cb1..f4f5fcb 100644 --- a/plugin/skills/docpull-research/SKILL.md +++ b/plugin/skills/docpull-research/SKILL.md @@ -1,6 +1,6 @@ --- name: docpull-research -description: Use the docpull MCP tools (list_indexed, ensure_docs, grep_docs, read_doc, fetch_url) to ground answers in real documentation when the user asks about a specific library, framework, or API — especially for fast-moving libraries (Next.js, FastAPI, LangChain, Pydantic, React, Tailwind, Drizzle, Prisma, Anthropic SDK, etc.) where training data is likely stale or incomplete. Activate on questions like "how do I X in [library]", "what's the API for [framework].[method]", "show me how [library] handles Y", or when a user pastes a docs URL. +description: Use the docpull MCP tools (list_indexed, list_sources, ensure_docs, grep_docs, read_doc, fetch_url) to ground answers in real documentation when the user asks about a specific library, framework, SDK, API surface, version-sensitive tool behavior, or pasted documentation URL. Especially useful for fast-moving libraries and tool ecosystems such as Next.js, FastAPI, LangChain, React, Tailwind, Drizzle, Prisma, Anthropic/OpenAI SDKs, Vercel AI SDK, and Vercel skills.sh / skills CLI docs. allowed-tools: mcp__docpull__list_indexed, mcp__docpull__list_sources, mcp__docpull__ensure_docs, mcp__docpull__grep_docs, mcp__docpull__read_doc, mcp__docpull__fetch_url --- @@ -10,11 +10,12 @@ Ground library/framework answers in real documentation instead of training-data ## When to use this skill -**Activate when** the user's question names a specific library, framework, SDK, or API surface — especially: +**Activate when** the user's question names a specific library, framework, SDK, API surface, or docs-backed tool ecosystem — especially: - **Fast-moving libraries** where training-data drift is likely: Next.js (App Router), Pydantic v2, LangChain, FastAPI, Anthropic SDK, OpenAI SDK, Drizzle, Prisma, Tailwind v4+, Vercel AI SDK. - **Version-specific questions** ("how does X work in [library] v[N]"). - **Pasted docs URLs** the user wants explained or referenced. +- **Agent/tooling ecosystems** with live docs or CLIs, including `skills.sh`, `github.com/vercel-labs/skills`, Vercel agent skills, MCP docs, and SDK command references. - **Code the user is actively writing** against a library, where wrong signatures will cost them debugging time. **Do NOT activate for**: @@ -36,7 +37,7 @@ list_indexed() → ["fastapi (3d ago)", "react (12h ago)", ...] ### 2. If the library is cached → search it -Use `grep_docs` with a focused regex. The library is already on disk, so this is a local search: +Use `grep_docs` with a focused regex. Prefer API nouns, method names, command names, or option flags over whole natural-language questions. The library is already on disk, so this is a local search: ``` grep_docs(library="fastapi", pattern="dependency injection", limit=10, context=2) @@ -44,17 +45,30 @@ grep_docs(library="fastapi", pattern="dependency injection", limit=10, context=2 If you want more context around a hit, use `read_doc(library, path, line_start, line_end)`. -### 3. If the library is NOT cached → decide whether to fetch +### 3. If the source is NOT cached → decide whether to fetch - **Built-in alias** (the library appears in `list_sources()`): call `ensure_docs(source="")`. This crawls and indexes the whole library. ~10–30s for typical sites. -- **Arbitrary URL**: call `fetch_url(url=...)` if you only need one page. For a whole site you don't have an alias for, tell the user to run `/docs-add ` (which uses the docpull CLI to crawl); the MCP `fetch_url` is single-page only. +- **Arbitrary URL**: call `fetch_url(url=...)` if you only need one page. For a whole site you don't have an alias for, tell the user to run `/mcp__docpull__docs_add `; the MCP `fetch_url` is single-page only. - **No alias, user didn't paste a URL**: ask the user once whether they'd like to add the library, and what the docs URL is. Don't fetch speculatively. -### 4. Quote with attribution +### 4. Special case: skills.sh and Vercel skills CLI + +For questions about Vercel skills, `skills.sh`, `npx skills`, agent skill installation, or `SKILL.md` structure: + +- Treat the docs as version-sensitive. First check `list_indexed` for an existing `skills`, `skills.sh`, or `vercel-labs-skills` source. +- If cached, search for exact commands or flags such as `skills add`, `--agent`, `--skill`, `--copy`, `--yes`, `skills use`, `skills list`, `skills find`, `skills update`, `skills remove`, `SKILL.md`, `frontmatter`, or the named agent. +- If not cached and the user pasted a skills.sh docs page, use `fetch_url` on that page. +- If not cached and no URL was pasted, prefer the official docs page `https://www.skills.sh/docs` for a quick one-page answer. For CLI option details, ask once before crawling the full GitHub repo, or use the official README URL if the user only needs install/use command syntax. +- When giving install commands for this repo, preserve project policy: `npx -y skills add --skill '*' --agent codex --copy --yes`, then remove installer artifacts with `rm -rf .agents skills-lock.json`. +- Mention the security boundary from skills.sh when relevant: public skills can be audited, but users should review skill contents before installing. + +### 5. Quote with attribution When you cite docs, include the source path returned by `grep_docs` / `read_doc` so the user can verify. Example: "Per `fastapi/tutorial/dependencies.md:42`, dependencies declared with `Depends()` are resolved per-request..." -### 5. Don't over-fetch +For one-page `fetch_url` answers, name the page URL or page title in the answer. For cached docs, prefer `library/path.md:line` style references. + +### 6. Don't over-fetch - Don't call `ensure_docs` for libraries the user didn't ask about ("while we're here, let me also fetch..."). - Don't crawl the same library twice in one session — `list_indexed` will tell you it's there. @@ -66,9 +80,10 @@ These are pre-configured and resolvable by `ensure_docs(source=...)` without set ## Failure modes -- **`ensure_docs` returns "unknown source"**: the alias isn't built-in. Either suggest `/docs-add ` or call `list_sources()` and propose a near match. +- **`ensure_docs` returns "unknown source"**: the alias isn't built-in. Either suggest `/mcp__docpull__docs_add ` or call `list_sources()` and propose a near match. - **`grep_docs` returns empty**: the pattern is too narrow, or the library doesn't cover the topic. Broaden once, then surface the gap to the user. -- **MCP server not responding**: tell the user to run `pip install docpull` and verify the plugin's MCP server is healthy. Fall back to answering from training data with an explicit caveat that docs weren't available. +- **`fetch_url` cannot read a docs page**: state that docpull could not fetch the page, then use a browser/search fallback only if the host permits it and the question needs current docs. +- **MCP server not responding**: tell the user to run `pip install 'docpull[mcp]'` and verify the plugin's MCP server is healthy. Fall back to answering from training data with an explicit caveat that docs weren't available. ## Tone diff --git a/pyproject.toml b/pyproject.toml index a5665b3..9076853 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "docpull" version = "4.0.0" dynamic = [] -description = "Pull documentation from the web and convert to clean markdown" +description = "Pull server-rendered web content into clean markdown" readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.10" license = "MIT" @@ -19,7 +19,7 @@ maintainers = [ ] keywords = [ "python", "markdown", "documentation", "web-scraping", "developer-tools", - "claude", "ai-training-data" + "web-crawling", "claude", "ai-training-data" ] classifiers = [ # Development Status diff --git a/scripts/sync_agent_host_configs.py b/scripts/sync_agent_host_configs.py new file mode 100644 index 0000000..4493275 --- /dev/null +++ b/scripts/sync_agent_host_configs.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +"""Sync project-local agent host config files from repo sources.""" + +from __future__ import annotations + +import argparse +import json +import shutil +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +PLUGIN_SKILL_DIR = REPO_ROOT / "plugin" / "skills" / "docpull-research" +CODEX_CONFIG_PATH = REPO_ROOT / ".codex" / "config.toml" +CODEX_SKILL_DIR = REPO_ROOT / ".agents" / "skills" / "docpull-research" +CODEX_SKILL_PATH = CODEX_SKILL_DIR / "SKILL.md" +CODEX_MARKETPLACE_PATH = REPO_ROOT / ".agents" / "plugins" / "marketplace.json" + +CODEX_CONFIG = """[mcp_servers.docpull] +command = "docpull" +args = ["mcp"] +""" + + +def sync(*, dry_run: bool = False) -> list[Path]: + """Write host config files and return the paths that would be/were touched.""" + + paths = [CODEX_CONFIG_PATH, CODEX_SKILL_PATH, CODEX_MARKETPLACE_PATH] + if dry_run: + return paths + + CODEX_CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True) + CODEX_CONFIG_PATH.write_text(CODEX_CONFIG, encoding="utf-8") + + if CODEX_SKILL_DIR.exists(): + shutil.rmtree(CODEX_SKILL_DIR) + shutil.copytree(PLUGIN_SKILL_DIR, CODEX_SKILL_DIR) + + CODEX_MARKETPLACE_PATH.parent.mkdir(parents=True, exist_ok=True) + CODEX_MARKETPLACE_PATH.write_text( + json.dumps( + { + "name": "docpull-local", + "plugins": [ + { + "name": "docpull", + "source": {"source": "local", "path": "./plugin"}, + "policy": {"installation": "AVAILABLE", "authentication": "ON_INSTALL"}, + "category": "Documentation", + } + ], + }, + indent=2, + ) + + "\n", + encoding="utf-8", + ) + + return paths + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--dry-run", action="store_true", help="print files without writing them") + args = parser.parse_args() + + for path in sync(dry_run=args.dry_run): + print(path.relative_to(REPO_ROOT).as_posix()) + + +if __name__ == "__main__": + main() diff --git a/scripts/sync_claude_plugin.py b/scripts/sync_claude_plugin.py new file mode 100644 index 0000000..a7c4a56 --- /dev/null +++ b/scripts/sync_claude_plugin.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +"""Sync the self-contained Claude plugin bundle from repo sources.""" + +from __future__ import annotations + +import json +import shutil +from pathlib import Path + +try: + import tomllib +except ModuleNotFoundError: # pragma: no cover - Python <3.11 fallback for dev tooling + import tomli as tomllib # type: ignore[no-redef] + + +REPO_ROOT = Path(__file__).resolve().parents[1] +PYPROJECT_PATH = REPO_ROOT / "pyproject.toml" +AUTHORING_PLUGIN_DIR = REPO_ROOT / "plugin" +BUNDLE_ROOT = REPO_ROOT / ".claude-plugin" +BUNDLE_PLUGIN_DIR = BUNDLE_ROOT / "plugin" + +PLUGIN_DESCRIPTION = ( + "Pull server-rendered web content from any URL into Claude Code. Indexes sites in seconds with " + "conditional-GET caching, then exposes them as MCP tools (fetch_url, ensure_docs, list_sources, " + "list_indexed, grep_docs, read_doc, add_source, remove_source). Local, browser-free, no API keys." +) +MARKETPLACE_DESCRIPTION = ( + "Pull server-rendered web content from any URL into Claude Code. Local, fast, no API keys." +) +PLUGIN_KEYWORDS = [ + "web", + "crawler", + "fetch", + "markdown", + "rag", + "mcp", + "local-first", +] +MARKETPLACE_KEYWORDS = PLUGIN_KEYWORDS[:4] + ["indexing"] + PLUGIN_KEYWORDS[5:] + + +def load_pyproject() -> dict: + with PYPROJECT_PATH.open("rb") as f: + return tomllib.load(f) + + +def write_json(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2) + "\n") + + +def sync_bundle_files() -> None: + if BUNDLE_PLUGIN_DIR.exists(): + shutil.rmtree(BUNDLE_PLUGIN_DIR) + + shutil.copytree( + AUTHORING_PLUGIN_DIR, + BUNDLE_PLUGIN_DIR, + ignore=shutil.ignore_patterns(".claude-plugin"), + ) + + +def build_plugin_metadata(pyproject: dict) -> dict: + project = pyproject["project"] + maintainer = project["maintainers"][0] + repository_url = project["urls"]["Repository"] + + return { + "name": project["name"], + "version": project["version"], + "description": PLUGIN_DESCRIPTION, + "author": { + "name": maintainer["name"], + "email": maintainer["email"], + "url": repository_url, + }, + "homepage": project["urls"]["Homepage"], + "repository": repository_url, + "license": project["license"], + "keywords": PLUGIN_KEYWORDS, + } + + +def build_marketplace_metadata(pyproject: dict, plugin_metadata: dict) -> dict: + author = plugin_metadata["author"] + return { + "name": plugin_metadata["name"], + "owner": { + "name": author["name"], + "email": author["email"], + "url": author["url"], + }, + "plugins": [ + { + "name": plugin_metadata["name"], + "source": "./plugin", + "description": MARKETPLACE_DESCRIPTION, + "version": plugin_metadata["version"], + "author": { + "name": author["name"], + "email": author["email"], + }, + "homepage": plugin_metadata["homepage"], + "repository": plugin_metadata["repository"], + "license": plugin_metadata["license"], + "category": "documentation", + "keywords": MARKETPLACE_KEYWORDS, + } + ], + } + + +def main() -> None: + pyproject = load_pyproject() + sync_bundle_files() + + plugin_metadata = build_plugin_metadata(pyproject) + write_json(AUTHORING_PLUGIN_DIR / ".claude-plugin" / "plugin.json", plugin_metadata) + write_json(BUNDLE_PLUGIN_DIR / ".claude-plugin" / "plugin.json", plugin_metadata) + write_json(BUNDLE_ROOT / "marketplace.json", build_marketplace_metadata(pyproject, plugin_metadata)) + + +if __name__ == "__main__": + main() diff --git a/src/docpull/__init__.py b/src/docpull/__init__.py index 54d0ef7..c108745 100644 --- a/src/docpull/__init__.py +++ b/src/docpull/__init__.py @@ -1,11 +1,11 @@ """ -docpull - Fetch and convert documentation from any URL to markdown. +docpull - Fetch and convert server-rendered web content from any URL to markdown. Usage: from docpull import Fetcher, DocpullConfig, ProfileName config = DocpullConfig( - url="https://docs.example.com", + url="https://example.com", profile=ProfileName.RAG, ) diff --git a/src/docpull/cache/manager.py b/src/docpull/cache/manager.py index e77b373..3698894 100644 --- a/src/docpull/cache/manager.py +++ b/src/docpull/cache/manager.py @@ -5,9 +5,10 @@ import hashlib import json import logging +import tempfile from datetime import timedelta from pathlib import Path -from typing import TypedDict +from typing import Any, TypedDict from ..time_utils import parse_persisted_datetime, utc_now, utc_now_iso @@ -41,6 +42,7 @@ class DiscoveredUrlsState(TypedDict, total=False): start_url: str discovered_at: str + config_fingerprint: dict[str, Any] urls: list[str] @@ -56,20 +58,28 @@ def __init__(self) -> None: def from_cache_state(cls, state: CacheState) -> _InternalState: """Create internal state from serialized CacheState.""" internal = cls() - internal.fetched_urls = set(state.get("fetched_urls", [])) - internal.failed_urls = set(state.get("failed_urls", [])) - internal.last_run = state.get("last_run") + internal.fetched_urls = set(_string_list(state.get("fetched_urls"))) + internal.failed_urls = set(_string_list(state.get("failed_urls"))) + last_run = state.get("last_run") + internal.last_run = last_run if isinstance(last_run, str) else None return internal def to_cache_state(self) -> CacheState: """Convert to serializable CacheState.""" return { - "fetched_urls": list(self.fetched_urls), - "failed_urls": list(self.failed_urls), + "fetched_urls": sorted(self.fetched_urls), + "failed_urls": sorted(self.failed_urls), "last_run": self.last_run, } +def _string_list(value: object) -> list[str]: + """Return only string items from a persisted JSON list.""" + if not isinstance(value, list): + return [] + return [item for item in value if isinstance(item, str)] + + class CacheManager: """Manage cache for tracking fetched documents and detecting updates. @@ -111,20 +121,66 @@ def _load_manifest(self) -> dict[str, ManifestEntry]: if self.manifest_file.exists(): try: with open(self.manifest_file, encoding="utf-8") as f: - data: dict[str, ManifestEntry] = json.load(f) - return data + data: Any = json.load(f) + if not isinstance(data, dict): + msg = "manifest root is not an object" + raise ValueError(msg) + manifest: dict[str, ManifestEntry] = {} + for url, raw_entry in data.items(): + if not isinstance(url, str) or not isinstance(raw_entry, dict): + logger.warning("Skipping invalid cache manifest entry for %r", url) + continue + entry: ManifestEntry = {} + checksum = raw_entry.get("checksum") + if isinstance(checksum, str): + entry["checksum"] = checksum + file_path = raw_entry.get("file_path") + if isinstance(file_path, str): + entry["file_path"] = file_path + fetched_at = raw_entry.get("fetched_at") + if isinstance(fetched_at, str): + entry["fetched_at"] = fetched_at + etag = raw_entry.get("etag") + if isinstance(etag, str): + entry["etag"] = etag + last_modified = raw_entry.get("last_modified") + if isinstance(last_modified, str): + entry["last_modified"] = last_modified + size = raw_entry.get("size") + if isinstance(size, int): + entry["size"] = size + manifest[url] = entry + return manifest except Exception as e: logger.warning(f"Could not load manifest: {e}") return {} + def _write_json(self, path: Path, data: object) -> None: + """Atomically write JSON data to a cache file.""" + with tempfile.NamedTemporaryFile( + "w", + encoding="utf-8", + dir=self.cache_dir, + prefix=f".{path.name}.", + suffix=".tmp", + delete=False, + ) as f: + temp_path = Path(f.name) + json.dump(data, f, indent=2, ensure_ascii=False) + f.write("\n") + try: + temp_path.replace(path) + except Exception: + temp_path.unlink(missing_ok=True) + raise + def _save_manifest(self) -> None: """Save manifest to disk (internal, called by flush).""" if not self._manifest_dirty: return try: - with open(self.manifest_file, "w", encoding="utf-8") as f: - json.dump(self.manifest, f, indent=2, ensure_ascii=False) + self._write_json(self.manifest_file, self.manifest) self._manifest_dirty = False except Exception as e: logger.error(f"Could not save manifest: {e}") @@ -138,8 +194,16 @@ def _load_state(self) -> CacheState: if self.state_file.exists(): try: with open(self.state_file, encoding="utf-8") as f: - data: CacheState = json.load(f) - return data + data: Any = json.load(f) + if not isinstance(data, dict): + msg = "state root is not an object" + raise ValueError(msg) + last_run = data.get("last_run") + return { + "fetched_urls": _string_list(data.get("fetched_urls")), + "failed_urls": _string_list(data.get("failed_urls")), + "last_run": last_run if isinstance(last_run, str) else None, + } except Exception as e: logger.warning(f"Could not load state: {e}") @@ -155,8 +219,7 @@ def _save_state(self) -> None: return try: state = self._state.to_cache_state() - with open(self.state_file, "w", encoding="utf-8") as f: - json.dump(state, f, indent=2, ensure_ascii=False) + self._write_json(self.state_file, state) self._state_dirty = False except Exception as e: logger.error(f"Could not save state: {e}") @@ -197,6 +260,13 @@ def compute_checksum(content: str | bytes) -> str: content = content.encode("utf-8") return hashlib.sha256(content).hexdigest() + @staticmethod + def _content_size(content: str | bytes) -> int: + """Return content size in bytes, matching checksum input.""" + if isinstance(content, str): + return len(content.encode("utf-8")) + return len(content) + def update_cache( self, url: str, @@ -221,7 +291,7 @@ def update_cache( "checksum": self.compute_checksum(content), "file_path": str(file_path), "fetched_at": utc_now_iso(), - "size": len(content), + "size": self._content_size(content), } if etag: @@ -241,7 +311,11 @@ def mark_fetched(self, url: str) -> None: Changes are batched. Call flush() to persist to disk. """ self._state.fetched_urls.add(url) + was_failed = url in self._state.failed_urls + self._state.failed_urls.discard(url) self._state_dirty = True + if was_failed: + logger.debug("Cleared previous failure state for fetched URL: %s", url) def mark_failed(self, url: str) -> None: """Mark URL as failed. @@ -253,6 +327,7 @@ def mark_failed(self, url: str) -> None: Changes are batched. Call flush() to persist to disk. """ self._state.failed_urls.add(url) + self._state.fetched_urls.discard(url) self._state_dirty = True def get_fetched_urls(self) -> set[str]: @@ -302,19 +377,29 @@ def evict_expired(self, ttl_days: int | None = None) -> int: del self.manifest[url] if to_remove: + for url in to_remove: + self._state.fetched_urls.discard(url) + self._state.failed_urls.discard(url) self._manifest_dirty = True + self._state_dirty = True logger.info(f"Evicted {len(to_remove)} expired cache entries") return len(to_remove) # Resume capability methods - def save_discovered_urls(self, urls: list[str], start_url: str) -> None: + def save_discovered_urls( + self, + urls: list[str], + start_url: str, + config_fingerprint: dict[str, Any] | None = None, + ) -> None: """Save discovered URLs for resume capability. Args: urls: List of discovered URLs start_url: The starting URL for this crawl + config_fingerprint: Deterministic crawl-shaping config snapshot. Note: This is written immediately (not batched) to ensure @@ -325,18 +410,24 @@ def save_discovered_urls(self, urls: list[str], start_url: str) -> None: "discovered_at": utc_now_iso(), "urls": urls, } + if config_fingerprint is not None: + data["config_fingerprint"] = config_fingerprint try: - with open(self.discovered_urls_file, "w", encoding="utf-8") as f: - json.dump(data, f, indent=2, ensure_ascii=False) + self._write_json(self.discovered_urls_file, data) logger.info(f"Saved {len(urls)} discovered URLs for resume capability") except Exception as e: logger.error(f"Could not save discovered URLs: {e}") - def load_discovered_urls(self, start_url: str) -> list[str] | None: + def load_discovered_urls( + self, + start_url: str, + config_fingerprint: dict[str, Any] | None = None, + ) -> list[str] | None: """Load previously discovered URLs if they match the start URL. Args: start_url: The starting URL to match + config_fingerprint: Deterministic crawl-shaping config snapshot. Returns: List of discovered URLs if found and matching, None otherwise @@ -346,29 +437,45 @@ def load_discovered_urls(self, start_url: str) -> list[str] | None: try: with open(self.discovered_urls_file, encoding="utf-8") as f: - data: DiscoveredUrlsState = json.load(f) + data: Any = json.load(f) + if not isinstance(data, dict): + msg = "discovered URLs root is not an object" + raise ValueError(msg) if data.get("start_url") != start_url: logger.info("Discovered URLs file exists but start_url doesn't match") return None - - urls = data.get("urls", []) + if config_fingerprint is not None: + persisted_fingerprint = data.get("config_fingerprint") + if not isinstance(persisted_fingerprint, dict): + logger.info("Discovered URLs file exists but has no compatible config fingerprint") + return None + if persisted_fingerprint != config_fingerprint: + logger.info("Discovered URLs file exists but crawl fingerprint doesn't match") + return None + + urls = _string_list(data.get("urls")) logger.info(f"Loaded {len(urls)} discovered URLs from previous run") return urls except Exception as e: logger.warning(f"Could not load discovered URLs: {e}") return None - def get_pending_urls(self, start_url: str) -> list[str] | None: + def get_pending_urls( + self, + start_url: str, + config_fingerprint: dict[str, Any] | None = None, + ) -> list[str] | None: """Get URLs that were discovered but not yet fetched. Args: start_url: The starting URL to match + config_fingerprint: Deterministic crawl-shaping config snapshot. Returns: List of pending URLs, or None if no resume data available """ - discovered = self.load_discovered_urls(start_url) + discovered = self.load_discovered_urls(start_url, config_fingerprint=config_fingerprint) if discovered is None: return None diff --git a/src/docpull/cli.py b/src/docpull/cli.py index e2bb8f8..1e3f01a 100644 --- a/src/docpull/cli.py +++ b/src/docpull/cli.py @@ -7,57 +7,42 @@ import sys from pathlib import Path -# Check if --doctor flag is present before checking dependencies -if "--doctor" in sys.argv: - from .doctor import run_doctor - - output_dir = None - if "--output-dir" in sys.argv or "-o" in sys.argv: - flag = "--output-dir" if "--output-dir" in sys.argv else "-o" - flag_idx = sys.argv.index(flag) - if flag_idx + 1 < len(sys.argv): - output_dir = Path(sys.argv[flag_idx + 1]) - sys.exit(run_doctor(output_dir=output_dir)) - -# Verify core dependencies -try: - import aiohttp # noqa: F401 - import bs4 # noqa: F401 - import defusedxml # noqa: F401 - import html2text # noqa: F401 - import rich # noqa: F401 -except ImportError as e: - print(f"\nERROR: Missing required dependency: {e.name}", file=sys.stderr) - print("\nDocpull requires all core dependencies to be installed.", file=sys.stderr) - print("\nRecommended fixes:", file=sys.stderr) - print(" 1. For pipx users: pipx reinstall docpull --force", file=sys.stderr) - print(" 2. For pip users: pip install --upgrade --force-reinstall docpull", file=sys.stderr) - print(" 3. For development: pip install -e .[dev]", file=sys.stderr) - print("\nTo diagnose issues, run: docpull --doctor", file=sys.stderr) - sys.exit(1) - -from rich.console import Console -from rich.progress import Progress, SpinnerColumn, TextColumn - from . import __version__ -from .core.fetcher import Fetcher -from .models.config import DocpullConfig, ProfileName -from .models.events import EventType, SkipReason + + +def _verify_core_dependencies() -> bool: + """Return False with install guidance when a runtime dependency is missing.""" + try: + import aiohttp # noqa: F401 + import bs4 # noqa: F401 + import defusedxml # noqa: F401 + import html2text # noqa: F401 + import rich # noqa: F401 + except ImportError as e: + print(f"\nERROR: Missing required dependency: {e.name}", file=sys.stderr) + print("\nDocpull requires all core dependencies to be installed.", file=sys.stderr) + print("\nRecommended fixes:", file=sys.stderr) + print(" 1. For pipx users: pipx reinstall docpull --force", file=sys.stderr) + print(" 2. For pip users: pip install --upgrade --force-reinstall docpull", file=sys.stderr) + print(" 3. For development: pip install -e .[dev]", file=sys.stderr) + print("\nTo diagnose issues, run: docpull --doctor", file=sys.stderr) + return False + return True def create_parser() -> argparse.ArgumentParser: """Create argument parser for CLI.""" parser = argparse.ArgumentParser( prog="docpull", - description="Fetch and convert documentation from any URL to markdown", + description="Fetch and convert server-rendered web content from any URL to markdown", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Fetch with default settings (RAG profile) - docpull https://docs.example.com + docpull https://example.com # Use a specific profile - docpull https://docs.example.com --profile mirror + docpull https://example.com/blog --profile mirror # Control crawl behavior docpull https://example.com --max-pages 100 --max-depth 3 @@ -70,7 +55,7 @@ def create_parser() -> argparse.ArgumentParser: parser.add_argument( "url", nargs="?", - help="URL to fetch documentation from", + help="URL to fetch content from", ) parser.add_argument( @@ -134,12 +119,12 @@ def create_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--naming-strategy", - choices=["full", "hierarchical", "flat", "short"], + choices=["full", "hierarchical"], default=None, help=( "URL-to-filename strategy. 'full' flattens with underscores; " "'hierarchical' preserves the URL path as nested directories. " - "Mirror profile defaults to hierarchical." + "Mirror keeps 'full' naming by default in 2.x." ), ) parser.add_argument( @@ -371,6 +356,16 @@ def create_parser() -> argparse.ArgumentParser: def run_fetcher(args: argparse.Namespace) -> int: """Run the fetcher with given arguments.""" + if not _verify_core_dependencies(): + return 1 + + from rich.console import Console + from rich.progress import Progress, SpinnerColumn, TextColumn + + from .core.fetcher import Fetcher + from .models.config import DocpullConfig, ProfileName + from .models.events import EventType, SkipReason + console = Console() if not args.url: @@ -594,9 +589,21 @@ async def run() -> int: elif event.type == EventType.DISCOVERY_COMPLETE: progress.update(task, description=f"[green]Found {event.total} URLs") elif event.type == EventType.FETCH_PROGRESS: + processed = ( + event.processed_count + if event.processed_count is not None + else event.current + ) + total = event.total if event.total is not None else "?" + saved = event.saved_count if event.saved_count is not None else "?" + skipped = event.skipped_count if event.skipped_count is not None else "?" + failed = event.failed_count if event.failed_count is not None else "?" progress.update( task, - description=f"[cyan]Fetching {event.current}/{event.total}: {event.url}", + description=( + f"[cyan]Processed {processed}/{total} " + f"(saved {saved}, skipped {skipped}, failed {failed}): {event.url}" + ), ) elif event.type == EventType.FETCH_SKIPPED: if event.skip_reason: diff --git a/src/docpull/conversion/chunking.py b/src/docpull/conversion/chunking.py index 22defd0..d2837ce 100644 --- a/src/docpull/conversion/chunking.py +++ b/src/docpull/conversion/chunking.py @@ -71,7 +71,7 @@ def encoding(self) -> str: def _strip_frontmatter(markdown: str) -> tuple[str, str]: """Split YAML frontmatter from body.""" - if not markdown.startswith("---"): + if not markdown.startswith("---\n"): return "", markdown end = markdown.find("\n---", 3) if end == -1: @@ -155,41 +155,58 @@ def chunk_markdown( chunks: list[Chunk] = [] buf_parts: list[str] = [] buf_tokens = 0 + buf_heading: str | None = None current_heading: str | None = None + def first_chunk_prefix() -> str: + if keep_frontmatter_in_first and not chunks: + return frontmatter + return "" + + def would_fit(extra_tokens: int) -> bool: + prefix_tokens = counter.count(frontmatter) if first_chunk_prefix() else 0 + return buf_tokens + extra_tokens + prefix_tokens <= max_tokens + def flush() -> None: - nonlocal buf_parts, buf_tokens + nonlocal buf_heading, buf_parts, buf_tokens if not buf_parts: return text = "\n\n".join(part.strip() for part in buf_parts if part.strip()) if not text: buf_parts = [] buf_tokens = 0 + buf_heading = None return - prefix = frontmatter if keep_frontmatter_in_first and not chunks else "" + prefix = first_chunk_prefix() final = (prefix + text).strip() + "\n" chunks.append( Chunk( index=len(chunks), text=final, token_count=counter.count(final), - heading=current_heading, + heading=buf_heading, ) ) buf_parts = [] buf_tokens = 0 + buf_heading = None for heading, section in sections: if heading is not None: current_heading = heading section_tokens = counter.count(section) - if section_tokens <= max_tokens and buf_tokens + section_tokens <= max_tokens: + if section_tokens <= max_tokens and would_fit(section_tokens): + if not buf_parts: + buf_heading = current_heading buf_parts.append(section) buf_tokens += section_tokens continue # Section alone fits but buffer is full: flush then add. - if section_tokens <= max_tokens: + if section_tokens <= max_tokens and ( + not first_chunk_prefix() or section_tokens + counter.count(first_chunk_prefix()) <= max_tokens + ): flush() + buf_heading = current_heading buf_parts.append(section) buf_tokens = section_tokens continue @@ -207,13 +224,15 @@ def flush() -> None: Chunk( index=len(chunks), text=text, - token_count=p_tokens, + token_count=counter.count(text), heading=current_heading, ) ) continue - if buf_tokens + p_tokens > max_tokens: + if not would_fit(p_tokens): flush() + if not buf_parts: + buf_heading = current_heading buf_parts.append(para) buf_tokens += p_tokens diff --git a/src/docpull/conversion/special_cases.py b/src/docpull/conversion/special_cases.py index a35bfed..8adda02 100644 --- a/src/docpull/conversion/special_cases.py +++ b/src/docpull/conversion/special_cases.py @@ -380,18 +380,23 @@ def _render_operation( lines.append(op_desc) lines.append("") - self._render_parameters(lines, list(shared_params) + list(op.get("parameters") or [])) + self._render_parameters(lines, list(shared_params) + list(op.get("parameters") or []), spec) self._render_request_body(lines, op.get("requestBody"), spec) self._render_responses(lines, op.get("responses"), spec) - def _render_parameters(self, lines: list[str], params: list[Any]) -> None: + def _render_parameters(self, lines: list[str], params: list[Any], spec: dict[str, Any]) -> None: buckets: dict[str, list[tuple[str, str, bool, str]]] = {} for param in params: if not isinstance(param, dict): continue + if "$ref" in param: + resolved = _resolve_ref(spec, param["$ref"]) + if not isinstance(resolved, dict): + continue + param = resolved pin = param.get("in", "query") pname = param.get("name", "?") - ptype = _describe_type(param.get("schema") or {}, {}) + ptype = _describe_type(param.get("schema") or {}, spec) required = bool(param.get("required")) or pin == "path" pdesc = _clean_text(param.get("description") or "") buckets.setdefault(pin, []).append((pname, ptype, required, pdesc)) diff --git a/src/docpull/core/__init__.py b/src/docpull/core/__init__.py index 190868c..6427cb9 100644 --- a/src/docpull/core/__init__.py +++ b/src/docpull/core/__init__.py @@ -1,5 +1,5 @@ """Core fetcher API for docpull v2.0.""" -from .fetcher import Fetcher, fetch_blocking +from .fetcher import Fetcher, fetch_blocking, fetch_one -__all__ = ["Fetcher", "fetch_blocking"] +__all__ = ["Fetcher", "fetch_blocking", "fetch_one"] diff --git a/src/docpull/core/fetcher.py b/src/docpull/core/fetcher.py index 0366fca..4f4aa01 100644 --- a/src/docpull/core/fetcher.py +++ b/src/docpull/core/fetcher.py @@ -17,6 +17,7 @@ from ..models.config import DocpullConfig from ..models.events import EventType, FetchEvent, FetchStats, SkipReason from ..models.profiles import apply_profile +from ..models.run import RunIdentity from ..pipeline.base import FetchPipeline, PageContext from ..pipeline.base import FetchStep as FetchStepProtocol from ..pipeline.steps import ( @@ -39,8 +40,8 @@ def _url_to_filename(url: str, base_url: str | None = None) -> str: """ Convert URL to a safe flattened filename (e.g. ``api_auth_oauth2.md``). - Used by the ``full`` / ``flat`` / ``short`` naming strategies. For the - ``hierarchical`` strategy, see :func:`_url_to_path_parts`. + Used by the ``full`` naming strategy. For the ``hierarchical`` strategy, + see :func:`_url_to_path_parts`. Args: url: The URL to convert @@ -50,13 +51,7 @@ def _url_to_filename(url: str, base_url: str | None = None) -> str: Safe filename string """ parsed = urlparse(url) - path = parsed.path.strip("/") - - # Remove base URL prefix if provided - if base_url: - base_path = urlparse(base_url).path.strip("/") - if path.startswith(base_path): - path = path[len(base_path) :].strip("/") + path = _strip_base_path(parsed.path, base_url).strip("/") # Convert path to filename if not path or path == "/": @@ -79,6 +74,23 @@ def _url_to_filename(url: str, base_url: str | None = None) -> str: _PATH_SAFE_RE = re.compile(r"[^\w\-.]") +def _strip_base_path(raw_path: str, base_url: str | None) -> str: + """Strip a base URL path only when it matches a full path segment prefix.""" + if not base_url: + return raw_path + + base_path = urlparse(base_url).path.strip("/") + if not base_path: + return raw_path + + stripped_path = raw_path.strip("/") + if stripped_path == base_path: + return "/" + if stripped_path.startswith(base_path + "/"): + return "/" + stripped_path[len(base_path) + 1 :] + return raw_path + + def _sanitize_path_segment(segment: str) -> str: """Make a single URL path segment safe for use as a filesystem name. @@ -109,14 +121,7 @@ def _url_to_path_parts(url: str, base_url: str | None = None) -> list[str]: ``["index.md"]`` """ parsed = urlparse(url) - raw_path = parsed.path - - if base_url: - base_path = urlparse(base_url).path.strip("/") - stripped = raw_path.strip("/") - if base_path and stripped.startswith(base_path): - stripped = stripped[len(base_path) :] - raw_path = "/" + stripped + ("/" if raw_path.endswith("/") else "") + raw_path = _strip_base_path(parsed.path, base_url) trailing_slash = raw_path.endswith("/") parts = [seg for seg in raw_path.split("/") if seg] @@ -272,6 +277,7 @@ async def __aenter__(self) -> Fetcher: user_agent=self.config.network.user_agent, proxy=self.config.network.proxy, default_timeout=float(self.config.network.read_timeout), + connect_timeout=float(self.config.network.connect_timeout), auth_headers=auth_headers, url_validator=self._url_validator, allow_insecure_tls=self.config.network.insecure_tls, @@ -315,6 +321,7 @@ async def __aenter__(self) -> Fetcher: ValidateStep( url_validator=self._url_validator, robots_checker=self._robots_checker, + rate_limiter=self._rate_limiter, check_existing=True, cache_enabled=cache_enabled, ), @@ -505,20 +512,22 @@ async def fetch_one(self, url: str, *, save: bool = True) -> PageContext: raise RuntimeError("Fetcher not initialized. Use 'async with' context manager.") output_path = self._compute_output_path(url) - steps = self._pipeline.steps + pipeline = self._pipeline if not save: - steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}] - pipeline = type(self._pipeline)(steps=steps) - ctx = await pipeline.execute(url, output_path) - if ctx.error: + save_step_names = {"save", "save_json", "save_ndjson", "save_sqlite"} + steps = [s for s in self._pipeline.steps if s.name not in save_step_names] + pipeline = FetchPipeline(steps=steps) + result = await pipeline.execute_result(url, output_path) + ctx = result.ctx + if save: + self._record_result(url, output_path, ctx) + elif ctx.error: self._stats.pages_failed += 1 elif ctx.should_skip: self._stats.pages_skipped += 1 else: self._stats.pages_fetched += 1 self._stats.bytes_downloaded += ctx.bytes_downloaded - if save: - self._stats.files_saved += 1 return ctx def _compute_output_path(self, url: str) -> Path: @@ -542,6 +551,60 @@ def _compute_output_path(self, url: str) -> Path: filename = _url_to_filename(url, self.config.url) return output_dir / filename + def _resume_fingerprint(self) -> dict[str, object]: + """Stable crawl-shaping config snapshot for resume compatibility.""" + return RunIdentity.from_config(self.config).resume_fingerprint() + + @staticmethod + def _progress_event( + *, + url: str, + processed: int, + total: int | None, + saved: int, + skipped: int, + failed: int, + ) -> FetchEvent: + total_for_message = total if total is not None else processed + return FetchEvent( + type=EventType.FETCH_PROGRESS, + url=url, + current=processed, + total=total, + processed_count=processed, + saved_count=saved, + skipped_count=skipped, + failed_count=failed, + message=( + f"Processed {processed}/{total_for_message} " + f"(saved {saved}, skipped {skipped}, failed {failed}): {url}" + ), + ) + + @staticmethod + def _update_progress_counts( + counts: dict[str, int], + *, + ctx: PageContext | None = None, + dry_run: bool = False, + failed: bool = False, + ) -> None: + counts["processed"] += 1 + if dry_run: + counts["skipped"] += 1 + return + if failed: + counts["failed"] += 1 + return + if ctx is None: + return + if ctx.error: + counts["failed"] += 1 + elif ctx.should_skip: + counts["skipped"] += 1 + else: + counts["saved"] += 1 + async def run(self) -> AsyncIterator[FetchEvent]: """ Execute the fetch operation, yielding events. @@ -613,7 +676,10 @@ def _resume_urls(self) -> list[str] | None: or not self.config.url ): return None - pending = self._cache_manager.get_pending_urls(self.config.url) + pending = self._cache_manager.get_pending_urls( + self.config.url, + config_fingerprint=self._resume_fingerprint(), + ) if pending is None: return None max_pages = self.config.crawl.max_pages @@ -641,7 +707,11 @@ async def _run_collected(self) -> AsyncIterator[FetchEvent]: return if self.config.cache.enabled and self._cache_manager: - self._cache_manager.save_discovered_urls(discovered, start_url) + self._cache_manager.save_discovered_urls( + discovered, + start_url, + config_fingerprint=self._resume_fingerprint(), + ) yield FetchEvent( type=EventType.DISCOVERY_COMPLETE, @@ -662,25 +732,18 @@ async def _fetch_collected(self, urls: list[str]) -> AsyncIterator[FetchEvent]: assert self._pipeline is not None assert self._start_time is not None self._stats.urls_discovered = len(urls) + progress_counts = {"processed": 0, "saved": 0, "skipped": 0, "failed": 0} collected_events: list[FetchEvent] = [] def collect(event: FetchEvent) -> None: collected_events.append(event) - for i, url in enumerate(urls): + for url in urls: if self._cancelled: yield FetchEvent(type=EventType.CANCELLED, message="Fetch cancelled by user") return - yield FetchEvent( - type=EventType.FETCH_PROGRESS, - url=url, - current=i + 1, - total=len(urls), - message=f"Processing {i + 1}/{len(urls)}: {url}", - ) - if self.config.dry_run: yield FetchEvent( type=EventType.FETCH_SKIPPED, @@ -690,14 +753,33 @@ def collect(event: FetchEvent) -> None: skip_reason=SkipReason.DRY_RUN, ) self._stats.pages_skipped += 1 + self._update_progress_counts(progress_counts, dry_run=True) + yield self._progress_event( + url=url, + processed=progress_counts["processed"], + total=len(urls), + saved=progress_counts["saved"], + skipped=progress_counts["skipped"], + failed=progress_counts["failed"], + ) continue output_path = self._compute_output_path(url) collected_events.clear() - ctx = await self._pipeline.execute(url, output_path, emit=collect) + result = await self._pipeline.execute_result(url, output_path, emit=collect) + ctx = result.ctx for ev in collected_events: yield ev + self._update_progress_counts(progress_counts, ctx=ctx) self._record_result(url, output_path, ctx) + yield self._progress_event( + url=url, + processed=progress_counts["processed"], + total=len(urls), + saved=progress_counts["saved"], + skipped=progress_counts["skipped"], + failed=progress_counts["failed"], + ) self._stats.duration_seconds = time.monotonic() - self._start_time if self._cache_manager and self._stats.pages_failed == 0: @@ -736,7 +818,8 @@ async def _run_streaming(self) -> AsyncIterator[FetchEvent]: url_queue: asyncio.Queue[str | None] = asyncio.Queue(maxsize=worker_count * 4) event_queue: asyncio.Queue[FetchEvent | None] = asyncio.Queue() - progress_counter = {"saved": 0} + progress_counter = {"processed": 0, "saved": 0, "skipped": 0, "failed": 0} + discovery_state: dict[str, BaseException | None] = {"error": None} async def discover_into_queue() -> None: discovered_for_resume: list[str] = [] @@ -759,10 +842,18 @@ async def discover_into_queue() -> None: and self._cache_manager and len(discovered_for_resume) % 200 == 0 ): - self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url) + self._cache_manager.save_discovered_urls( + list(discovered_for_resume), + start_url, + config_fingerprint=self._resume_fingerprint(), + ) finally: if self.config.cache.enabled and self._cache_manager: - self._cache_manager.save_discovered_urls(discovered_for_resume, start_url) + self._cache_manager.save_discovered_urls( + discovered_for_resume, + start_url, + config_fingerprint=self._resume_fingerprint(), + ) self._stats.urls_discovered = len(discovered_for_resume) await event_queue.put( FetchEvent( @@ -792,6 +883,18 @@ async def worker() -> None: ) ) self._stats.pages_skipped += 1 + self._update_progress_counts(progress_counter, dry_run=True) + total_so_far = self._stats.urls_discovered or progress_counter["processed"] + await event_queue.put( + self._progress_event( + url=url, + processed=progress_counter["processed"], + total=total_so_far, + saved=progress_counter["saved"], + skipped=progress_counter["skipped"], + failed=progress_counter["failed"], + ) + ) continue local_events: list[FetchEvent] = [] @@ -803,7 +906,8 @@ def emit(ev: FetchEvent, _sink: list[FetchEvent] = local_events) -> None: _sink.append(ev) try: - ctx = await pipeline.execute(url, output_path, emit=emit) + result = await pipeline.execute_result(url, output_path, emit=emit) + ctx = result.ctx except Exception as err: # noqa: BLE001 await event_queue.put( FetchEvent( @@ -814,34 +918,52 @@ def emit(ev: FetchEvent, _sink: list[FetchEvent] = local_events) -> None: ) ) self._stats.pages_failed += 1 + self._update_progress_counts(progress_counter, failed=True) + total_so_far = self._stats.urls_discovered or progress_counter["processed"] + await event_queue.put( + self._progress_event( + url=url, + processed=progress_counter["processed"], + total=total_so_far, + saved=progress_counter["saved"], + skipped=progress_counter["skipped"], + failed=progress_counter["failed"], + ) + ) continue for ev in local_events: await event_queue.put(ev) + self._update_progress_counts(progress_counter, ctx=ctx) self._record_result(url, output_path, ctx) - if ctx.markdown and not ctx.error and not ctx.should_skip: - progress_counter["saved"] += 1 - # Synthesize a progress event so the CLI bar moves; - # `total` may still be unknown, so report what we know. - total_so_far = self._stats.urls_discovered or progress_counter["saved"] - await event_queue.put( - FetchEvent( - type=EventType.FETCH_PROGRESS, - url=url, - current=progress_counter["saved"], - total=total_so_far, - message=f"Saved {progress_counter['saved']}/{total_so_far}: {url}", - ) + # Synthesize a progress event so the CLI bar moves; `total` + # may still be unknown, so report what discovery has yielded. + total_so_far = self._stats.urls_discovered or progress_counter["processed"] + await event_queue.put( + self._progress_event( + url=url, + processed=progress_counter["processed"], + total=total_so_far, + saved=progress_counter["saved"], + skipped=progress_counter["skipped"], + failed=progress_counter["failed"], ) + ) discover_task = asyncio.create_task(discover_into_queue()) worker_tasks = [asyncio.create_task(worker()) for _ in range(worker_count)] async def wait_for_drain() -> None: - await discover_task - await asyncio.gather(*worker_tasks, return_exceptions=True) - await event_queue.put(None) + try: + await discover_task + except BaseException as err: # noqa: BLE001 + discovery_state["error"] = err + finally: + for _ in range(worker_count): + await url_queue.put(None) + await asyncio.gather(*worker_tasks, return_exceptions=True) + await event_queue.put(None) drain_task = asyncio.create_task(wait_for_drain()) @@ -854,6 +976,9 @@ async def wait_for_drain() -> None: finally: await drain_task + if discovery_state["error"] is not None: + raise discovery_state["error"] + self._stats.duration_seconds = time.monotonic() - self._start_time if self._cache_manager and self._stats.pages_failed == 0: self._cache_manager.clear_discovered_urls() @@ -884,19 +1009,22 @@ def _record_result(self, url: str, output_path: Path, ctx: PageContext) -> None: self._cache_manager.mark_failed(url) return if ctx.should_skip: + if ctx.skip_code == SkipReason.DUPLICATE_CONTENT: + self._stats.pages_deduplicated += 1 self._stats.pages_skipped += 1 return self._stats.pages_fetched += 1 self._stats.bytes_downloaded += ctx.bytes_downloaded self._stats.files_saved += 1 - if self._cache_manager and ctx.markdown: - self._cache_manager.update_cache( - url=url, - content=ctx.markdown, - file_path=output_path, - etag=ctx.etag, - last_modified=ctx.last_modified, - ) + if self._cache_manager: + if ctx.markdown is not None: + self._cache_manager.update_cache( + url=url, + content=ctx.markdown, + file_path=ctx.persisted_path or output_path, + etag=ctx.etag, + last_modified=ctx.last_modified, + ) self._cache_manager.mark_fetched(url) diff --git a/src/docpull/discovery/crawler.py b/src/docpull/discovery/crawler.py index 2b1bd89..41b7915 100644 --- a/src/docpull/discovery/crawler.py +++ b/src/docpull/discovery/crawler.py @@ -117,13 +117,13 @@ def _extract_links(self, html: bytes, base_url: str) -> list[str]: def _should_crawl(self, url: str) -> bool: """ - Check if a URL should be crawled. + Check if a URL is safe and in-scope for crawling. Args: url: URL to check Returns: - True if URL should be crawled + True if URL can be fetched and traversed """ # Security validation if not self._validator.is_valid(url): @@ -134,10 +134,10 @@ def _should_crawl(self, url: str) -> bool: return False # Domain filter - if self._domain_filter and not self._domain_filter.should_include(url): - return False + return not self._domain_filter or self._domain_filter.should_include(url) - # Pattern filter + def _should_include(self, url: str) -> bool: + """Check whether a crawled URL should be emitted to consumers.""" return not (self._pattern_filter and not self._pattern_filter.should_include(url)) async def discover( @@ -171,15 +171,20 @@ async def discover( # Use provided max_depth or instance default effective_max_depth = max_depth if max_depth is not None else self._max_depth + count = 0 + + if not self._should_crawl(start_url): + logger.info("Skipping disallowed start URL: %s", start_url) + return + # BFS queue: (url, depth) queue: deque[tuple[str, int]] = deque() queue.append((start_url, 0)) self._seen.add(start_url) - count = 0 - - # Yield the starting URL first - if self._should_crawl(start_url): + # Traverse the seed even if path filters exclude it, otherwise a common + # "start at /, include only /docs/*" crawl never reaches the docs tree. + if self._should_include(start_url): yield start_url count += 1 @@ -215,6 +220,13 @@ async def discover( if not self._should_crawl(link): continue + # Add to queue for further crawling + if depth + 1 < effective_max_depth: + queue.append((link, depth + 1)) + + if not self._should_include(link): + continue + # Yield the URL yield link count += 1 @@ -222,8 +234,4 @@ async def discover( if max_urls is not None and count >= max_urls: return - # Add to queue for further crawling - if depth + 1 < effective_max_depth: - queue.append((link, depth + 1)) - logger.info(f"Crawl complete: found {count} URLs") diff --git a/src/docpull/discovery/link_extractors/static.py b/src/docpull/discovery/link_extractors/static.py index bb0b559..f1a955b 100644 --- a/src/docpull/discovery/link_extractors/static.py +++ b/src/docpull/discovery/link_extractors/static.py @@ -128,6 +128,12 @@ def _resolve_url(self, href: str, base_url: str) -> str | None: # Remove fragment parsed = urlparse(absolute_url) + if not parsed.scheme or not parsed.netloc: + return None + + if parsed.scheme not in ("http", "https"): + return None + clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" if parsed.query: clean_url += f"?{parsed.query}" diff --git a/src/docpull/discovery/sitemap.py b/src/docpull/discovery/sitemap.py index be1c820..b2958a7 100644 --- a/src/docpull/discovery/sitemap.py +++ b/src/docpull/discovery/sitemap.py @@ -63,6 +63,7 @@ def __init__( self._filter = pattern_filter self._robots = robots_checker self._seen = SeenUrlTracker() + self._seen_sitemaps = SeenUrlTracker() self._domain_filter: DomainFilter | None = None def _is_in_scope(self, url: str) -> bool: @@ -206,6 +207,10 @@ async def _discover_from_sitemap( logger.warning(f"Max sitemap depth exceeded at {sitemap_url}") return + if not self._seen_sitemaps.add(sitemap_url): + logger.debug("Skipping duplicate sitemap document: %s", sitemap_url) + return + content = await self._fetch_sitemap(sitemap_url) if content is None: return @@ -276,10 +281,12 @@ async def discover( Discovered URLs """ self._seen.clear() + self._seen_sitemaps.clear() self._domain_filter = DomainFilter(start_url, allow_subdomains=False) - # If URL looks like a sitemap, use it directly - if start_url.endswith(".xml"): + # If the URL path looks like a sitemap, use it directly even when the + # caller passed a query string or fragment. + if urlparse(start_url).path.endswith(".xml"): async for url in self._discover_from_sitemap(start_url, max_urls=max_urls): yield url return diff --git a/src/docpull/http/client.py b/src/docpull/http/client.py index 267f7da..ec28849 100644 --- a/src/docpull/http/client.py +++ b/src/docpull/http/client.py @@ -8,6 +8,8 @@ import re import secrets import socket +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime from types import TracebackType from urllib.parse import urljoin, urlparse @@ -18,14 +20,6 @@ from .protocols import HttpResponse from .rate_limiter import AdaptiveRateLimiter, PerHostRateLimiter -# Better encoding detection (charset-normalizer is an aiohttp dependency) -try: - from charset_normalizer import from_bytes as detect_encoding - - CHARSET_NORMALIZER_AVAILABLE = True -except ImportError: - CHARSET_NORMALIZER_AVAILABLE = False - logger = logging.getLogger(__name__) @@ -88,7 +82,6 @@ class AsyncHttpClient: - Exponential backoff retry for transient failures - Per-host rate limiting via PerHostRateLimiter - Content size limits to prevent memory exhaustion - - Intelligent encoding detection - Timeout controls Example: @@ -103,7 +96,6 @@ class AsyncHttpClient: _CRLF_RE = re.compile(r"[\r\n\x00]") MAX_CONTENT_SIZE = 50 * 1024 * 1024 # 50 MB - MAX_DOWNLOAD_TIME = 300 # 5 minutes MAX_REDIRECTS = 10 # Status codes that warrant a retry @@ -127,6 +119,7 @@ def __init__( user_agent: str | None = None, proxy: str | None = None, default_timeout: float = 30.0, + connect_timeout: float = 10.0, auth_headers: dict[str, str] | None = None, url_validator: UrlValidator | None = None, allow_insecure_tls: bool = False, @@ -143,7 +136,8 @@ def __init__( max_content_size: Maximum response size in bytes user_agent: Custom User-Agent string proxy: Proxy URL (http:// or socks5://) - default_timeout: Default request timeout in seconds + default_timeout: Default socket read timeout in seconds + connect_timeout: Connection timeout in seconds auth_headers: Authentication headers to include in all requests """ self._rate_limiter = rate_limiter @@ -152,9 +146,12 @@ def __init__( self._max_content_size = max_content_size self._proxy = proxy self._default_timeout = default_timeout + self._connect_timeout = connect_timeout self._auth_headers = auth_headers or {} self._url_validator = url_validator - self._auth_scope_hosts = {host.lower() for host in auth_scope_hosts} if auth_scope_hosts else None + self._auth_scope_hosts = ( + {self._normalize_hostname(host) for host in auth_scope_hosts} if auth_scope_hosts else None + ) if allow_insecure_tls: raise ValueError("Insecure TLS is not supported; certificate verification is always enforced") @@ -229,11 +226,61 @@ def _headers_for_url(self, headers: dict[str, str], target_url: str) -> dict[str return headers hostname = urlparse(target_url).hostname - if hostname and hostname.lower() in self._auth_scope_hosts: + if hostname and self._normalize_hostname(hostname) in self._auth_scope_hosts: return headers return {key: value for key, value in headers.items() if key.lower() not in self.SENSITIVE_HEADERS} + @staticmethod + def _normalize_hostname(hostname: str) -> str: + """Normalize hostnames for scope comparisons.""" + return hostname.lower().rstrip(".") + + def _build_request_headers( + self, + headers: dict[str, str] | None, + target_url: str, + ) -> dict[str, str]: + """Merge auth and request headers, validate them, then apply scope rules.""" + request_headers = dict(self._auth_headers) + if headers: + request_headers.update(headers) + + for name, value in request_headers.items(): + self._validate_header_value(name, value) + + return self._headers_for_url(request_headers, target_url) + + @staticmethod + def _parse_retry_after(retry_after: str | None) -> int | None: + """Parse Retry-After seconds or HTTP-date into a positive delay.""" + if not retry_after: + return None + + value = retry_after.strip() + if value.isdigit(): + seconds = int(value) + return seconds if seconds > 0 else None + + try: + retry_at = parsedate_to_datetime(value) + except (TypeError, ValueError, IndexError, OverflowError): + return None + + if retry_at.tzinfo is None: + retry_at = retry_at.replace(tzinfo=timezone.utc) + + delay = int((retry_at - datetime.now(timezone.utc)).total_seconds()) + return delay if delay > 0 else None + + def _client_timeout(self, read_timeout: float) -> aiohttp.ClientTimeout: + """Build an aiohttp timeout from separate connect/read settings.""" + return aiohttp.ClientTimeout( + total=self._connect_timeout + read_timeout, + connect=self._connect_timeout, + sock_read=read_timeout, + ) + def _next_redirect( self, response: aiohttp.ClientResponse, @@ -279,7 +326,7 @@ async def __aenter__(self) -> AsyncHttpClient: connector = aiohttp.TCPConnector( limit=100, - limit_per_host=10, + limit_per_host=max(1, self._rate_limiter.default_concurrent), ttl_dns_cache=300, resolver=resolver, ) @@ -315,52 +362,25 @@ def _calculate_retry_delay(self, attempt: int) -> float: jitter: float = secrets.randbits(24) / float(1 << 24) return delay + jitter - def _decode_content(self, content: bytes, content_type: str) -> str: - """ - Decode content with intelligent encoding detection. - - Fallback chain: - 1. Content-Type header charset - 2. charset-normalizer detection - 3. UTF-8 with replacement - - Args: - content: Raw bytes content - content_type: Content-Type header value - - Returns: - Decoded string - """ - # First, try to get encoding from Content-Type header - encoding = None - if content_type: - for part in content_type.split(";"): - part = part.strip() - if part.lower().startswith("charset="): - encoding = part.split("=", 1)[1].strip().strip("\"'") - break - - # Try declared encoding first - if encoding: + async def _read_response_content(self, response: aiohttp.ClientResponse) -> bytes: + """Read a response body while enforcing the configured size cap.""" + content_length = response.headers.get("Content-Length") + if content_length: try: - return content.decode(encoding) - except (UnicodeDecodeError, LookupError): - logger.debug(f"Failed to decode with declared encoding: {encoding}") + if int(content_length) > self._max_content_size: + raise ValueError(f"Content too large: {content_length} bytes") + except ValueError: + if content_length.isdigit(): + raise + logger.debug(f"Ignoring invalid Content-Length header: {content_length}") - # Use charset-normalizer for better detection - if CHARSET_NORMALIZER_AVAILABLE: - try: - result = detect_encoding(content) - if result: - best_match = result.best() - if best_match: - logger.debug(f"Detected encoding: {best_match.encoding}") - return str(best_match) - except Exception as e: - logger.debug(f"Encoding detection failed: {e}") - - # Final fallback: UTF-8 with replacement - return content.decode("utf-8", errors="replace") + content = bytearray() + async for chunk in response.content.iter_chunked(8192): + content.extend(chunk) + if len(content) > self._max_content_size: + raise ValueError(f"Content size limit exceeded: >{self._max_content_size} bytes") + + return bytes(content) async def get( self, @@ -387,18 +407,14 @@ async def get( if self._session is None: raise RuntimeError("Client not initialized. Use 'async with' context manager.") - timeout_val = timeout or self._default_timeout - # Merge auth headers with request-specific headers - request_headers = dict(self._auth_headers) - if headers: - request_headers.update(headers) + timeout_val = self._default_timeout if timeout is None else timeout last_error: Exception | None = None for attempt in range(self._max_retries + 1): try: current_url = url - current_headers = self._headers_for_url(dict(request_headers), current_url) + current_headers = self._build_request_headers(headers, current_url) redirect_count = 0 while True: @@ -408,7 +424,7 @@ async def get( self._rate_limiter.limit(current_url), self._session.get( current_url, - timeout=aiohttp.ClientTimeout(total=timeout_val), + timeout=self._client_timeout(timeout_val), headers=current_headers, proxy=self._proxy, allow_redirects=False, @@ -423,10 +439,7 @@ async def get( if response.status in self.RETRYABLE_STATUS_CODES: if response.status == 429 and isinstance(self._rate_limiter, AdaptiveRateLimiter): - retry_after = response.headers.get("Retry-After") - retry_seconds = ( - int(retry_after) if retry_after and retry_after.isdigit() else None - ) + retry_seconds = self._parse_retry_after(response.headers.get("Retry-After")) await self._rate_limiter.record_rate_limit(current_url, retry_seconds) if attempt < self._max_retries: @@ -439,17 +452,7 @@ async def get( break response.raise_for_status() - content_length = response.headers.get("Content-Length") - if content_length and int(content_length) > self._max_content_size: - raise ValueError(f"Content too large: {content_length} bytes") - - content = b"" - async for chunk in response.content.iter_chunked(8192): - content += chunk - if len(content) > self._max_content_size: - raise ValueError( - f"Content size limit exceeded: >{self._max_content_size} bytes" - ) + content = await self._read_response_content(response) content_type = response.headers.get("Content-Type", "") @@ -503,37 +506,73 @@ async def head( if self._session is None: raise RuntimeError("Client not initialized. Use 'async with' context manager.") - # Merge auth headers with request-specific headers - request_headers = dict(self._auth_headers) - if headers: - request_headers.update(headers) + last_error: Exception | None = None - current_url = url - current_headers = self._headers_for_url(dict(request_headers), current_url) - redirect_count = 0 + for attempt in range(self._max_retries + 1): + try: + current_url = url + current_headers = self._build_request_headers(headers, current_url) + redirect_count = 0 - while True: - self._validate_url(current_url) + while True: + self._validate_url(current_url) - async with ( - self._rate_limiter.limit(current_url), - self._session.head( - current_url, - timeout=aiohttp.ClientTimeout(total=timeout), - headers=current_headers if current_headers else None, - proxy=self._proxy, - allow_redirects=False, - ) as response, - ): - redirect = self._next_redirect(response, current_url, current_headers, redirect_count, url) - if redirect is not None: - current_url, current_headers, redirect_count = redirect - continue - - return HttpResponse( - status_code=response.status, - content=b"", - content_type=response.headers.get("Content-Type", ""), - headers=dict(response.headers), - url=str(response.url), - ) + async with ( + self._rate_limiter.limit(current_url), + self._session.head( + current_url, + timeout=self._client_timeout(timeout), + headers=current_headers if current_headers else None, + proxy=self._proxy, + allow_redirects=False, + ) as response, + ): + redirect = self._next_redirect( + response, current_url, current_headers, redirect_count, url + ) + if redirect is not None: + current_url, current_headers, redirect_count = redirect + continue + + if response.status in self.RETRYABLE_STATUS_CODES: + if response.status == 429 and isinstance(self._rate_limiter, AdaptiveRateLimiter): + retry_seconds = self._parse_retry_after(response.headers.get("Retry-After")) + await self._rate_limiter.record_rate_limit(current_url, retry_seconds) + + if attempt < self._max_retries: + delay = self._calculate_retry_delay(attempt) + logger.warning( + f"Got {response.status} for {current_url}, retrying in {delay:.1f}s " + f"(attempt {attempt + 1}/{self._max_retries + 1})" + ) + await asyncio.sleep(delay) + break + response.raise_for_status() + + if isinstance(self._rate_limiter, AdaptiveRateLimiter): + await self._rate_limiter.record_success(current_url) + + return HttpResponse( + status_code=response.status, + content=b"", + content_type=response.headers.get("Content-Type", ""), + headers=dict(response.headers), + url=str(response.url), + ) + + except self.RETRYABLE_EXCEPTIONS as e: + last_error = e + if attempt < self._max_retries: + delay = self._calculate_retry_delay(attempt) + logger.warning( + f"Error fetching {url}: {e}, retrying in {delay:.1f}s " + f"(attempt {attempt + 1}/{self._max_retries + 1})" + ) + await asyncio.sleep(delay) + else: + logger.error(f"HTTP fetch error for {url} after {self._max_retries + 1} attempts: {e}") + raise + + if last_error: + raise last_error + raise RuntimeError(f"Unexpected error fetching {url}") diff --git a/src/docpull/http/protocols.py b/src/docpull/http/protocols.py index 575286c..e0953b0 100644 --- a/src/docpull/http/protocols.py +++ b/src/docpull/http/protocols.py @@ -64,6 +64,7 @@ async def head( url: str, *, timeout: float = 10.0, + headers: dict[str, str] | None = None, ) -> HttpResponse: """ Perform an HTTP HEAD request. @@ -71,6 +72,7 @@ async def head( Args: url: The URL to check timeout: Request timeout in seconds + headers: Optional additional headers Returns: HttpResponse (content will be empty bytes) diff --git a/src/docpull/http/rate_limiter.py b/src/docpull/http/rate_limiter.py index 5277114..1428753 100644 --- a/src/docpull/http/rate_limiter.py +++ b/src/docpull/http/rate_limiter.py @@ -28,7 +28,7 @@ class PerHostRateLimiter: await fetch_page(...) async with limiter.limit("https://example.com/page2"): - # Will wait at least 0.5s after page1 completes + # Will wait at least 0.5s after page1 starts await fetch_page(...) """ @@ -47,23 +47,58 @@ def __init__( host_configs: Optional per-host overrides, e.g.: {"api.example.com": {"delay": 1.0, "concurrent": 2}} """ + self._validate_delay(default_delay) + self._validate_concurrency(default_concurrent) + self.default_delay = default_delay self.default_concurrent = default_concurrent - self.host_configs = host_configs or {} + self.host_configs = {} + for host, config in (host_configs or {}).items(): + normalized_host = self._normalize_host(host) + delay = config.get("delay", self.default_delay) + concurrent = config.get("concurrent", self.default_concurrent) + self._validate_delay(delay) + self._validate_concurrency(concurrent) + self.host_configs[normalized_host] = { + "delay": delay, + "concurrent": concurrent, + } # Per-host state self._semaphores: dict[str, asyncio.Semaphore] = {} + self._timing_locks: dict[str, asyncio.Lock] = {} self._last_request: dict[str, float] = {} - self._lock = asyncio.Lock() + self._state_lock = asyncio.Lock() def _get_host(self, url: str) -> str: - """Extract host from URL.""" - return urlparse(url).netloc + """Extract a normalized host key from URL.""" + parsed = urlparse(url) + if parsed.hostname: + return self._normalize_host(parsed.hostname) + return self._normalize_host(parsed.netloc) + + @staticmethod + def _normalize_host(host: str) -> str: + """Normalize host keys for configs and runtime tracking.""" + return host.lower().rstrip(".") + + @staticmethod + def _validate_delay(delay: float) -> None: + """Reject invalid delay values early.""" + if delay < 0: + raise ValueError("Rate limit delay must be >= 0") + + @staticmethod + def _validate_concurrency(concurrent: int) -> None: + """Reject invalid concurrency values early.""" + if concurrent < 1: + raise ValueError("Per-host concurrency must be >= 1") def _get_config(self, host: str) -> tuple[float, int]: """Get delay and concurrency for a specific host.""" - if host in self.host_configs: - cfg = self.host_configs[host] + normalized_host = self._normalize_host(host) + if normalized_host in self.host_configs: + cfg = self.host_configs[normalized_host] return ( cfg.get("delay", self.default_delay), cfg.get("concurrent", self.default_concurrent), @@ -72,12 +107,19 @@ def _get_config(self, host: str) -> tuple[float, int]: async def _get_semaphore(self, host: str) -> asyncio.Semaphore: """Get or create semaphore for a host.""" - async with self._lock: + async with self._state_lock: if host not in self._semaphores: _, concurrent = self._get_config(host) self._semaphores[host] = asyncio.Semaphore(concurrent) return self._semaphores[host] + async def _get_timing_lock(self, host: str) -> asyncio.Lock: + """Get or create the delay-enforcement lock for a host.""" + async with self._state_lock: + if host not in self._timing_locks: + self._timing_locks[host] = asyncio.Lock() + return self._timing_locks[host] + @asynccontextmanager async def limit(self, url: str) -> AsyncIterator[None]: """ @@ -103,8 +145,9 @@ async def limit(self, url: str) -> AsyncIterator[None]: # Acquire semaphore slot async with sem: - # Enforce per-host delay - async with self._lock: + # Enforce per-host delay without serializing unrelated hosts. + timing_lock = await self._get_timing_lock(host) + async with timing_lock: now = time.monotonic() last = self._last_request.get(host, 0.0) wait_time = max(0.0, delay - (now - last)) @@ -136,13 +179,16 @@ def update_host_config( Changes to concurrent won't affect existing semaphores. For safety, set host configs before starting requests. """ - if host not in self.host_configs: - self.host_configs[host] = {} + normalized_host = self._normalize_host(host) + if normalized_host not in self.host_configs: + self.host_configs[normalized_host] = {} if delay is not None: - self.host_configs[host]["delay"] = delay + self._validate_delay(delay) + self.host_configs[normalized_host]["delay"] = delay if concurrent is not None: - self.host_configs[host]["concurrent"] = concurrent + self._validate_concurrency(concurrent) + self.host_configs[normalized_host]["concurrent"] = concurrent def get_stats(self) -> dict: """Get rate limiter statistics.""" diff --git a/src/docpull/mcp/prompts.py b/src/docpull/mcp/prompts.py new file mode 100644 index 0000000..88455d1 --- /dev/null +++ b/src/docpull/mcp/prompts.py @@ -0,0 +1,163 @@ +"""Reusable MCP prompts for common docpull workflows.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class PromptSpec: + name: str + title: str + description: str + argument_description: str | None + template: str + + +_DOCS_ADD_TEMPLATE = """# Add docs to the local docpull index + +User input: {arguments} + +The user wants to add documentation to docpull's local index so it is searchable later with grep_docs. + +Use this workflow: + +1. If the input is empty, reply with: + Usage: /mcp__docpull__docs_add , /mcp__docpull__docs_add , + or /mcp__docpull__docs_add . +2. If the input is one token without a URL scheme, treat it as a source alias + and call ensure_docs(source=). +3. If the input is one HTTPS URL, derive an alias from the hostname by stripping + a leading docs. or www., taking the first label, and lowercasing it. Check + list_sources and list_indexed for collisions. If there is no collision, call + add_source(name=, url=) and then ensure_docs(source=). +4. If the input is two tokens and the second token is an HTTPS URL, validate the + first token as an alias, call add_source(name=, url=), then + ensure_docs(source=). + +After success, report the alias and the fetch counts from ensure_docs. If a URL +is rejected, explain that docpull only accepts public HTTPS docs URLs. +""" + +_DOCS_SEARCH_TEMPLATE = """# Search fetched docs + +User input: {arguments} + +The user wants to search documentation that has already been fetched into docpull's local index. + +Use this workflow: + +1. If the input is empty, reply with: Usage: /mcp__docpull__docs_search [library]. +2. Parse the first argument as the regex pattern and the optional second argument as the library alias. +3. Call grep_docs(pattern=, library=, limit=10, context=2). +4. For the top two or three useful file hits, call read_doc with the returned + library and path, using a narrow line window around the first match. +5. If no library was provided and nothing matches, broaden the regex once. If a + library was provided and nothing matches, call list_indexed to confirm the + library exists. + +Answer with the synthesized result, citing docpull paths as library/path.md:line. +Do not dump raw grep output unless the user asks for it. +""" + +_DOCS_LIST_TEMPLATE = """# List cached docs + +Show what documentation libraries are available in the local docpull index. + +Use this workflow: + +1. Call list_indexed(). +2. If nothing is cached, say that no docs are indexed yet and suggest /mcp__docpull__docs_add . +3. If libraries are cached, render the list concisely with file counts and freshness. +4. If any cached library is stale, suggest /mcp__docpull__docs_refresh . +""" + +_DOCS_REFRESH_TEMPLATE = """# Refresh cached docs + +User input: {arguments} + +The user wants to force-refresh a fetched documentation library, bypassing the normal cache. + +Use this workflow: + +1. If the input is empty, reply with: Usage: /mcp__docpull__docs_refresh . +2. Parse the input as a single library alias. +3. Call ensure_docs(source=, force=true). +4. After success, summarize pages fetched, skipped, and failed from the tool response. + +Do not refresh every cached library unless the user explicitly asks for that broader operation. +""" + +_DOCS_REMOVE_TEMPLATE = """# Remove a docs source + +User input: {arguments} + +The user wants to remove a user-defined source alias and, by default, delete its cached docs. + +Use this workflow: + +1. If the input is empty, reply with: Usage: /mcp__docpull__docs_remove [--keep-cache]. +2. Parse the first token as the library alias. +3. If --keep-cache is present, call remove_source(name=, delete_cache=false). +4. Otherwise call remove_source(name=, delete_cache=true). +5. Relay the tool result plainly. If the source is builtin, explain that builtins cannot be removed. + +Do not delete files with shell commands; use remove_source for validated cache deletion. +""" + +PROMPTS: tuple[PromptSpec, ...] = ( + PromptSpec( + name="docs_add", + title="Add docs", + description="Fetch a built-in docs alias or register an HTTPS docs URL and index it locally.", + argument_description=" | | ", + template=_DOCS_ADD_TEMPLATE, + ), + PromptSpec( + name="docs_search", + title="Search docs", + description="Search fetched docs by regex and read surrounding context from the best hits.", + argument_description=" [library]", + template=_DOCS_SEARCH_TEMPLATE, + ), + PromptSpec( + name="docs_list", + title="List cached docs", + description="List documentation libraries currently cached locally.", + argument_description=None, + template=_DOCS_LIST_TEMPLATE, + ), + PromptSpec( + name="docs_refresh", + title="Refresh docs", + description="Force-refresh a cached documentation library.", + argument_description="", + template=_DOCS_REFRESH_TEMPLATE, + ), + PromptSpec( + name="docs_remove", + title="Remove docs", + description="Remove a user-defined source alias, optionally keeping cached docs.", + argument_description=" [--keep-cache]", + template=_DOCS_REMOVE_TEMPLATE, + ), +) + +_PROMPTS_BY_NAME = {prompt.name: prompt for prompt in PROMPTS} + + +def render_prompt(name: str, arguments: dict[str, str] | None = None) -> str: + """Render a prompt by name for the MCP server.""" + try: + prompt = _PROMPTS_BY_NAME[name] + except KeyError: + raise ValueError(f"Unknown prompt: {name}") from None + + raw_arguments = "" + if arguments: + raw_arguments = arguments.get("input", "") + + return prompt.template.format(arguments=raw_arguments or "(empty)") + + +__all__ = ["PROMPTS", "PromptSpec", "render_prompt"] diff --git a/src/docpull/mcp/server.py b/src/docpull/mcp/server.py index bed6b6d..3d81768 100644 --- a/src/docpull/mcp/server.py +++ b/src/docpull/mcp/server.py @@ -1,7 +1,8 @@ -"""stdio MCP server exposing docpull tools to AI agents. +"""stdio MCP server exposing docpull tools and prompts to AI agents. Requires the optional ``mcp`` Python package (install with -``pip install docpull[mcp]``). The server registers eight tools: +``pip install docpull[mcp]``). The server registers eight tools and five +workflow prompts. Read-only: - ``fetch_url(url)`` — one-shot fetch, no discovery. Agent-oriented fast path. @@ -24,6 +25,7 @@ import sys from typing import Any +from .prompts import PROMPTS, render_prompt from .tools import ( ToolResult, add_source, @@ -199,6 +201,15 @@ def _coerce_int(value: Any, *, name: str, default: int) -> int: raise ValueError(f"'{name}' must be an integer, got {type(value).__name__}") +def _coerce_bool(value: Any, *, name: str, default: bool) -> bool: + """Accept booleans only; reject truthy strings like ``"false"``.""" + if value is None: + return default + if isinstance(value, bool): + return value + raise ValueError(f"'{name}' must be a boolean, got {type(value).__name__}") + + def _require_str(arguments: dict[str, Any], key: str) -> str: if key not in arguments: raise ValueError(f"Missing required argument: '{key}'") @@ -212,7 +223,16 @@ async def _run_stdio() -> int: try: from mcp.server import Server from mcp.server.stdio import stdio_server - from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations + from mcp.types import ( + CallToolResult, + GetPromptResult, + Prompt, + PromptArgument, + PromptMessage, + TextContent, + Tool, + ToolAnnotations, + ) except ImportError: print( "docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]", @@ -222,6 +242,45 @@ async def _run_stdio() -> int: server: Server = Server("docpull", instructions=SERVER_INSTRUCTIONS) + @server.list_prompts() # type: ignore[misc,no-untyped-call] + async def _list_prompts() -> list[Prompt]: + return [ + Prompt( + name=prompt.name, + title=prompt.title, + description=prompt.description, + arguments=( + [ + PromptArgument( + name="input", + description=prompt.argument_description, + required=False, + ) + ] + if prompt.argument_description is not None + else None + ), + ) + for prompt in PROMPTS + ] + + @server.get_prompt() # type: ignore[misc,no-untyped-call] + async def _get_prompt(name: str, arguments: dict[str, str] | None) -> GetPromptResult: + try: + text = render_prompt(name, arguments) + except ValueError as err: + text = str(err) + + return GetPromptResult( + description=f"docpull workflow: {name}", + messages=[ + PromptMessage( + role="user", + content=TextContent(type="text", text=text), + ) + ], + ) + @server.list_tools() # type: ignore[misc,no-untyped-call] async def _list_tools() -> list[Tool]: return [ @@ -405,9 +464,12 @@ async def _list_tools() -> list[Tool]: description=( "Add or update a user source alias in the writable " "sources.yaml. Refuses to shadow a builtin alias unless " - "force=true. URL is HTTPS-only and validated against the " - "same SSRF rules as fetch_url. Use list_sources to confirm " - "the change." + "force=true. URL is HTTPS-only and screened against the " + "source-registry hostname policy (localhost/internal " + "suffixes and literal private IPs are rejected). The " + "fetcher re-validates resolved addresses before any " + "network connection. Use list_sources to confirm the " + "change." ), annotations=ToolAnnotations( title="Add or update a user source", @@ -518,7 +580,7 @@ async def _call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult: on_progress = await _make_progress_callback() result = await ensure_docs( source, - force=bool(arguments.get("force", False)), + force=_coerce_bool(arguments.get("force"), name="force", default=False), profile=arguments.get("profile"), on_progress=on_progress, ) @@ -538,7 +600,11 @@ async def _call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult: pattern, library=library, limit=_coerce_int(arguments.get("limit"), name="limit", default=20), - case_sensitive=bool(arguments.get("case_sensitive", False)), + case_sensitive=_coerce_bool( + arguments.get("case_sensitive"), + name="case_sensitive", + default=False, + ), context=_coerce_int(arguments.get("context"), name="context", default=1), ) elif name == "read_doc": @@ -568,13 +634,17 @@ async def _call_tool(name: str, arguments: dict[str, Any]) -> CallToolResult: description=description, category=category, max_pages=_coerce_int(max_pages, name="max_pages", default=0) or None, - force=bool(arguments.get("force", False)), + force=_coerce_bool(arguments.get("force"), name="force", default=False), ) elif name == "remove_source": rm_name = _require_str(arguments, "name") result = remove_source( rm_name, - delete_cache=bool(arguments.get("delete_cache", False)), + delete_cache=_coerce_bool( + arguments.get("delete_cache"), + name="delete_cache", + default=False, + ), ) else: result = ToolResult(f"Unknown tool: {name}", is_error=True) diff --git a/src/docpull/mcp/sources.py b/src/docpull/mcp/sources.py index 843a3f8..5b29141 100644 --- a/src/docpull/mcp/sources.py +++ b/src/docpull/mcp/sources.py @@ -59,7 +59,20 @@ class SourceConfig: _LIBRARY_NAME_RE = re.compile(r"^[a-zA-Z0-9_.-]+$") MAX_LIBRARY_NAME_LENGTH = 128 MAX_USER_SOURCE_PAGES = 100_000 -_USER_SOURCE_URL_VALIDATOR = UrlValidator(allowed_schemes={"https"}) + + +def _registry_resolver(_hostname: str) -> list[str]: + """Avoid live DNS while parsing source aliases. + + Source aliases are configuration, not network I/O. Fetch-time validators + still resolve and screen the actual destination before connecting; the + registry only needs deterministic checks for URL shape, blocked hostnames, + and literal unsafe IPs. + """ + return ["93.184.216.34"] + + +_USER_SOURCE_URL_VALIDATOR = UrlValidator(allowed_schemes={"https"}, resolver=_registry_resolver) def is_safe_library_name(name: str) -> bool: @@ -134,7 +147,13 @@ def load_user_sources(path: Path | None = None) -> dict[str, SourceConfig]: except yaml.YAMLError as err: logger.warning("Failed to parse %s: %s", path, err) return {} + if not isinstance(raw, dict): + logger.warning("Ignoring %s: root YAML value must be a mapping", path) + return {} entries = raw.get("sources") or {} + if not isinstance(entries, dict): + logger.warning("Ignoring %s: 'sources' must be a mapping", path) + return {} result: dict[str, SourceConfig] = {} for name, cfg in entries.items(): source_name = str(name) diff --git a/src/docpull/mcp/tools.py b/src/docpull/mcp/tools.py index c4641d1..1e7a900 100644 --- a/src/docpull/mcp/tools.py +++ b/src/docpull/mcp/tools.py @@ -31,6 +31,7 @@ from .sources import ( _URL_SCHEME_RE, BUILTIN_SOURCES, + _registry_resolver, all_sources, default_config_dir, default_docs_dir, @@ -42,6 +43,7 @@ logger = logging.getLogger(__name__) CACHE_TTL_SECONDS = 7 * 24 * 60 * 60 # 7 days +META_SCHEMA_VERSION = 1 MAX_GREP_PATTERN_LEN = 1000 GREP_TIMEOUT_SECONDS = 10.0 GREP_LINE_TIMEOUT_SECONDS = 0.05 @@ -74,7 +76,13 @@ def _source_dir(docs_dir: Path, source: str) -> Path: return docs_dir / source -def _cache_fresh(meta_path: Path) -> bool: +def _cache_fresh( + meta_path: Path, + *, + expected_url: str | None = None, + expected_profile: str | None = None, + expected_max_pages: int | None = None, +) -> bool: if not meta_path.exists(): return False try: @@ -86,10 +94,26 @@ def _cache_fresh(meta_path: Path) -> bool: return False if data.get("partial") is True: return False + if data.get("schema_version") != META_SCHEMA_VERSION: + return False + if expected_url is not None and data.get("url") != expected_url: + return False + if expected_profile is not None and data.get("profile") != expected_profile: + return False + if data.get("max_pages") != expected_max_pages: + return False return (time.time() - fetched_at) < CACHE_TTL_SECONDS -def _write_meta(meta_path: Path, source: str, url: str, pages: int) -> None: +def _write_meta( + meta_path: Path, + source: str, + url: str, + pages: int, + *, + profile: str, + max_pages: int | None, +) -> None: """Atomic-ish meta write: tmp file + rename so a crash mid-write never leaves a half-parsed JSON behind.""" meta_path.parent.mkdir(parents=True, exist_ok=True) @@ -99,6 +123,9 @@ def _write_meta(meta_path: Path, source: str, url: str, pages: int) -> None: { "source": source, "url": url, + "schema_version": META_SCHEMA_VERSION, + "profile": profile, + "max_pages": max_pages, "fetched_at_epoch": time.time(), "fetched_at": utc_now_iso(), "page_count": pages, @@ -109,7 +136,15 @@ def _write_meta(meta_path: Path, source: str, url: str, pages: int) -> None: os.replace(tmp, meta_path) -def _write_partial_meta(meta_path: Path, source: str, url: str, pages: int) -> None: +def _write_partial_meta( + meta_path: Path, + source: str, + url: str, + pages: int, + *, + profile: str, + max_pages: int | None, +) -> None: """Mark a fetch as partial. ``_cache_fresh`` treats partial as stale.""" meta_path.parent.mkdir(parents=True, exist_ok=True) tmp = meta_path.with_suffix(meta_path.suffix + ".tmp") @@ -118,6 +153,9 @@ def _write_partial_meta(meta_path: Path, source: str, url: str, pages: int) -> N { "source": source, "url": url, + "schema_version": META_SCHEMA_VERSION, + "profile": profile, + "max_pages": max_pages, "fetched_at_epoch": time.time(), "fetched_at": utc_now_iso(), "page_count": pages, @@ -200,9 +238,19 @@ async def ensure_docs( target_dir = _source_dir(docs_dir, source) meta_path = _meta_path(docs_dir, source) - - if not force and _cache_fresh(meta_path) and target_dir.exists() and any(target_dir.rglob("*.md")): - files = list(target_dir.rglob("*.md")) + profile_name = profile_enum.value + + files = list(target_dir.rglob("*.md")) if target_dir.exists() else [] + if ( + not force + and _cache_fresh( + meta_path, + expected_url=resolved.url, + expected_profile=profile_name, + expected_max_pages=resolved.max_pages, + ) + and files + ): return ToolResult( f"Cached: {source} ({len(files)} files at {target_dir}). Call with force=true to refresh.", data={ @@ -233,11 +281,25 @@ async def ensure_docs( crashed = True # Mark whatever made it to disk as a partial fetch so future # ensure_docs calls re-fetch instead of trusting half a crawl. - _write_partial_meta(meta_path, source, resolved.url, fetched) + _write_partial_meta( + meta_path, + source, + resolved.url, + fetched, + profile=profile_name, + max_pages=resolved.max_pages, + ) raise if not crashed: - _write_meta(meta_path, source, resolved.url, stats.pages_fetched) + _write_meta( + meta_path, + source, + resolved.url, + stats.pages_fetched, + profile=profile_name, + max_pages=resolved.max_pages, + ) return ToolResult( f"Fetched {source}: {stats.pages_fetched} pages saved to {target_dir} " f"({stats.pages_skipped} skipped, {stats.pages_failed} failed).", @@ -649,7 +711,7 @@ def read_doc( MAX_DESCRIPTION_LEN = 500 ALLOWED_USER_CATEGORIES = {"frontend", "backend", "ai", "database", "user"} -_ADD_SOURCE_VALIDATOR = UrlValidator(allowed_schemes={"https"}) +_ADD_SOURCE_VALIDATOR = UrlValidator(allowed_schemes={"https"}, resolver=_registry_resolver) def _user_sources_path(config_dir: Path | None = None) -> Path: @@ -678,9 +740,12 @@ def _read_user_sources_raw(path: Path) -> dict[str, dict[str, Any]]: try: raw = yaml.safe_load(path.read_text()) or {} except yaml.YAMLError as err: - logger.warning("user sources.yaml is malformed; treating as empty: %s", err) - return {} + raise ValueError(f"user sources.yaml is malformed: {err}") from err + if not isinstance(raw, dict): + raise ValueError("user sources.yaml root must be a mapping") entries = raw.get("sources") or {} + if not isinstance(entries, dict): + raise ValueError("user sources.yaml 'sources' key must be a mapping") out: dict[str, dict[str, Any]] = {} for name, cfg in entries.items(): if isinstance(cfg, dict) and isinstance(cfg.get("url"), str): @@ -701,17 +766,15 @@ def add_source( """Add or update a user source alias in ``sources.yaml``. Refuses to shadow a builtin alias unless ``force=True`` (the agent is - explicitly choosing to override it). URL is validated with the same - SSRF rules ``fetch_url`` uses. + explicitly choosing to override it). Registration applies the same + static hostname/IP policy as the TS source resolver; fetch-time + validators still re-check resolved addresses before connecting. """ if not is_safe_library_name(name): return ToolResult( f"Invalid source name '{name}'. Use alnum + ``_ . -``, max 128 chars.", is_error=True, ) - validation = _ADD_SOURCE_VALIDATOR.validate(url) - if not validation.is_valid: - return ToolResult(f"URL rejected: {validation.rejection_reason}", is_error=True) if description is not None and len(description) > MAX_DESCRIPTION_LEN: return ToolResult(f"Description too long (>{MAX_DESCRIPTION_LEN} chars).", is_error=True) if category is not None and category not in ALLOWED_USER_CATEGORIES: @@ -726,9 +789,15 @@ def add_source( f"'{name}' is a builtin source. Pass force=true to shadow it with a user override.", is_error=True, ) + validation = _ADD_SOURCE_VALIDATOR.validate(url) + if not validation.is_valid: + return ToolResult(f"URL rejected: {validation.rejection_reason}", is_error=True) path = _user_sources_path(config_dir) - existing = _read_user_sources_raw(path) + try: + existing = _read_user_sources_raw(path) + except ValueError as err: + return ToolResult(f"Refusing to modify {path}: {err}.", is_error=True) replaced = name in existing entry: dict[str, Any] = {"url": url} @@ -782,7 +851,10 @@ def remove_source( ) path = _user_sources_path(config_dir) - existing = _read_user_sources_raw(path) + try: + existing = _read_user_sources_raw(path) + except ValueError as err: + return ToolResult(f"Refusing to modify {path}: {err}.", is_error=True) removed = name in existing if removed: del existing[name] diff --git a/src/docpull/metadata_extractor.py b/src/docpull/metadata_extractor.py index 98f6ba9..a232078 100644 --- a/src/docpull/metadata_extractor.py +++ b/src/docpull/metadata_extractor.py @@ -77,12 +77,12 @@ def extract(self, html: str, url: str) -> RichMetadata: # Extract JSON-LD data jsonld_data = data.get("json-ld", []) if jsonld_data and isinstance(jsonld_data, list): - metadata.update(self._extract_jsonld(jsonld_data)) # type: ignore[typeddict-item] + self._merge_missing(metadata, self._extract_jsonld(jsonld_data)) # Extract microdata microdata = data.get("microdata", []) if microdata and isinstance(microdata, list): - metadata.update(self._extract_microdata(microdata)) # type: ignore[typeddict-item] + self._merge_missing(metadata, self._extract_microdata(microdata)) except ImportError: logger.warning("extruct not installed, rich metadata extraction disabled") @@ -91,7 +91,13 @@ def extract(self, html: str, url: str) -> RichMetadata: return metadata - def _extract_opengraph(self, og_properties: list[dict[str, Any]]) -> dict[str, Any]: + def _merge_missing(self, target: RichMetadata, source: dict[str, Any]) -> None: + """Fill metadata fields without overwriting higher-priority sources.""" + for key, value in source.items(): + if value and not target.get(key): # type: ignore[literal-required] + target[key] = value # type: ignore[literal-required] + + def _extract_opengraph(self, og_properties: list[Any]) -> dict[str, Any]: """Extract Open Graph metadata. Args: @@ -102,17 +108,29 @@ def _extract_opengraph(self, og_properties: list[dict[str, Any]]) -> dict[str, A """ result: dict[str, Any] = {} - # Build dict from properties list + # Build dict from extruct's property list. Current extruct returns + # tuples like ("og:title", "Title"), while older/faked tests may use + # dicts. Preserve repeated values such as article:tag. og_dict: dict[str, Any] = {} + + def add_value(key: str, value: Any) -> None: + clean_key = key.replace("og:", "") + existing = og_dict.get(clean_key) + if existing is None: + og_dict[clean_key] = value + elif isinstance(existing, list): + existing.append(value) + else: + og_dict[clean_key] = [existing, value] + for prop in og_properties: if isinstance(prop, dict): for key, value in prop.items(): - # Handle both 'og:title' and 'title' formats - clean_key = key.replace("og:", "") - if isinstance(value, list) and len(value) > 0: - og_dict[clean_key] = value[0] - else: - og_dict[clean_key] = value + add_value(key, value) + elif isinstance(prop, (list, tuple)) and len(prop) == 2: + key, value = prop + if isinstance(key, str): + add_value(key, value) # Map OG fields to our metadata if "title" in og_dict: @@ -202,11 +220,24 @@ def _extract_jsonld(self, jsonld_list: list[dict[str, Any]]) -> dict[str, Any]: image = item["image"] if isinstance(image, dict): result["image"] = self._safe_string(image.get("url", "")) + elif isinstance(image, list): + result["image"] = self._extract_image_from_list(image) elif isinstance(image, str): result["image"] = self._safe_string(image) return result + def _extract_image_from_list(self, images: list[Any]) -> str: + """Extract the first useful image URL from JSON-LD image arrays.""" + for image in images: + if isinstance(image, dict): + value = self._safe_string(image.get("url", "")) + else: + value = self._safe_string(image) + if value: + return value + return "" + def _extract_microdata(self, microdata_list: list[dict[str, Any]]) -> dict[str, Any]: """Extract microdata metadata. diff --git a/src/docpull/models/__init__.py b/src/docpull/models/__init__.py index 68095aa..ff53332 100644 --- a/src/docpull/models/__init__.py +++ b/src/docpull/models/__init__.py @@ -15,6 +15,7 @@ ) from .events import EventType, FetchEvent, FetchStats, SkipReason from .profiles import PROFILES, apply_profile +from .run import RunIdentity __all__ = [ # Config @@ -34,6 +35,7 @@ "FetchEvent", "FetchStats", "SkipReason", + "RunIdentity", # Profiles "PROFILES", "apply_profile", diff --git a/src/docpull/models/config.py b/src/docpull/models/config.py index c906197..d4de295 100644 --- a/src/docpull/models/config.py +++ b/src/docpull/models/config.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Any, Literal -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, Field, field_validator, model_validator class ProfileName(str, Enum): @@ -192,6 +192,16 @@ class OutputConfig(BaseModel): model_config = {"extra": "forbid"} + @model_validator(mode="after") + def _validate_chunk_and_skill_settings(self) -> OutputConfig: + if self.emit_chunks and self.max_tokens_per_file is None: + raise ValueError("emit_chunks requires max_tokens_per_file so chunked output actually exists") + + if self.skill_name is not None: + object.__setattr__(self, "naming_strategy", "hierarchical") + + return self + def _expand_env_var(value: str | None) -> str | None: """Expand environment variable references in a string. @@ -254,10 +264,12 @@ def model_post_init(self, __context: object) -> None: # Use object.__setattr__ to bypass frozen model if needed if self.token: object.__setattr__(self, "token", _expand_env_var(self.token)) + _reject_header_injection(self.token, "token") if self.password: object.__setattr__(self, "password", _expand_env_var(self.password)) if self.cookie: object.__setattr__(self, "cookie", _expand_env_var(self.cookie)) + _reject_header_injection(self.cookie, "cookie") if self.header_value: object.__setattr__(self, "header_value", _expand_env_var(self.header_value)) # Re-check after env var expansion (env vars could introduce CRLF) @@ -265,6 +277,37 @@ def model_post_init(self, __context: object) -> None: if self.header_name: _reject_header_injection(self.header_name, "header_name") + @model_validator(mode="after") + def _validate_auth_payload(self) -> AuthConfig: + if self.type == AuthType.NONE: + if any( + value is not None + for value in ( + self.token, + self.username, + self.password, + self.cookie, + self.header_name, + self.header_value, + ) + ): + raise ValueError("auth fields were provided but auth.type is 'none'") + return self + + if self.type == AuthType.BEARER and not self.token: + raise ValueError("auth.type 'bearer' requires token") + + if self.type == AuthType.BASIC and (not self.username or not self.password): + raise ValueError("auth.type 'basic' requires both username and password") + + if self.type == AuthType.COOKIE and not self.cookie: + raise ValueError("auth.type 'cookie' requires cookie") + + if self.type == AuthType.HEADER and (not self.header_name or not self.header_value): + raise ValueError("auth.type 'header' requires both header_name and header_value") + + return self + class NetworkConfig(BaseModel): """Configuration for HTTP client and network behavior.""" @@ -302,6 +345,15 @@ def _reject_insecure_tls(cls, value: bool) -> bool: def _reject_crlf_in_user_agent(cls, v: str | None, info: Any) -> str | None: return _reject_header_injection(v, info.field_name) + @model_validator(mode="after") + def _validate_proxy_dns_posture(self) -> NetworkConfig: + if self.require_pinned_dns and self.proxy is not None: + raise ValueError( + "require_pinned_dns is set but a proxy was configured. " + "Remove proxy or disable require_pinned_dns." + ) + return self + class PerformanceConfig(BaseModel): """Configuration for performance tuning.""" @@ -336,6 +388,12 @@ class CacheConfig(BaseModel): model_config = {"extra": "forbid"} + @model_validator(mode="after") + def _validate_resume_requires_cache(self) -> CacheConfig: + if self.resume and not self.enabled: + raise ValueError("cache.resume requires cache.enabled=True") + return self + class DocpullConfig(BaseModel): """ diff --git a/src/docpull/models/events.py b/src/docpull/models/events.py index 8e78123..1dfb71f 100644 --- a/src/docpull/models/events.py +++ b/src/docpull/models/events.py @@ -10,11 +10,15 @@ class SkipReason(str, Enum): """Reasons for skipping a URL during fetch.""" + URL_VALIDATION_FAILED = "url_validation_failed" ROBOTS_DISALLOWED = "robots_disallowed" ALREADY_FETCHED = "already_fetched" CACHE_UNCHANGED = "cache_unchanged" INVALID_CONTENT_TYPE = "invalid_content_type" DUPLICATE_CONTENT = "duplicate_content" + NO_CONTENT_EXTRACTED = "no_content_extracted" + JS_ONLY_SPA = "js_only_spa" + NO_CONTENT_TO_SAVE = "no_content_to_save" PATTERN_EXCLUDED = "pattern_excluded" MAX_DEPTH_EXCEEDED = "max_depth_exceeded" HTTP_ERROR = "http_error" @@ -90,6 +94,10 @@ class FetchEvent: # Progress tracking current: int | None = None total: int | None = None + processed_count: int | None = None + saved_count: int | None = None + skipped_count: int | None = None + failed_count: int | None = None # Typed payload fields for specific events bytes_downloaded: int | None = None diff --git a/src/docpull/models/profiles.py b/src/docpull/models/profiles.py index 8387828..363f436 100644 --- a/src/docpull/models/profiles.py +++ b/src/docpull/models/profiles.py @@ -45,7 +45,8 @@ }, }, ProfileName.LLM: { - # Token-aware output, streaming NDJSON, fail-loud on JS-only pages. + # Token-aware output, streaming NDJSON, and clear JS-only skips. + # Fail-loud JS handling remains an explicit --strict-js-required choice. # This is what "AI-ready Markdown" should actually mean: predictable # chunk sizes, stable hashes, one-record-per-line streaming. "crawl": { diff --git a/src/docpull/models/run.py b/src/docpull/models/run.py new file mode 100644 index 0000000..37e21a5 --- /dev/null +++ b/src/docpull/models/run.py @@ -0,0 +1,67 @@ +"""Run identity models for resume, cache, and output compatibility.""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + +from .config import DocpullConfig + + +class RunIdentity(BaseModel): + """Stable, non-secret description of a docpull run's semantics.""" + + schema_version: int = Field(1, description="Schema version for RunIdentity itself") + profile: str + start_url: str | None = None + max_pages: int | None = None + max_depth: int + include_paths: list[str] = Field(default_factory=list) + exclude_paths: list[str] = Field(default_factory=list) + output_format: str + naming_strategy: str + rich_metadata: bool + extractor: str + enable_special_cases: bool + strict_js_required: bool + max_tokens_per_file: int | None = None + emit_chunks: bool = False + tokenizer: str + auth_type: str + + @classmethod + def from_config(cls, config: DocpullConfig) -> RunIdentity: + return cls( + profile=config.profile.value, + start_url=config.url, + max_pages=config.crawl.max_pages, + max_depth=config.crawl.max_depth, + include_paths=sorted(config.crawl.include_paths), + exclude_paths=sorted(config.crawl.exclude_paths), + output_format=config.output.format, + naming_strategy=config.output.naming_strategy, + rich_metadata=config.output.rich_metadata, + extractor=config.content_filter.extractor, + enable_special_cases=config.content_filter.enable_special_cases, + strict_js_required=config.content_filter.strict_js_required, + max_tokens_per_file=config.output.max_tokens_per_file, + emit_chunks=config.output.emit_chunks, + tokenizer=config.output.tokenizer, + auth_type=config.auth.type.value, + ) + + def resume_fingerprint(self) -> dict[str, object]: + """Subset that affects traversal and resume safety.""" + return { + "version": 2, + "profile": self.profile, + "max_pages": self.max_pages, + "max_depth": self.max_depth, + "include_paths": self.include_paths, + "exclude_paths": self.exclude_paths, + "extractor": self.extractor, + "enable_special_cases": self.enable_special_cases, + "strict_js_required": self.strict_js_required, + "output_format": self.output_format, + "emit_chunks": self.emit_chunks, + "auth_type": self.auth_type, + } diff --git a/src/docpull/pipeline/base.py b/src/docpull/pipeline/base.py index 7fd1a5e..5bfbba7 100644 --- a/src/docpull/pipeline/base.py +++ b/src/docpull/pipeline/base.py @@ -4,10 +4,11 @@ from collections.abc import Callable from dataclasses import dataclass, field +from enum import Enum from pathlib import Path -from typing import Protocol, runtime_checkable +from typing import Any, Protocol, runtime_checkable -from ..models.events import EventType, FetchEvent +from ..models.events import EventType, FetchEvent, SkipReason # Type alias for event emitter function EventEmitter = Callable[[FetchEvent], None] @@ -42,16 +43,19 @@ class PageContext: markdown: str | None = None metadata: dict = field(default_factory=dict) title: str | None = None + extraction_info: dict[str, Any] = field(default_factory=dict) # Status should_skip: bool = False skip_reason: str | None = None + skip_code: SkipReason | None = None error: str | None = None # Additional data from fetch status_code: int | None = None content_type: str | None = None bytes_downloaded: int = 0 + persisted_path: Path | None = None # HTTP caching headers (for incremental updates) etag: str | None = None @@ -61,6 +65,46 @@ class PageContext: source_type: str | None = None chunks: list[object] = field(default_factory=list) + def mark_skipped(self, reason: str, code: SkipReason) -> None: + """Mark the page as a non-fatal skip with a typed reason.""" + self.should_skip = True + self.skip_reason = reason + self.skip_code = code + + def mark_failed(self, error: str) -> None: + """Mark the page as a failure.""" + self.error = error + self.should_skip = False + + +class PipelineStatus(str, Enum): + """Terminal outcome of a pipeline execution.""" + + SUCCEEDED = "succeeded" + SKIPPED = "skipped" + FAILED = "failed" + + +@dataclass +class PipelineResult: + """Typed result for a single pipeline execution.""" + + ctx: PageContext + status: PipelineStatus + failed_step: str | None = None + + @property + def succeeded(self) -> bool: + return self.status == PipelineStatus.SUCCEEDED + + @property + def skipped(self) -> bool: + return self.status == PipelineStatus.SKIPPED + + @property + def failed(self) -> bool: + return self.status == PipelineStatus.FAILED + @runtime_checkable class FetchStep(Protocol): @@ -140,12 +184,12 @@ class FetchPipeline: steps: list[FetchStep] - async def execute( + async def execute_result( self, url: str, output_path: Path, emit: EventEmitter | None = None, - ) -> PageContext: + ) -> PipelineResult: """ Execute the pipeline for a URL. @@ -155,19 +199,15 @@ async def execute( emit: Optional callback for emitting events Returns: - PageContext with final state (check error/should_skip for status) + Typed pipeline result with status and final context. """ ctx = PageContext(url=url, output_path=output_path) for step in self.steps: - if ctx.should_skip: - break - try: ctx = await step.execute(ctx, emit) except Exception as e: - ctx.error = f"{step.name}: {e}" - ctx.should_skip = True + ctx.mark_failed(f"{step.name}: {e}") # Emit failure event if emit: @@ -178,6 +218,20 @@ async def execute( error=ctx.error, ) ) - break + return PipelineResult(ctx=ctx, status=PipelineStatus.FAILED, failed_step=step.name) + + if ctx.error: + return PipelineResult(ctx=ctx, status=PipelineStatus.FAILED, failed_step=step.name) + if ctx.should_skip: + return PipelineResult(ctx=ctx, status=PipelineStatus.SKIPPED) + + return PipelineResult(ctx=ctx, status=PipelineStatus.SUCCEEDED) - return ctx + async def execute( + self, + url: str, + output_path: Path, + emit: EventEmitter | None = None, + ) -> PageContext: + """Compatibility wrapper returning only the final context.""" + return (await self.execute_result(url, output_path, emit)).ctx diff --git a/src/docpull/pipeline/steps/convert.py b/src/docpull/pipeline/steps/convert.py index cbea451..76225ec 100644 --- a/src/docpull/pipeline/steps/convert.py +++ b/src/docpull/pipeline/steps/convert.py @@ -16,7 +16,7 @@ looks_like_spa, looks_like_spa_output, ) -from ...models.events import EventType, FetchEvent +from ...models.events import EventType, FetchEvent, SkipReason from ..base import EventEmitter, PageContext if TYPE_CHECKING: @@ -257,6 +257,7 @@ def _handle_empty_content(self, ctx: PageContext, emit: EventEmitter | None) -> return ctx ctx.should_skip = True ctx.skip_reason = "JS-only SPA: no content without JS render" if is_spa else "No content extracted" + ctx.skip_code = SkipReason.JS_ONLY_SPA if is_spa else SkipReason.NO_CONTENT_EXTRACTED if is_spa: logger.warning("Likely JS-only SPA at %s (no server-rendered content)", ctx.url) else: @@ -267,6 +268,7 @@ def _handle_empty_content(self, ctx: PageContext, emit: EventEmitter | None) -> type=EventType.FETCH_SKIPPED, url=ctx.url, message=ctx.skip_reason, + skip_reason=ctx.skip_code, ) ) return ctx diff --git a/src/docpull/pipeline/steps/dedup.py b/src/docpull/pipeline/steps/dedup.py index b67d7e8..baad152 100644 --- a/src/docpull/pipeline/steps/dedup.py +++ b/src/docpull/pipeline/steps/dedup.py @@ -4,7 +4,7 @@ from ...cache import StreamingDeduplicator from ...conversion.chunking import _strip_frontmatter -from ...models.events import EventType, FetchEvent +from ...models.events import EventType, FetchEvent, SkipReason from ..base import EventEmitter, PageContext logger = logging.getLogger(__name__) @@ -85,8 +85,18 @@ async def execute( if not should_save and duplicate_of: ctx.should_skip = True ctx.skip_reason = f"Duplicate of {duplicate_of}" + ctx.skip_code = SkipReason.DUPLICATE_CONTENT if emit: + emit( + FetchEvent( + type=EventType.FETCH_SKIPPED, + url=ctx.url, + duplicate_of=duplicate_of, + message=f"Duplicate content (original: {duplicate_of})", + skip_reason=SkipReason.DUPLICATE_CONTENT, + ) + ) emit( FetchEvent( type=EventType.PAGE_DEDUPLICATED, diff --git a/src/docpull/pipeline/steps/fetch.py b/src/docpull/pipeline/steps/fetch.py index a52fec5..f2a9ff6 100644 --- a/src/docpull/pipeline/steps/fetch.py +++ b/src/docpull/pipeline/steps/fetch.py @@ -1,6 +1,7 @@ """FetchStep - HTTP fetching pipeline step.""" import logging +from pathlib import Path from typing import TYPE_CHECKING from ...http.protocols import HttpClient @@ -130,10 +131,12 @@ def _conditional_headers(self, url: str, output_path_exists: bool) -> dict[str, entry = self._cache_manager.manifest.get(url) if not entry: return {} + persisted_file = entry.get("file_path") + persisted_exists = isinstance(persisted_file, str) and Path(persisted_file).exists() # Force a fresh body when the cache has us on record but the file # is missing. Otherwise a 304 would short-circuit to skip and we'd # never write the file the user expects. - if not output_path_exists: + if not output_path_exists and not persisted_exists: return {} headers: dict[str, str] = {} etag = self._sanitize_validator(entry.get("etag")) @@ -202,6 +205,7 @@ async def execute( if response.status_code == 304: ctx.should_skip = True ctx.skip_reason = "Not modified (304)" + ctx.skip_code = SkipReason.CACHE_UNCHANGED logger.debug(f"304 Not Modified: {url}") if emit: emit( @@ -219,6 +223,7 @@ async def execute( if 400 <= response.status_code < 500: ctx.should_skip = True ctx.skip_reason = f"HTTP {response.status_code}" + ctx.skip_code = SkipReason.HTTP_ERROR logger.debug(f"Skipping {url}: HTTP {response.status_code}") if emit: @@ -228,6 +233,7 @@ async def execute( url=url, status_code=response.status_code, message=f"Skipped: HTTP {response.status_code}", + skip_reason=SkipReason.HTTP_ERROR, ) ) return ctx @@ -236,6 +242,7 @@ async def execute( if self._validate_content_type and not self._is_valid_content_type(response.content_type): ctx.should_skip = True ctx.skip_reason = f"Invalid content type: {response.content_type}" + ctx.skip_code = SkipReason.INVALID_CONTENT_TYPE logger.debug(f"Skipping {url}: invalid content type {response.content_type}") if emit: @@ -245,6 +252,7 @@ async def execute( url=url, content_type=response.content_type, message="Skipped: invalid content type", + skip_reason=SkipReason.INVALID_CONTENT_TYPE, ) ) return ctx @@ -278,15 +286,5 @@ async def execute( except Exception as e: logger.error(f"Fetch error for {url}: {e}") - if emit: - emit( - FetchEvent( - type=EventType.FETCH_FAILED, - url=url, - error=str(e), - message=f"Fetch failed: {e}", - ) - ) - # Re-raise to let pipeline handle it raise diff --git a/src/docpull/pipeline/steps/save.py b/src/docpull/pipeline/steps/save.py index 76a7e95..91cc762 100644 --- a/src/docpull/pipeline/steps/save.py +++ b/src/docpull/pipeline/steps/save.py @@ -4,7 +4,7 @@ import logging from pathlib import Path -from ...models.events import EventType, FetchEvent +from ...models.events import EventType, FetchEvent, SkipReason from ..base import EventEmitter, PageContext logger = logging.getLogger(__name__) @@ -95,6 +95,9 @@ async def execute( Returns: PageContext (unchanged, or with error set) """ + if ctx.should_skip or ctx.error: + return ctx + url = ctx.url output_path = ctx.output_path @@ -107,6 +110,7 @@ async def execute( else: ctx.should_skip = True ctx.skip_reason = "No content to save" + ctx.skip_code = SkipReason.NO_CONTENT_TO_SAVE logger.warning(f"Skipping {url}: no content to save") if emit: @@ -115,6 +119,7 @@ async def execute( type=EventType.FETCH_SKIPPED, url=url, message="No content to save", + skip_reason=SkipReason.NO_CONTENT_TO_SAVE, ) ) return ctx @@ -132,11 +137,15 @@ async def execute( parent = validated_path.parent ext = validated_path.suffix or ".md" width = max(2, len(str(len(ctx.chunks) - 1))) + first_chunk_path: Path | None = None for chunk in ctx.chunks: idx = getattr(chunk, "index", 0) text = getattr(chunk, "text", "") chunk_path = parent / f"{stem}.{idx:0{width}d}{ext}" await asyncio.to_thread(chunk_path.write_text, text, encoding="utf-8") + if first_chunk_path is None: + first_chunk_path = chunk_path + ctx.persisted_path = first_chunk_path logger.info("Saved %d chunks: %s.*%s", len(ctx.chunks), parent / stem, ext) else: # Write full document (use asyncio.to_thread to avoid blocking) @@ -145,6 +154,7 @@ async def execute( content, encoding="utf-8", ) + ctx.persisted_path = validated_path logger.info(f"Saved: {validated_path}") # Snapshot the first successful page's metadata for SKILL.md. @@ -169,33 +179,12 @@ async def execute( ctx.error = str(e) ctx.should_skip = True logger.error(f"Path validation failed for {url}: {e}") - - if emit: - emit( - FetchEvent( - type=EventType.FETCH_FAILED, - url=url, - error=str(e), - message=f"Path validation failed: {e}", - ) - ) raise except OSError as e: # File system error ctx.error = f"Failed to save: {e}" logger.error(f"Failed to save {url} to {output_path}: {e}") - - if emit: - emit( - FetchEvent( - type=EventType.FETCH_FAILED, - url=url, - output_path=output_path, - error=str(e), - message=f"Failed to save: {e}", - ) - ) raise def finalize(self) -> None: diff --git a/src/docpull/pipeline/steps/save_json.py b/src/docpull/pipeline/steps/save_json.py index d3a2f72..489abed 100644 --- a/src/docpull/pipeline/steps/save_json.py +++ b/src/docpull/pipeline/steps/save_json.py @@ -2,6 +2,7 @@ from __future__ import annotations +import asyncio import contextlib import json import logging @@ -63,6 +64,7 @@ def __init__( self._temp_file: TextIO | None = None self._temp_path: str | None = None self._first_doc = True + self._lock = asyncio.Lock() def _ensure_temp_file(self) -> TextIO: """Create temp file for streaming writes if not already open.""" @@ -79,6 +81,19 @@ def _ensure_temp_file(self) -> TextIO: self._first_doc = True return self._temp_file + def _write_document(self, doc: dict[str, object]) -> None: + """Append one document to the temporary JSON stream.""" + f = self._ensure_temp_file() + + if not self._first_doc: + f.write(",\n") + self._first_doc = False + + doc_json = json.dumps(doc, indent=2, ensure_ascii=False) + indented = "\n".join(" " + line for line in doc_json.split("\n")) + f.write(indented) + f.flush() + async def execute( self, ctx: PageContext, @@ -94,10 +109,10 @@ async def execute( Returns: PageContext (unchanged) """ - if ctx.should_skip or not ctx.markdown: + if ctx.should_skip or ctx.error or not ctx.markdown: return ctx - doc = { + doc: dict[str, object] = { "url": ctx.url, "title": ctx.title, "content": ctx.markdown, @@ -105,20 +120,10 @@ async def execute( "fetched_at": utc_now_iso(), } - f = self._ensure_temp_file() - - # Write comma separator between documents - if not self._first_doc: - f.write(",\n") - self._first_doc = False - - # Write document with indentation - doc_json = json.dumps(doc, indent=2, ensure_ascii=False) - # Indent each line by 4 spaces (2 for documents array + 2 for item) - indented = "\n".join(" " + line for line in doc_json.split("\n")) - f.write(indented) - - self._document_count += 1 + async with self._lock: + await asyncio.to_thread(self._write_document, doc) + self._document_count += 1 + ctx.persisted_path = self._output_file if emit: emit( diff --git a/src/docpull/pipeline/steps/save_ndjson.py b/src/docpull/pipeline/steps/save_ndjson.py index 3df3c94..f7a1e1c 100644 --- a/src/docpull/pipeline/steps/save_ndjson.py +++ b/src/docpull/pipeline/steps/save_ndjson.py @@ -98,6 +98,7 @@ async def execute( record["hash"] = hashlib.sha256(ctx.markdown.encode("utf-8")).hexdigest() self._write_record(record) self._document_count += 1 + ctx.persisted_path = self._output_path if emit: emit( diff --git a/src/docpull/pipeline/steps/save_sqlite.py b/src/docpull/pipeline/steps/save_sqlite.py index aea7bf6..ff136d8 100644 --- a/src/docpull/pipeline/steps/save_sqlite.py +++ b/src/docpull/pipeline/steps/save_sqlite.py @@ -2,6 +2,7 @@ from __future__ import annotations +import asyncio import json import logging import sqlite3 @@ -54,12 +55,13 @@ def __init__( self._conn: sqlite3.Connection | None = None self._document_count = 0 self._pending_count = 0 # Track uncommitted documents + self._lock = asyncio.Lock() def _ensure_db(self) -> sqlite3.Connection: """Ensure the database and table exist.""" if self._conn is None: self._base_dir.mkdir(parents=True, exist_ok=True) - self._conn = sqlite3.connect(self._db_path) + self._conn = sqlite3.connect(self._db_path, check_same_thread=False) # Create table with index on URL self._conn.execute(""" @@ -79,6 +81,29 @@ def _ensure_db(self) -> sqlite3.Connection: return self._conn + def _insert_document(self, ctx: PageContext) -> bool: + """Insert one document and batch-commit when needed.""" + conn = self._ensure_db() + cursor = conn.execute( + """INSERT OR IGNORE INTO documents + (url, title, content, metadata, fetched_at) + VALUES (?, ?, ?, ?, ?)""", + ( + ctx.url, + ctx.title, + ctx.markdown, + json.dumps(ctx.metadata, ensure_ascii=False), + utc_now_iso(), + ), + ) + inserted = cursor.rowcount > 0 + if inserted: + self._pending_count += 1 + if self._pending_count >= self.BATCH_SIZE: + conn.commit() + self._pending_count = 0 + return inserted + async def execute( self, ctx: PageContext, @@ -94,33 +119,15 @@ async def execute( Returns: PageContext (unchanged) """ - if ctx.should_skip or not ctx.markdown: + if ctx.should_skip or ctx.error or not ctx.markdown: return ctx - conn = self._ensure_db() - try: - cursor = conn.execute( - """INSERT OR IGNORE INTO documents - (url, title, content, metadata, fetched_at) - VALUES (?, ?, ?, ?, ?)""", - ( - ctx.url, - ctx.title, - ctx.markdown, - json.dumps(ctx.metadata, ensure_ascii=False), - utc_now_iso(), - ), - ) - # Only count if a row was actually inserted (not ignored) - if cursor.rowcount > 0: - self._document_count += 1 - self._pending_count += 1 - - # Batch commits for performance - if self._pending_count >= self.BATCH_SIZE: - conn.commit() - self._pending_count = 0 + async with self._lock: + inserted = await asyncio.to_thread(self._insert_document, ctx) + if inserted: + self._document_count += 1 + ctx.persisted_path = self._db_path if emit: emit( diff --git a/src/docpull/pipeline/steps/validate.py b/src/docpull/pipeline/steps/validate.py index eaa9c66..d9494a3 100644 --- a/src/docpull/pipeline/steps/validate.py +++ b/src/docpull/pipeline/steps/validate.py @@ -1,8 +1,10 @@ """ValidateStep - URL validation pipeline step.""" import logging +from urllib.parse import urlparse -from ...models.events import EventType, FetchEvent +from ...http.rate_limiter import PerHostRateLimiter +from ...models.events import EventType, FetchEvent, SkipReason from ...security.robots import RobotsChecker from ...security.url_validator import UrlValidator from ..base import EventEmitter, PageContext @@ -36,6 +38,7 @@ def __init__( self, url_validator: UrlValidator, robots_checker: RobotsChecker, + rate_limiter: PerHostRateLimiter | None = None, check_existing: bool = True, cache_enabled: bool = False, ) -> None: @@ -45,6 +48,7 @@ def __init__( Args: url_validator: UrlValidator instance for security checks robots_checker: RobotsChecker instance for robots.txt compliance + rate_limiter: Optional rate limiter to update from Crawl-delay check_existing: If True AND ``cache_enabled`` is False, skip URLs where the output file already exists. When caching is enabled, freshness is owned by FetchStep's conditional GET. @@ -54,6 +58,7 @@ def __init__( """ self._url_validator = url_validator self._robots_checker = robots_checker + self._rate_limiter = rate_limiter self._check_existing = check_existing and not cache_enabled async def execute( @@ -78,6 +83,7 @@ async def execute( if not validation_result.is_valid: ctx.should_skip = True ctx.skip_reason = f"URL validation failed: {validation_result.rejection_reason}" + ctx.skip_code = SkipReason.URL_VALIDATION_FAILED logger.debug(f"Skipping {url}: {validation_result.rejection_reason}") if emit: @@ -86,6 +92,7 @@ async def execute( type=EventType.FETCH_SKIPPED, url=url, message=f"URL validation failed: {validation_result.rejection_reason}", + skip_reason=SkipReason.URL_VALIDATION_FAILED, ) ) return ctx @@ -94,6 +101,7 @@ async def execute( if not self._robots_checker.is_allowed(url): ctx.should_skip = True ctx.skip_reason = "Blocked by robots.txt" + ctx.skip_code = SkipReason.ROBOTS_DISALLOWED logger.debug(f"Skipping {url}: blocked by robots.txt") if emit: @@ -102,14 +110,22 @@ async def execute( type=EventType.FETCH_SKIPPED, url=url, message="Blocked by robots.txt", + skip_reason=SkipReason.ROBOTS_DISALLOWED, ) ) return ctx + if self._rate_limiter is not None: + crawl_delay = self._robots_checker.get_crawl_delay(url) + hostname = urlparse(url).hostname + if crawl_delay is not None and hostname: + self._rate_limiter.update_host_config(hostname, delay=crawl_delay) + # 3. Check if output file already exists if self._check_existing and ctx.output_path.exists(): ctx.should_skip = True ctx.skip_reason = "Output file already exists" + ctx.skip_code = SkipReason.FILE_EXISTS logger.debug(f"Skipping {url}: output file exists at {ctx.output_path}") if emit: @@ -119,6 +135,7 @@ async def execute( url=url, output_path=ctx.output_path, message="Output file already exists", + skip_reason=SkipReason.FILE_EXISTS, ) ) return ctx diff --git a/src/docpull/security/robots.py b/src/docpull/security/robots.py index 8f40632..d803fa5 100644 --- a/src/docpull/security/robots.py +++ b/src/docpull/security/robots.py @@ -3,6 +3,7 @@ from __future__ import annotations import http.client +import ipaddress import logging import socket import ssl @@ -118,7 +119,7 @@ def __init__( self.user_agent = user_agent self.timeout = timeout self.logger = logger or logging.getLogger(__name__) - self._url_validator = url_validator + self._url_validator = url_validator or UrlValidator(allowed_schemes={"https"}) self._max_redirects = max_redirects if allow_insecure_tls: @@ -129,9 +130,6 @@ def __init__( def _validate_url(self, url: str) -> bool: """Validate robots URLs before requesting them or following redirects.""" - if self._url_validator is None: - return True - result = self._url_validator.validate(url) if result.is_valid: return True @@ -142,11 +140,38 @@ def _validate_url(self, url: str) -> bool: def _get_robots_url(self, url: str) -> str: """Get robots.txt URL for a given page URL.""" parsed = urlparse(url) - return f"{parsed.scheme}://{parsed.netloc}/robots.txt" + hostname = parsed.hostname + if hostname is None: + return f"{parsed.scheme.lower()}://{parsed.netloc}/robots.txt" + + netloc = self._format_netloc(hostname, parsed.port) + + return f"{parsed.scheme.lower()}://{netloc}/robots.txt" def _get_domain(self, url: str) -> str: """Extract domain from URL.""" - return urlparse(url).netloc + parsed = urlparse(url) + hostname = parsed.hostname + if hostname is None: + return parsed.netloc.lower() + + return self._format_netloc(hostname, parsed.port) + + @staticmethod + def _format_netloc(hostname: str, port: int | None) -> str: + """Format a canonical host[:port] string, preserving IPv6 brackets.""" + host = hostname + try: + ip = ipaddress.ip_address(hostname) + except ValueError: + ip = None + + if isinstance(ip, ipaddress.IPv6Address): + host = f"[{hostname}]" + + if port is None or port == 443: + return host + return f"{host}:{port}" def _fetch_robots(self, domain: str, robots_url: str) -> _RobotsCacheEntry: """ @@ -206,18 +231,7 @@ def _fetch_robots(self, domain: str, robots_url: str) -> _RobotsCacheEntry: def _resolve_addresses(self, hostname: str) -> list[str]: """Resolve hostnames through the validator so the connect path stays pinned.""" - if self._url_validator is not None: - return self._url_validator.resolve_allowed_addresses(hostname) - - addresses: set[str] = set() - for family, _, _, _, sockaddr in socket.getaddrinfo(hostname, None, type=socket.SOCK_STREAM): - if family in {socket.AF_INET, socket.AF_INET6}: - addresses.add(str(sockaddr[0])) - - if not addresses: - raise OSError(f"No addresses found for {hostname}") - - return sorted(addresses) + return self._url_validator.resolve_allowed_addresses(hostname) def _build_ssl_context(self) -> ssl.SSLContext: return ssl.create_default_context() diff --git a/src/docpull/security/url_validator.py b/src/docpull/security/url_validator.py index 09ee597..8a41645 100644 --- a/src/docpull/security/url_validator.py +++ b/src/docpull/security/url_validator.py @@ -5,6 +5,7 @@ import ipaddress import logging import socket +import time from collections.abc import Callable from dataclasses import dataclass from urllib.parse import urlparse @@ -47,13 +48,24 @@ class UrlValidator: # Default security settings DEFAULT_ALLOWED_SCHEMES = {"https"} - INTERNAL_SUFFIXES = {".internal", ".local", ".localhost", ".localdomain"} + INTERNAL_SUFFIXES = { + ".internal", + ".local", + ".localhost", + ".localdomain", + ".lan", + # Wildcard rebinding domains that encode arbitrary IPs in the hostname. + ".nip.io", + ".sslip.io", + ".xip.io", + } LOCALHOST_NAMES = {"localhost", "localhost.localdomain"} # RFC 6598 carrier-grade NAT / shared address space. Python's ``ipaddress`` # does not flag 100.64.0.0/10 as private, but it is non-globally-routable and # is used as internal address space by many cloud and Kubernetes networks, so # we block it the same way the TypeScript MCP gate does (``isCGNAT()``). _CGNAT_NETWORK = ipaddress.ip_network("100.64.0.0/10") + _RESOLUTION_CACHE_TTL_SECONDS = 1.0 def __init__( self, @@ -73,10 +85,15 @@ def __init__( logger: Optional logger for validation messages """ self.allowed_schemes = allowed_schemes or self.DEFAULT_ALLOWED_SCHEMES - self.allowed_domains = allowed_domains + self.allowed_domains = ( + {self._normalize_hostname(domain) for domain in allowed_domains} + if allowed_domains is not None + else None + ) self.block_private_ips = block_private_ips self.logger = logger or logging.getLogger(__name__) self._resolver = resolver or self._resolve_hostname + self._resolution_cache: dict[str, tuple[float, list[str], UrlValidationResult | None]] = {} def validate(self, url: str) -> UrlValidationResult: """ @@ -191,7 +208,7 @@ def resolve_allowed_addresses(self, hostname: str) -> list[str]: raise ValueError(f"No addresses found for {normalized}") return addresses - addresses, rejection = self._resolve_and_screen(normalized) + addresses, rejection = self._resolve_and_screen_cached(normalized) if rejection is not None: raise ValueError(rejection.rejection_reason or "Hostname failed validation") return addresses @@ -271,9 +288,22 @@ def _check_resolved_addresses(self, hostname: str) -> UrlValidationResult | None else: return None # Literal IPs are screened by _check_ip_address. - _addresses, rejection = self._resolve_and_screen(hostname) + _addresses, rejection = self._resolve_and_screen_cached(hostname) return rejection + def _resolve_and_screen_cached(self, hostname: str) -> tuple[list[str], UrlValidationResult | None]: + """Resolve and screen a hostname once per short-lived TTL window.""" + cached = self._resolution_cache.get(hostname) + now = time.monotonic() + if cached is not None: + cached_at, addresses, rejection = cached + if now - cached_at <= self._RESOLUTION_CACHE_TTL_SECONDS: + return addresses, rejection + + addresses, rejection = self._resolve_and_screen(hostname) + self._resolution_cache[hostname] = (now, addresses, rejection) + return addresses, rejection + def _resolve_and_screen(self, hostname: str) -> tuple[list[str], UrlValidationResult | None]: """Resolve ``hostname`` once and screen every answer against the policy. diff --git a/tests/benchmarks/test_10k_pages.py b/tests/benchmarks/test_10k_pages.py index 1e59888..ee523c3 100644 --- a/tests/benchmarks/test_10k_pages.py +++ b/tests/benchmarks/test_10k_pages.py @@ -19,6 +19,7 @@ from __future__ import annotations import json +import os import resource import statistics import sys @@ -77,6 +78,12 @@ def _body_for(index: int) -> str: ) +def _duplicate_count(page_count: int = PAGE_COUNT) -> int: + """Return the exact duplicate count generated by the synthetic server.""" + dup_step = int(1 / DUPLICATE_FRACTION) + return sum(1 for i in range(1, page_count) if i % dup_step == 0) + + def _peak_rss_bytes() -> int: """Process peak RSS in bytes. macOS ru_maxrss is bytes, Linux is KiB.""" rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss @@ -122,7 +129,13 @@ async def robots_handler(_request: web.Request) -> web.Response: runner = web.AppRunner(app) await runner.setup() site = web.TCPSite(runner, "127.0.0.1", 0) - await site.start() + try: + await site.start() + except PermissionError as exc: + await runner.cleanup() + if os.environ.get("CI") == "true": + raise + pytest.skip(f"localhost bind unavailable: {exc}") port = site._server.sockets[0].getsockname()[1] # type: ignore[union-attr] # Same monkeypatches as test_cache_conditional_get: allow http://127.0.0.1. @@ -196,7 +209,8 @@ async def test_10k_pages_end_to_end(server, tmp_path: Path) -> None: manifest_path = tmp_path / "cache" / "manifest.json" manifest_size = manifest_path.stat().st_size if manifest_path.exists() else 0 - expected_unique = PAGE_COUNT - int(PAGE_COUNT * DUPLICATE_FRACTION) + expected_duplicates = _duplicate_count() + expected_unique = PAGE_COUNT - expected_duplicates discovery_secs = ( (discovery_complete - discovery_started) if discovery_started and discovery_complete else 0.0 ) @@ -208,6 +222,7 @@ async def test_10k_pages_end_to_end(server, tmp_path: Path) -> None: "pages_failed": fetcher.stats.pages_failed, "duplicates_detected": duplicates_seen, "expected_unique_pages": expected_unique, + "expected_duplicates": expected_duplicates, "wall_seconds": round(wall, 2), "discovery_seconds": round(discovery_secs, 2), "fetch_seconds": round(wall - discovery_secs, 2), @@ -228,7 +243,7 @@ async def test_10k_pages_end_to_end(server, tmp_path: Path) -> None: # Fetched + skipped (dedup hits skip with should_skip) should equal total. assert fetcher.stats.pages_fetched + fetcher.stats.pages_skipped == PAGE_COUNT, report # Dedup detected something close to the injected rate. - assert duplicates_seen >= int(PAGE_COUNT * DUPLICATE_FRACTION * 0.9), report + assert duplicates_seen >= int(expected_duplicates * 0.9), report # Memory ceiling: fail if we burn more than 200 MiB on this workload. # Real number on a clean run should land well under 100 MiB. assert (rss_peak - rss_baseline) < 200 * 1024 * 1024, f"RSS regression: {report}" diff --git a/tests/benchmarks/test_performance.py b/tests/benchmarks/test_performance.py index 28d10f5..fa38fc2 100644 --- a/tests/benchmarks/test_performance.py +++ b/tests/benchmarks/test_performance.py @@ -7,8 +7,7 @@ - Deduplication performance - Configuration parsing speed -Run with: pytest tests/benchmarks/ -v --benchmark-only -Or with: python -m pytest tests/benchmarks/ -v +Run with: python -m pytest tests/benchmarks/test_performance.py -v -s """ import gc @@ -63,6 +62,13 @@ LARGE_HTML_BYTES = LARGE_HTML.encode() +def _deep_size_of_strings_map(mapping: dict[str, str]) -> int: + """Approximate the bytes used by a string-to-string dict and its contents.""" + return sys.getsizeof(mapping) + sum( + sys.getsizeof(key) + sys.getsizeof(value) for key, value in mapping.items() + ) + + class TestConversionPerformance: """Benchmarks for HTML to Markdown conversion.""" @@ -158,7 +164,7 @@ def test_streaming_dedup_memory_efficiency(self): # Measure memory before gc.collect() - mem_before = sys.getsizeof(dedup._seen) + mem_before = _deep_size_of_strings_map(dedup._seen) # Add many entries (using sync compute_hash for simplicity) for i in range(10000): @@ -167,13 +173,13 @@ def test_streaming_dedup_memory_efficiency(self): # Measure memory after gc.collect() - mem_after = sys.getsizeof(dedup._seen) + mem_after = _deep_size_of_strings_map(dedup._seen) - # Memory should grow linearly with entries (hash size is fixed) - # Each hash is ~64 bytes, so 10000 entries should be ~640KB + # Memory should grow linearly with entries: each entry stores one + # fixed-size SHA-256 hex digest plus one representative URL. mem_growth = mem_after - mem_before print(f"\nStreamingDeduplicator memory: {mem_growth / 1024:.1f}KB for 10000 entries") - assert mem_growth < 2 * 1024 * 1024, "Memory should be <2MB for 10000 entries" + assert mem_growth < 3 * 1024 * 1024, "Memory should be <3MB for 10000 entries" class TestPipelinePerformance: diff --git a/tests/conftest.py b/tests/conftest.py index ea0cfab..e065594 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,11 +9,11 @@ def pytest_ignore_collect(collection_path: object, config: pytest.Config) -> bool: - """Keep the long benchmark out of default test collection unless enabled.""" + """Keep benchmark tests out of default collection unless enabled.""" path = Path(str(collection_path)) - if os.environ.get("DOCPULL_BENCHMARK_10K") == "1": + if os.environ.get("DOCPULL_BENCHMARKS") == "1" or os.environ.get("DOCPULL_BENCHMARK_10K") == "1": return False - return path.name == "test_10k_pages.py" and path.parent.name == "benchmarks" + return path.parent.name == "benchmarks" @pytest.fixture diff --git a/tests/test_cache_conditional_get.py b/tests/test_cache_conditional_get.py index 6707b82..bb29316 100644 --- a/tests/test_cache_conditional_get.py +++ b/tests/test_cache_conditional_get.py @@ -69,7 +69,11 @@ async def robots(_request: web.Request) -> web.Response: runner = web.AppRunner(app) await runner.setup() site = web.TCPSite(runner, "127.0.0.1", 0) - await site.start() + try: + await site.start() + except PermissionError as err: + await runner.cleanup() + pytest.skip(f"localhost bind unavailable in this environment: {err}") server_socket = site._server.sockets[0] # type: ignore[union-attr] port = server_socket.getsockname()[1] @@ -84,6 +88,11 @@ def permissive_validate(self, hostname): # type: ignore[no-untyped-def] return UrlValidationResult.valid() monkeypatch.setattr(UrlValidator, "validate_hostname", permissive_validate) + monkeypatch.setattr( + UrlValidator, + "resolve_allowed_addresses", + lambda self, hostname: ["127.0.0.1"], + ) # Also override scheme to allow http (test server is plain HTTP). original_init = UrlValidator.__init__ @@ -206,3 +215,33 @@ async def test_missing_output_file_forces_full_fetch(server, tmp_path: Path, mon # And no conditional header should have been sent. no_ifmatch = [h for h in server["request_log"] if "If-None-Match" not in h] assert no_ifmatch, "expected at least one unconditional request" + + +@pytest.mark.asyncio +async def test_chunked_output_uses_conditional_get_from_persisted_chunk(server, tmp_path: Path): + """Chunked markdown output should still send validators on the second run.""" + output_dir = tmp_path / "out" + cache_dir = tmp_path / "cache" + cfg = DocpullConfig( + url=server["url"], + output={ + "directory": output_dir, + "max_tokens_per_file": 10, + "emit_chunks": True, + }, + cache={"enabled": True, "directory": cache_dir, "skip_unchanged": True}, + crawl={"max_pages": 1, "max_depth": 1}, + ) + + fetched1, _ = await _run(cfg) + assert fetched1 == 1 + chunk_files = list(output_dir.glob("*.md")) + assert chunk_files, f"expected chunk files in {output_dir}" + + server["request_log"].clear() + + fetched2, skip_reasons = await _run(cfg) + assert fetched2 == 0 + assert SkipReason.CACHE_UNCHANGED in skip_reasons + headers_seen = [h.get("If-None-Match") for h in server["request_log"] if "If-None-Match" in h] + assert headers_seen == ['"abc123"'] diff --git a/tests/test_cache_manager.py b/tests/test_cache_manager.py new file mode 100644 index 0000000..edaf318 --- /dev/null +++ b/tests/test_cache_manager.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import json +from datetime import timedelta + +from docpull.cache import CacheManager +from docpull.time_utils import utc_now + + +def test_cache_state_persists_deterministically(tmp_path): + cache = CacheManager(tmp_path) + + cache.mark_fetched("https://example.com/b") + cache.mark_fetched("https://example.com/a") + cache.mark_failed("https://example.com/d") + cache.mark_failed("https://example.com/c") + cache.flush() + + state = json.loads((tmp_path / "state.json").read_text()) + assert state["fetched_urls"] == ["https://example.com/a", "https://example.com/b"] + assert state["failed_urls"] == ["https://example.com/c", "https://example.com/d"] + + +def test_mark_fetched_clears_stale_failure_state(tmp_path): + cache = CacheManager(tmp_path) + + cache.mark_failed("https://example.com/page") + cache.mark_fetched("https://example.com/page") + cache.flush() + + state = json.loads((tmp_path / "state.json").read_text()) + assert state["fetched_urls"] == ["https://example.com/page"] + assert state["failed_urls"] == [] + + +def test_mark_failed_clears_stale_fetched_state(tmp_path): + cache = CacheManager(tmp_path) + + cache.mark_fetched("https://example.com/page") + cache.mark_failed("https://example.com/page") + cache.flush() + + state = json.loads((tmp_path / "state.json").read_text()) + assert state["fetched_urls"] == [] + assert state["failed_urls"] == ["https://example.com/page"] + + +def test_update_cache_records_byte_size_for_text(tmp_path): + cache = CacheManager(tmp_path) + + cache.update_cache("https://example.com/page", "snowman: \u2603", tmp_path / "page.md") + + assert cache.manifest["https://example.com/page"]["size"] == len("snowman: \u2603".encode()) + + +def test_evict_expired_removes_matching_resume_state(tmp_path): + cache = CacheManager(tmp_path, ttl_days=30) + url = "https://example.com/old" + old_timestamp = (utc_now() - timedelta(days=31)).isoformat() + cache.manifest[url] = { + "checksum": "abc", + "file_path": "old.md", + "fetched_at": old_timestamp, + "size": 3, + } + cache.mark_fetched(url) + + evicted = cache.evict_expired() + + assert evicted == 1 + assert url not in cache.manifest + assert url not in cache.get_fetched_urls() + + +def test_malformed_manifest_and_state_load_as_empty_cache(tmp_path): + (tmp_path / "manifest.json").write_text('["not", "a", "manifest"]') + (tmp_path / "state.json").write_text( + json.dumps( + { + "fetched_urls": "https://example.com/not-a-list", + "failed_urls": [1, "https://example.com/failed"], + "last_run": 123, + } + ) + ) + + cache = CacheManager(tmp_path) + + assert cache.manifest == {} + assert cache.get_fetched_urls() == set() + cache.flush() + + +def test_discovered_urls_loader_ignores_non_string_urls(tmp_path): + cache = CacheManager(tmp_path) + (tmp_path / "discovered_urls.json").write_text( + json.dumps( + { + "start_url": "https://example.com", + "urls": ["https://example.com/a", 1, None, "https://example.com/b"], + } + ) + ) + + assert cache.load_discovered_urls("https://example.com") == [ + "https://example.com/a", + "https://example.com/b", + ] + + +def test_discovered_urls_loader_rejects_mismatched_fingerprint(tmp_path): + cache = CacheManager(tmp_path) + (tmp_path / "discovered_urls.json").write_text( + json.dumps( + { + "start_url": "https://example.com", + "config_fingerprint": {"version": 1, "max_depth": 2}, + "urls": ["https://example.com/a"], + } + ) + ) + + assert ( + cache.load_discovered_urls( + "https://example.com", + config_fingerprint={"version": 1, "max_depth": 3}, + ) + is None + ) + + +def test_discovered_urls_round_trip_persists_fingerprint(tmp_path): + cache = CacheManager(tmp_path) + fingerprint = {"version": 1, "max_depth": 3, "include_paths": ["/docs/*"]} + + cache.save_discovered_urls( + ["https://example.com/a"], + "https://example.com", + config_fingerprint=fingerprint, + ) + + assert cache.load_discovered_urls("https://example.com", config_fingerprint=fingerprint) == [ + "https://example.com/a" + ] diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 979894a..a08ea65 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -50,6 +50,25 @@ def test_chunk_heading_captured(): assert any(c.heading and "Second" in c.heading for c in chunks) +def test_chunk_heading_matches_buffer_start_when_next_section_flushes(): + md = "# First\n\n" + ("alpha " * 20) + "\n\n## Second\n\n" + ("beta " * 20) + chunks = chunk_markdown(md, max_tokens=25) + + assert chunks[0].heading == "First" + assert "Second" not in chunks[0].text + assert any(chunk.heading == "Second" for chunk in chunks[1:]) + + +def test_frontmatter_is_counted_in_first_chunk_budget(): + md = "---\ntitle: " + ("x" * 400) + "\n---\n\n# H\n\n" + ("word " * 80) + chunks = chunk_markdown(md, max_tokens=40) + + assert chunks[0].text.startswith("---") + assert chunks[0].token_count == TokenCounter().count(chunks[0].text) + assert chunks[0].token_count > 40 + assert len(chunks) > 1 + + def test_oversize_paragraph_becomes_own_chunk(): huge = "word " * 10000 md = f"# H\n\n{huge}\n" diff --git a/tests/test_ci_policy.py b/tests/test_ci_policy.py index 6e7e535..0cd18a3 100644 --- a/tests/test_ci_policy.py +++ b/tests/test_ci_policy.py @@ -36,3 +36,52 @@ def test_publish_workflow_is_tag_only() -> None: publish = (WORKFLOW_DIR / "publish.yml").read_text() assert "workflow_dispatch" not in publish assert '"v*.*.*"' in publish + + +def test_publish_workflow_requires_main_branch_provenance() -> None: + publish = (WORKFLOW_DIR / "publish.yml").read_text() + assert "fetch-depth: 0" in publish + assert 'git merge-base --is-ancestor "$GITHUB_SHA" "origin/main"' in publish + + +def test_ci_matrix_covers_declared_supported_python_versions() -> None: + ci = (WORKFLOW_DIR / "ci.yml").read_text() + assert 'python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]' in ci + + +def test_python_security_audits_shipped_optional_dependencies() -> None: + security = (WORKFLOW_DIR / "security.yml").read_text() + assert "dependency-groups: all,dev" in security + assert "make python-security" in security + + +def test_benchmark_workflow_watches_full_python_source_tree() -> None: + benchmark = (WORKFLOW_DIR / "benchmark.yml").read_text() + assert '- "src/docpull/**"' in benchmark + + +def test_python_workflows_use_shared_setup_action() -> None: + for workflow_name in ["benchmark.yml", "ci.yml", "publish.yml", "security.yml"]: + workflow = (WORKFLOW_DIR / workflow_name).read_text() + assert "uses: ./.github/actions/setup-python-docpull" in workflow + + +def test_workflows_delegate_python_gate_commands_to_makefile() -> None: + ci = (WORKFLOW_DIR / "ci.yml").read_text() + publish = (WORKFLOW_DIR / "publish.yml").read_text() + security = (WORKFLOW_DIR / "security.yml").read_text() + benchmark = (WORKFLOW_DIR / "benchmark.yml").read_text() + + assert "make test-cov" in ci + assert "make lint-check" in ci + assert "make pre-commit-check" in ci + assert "make typecheck" in ci + assert "make release-gates" in publish + assert "make python-security" in security + assert "make benchmark-10k" in benchmark + assert "set -o pipefail" in benchmark + + +def test_web_security_job_uses_declared_node_major() -> None: + security = (WORKFLOW_DIR / "security.yml").read_text() + assert 'node-version: "24"' in security diff --git a/tests/test_claude_plugin_bundle.py b/tests/test_claude_plugin_bundle.py new file mode 100644 index 0000000..de6b425 --- /dev/null +++ b/tests/test_claude_plugin_bundle.py @@ -0,0 +1,150 @@ +"""Consistency checks for the Claude plugin bundle.""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +try: + import tomllib +except ModuleNotFoundError: # pragma: no cover - Python <3.11 fallback for test envs + import tomli as tomllib # type: ignore[no-redef] + + +REPO_ROOT = Path(__file__).resolve().parents[1] +PYPROJECT_PATH = REPO_ROOT / "pyproject.toml" +AUTHORING_PLUGIN_DIR = REPO_ROOT / "plugin" +BUNDLE_ROOT = REPO_ROOT / ".claude-plugin" +BUNDLE_PLUGIN_DIR = BUNDLE_ROOT / "plugin" +SYNC_SCRIPT = REPO_ROOT / "scripts" / "sync_claude_plugin.py" +SYNC_AGENT_HOSTS_SCRIPT = REPO_ROOT / "scripts" / "sync_agent_host_configs.py" + + +def setup_module() -> None: + subprocess.run([sys.executable, str(SYNC_SCRIPT)], check=True) + + +def _load_json(path: Path) -> dict: + return json.loads(path.read_text()) + + +def _load_pyproject() -> dict: + with PYPROJECT_PATH.open("rb") as f: + return tomllib.load(f) + + +def test_self_contained_bundle_includes_plugin_payload() -> None: + required_files = [ + BUNDLE_ROOT / "marketplace.json", + BUNDLE_PLUGIN_DIR / ".claude-plugin" / "plugin.json", + BUNDLE_PLUGIN_DIR / ".mcp.json", + BUNDLE_PLUGIN_DIR / "README.md", + BUNDLE_PLUGIN_DIR / "skills" / "docpull-research" / "SKILL.md", + ] + + missing = [path.relative_to(REPO_ROOT).as_posix() for path in required_files if not path.exists()] + + assert missing == [] + + +def test_bundle_payload_matches_authoring_plugin_files() -> None: + relative_files = [ + Path(".mcp.json"), + Path("README.md"), + Path("skills/docpull-research/SKILL.md"), + ] + + for relative_path in relative_files: + authoring = (AUTHORING_PLUGIN_DIR / relative_path).read_text() + bundled = (BUNDLE_PLUGIN_DIR / relative_path).read_text() + assert bundled == authoring, relative_path.as_posix() + + +def test_plugin_bundle_does_not_ship_host_specific_command_wrappers() -> None: + assert not (AUTHORING_PLUGIN_DIR / "commands").exists() + assert not (BUNDLE_PLUGIN_DIR / "commands").exists() + + +def test_bundle_metadata_matches_authoring_plugin_and_package_version() -> None: + pyproject = _load_pyproject() + package_version = pyproject["project"]["version"] + authoring = _load_json(AUTHORING_PLUGIN_DIR / ".claude-plugin" / "plugin.json") + bundled = _load_json(BUNDLE_PLUGIN_DIR / ".claude-plugin" / "plugin.json") + codex = _load_json(AUTHORING_PLUGIN_DIR / ".codex-plugin" / "plugin.json") + marketplace = _load_json(BUNDLE_ROOT / "marketplace.json") + + assert authoring == bundled + assert authoring["version"] == package_version + assert codex["version"] == package_version + assert codex["skills"] == "./skills/" + assert marketplace["plugins"][0]["version"] == package_version + assert marketplace["plugins"][0]["source"] == "./plugin" + + +def test_plugin_skill_requires_mcp_extra_for_recovery() -> None: + skill = (AUTHORING_PLUGIN_DIR / "skills" / "docpull-research" / "SKILL.md").read_text() + + assert "pip install 'docpull[mcp]'" in skill + assert "pip install docpull" not in skill + + +def test_agent_host_configs_keep_docpull_mcp_aligned() -> None: + claude_mcp = _load_json(AUTHORING_PLUGIN_DIR / ".mcp.json") + project_mcp = _load_json(REPO_ROOT / ".mcp.json") + cursor_mcp = _load_json(REPO_ROOT / ".cursor" / "mcp.json") + + assert project_mcp["mcpServers"]["docpull"] == claude_mcp["mcpServers"]["docpull"] + assert cursor_mcp["mcpServers"]["docpull"] == { + "type": "stdio", + "command": claude_mcp["mcpServers"]["docpull"]["command"], + "args": claude_mcp["mcpServers"]["docpull"]["args"], + } + + required_files = [ + REPO_ROOT / "CLAUDE.md", + REPO_ROOT / ".cursor" / "rules" / "docpull-research.mdc", + REPO_ROOT / "AGENTS.md", + ] + missing = [path.relative_to(REPO_ROOT).as_posix() for path in required_files if not path.exists()] + + assert missing == [] + + +def test_codex_host_sync_script_declares_official_project_paths() -> None: + result = subprocess.run( + [sys.executable, str(SYNC_AGENT_HOSTS_SCRIPT), "--dry-run"], + check=True, + capture_output=True, + text=True, + ) + + assert result.stdout.splitlines() == [ + ".codex/config.toml", + ".agents/skills/docpull-research/SKILL.md", + ".agents/plugins/marketplace.json", + ] + + +def test_agent_research_guidance_mentions_skills_cli_docs() -> None: + required_fragments = [ + "skills.sh", + "npx skills", + "skills add", + "--agent", + "--skill", + "--copy", + "--yes", + ] + guidance_files = [ + AUTHORING_PLUGIN_DIR / "skills" / "docpull-research" / "SKILL.md", + BUNDLE_PLUGIN_DIR / "skills" / "docpull-research" / "SKILL.md", + REPO_ROOT / ".cursor" / "rules" / "docpull-research.mdc", + REPO_ROOT / "AGENTS.md", + ] + + for path in guidance_files: + text = path.read_text() + missing = [fragment for fragment in required_fragments if fragment not in text] + assert missing == [], path.relative_to(REPO_ROOT).as_posix() diff --git a/tests/test_cli.py b/tests/test_cli.py index a1003e7..494d295 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,5 +1,7 @@ """CLI regression tests.""" +import subprocess +import sys from importlib.metadata import version import pytest @@ -18,3 +20,32 @@ def test_parser_rejects_removed_js_flag(): with pytest.raises(SystemExit): parser.parse_args(["https://example.com", "--js"]) + + +def test_parser_rejects_removed_naming_aliases(): + """Removed naming aliases should stay unavailable at the CLI surface.""" + parser = create_parser() + + with pytest.raises(SystemExit): + parser.parse_args(["https://example.com", "--naming-strategy", "flat"]) + + with pytest.raises(SystemExit): + parser.parse_args(["https://example.com", "--naming-strategy", "short"]) + + +def test_importing_cli_has_no_doctor_side_effect(): + """Importing the CLI module must not inspect sys.argv and exit.""" + result = subprocess.run( + [ + sys.executable, + "-c", + ("import sys; sys.argv=['docpull', '--doctor']; import docpull.cli; print('imported')"), + ], + check=False, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + assert result.stdout.strip() == "imported" + assert "Running docpull diagnostics" not in result.stdout diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 8c39843..58529ce 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -205,6 +205,38 @@ async def test_respects_max_urls(self, mock_http_client, mock_validator): assert len(urls) == 2 + @pytest.mark.asyncio + async def test_deduplicates_nested_sitemap_documents(self, mock_http_client, mock_validator): + """Repeated nested sitemap references should only fetch each document once.""" + index_content = b""" + + https://example.com/child.xml + https://example.com/child.xml + """ + child_content = b""" + + https://example.com/page1 + """ + + def make_response(content: bytes) -> MagicMock: + response = MagicMock() + response.status_code = 200 + response.content = content + return response + + mock_http_client.get.side_effect = [ + make_response(index_content), + make_response(child_content), + ] + + discoverer = SitemapDiscoverer(mock_http_client, mock_validator) + urls = [] + async for url in discoverer.discover("https://example.com/sitemap.xml"): + urls.append(url) + + assert urls == ["https://example.com/page1"] + assert mock_http_client.get.await_count == 2 + @pytest.mark.asyncio async def test_blocks_off_domain_urls_from_sitemap(self, mock_http_client, mock_validator): """Test that sitemap discovery stays on the crawl origin.""" @@ -226,6 +258,30 @@ async def test_blocks_off_domain_urls_from_sitemap(self, mock_http_client, mock_ assert urls == ["https://example.com/page1"] + @pytest.mark.asyncio + async def test_treats_direct_sitemap_url_with_query_as_sitemap(self, mock_http_client, mock_validator): + """Direct sitemap URLs should still be honored when they include a query string.""" + sitemap_content = b""" + + https://example.com/page1 + """ + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = sitemap_content + mock_http_client.get.return_value = mock_response + + discoverer = SitemapDiscoverer(mock_http_client, mock_validator) + urls = [] + async for url in discoverer.discover("https://example.com/sitemap.xml?source=test"): + urls.append(url) + + assert urls == ["https://example.com/page1"] + mock_http_client.get.assert_awaited_once_with( + "https://example.com/sitemap.xml?source=test", + timeout=30.0, + ) + def test_parse_sitemap_rejects_external_entity_payload(self, mock_http_client, mock_validator): """Hostile sitemap XML must not resolve external entities.""" sitemap_content = b""" @@ -334,6 +390,53 @@ async def test_respects_max_depth(self, mock_http_client, mock_validator, mock_r # Only start URL, no crawling assert len(urls) == 1 + @pytest.mark.asyncio + async def test_skips_disallowed_start_url(self, mock_http_client, mock_validator, mock_robots): + """The crawler must not fetch a blocked seed URL.""" + mock_robots.is_allowed.return_value = False + + crawler = LinkCrawler( + mock_http_client, + mock_validator, + mock_robots, + max_depth=1, + ) + + urls = [] + async for url in crawler.discover("https://example.com"): + urls.append(url) + + assert urls == [] + mock_http_client.get.assert_not_awaited() + + @pytest.mark.asyncio + async def test_crawls_seed_even_when_include_filter_excludes_it( + self, mock_http_client, mock_validator, mock_robots + ): + """The seed URL should still be traversed so included descendants can be found.""" + html_content = b"""Docs""" + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = html_content + mock_response.content_type = "text/html" + mock_http_client.get.return_value = mock_response + + crawler = LinkCrawler( + mock_http_client, + mock_validator, + mock_robots, + max_depth=1, + pattern_filter=PatternFilter(include_patterns=["/docs/*"]), + ) + + urls = [] + async for url in crawler.discover("https://example.com", max_depth=1): + urls.append(url) + + assert urls == ["https://example.com/docs/getting-started"] + mock_http_client.get.assert_awaited_once() + class TestCompositeDiscoverer: """Tests for CompositeDiscoverer.""" diff --git a/tests/test_integration.py b/tests/test_integration.py index 82c9897..fa5c70e 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,5 +1,6 @@ """Integration tests for the docpull API.""" +import asyncio from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch @@ -14,7 +15,25 @@ FetchStats, ProfileName, ) +from docpull.core import fetch_blocking as core_fetch_blocking +from docpull.core import fetch_one as core_fetch_one from docpull.core.fetcher import _url_to_filename +from docpull.models.events import SkipReason +from docpull.pipeline.base import FetchPipeline, PageContext, PipelineResult, PipelineStatus + + +class _PipelineStub: + def __init__(self, execute): + self.steps = [] + self._execute = execute + + async def execute(self, url: str, output_path: Path, emit=None): + return await self._execute(url, output_path, emit=emit) + + async def execute_result(self, url: str, output_path: Path, emit=None): + ctx = await self.execute(url, output_path, emit=emit) + status = PipelineStatus.SKIPPED if ctx.should_skip else PipelineStatus.SUCCEEDED + return PipelineResult(ctx=ctx, status=status) class TestDocpullConfig: @@ -81,6 +100,14 @@ def test_config_rejects_insecure_tls(self): network={"insecure_tls": True}, ) + def test_config_rejects_proxy_with_require_pinned_dns(self): + """Model validation should reject proxy mode when pinned DNS is required.""" + with pytest.raises(ValueError, match="require_pinned_dns"): + DocpullConfig( + url="https://example.com", + network={"proxy": "http://proxy:8080", "require_pinned_dns": True}, + ) + def test_config_dry_run(self): """Test config with dry run enabled.""" config = DocpullConfig(url="https://example.com", dry_run=True) @@ -120,6 +147,52 @@ def test_config_rejects_removed_browser_settings(self): performance={"browser_contexts": 2}, ) + def test_emit_chunks_requires_chunking(self): + """Chunk emission is invalid unless chunking is configured.""" + with pytest.raises(ValueError, match="emit_chunks requires max_tokens_per_file"): + DocpullConfig( + url="https://example.com", + output={"emit_chunks": True}, + ) + + def test_skill_mode_forces_hierarchical_naming(self): + """Skill output should always normalize to hierarchical naming.""" + config = DocpullConfig( + url="https://example.com", + output={"skill_name": "example-skill", "naming_strategy": "full"}, + ) + assert config.output.naming_strategy == "hierarchical" + + def test_resume_requires_cache_enabled(self): + """Resume mode should be rejected unless the cache is enabled.""" + with pytest.raises(ValueError, match="cache.resume requires cache.enabled=True"): + DocpullConfig( + url="https://example.com", + cache={"resume": True}, + ) + + def test_auth_type_requires_matching_payload(self): + """Typed auth modes should reject missing required fields.""" + with pytest.raises(ValueError, match="requires token"): + DocpullConfig(url="https://example.com", auth={"type": "bearer"}) + + with pytest.raises(ValueError, match="requires both username and password"): + DocpullConfig(url="https://example.com", auth={"type": "basic", "username": "u"}) + + with pytest.raises(ValueError, match="requires cookie"): + DocpullConfig(url="https://example.com", auth={"type": "cookie"}) + + with pytest.raises(ValueError, match="requires both header_name and header_value"): + DocpullConfig(url="https://example.com", auth={"type": "header", "header_name": "X-Test"}) + + def test_auth_fields_require_non_none_type(self): + """Auth payload should not silently no-op under auth.type=none.""" + with pytest.raises(ValueError, match="auth.type is 'none'"): + DocpullConfig( + url="https://example.com", + auth={"token": "secret-token"}, + ) + class TestUrlToFilename: """Tests for URL to filename conversion.""" @@ -168,12 +241,20 @@ def test_create_progress_event(self): url="https://example.com/page", current=5, total=10, + processed_count=5, + saved_count=3, + skipped_count=1, + failed_count=1, message="Fetching 5/10", ) assert event.type == EventType.FETCH_PROGRESS assert event.url == "https://example.com/page" assert event.current == 5 assert event.total == 10 + assert event.processed_count == 5 + assert event.saved_count == 3 + assert event.skipped_count == 1 + assert event.failed_count == 1 def test_create_completed_event(self): """Test creating a completed event.""" @@ -267,6 +348,184 @@ async def test_fetcher_cancel(self, mock_config): fetcher.cancel() assert fetcher._cancelled is True + @pytest.mark.asyncio + async def test_streaming_discovery_failure_does_not_hang(self, mock_config): + """Test discovery errors surface instead of deadlocking the stream.""" + with patch("docpull.core.fetcher.AsyncHttpClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client_cls.return_value = mock_client + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client.user_agent = "docpull-test" + + fetcher = Fetcher(mock_config) + async with fetcher: + + async def broken_discover(_: str, max_urls: int | None = None): + if max_urls: + pass + yield "https://docs.example.com/page-1" + raise RuntimeError("discovery exploded") + + async def execute(url: str, output_path: Path, emit=None): + if emit: + emit(FetchEvent(type=EventType.FETCH_PROGRESS, url=url, message="progress")) + return PageContext(url=url, output_path=output_path, markdown="ok", bytes_downloaded=1) + + fetcher._discoverer = MagicMock() + fetcher._discoverer.discover = broken_discover + fetcher._pipeline = _PipelineStub(execute) + + with pytest.raises(RuntimeError, match="discovery exploded"): + await asyncio.wait_for(_collect_events(fetcher.run()), timeout=1) + + @pytest.mark.asyncio + async def test_record_result_marks_empty_markdown_as_fetched(self, mock_config): + """Test resume state is updated even when markdown is empty.""" + fetcher = Fetcher(mock_config) + fetcher._cache_manager = MagicMock() + ctx = PageContext( + url="https://docs.example.com/empty", + output_path=mock_config.output.directory / "empty.md", + markdown="", + ) + + fetcher._record_result(ctx.url, ctx.output_path, ctx) + + fetcher._cache_manager.update_cache.assert_called_once_with( + url=ctx.url, + content="", + file_path=ctx.output_path, + etag=None, + last_modified=None, + ) + fetcher._cache_manager.mark_fetched.assert_called_once_with(ctx.url) + + @pytest.mark.asyncio + async def test_fetch_one_save_updates_cache_state(self, mock_config): + """Test single-page saved fetches share normal cache bookkeeping.""" + + class StubStep: + name = "stub" + + async def execute(self, ctx: PageContext, emit=None) -> PageContext: + ctx.markdown = "body" + ctx.bytes_downloaded = 12 + return ctx + + fetcher = Fetcher(mock_config) + fetcher._pipeline = FetchPipeline(steps=[StubStep()]) + fetcher._cache_manager = MagicMock() + + ctx = await fetcher.fetch_one("https://docs.example.com/one", save=True) + + assert ctx.markdown == "body" + assert fetcher.stats.pages_fetched == 1 + assert fetcher.stats.files_saved == 1 + fetcher._cache_manager.update_cache.assert_called_once() + fetcher._cache_manager.mark_fetched.assert_called_once_with("https://docs.example.com/one") + + @pytest.mark.asyncio + async def test_streaming_progress_counts_empty_markdown(self, mock_config): + """Test streaming mode still emits progress for empty successful pages.""" + mock_config.crawl.max_concurrent = 1 + with patch("docpull.core.fetcher.AsyncHttpClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client_cls.return_value = mock_client + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client.user_agent = "docpull-test" + + fetcher = Fetcher(mock_config) + async with fetcher: + + async def single_discover(_: str, max_urls: int | None = None): + if max_urls: + pass + yield "https://docs.example.com/empty" + + async def execute(url: str, output_path: Path, emit=None): + return PageContext(url=url, output_path=output_path, markdown="", bytes_downloaded=0) + + fetcher._discoverer = MagicMock() + fetcher._discoverer.discover = single_discover + fetcher._pipeline = _PipelineStub(execute) + + events = await _collect_events(fetcher.run()) + + progress_events = [event for event in events if event.type == EventType.FETCH_PROGRESS] + assert any( + event.url == "https://docs.example.com/empty" + and event.current == 1 + and event.processed_count == 1 + and event.saved_count == 1 + and event.skipped_count == 0 + and event.failed_count == 0 + for event in progress_events + ) + + @pytest.mark.asyncio + async def test_streaming_progress_counts_failures_as_processed(self, mock_config): + """Streaming progress should advance for failures, not only saves.""" + mock_config.crawl.max_concurrent = 1 + with patch("docpull.core.fetcher.AsyncHttpClient") as mock_client_cls: + mock_client = AsyncMock() + mock_client_cls.return_value = mock_client + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client.user_agent = "docpull-test" + + fetcher = Fetcher(mock_config) + async with fetcher: + + async def two_urls(_: str, max_urls: int | None = None): + if max_urls: + pass + yield "https://docs.example.com/ok" + yield "https://docs.example.com/fail" + + async def execute(url: str, output_path: Path, emit=None): + if url.endswith("/fail"): + raise RuntimeError("boom") + return PageContext(url=url, output_path=output_path, markdown="body", bytes_downloaded=1) + + fetcher._discoverer = MagicMock() + fetcher._discoverer.discover = two_urls + fetcher._pipeline = _PipelineStub(execute) + + events = await _collect_events(fetcher.run()) + + progress_events = [event for event in events if event.type == EventType.FETCH_PROGRESS] + assert any( + event.url == "https://docs.example.com/fail" + and event.current == 2 + and event.processed_count == 2 + and event.saved_count == 1 + and event.skipped_count == 0 + and event.failed_count == 1 + for event in progress_events + ) + + @pytest.mark.asyncio + async def test_record_result_counts_deduplicated_skips_separately(self, mock_config): + fetcher = Fetcher(mock_config) + ctx = PageContext( + url="https://docs.example.com/dup", + output_path=mock_config.output.directory / "dup.md", + should_skip=True, + skip_reason="Duplicate of https://docs.example.com/original", + skip_code=SkipReason.DUPLICATE_CONTENT, + ) + + fetcher._record_result(ctx.url, ctx.output_path, ctx) + + assert fetcher.stats.pages_skipped == 1 + assert fetcher.stats.pages_deduplicated == 1 + + +async def _collect_events(stream): + return [event async for event in stream] + class TestEventTypes: """Tests for all event types.""" @@ -319,6 +578,14 @@ def test_quick_profile_defaults(self): assert config.crawl.max_pages == 50 assert config.crawl.max_depth == 2 + +class TestCoreExports: + """Tests for the docpull.core package surface.""" + + def test_sync_helpers_are_exported(self): + assert callable(core_fetch_one) + assert callable(core_fetch_blocking) + def test_explicit_user_value_beats_profile_value(self): """User-supplied values must win over profile values on collision. diff --git a/tests/test_link_extractors.py b/tests/test_link_extractors.py index cdba23f..d8cbffc 100644 --- a/tests/test_link_extractors.py +++ b/tests/test_link_extractors.py @@ -92,6 +92,13 @@ async def test_resolves_relative_urls(self, extractor): links = await extractor.extract_links("https://example.com/docs/api/", content=html) assert "https://example.com/docs/other/page" in links + @pytest.mark.asyncio + async def test_rejects_non_http_urls(self, extractor): + """Static extraction should match enhanced extraction's URL contract.""" + html = b'File' + links = await extractor.extract_links("https://example.com", content=html) + assert links == [] + class TestEnhancedLinkExtractor: """Tests for EnhancedLinkExtractor.""" diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index f9ac4cd..9b85998 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -12,9 +12,20 @@ from mcp.client.stdio import stdio_client +from docpull.mcp.server import _coerce_bool from mcp import ClientSession, StdioServerParameters +def test_coerce_bool_rejects_string_inputs(): + with pytest.raises(ValueError, match="must be a boolean"): + _coerce_bool("false", name="force", default=False) + + +def test_coerce_bool_accepts_bool_inputs(): + assert _coerce_bool(True, name="force", default=False) is True + assert _coerce_bool(None, name="force", default=False) is False + + @pytest.mark.asyncio async def test_stdio_server_lists_and_calls_tools(tmp_path): env = os.environ.copy() @@ -42,6 +53,20 @@ async def test_stdio_server_lists_and_calls_tools(tmp_path): "remove_source", } + prompts = await session.list_prompts() + prompt_names = {prompt.name for prompt in prompts.prompts} + assert prompt_names == { + "docs_add", + "docs_search", + "docs_list", + "docs_refresh", + "docs_remove", + } + + prompt = await session.get_prompt("docs_search", {"input": "Depends fastapi"}) + assert "grep_docs" in prompt.messages[0].content.text + assert "Depends fastapi" in prompt.messages[0].content.text + result = await session.call_tool("list_sources", {}) assert result.isError is False assert result.structuredContent is not None diff --git a/tests/test_mcp_tools.py b/tests/test_mcp_tools.py index 9aa0567..b2c908a 100644 --- a/tests/test_mcp_tools.py +++ b/tests/test_mcp_tools.py @@ -25,6 +25,19 @@ from docpull.security.url_validator import UrlValidationResult +class _AllowAllValidator: + def validate(self, url: str) -> UrlValidationResult: + return UrlValidationResult.valid() + + +def _allow_user_source_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("docpull.mcp.sources._USER_SOURCE_URL_VALIDATOR", _AllowAllValidator()) + + +def _allow_add_source_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("docpull.mcp.tools._ADD_SOURCE_VALIDATOR", _AllowAllValidator()) + + def test_builtin_sources_include_common_libraries(): assert "react" in BUILTIN_SOURCES assert "nextjs" in BUILTIN_SOURCES @@ -82,7 +95,8 @@ def test_load_user_sources_missing_file(tmp_path): assert sources == {} -def test_load_user_sources_parses_yaml(tmp_path): +def test_load_user_sources_parses_yaml(tmp_path, monkeypatch): + _allow_user_source_validation(monkeypatch) path = tmp_path / "sources.yaml" path.write_text(""" sources: @@ -99,6 +113,7 @@ def test_load_user_sources_parses_yaml(tmp_path): def test_all_sources_merges_builtin_and_user(tmp_path, monkeypatch): + _allow_user_source_validation(monkeypatch) path = tmp_path / "sources.yaml" path.write_text("sources:\n custom1:\n url: https://example.com\n") monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path)) @@ -230,6 +245,12 @@ async def test_fetch_url_rejects_localhost(): assert "localhost" in result.text.lower() or "rejected" in result.text.lower() +@pytest.mark.asyncio +async def test_fetch_url_rejects_dns_rebinding_suffix(): + result = await fetch_url("https://169.254.169.254.nip.io/latest/meta-data/") + assert result.is_error + + @pytest.mark.asyncio async def test_fetch_url_rejects_metadata_ip(): result = await fetch_url("https://169.254.169.254/latest/meta-data/") @@ -374,6 +395,15 @@ def test_load_user_sources_logs_yaml_error(tmp_path, caplog): assert any("Failed to parse" in rec.message for rec in caplog.records) +def test_load_user_sources_ignores_non_mapping_root(tmp_path, caplog): + path = tmp_path / "sources.yaml" + path.write_text("not-a-mapping\n") + with caplog.at_level(logging.WARNING, logger="docpull.mcp.sources"): + sources = load_user_sources(path=path) + assert sources == {} + assert any("root YAML value must be a mapping" in rec.message for rec in caplog.records) + + def test_load_user_sources_rejects_unsafe_manual_entries(tmp_path, caplog, monkeypatch): class FakeValidator: def validate(self, url: str) -> UrlValidationResult: @@ -432,6 +462,79 @@ def test_partial_meta_treats_cache_as_stale(tmp_path): assert _cache_fresh(meta) is False +def test_meta_cache_rejects_changed_url_or_profile(tmp_path): + """Meta freshness should include the crawl fingerprint, not only TTL.""" + import json + import time + + from docpull.mcp.tools import _cache_fresh + + meta = tmp_path / ".x.meta.json" + meta.write_text( + json.dumps( + { + "schema_version": 1, + "url": "https://old.example/docs", + "profile": "rag", + "max_pages": 50, + "fetched_at_epoch": time.time(), + "page_count": 5, + } + ) + ) + assert _cache_fresh( + meta, + expected_url="https://old.example/docs", + expected_profile="rag", + expected_max_pages=50, + ) + assert not _cache_fresh( + meta, + expected_url="https://new.example/docs", + expected_profile="rag", + expected_max_pages=50, + ) + assert not _cache_fresh( + meta, + expected_url="https://old.example/docs", + expected_profile="mirror", + expected_max_pages=50, + ) + assert not _cache_fresh( + meta, + expected_url="https://old.example/docs", + expected_profile="rag", + expected_max_pages=100, + ) + + +def test_meta_cache_rejects_wrong_schema_version(tmp_path): + import json + import time + + from docpull.mcp.tools import _cache_fresh + + meta = tmp_path / ".x.meta.json" + meta.write_text( + json.dumps( + { + "schema_version": 999, + "url": "https://old.example/docs", + "profile": "rag", + "max_pages": 50, + "fetched_at_epoch": time.time(), + "page_count": 5, + } + ) + ) + assert not _cache_fresh( + meta, + expected_url="https://old.example/docs", + expected_profile="rag", + expected_max_pages=50, + ) + + def test_grep_docs_context_two_renders_two_lines(tmp_path): """`context=2` should render two lines above and below, not silently cap at 1.""" lib = tmp_path / "lib" @@ -449,7 +552,7 @@ def test_atomic_meta_write_no_tmp_left_behind(tmp_path): from docpull.mcp.tools import _write_meta meta = tmp_path / ".x.meta.json" - _write_meta(meta, "x", "https://x.test", 3) + _write_meta(meta, "x", "https://x.test", 3, profile="rag", max_pages=10) assert meta.exists() assert not (tmp_path / ".x.meta.json.tmp").exists() @@ -457,10 +560,11 @@ def test_atomic_meta_write_no_tmp_left_behind(tmp_path): # --- add_source / remove_source -------------------------------------- -def test_add_source_writes_user_yaml(tmp_path): +def test_add_source_writes_user_yaml(tmp_path, monkeypatch): """add_source persists the new entry to sources.yaml under config_dir.""" import yaml + _allow_add_source_validation(monkeypatch) result = add_source( "mydocs", "https://example.com/docs", @@ -502,25 +606,33 @@ def test_add_source_rejects_localhost(tmp_path): assert result.is_error +def test_add_source_rejects_dns_rebinding_suffix(tmp_path): + result = add_source("rebind", "https://169.254.169.254.nip.io/", config_dir=tmp_path) + assert result.is_error + + def test_add_source_rejects_private_ip(tmp_path): result = add_source("internal", "https://10.0.0.1/", config_dir=tmp_path) assert result.is_error -def test_add_source_refuses_builtin_without_force(tmp_path): +def test_add_source_refuses_builtin_without_force(tmp_path, monkeypatch): # NB: URL must be DNS-resolvable because UrlValidator does live lookups. + _allow_add_source_validation(monkeypatch) result = add_source("react", "https://example.com/", config_dir=tmp_path) assert result.is_error assert "builtin" in result.text.lower() -def test_add_source_force_overrides_builtin(tmp_path): +def test_add_source_force_overrides_builtin(tmp_path, monkeypatch): + _allow_add_source_validation(monkeypatch) result = add_source("react", "https://example.com/", force=True, config_dir=tmp_path) assert not result.is_error assert result.data["shadowed_builtin"] is True -def test_add_source_updates_existing_user_source(tmp_path): +def test_add_source_updates_existing_user_source(tmp_path, monkeypatch): + _allow_add_source_validation(monkeypatch) add_source("mydocs", "https://example.com/a", config_dir=tmp_path) result = add_source("mydocs", "https://example.com/b", config_dir=tmp_path) assert not result.is_error @@ -537,13 +649,15 @@ def test_add_source_rejects_oversized_description(tmp_path): assert result.is_error -def test_add_source_rejects_unknown_category(tmp_path): +def test_add_source_rejects_unknown_category(tmp_path, monkeypatch): + _allow_add_source_validation(monkeypatch) result = add_source("mydocs", "https://example.com/", category="bogus", config_dir=tmp_path) assert result.is_error assert "category" in result.text.lower() -def test_remove_source_removes_user_entry(tmp_path): +def test_remove_source_removes_user_entry(tmp_path, monkeypatch): + _allow_add_source_validation(monkeypatch) add_source("mydocs", "https://example.com/", config_dir=tmp_path) result = remove_source("mydocs", config_dir=tmp_path, docs_dir=tmp_path) assert not result.is_error @@ -557,7 +671,8 @@ def test_remove_source_refuses_builtin(tmp_path): assert "builtin" in result.text.lower() -def test_remove_source_with_delete_cache(tmp_path): +def test_remove_source_with_delete_cache(tmp_path, monkeypatch): + _allow_add_source_validation(monkeypatch) add_source("mydocs", "https://example.com/", config_dir=tmp_path) cache = tmp_path / "mydocs" cache.mkdir() @@ -585,12 +700,36 @@ def test_remove_source_rejects_traversal(tmp_path): assert result.is_error -def test_add_source_atomic_no_tmp_left_behind(tmp_path): +def test_add_source_atomic_no_tmp_left_behind(tmp_path, monkeypatch): + _allow_add_source_validation(monkeypatch) add_source("mydocs", "https://example.com/", config_dir=tmp_path) assert (tmp_path / "sources.yaml").exists() assert not (tmp_path / "sources.yaml.tmp").exists() +def test_add_source_refuses_to_overwrite_malformed_yaml(tmp_path, monkeypatch): + _allow_add_source_validation(monkeypatch) + path = tmp_path / "sources.yaml" + path.write_text("not-a-mapping\n") + + result = add_source("mydocs", "https://example.com/", config_dir=tmp_path) + + assert result.is_error + assert "Refusing to modify" in result.text + assert path.read_text() == "not-a-mapping\n" + + +def test_remove_source_refuses_to_overwrite_malformed_yaml(tmp_path): + path = tmp_path / "sources.yaml" + path.write_text("sources: nope\n") + + result = remove_source("mydocs", config_dir=tmp_path, docs_dir=tmp_path) + + assert result.is_error + assert "Refusing to modify" in result.text + assert path.read_text() == "sources: nope\n" + + # --- Structured output (outputSchema / structuredContent) ------------ diff --git a/tests/test_metadata_extractor.py b/tests/test_metadata_extractor.py new file mode 100644 index 0000000..0c2da21 --- /dev/null +++ b/tests/test_metadata_extractor.py @@ -0,0 +1,65 @@ +"""Tests for rich metadata extraction.""" + +from docpull.metadata_extractor import RichMetadataExtractor + + +def test_extract_opengraph_handles_extruct_tuple_properties() -> None: + extractor = RichMetadataExtractor() + + result = extractor._extract_opengraph( + [ + ("og:title", "Open Graph Title"), + ("og:description", "Open Graph description"), + ("og:image", "https://example.com/og.png"), + ("article:tag", "python"), + ("article:tag", "docs"), + ] + ) + + assert result["title"] == "Open Graph Title" + assert result["description"] == "Open Graph description" + assert result["image"] == "https://example.com/og.png" + assert result["tags"] == ["python", "docs"] + + +def test_extract_prefers_opengraph_over_jsonld_when_both_exist() -> None: + extractor = RichMetadataExtractor() + html = """ + + + + + + + + + """ + + result = extractor.extract(html, "https://example.com/docs") + + assert result["title"] == "Open Graph Title" + assert result["description"] == "Open Graph description" + assert result["author"] == "Ada Lovelace" + assert result["keywords"] == ["python", "docs"] + assert result["image"] == "https://example.com/hero.png" + + +def test_merge_with_fallback_removes_empty_values() -> None: + extractor = RichMetadataExtractor() + + result = extractor.merge_with_fallback( + {"url": "https://example.com/docs", "title": None, "description": ""}, + fallback_title="Fallback", + ) + + assert result == {"url": "https://example.com/docs", "title": "Fallback"} diff --git a/tests/test_naming.py b/tests/test_naming.py index ea053e1..5599569 100644 --- a/tests/test_naming.py +++ b/tests/test_naming.py @@ -36,6 +36,13 @@ def test_strips_base_path(self) -> None: ) assert result == ["api", "auth.md"] + def test_does_not_strip_sibling_base_prefix(self) -> None: + result = _url_to_path_parts( + "https://docs.foo.com/docs-api/auth", + base_url="https://docs.foo.com/docs", + ) + assert result == ["docs-api", "auth.md"] + def test_unsafe_segment_sanitized(self) -> None: result = _url_to_path_parts("https://docs.foo.com/foo bar/with$special") assert result == ["foo_bar", "with_special.md"] @@ -99,3 +106,12 @@ def test_strips_base(self) -> None: def test_root_becomes_index(self) -> None: assert _url_to_filename("https://docs.foo.com/") == "index.md" + + def test_sibling_base_prefix_is_not_stripped(self) -> None: + assert ( + _url_to_filename( + "https://docs.foo.com/docs-api/auth", + base_url="https://docs.foo.com/docs", + ) + == "docs-api_auth.md" + ) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 770687b..d3965e5 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -6,6 +6,7 @@ import pytest from docpull.cache import StreamingDeduplicator +from docpull.models.events import EventType, SkipReason from docpull.pipeline.base import FetchPipeline, PageContext from docpull.pipeline.steps import ( ConvertStep, @@ -111,6 +112,28 @@ async def test_robots_blocked_skipped(self, mock_validator, mock_robots): assert result.should_skip is True assert "robots.txt" in result.skip_reason + @pytest.mark.asyncio + async def test_crawl_delay_updates_rate_limiter(self, mock_validator, mock_robots): + """Test that Crawl-delay is propagated to the rate limiter.""" + valid_result = MagicMock() + valid_result.is_valid = True + mock_validator.validate.return_value = valid_result + mock_robots.is_allowed.return_value = True + mock_robots.get_crawl_delay.return_value = 2.5 + mock_rate_limiter = MagicMock() + + step = ValidateStep( + url_validator=mock_validator, + robots_checker=mock_robots, + rate_limiter=mock_rate_limiter, + check_existing=False, + ) + ctx = PageContext(url="https://Example.com/page", output_path=Path("/tmp/out.md")) + result = await step.execute(ctx) + + assert result.should_skip is False + mock_rate_limiter.update_host_config.assert_called_once_with("example.com", delay=2.5) + class TestFetchStep: """Tests for FetchStep.""" @@ -274,6 +297,7 @@ async def test_unique_content_passes(self, deduplicator): @pytest.mark.asyncio async def test_duplicate_content_skipped(self, deduplicator): """Test that duplicate content is skipped.""" + events = [] step = DedupStep(deduplicator=deduplicator) # First page @@ -290,10 +314,15 @@ async def test_duplicate_content_skipped(self, deduplicator): output_path=Path("/tmp/out2.md"), markdown="# Same Content\n\nThis is the same.", ) - result = await step.execute(ctx2) + result = await step.execute(ctx2, emit=events.append) assert result.should_skip is True assert "Duplicate" in result.skip_reason + assert result.skip_code == SkipReason.DUPLICATE_CONTENT + assert any( + event.type == EventType.FETCH_SKIPPED and event.skip_reason == SkipReason.DUPLICATE_CONTENT + for event in events + ) @pytest.mark.asyncio async def test_duplicate_body_with_different_frontmatter_is_skipped(self, deduplicator): @@ -411,6 +440,71 @@ async def execute(self, ctx, emit=None): assert execution_order == ["skip"] assert "never" not in execution_order + @pytest.mark.asyncio + async def test_pipeline_stops_on_error_without_running_next_step(self): + """A step that sets ctx.error should halt the pipeline immediately.""" + execution_order = [] + + class ErrorStep: + name = "error" + + async def execute(self, ctx, emit=None): + execution_order.append("error") + ctx.error = "conversion failed" + return ctx + + class NeverReached: + name = "never" + + async def execute(self, ctx, emit=None): + execution_order.append("never") + return ctx + + pipeline = FetchPipeline(steps=[ErrorStep(), NeverReached()]) + ctx = await pipeline.execute("https://example.com", Path("/tmp/out.md")) + + assert execution_order == ["error"] + assert ctx.error == "conversion failed" + + @pytest.mark.asyncio + async def test_pipeline_emits_single_failure_event_for_raised_exception(self): + """Raised step errors should produce one FETCH_FAILED event.""" + events = [] + + class ExplodingStep: + name = "explode" + + async def execute(self, ctx, emit=None): + raise RuntimeError("boom") + + pipeline = FetchPipeline(steps=[ExplodingStep()]) + ctx = await pipeline.execute("https://example.com", Path("/tmp/out.md"), emit=events.append) + + assert ctx.error == "explode: boom" + failure_events = [event for event in events if event.type == EventType.FETCH_FAILED] + assert len(failure_events) == 1 + assert failure_events[0].error == "explode: boom" + + +class TestSaveStepErrorHandling: + """SaveStep should never write when an earlier step already failed.""" + + @pytest.mark.asyncio + async def test_save_step_skips_when_context_has_error(self, tmp_path): + step = SaveStep(base_output_dir=tmp_path) + output_file = tmp_path / "test.md" + ctx = PageContext( + url="https://example.com/page", + output_path=output_file, + html=b"raw", + error="conversion failed", + ) + + result = await step.execute(ctx) + + assert result.error == "conversion failed" + assert not output_file.exists() + class TestSkillManifestGeneration: """SaveStep.finalize() writes a SKILL.md when output.skill_name is set. diff --git a/tests/test_save_ndjson.py b/tests/test_save_ndjson.py index cf804b4..02c36eb 100644 --- a/tests/test_save_ndjson.py +++ b/tests/test_save_ndjson.py @@ -10,7 +10,9 @@ from docpull.conversion.chunking import Chunk from docpull.pipeline.base import PageContext from docpull.pipeline.steps.chunk import ChunkStep +from docpull.pipeline.steps.save_json import JsonSaveStep from docpull.pipeline.steps.save_ndjson import NdjsonSaveStep +from docpull.pipeline.steps.save_sqlite import SqliteSaveStep @pytest.mark.asyncio @@ -77,3 +79,49 @@ async def test_chunk_step_skips_when_no_markdown(): ctx = PageContext(url="https://example.com/", output_path=Path("/tmp/x.md"), markdown=None) ctx = await step.execute(ctx) assert ctx.chunks == [] + + +@pytest.mark.asyncio +async def test_json_save_uses_to_thread(monkeypatch, tmp_path): + calls: list[str] = [] + + async def fake_to_thread(func, *args, **kwargs): + calls.append(func.__name__) + return func(*args, **kwargs) + + monkeypatch.setattr("docpull.pipeline.steps.save_json.asyncio.to_thread", fake_to_thread) + + step = JsonSaveStep(base_output_dir=tmp_path) + ctx = PageContext( + url="https://example.com/", + output_path=tmp_path / "page.md", + markdown="# Page\n\nBody.", + ) + + await step.execute(ctx) + step.finalize() + + assert calls == ["_write_document"] + + +@pytest.mark.asyncio +async def test_sqlite_save_uses_to_thread(monkeypatch, tmp_path): + calls: list[str] = [] + + async def fake_to_thread(func, *args, **kwargs): + calls.append(func.__name__) + return func(*args, **kwargs) + + monkeypatch.setattr("docpull.pipeline.steps.save_sqlite.asyncio.to_thread", fake_to_thread) + + step = SqliteSaveStep(base_output_dir=tmp_path) + ctx = PageContext( + url="https://example.com/", + output_path=tmp_path / "page.md", + markdown="# Page\n\nBody.", + ) + + await step.execute(ctx) + step.close() + + assert calls == ["_insert_document"] diff --git a/tests/test_security_hardening.py b/tests/test_security_hardening.py index a63d0ce..eb9e532 100644 --- a/tests/test_security_hardening.py +++ b/tests/test_security_hardening.py @@ -2,7 +2,9 @@ from __future__ import annotations +import asyncio import socket +import time from types import SimpleNamespace from unittest.mock import MagicMock @@ -10,6 +12,7 @@ from pydantic import ValidationError from docpull.http.client import AsyncHttpClient, _ValidatedResolver +from docpull.http.rate_limiter import AdaptiveRateLimiter, PerHostRateLimiter from docpull.security.robots import RobotsChecker, _RobotsResponse from docpull.security.url_validator import UrlValidationResult, UrlValidator @@ -23,6 +26,8 @@ async def __aexit__(self, exc_type, exc, tb) -> None: class _DummyRateLimiter: + default_concurrent = 3 + def limit(self, url: str) -> _NullAsyncContext: return _NullAsyncContext() @@ -80,6 +85,19 @@ def head(self, url: str, **kwargs: object) -> _FakeRequestContext: return _FakeRequestContext(self._responses.pop(0)) +class _RecordingAdaptiveRateLimiter(AdaptiveRateLimiter): + def __init__(self) -> None: + super().__init__(default_delay=0.0, default_concurrent=3) + self.rate_limit_calls: list[tuple[str, int | None]] = [] + self.success_calls: list[str] = [] + + async def record_rate_limit(self, url: str, retry_after: int | None = None) -> None: + self.rate_limit_calls.append((url, retry_after)) + + async def record_success(self, url: str) -> None: + self.success_calls.append(url) + + class TestUrlValidatorResolution: def test_rejects_public_hostname_that_resolves_to_loopback(self) -> None: validator = UrlValidator(resolver=lambda hostname: ["127.0.0.1"]) @@ -131,6 +149,35 @@ def rebinding_resolver(hostname: str) -> list[str]: assert calls["n"] == 1 assert addresses == ["1.2.3.4"] + def test_validate_and_connect_path_share_screened_resolution(self) -> None: + calls = {"n": 0} + + def resolver(hostname: str) -> list[str]: + calls["n"] += 1 + return ["93.184.216.34"] + + validator = UrlValidator(resolver=resolver) + + assert validator.validate("https://docs.example.com").is_valid is True + assert validator.resolve_allowed_addresses("docs.example.com") == ["93.184.216.34"] + assert calls["n"] == 1 + + def test_resolution_cache_expires_and_reresolves(self, monkeypatch: pytest.MonkeyPatch) -> None: + calls = {"n": 0} + now = {"value": 100.0} + + def resolver(hostname: str) -> list[str]: + calls["n"] += 1 + return [f"93.184.216.{calls['n']}"] + + validator = UrlValidator(resolver=resolver) + monkeypatch.setattr("docpull.security.url_validator.time.monotonic", lambda: now["value"]) + + assert validator.resolve_allowed_addresses("docs.example.com") == ["93.184.216.1"] + now["value"] += UrlValidator._RESOLUTION_CACHE_TTL_SECONDS + 0.01 + assert validator.resolve_allowed_addresses("docs.example.com") == ["93.184.216.2"] + assert calls["n"] == 2 + def test_resolve_allowed_addresses_rejects_blocked_resolution(self) -> None: validator = UrlValidator(resolver=lambda hostname: ["169.254.169.254"]) @@ -164,6 +211,23 @@ def test_trailing_dot_does_not_bypass_localhost(self) -> None: assert validator.validate("https://localhost./admin").is_valid is False assert validator.validate_hostname("service.internal.").is_valid is False + def test_blocks_dns_rebinding_suffixes_without_resolution(self) -> None: + validator = UrlValidator(resolver=lambda hostname: ["93.184.216.34"]) + + for host in ("docs.nip.io", "api.sslip.io", "box.xip.io", "router.lan"): + result = validator.validate(f"https://{host}/") + assert result.is_valid is False, host + assert result.rejection_reason is not None + assert "not allowed" in result.rejection_reason + + def test_allowed_domains_are_normalized(self) -> None: + validator = UrlValidator( + allowed_domains={"Docs.Example.com."}, + resolver=lambda hostname: ["93.184.216.34"], + ) + + assert validator.validate("https://docs.example.com/page").is_valid is True + class TestValidatedResolver: @pytest.mark.asyncio @@ -251,6 +315,87 @@ async def test_http_client_strips_auth_headers_for_off_scope_requests(self) -> N _, kwargs = client._session.calls[0] assert kwargs["headers"] == {"X-Trace": "keep-me"} + @pytest.mark.asyncio + async def test_http_client_head_accepts_request_headers(self) -> None: + client = AsyncHttpClient(rate_limiter=_DummyRateLimiter()) + client._session = _FakeSession( + [ + _FakeResponse( + 200, + headers={"Content-Type": "text/html"}, + url="https://docs.example.com/page", + ) + ] + ) + + await client.head("https://docs.example.com/page", headers={"X-Test": "1"}) + + assert client._session is not None + _, kwargs = client._session.calls[0] + assert kwargs["headers"]["X-Test"] == "1" + + @pytest.mark.asyncio + async def test_http_client_uses_separate_connect_and_read_timeouts(self) -> None: + client = AsyncHttpClient( + rate_limiter=_DummyRateLimiter(), + default_timeout=7.0, + connect_timeout=2.0, + ) + client._session = _FakeSession( + [ + _FakeResponse( + 200, + headers={"Content-Type": "text/html"}, + chunks=[b"ok"], + url="https://docs.example.com/page", + ) + ] + ) + + await client.get("https://docs.example.com/page") + + assert client._session is not None + _, kwargs = client._session.calls[0] + timeout = kwargs["timeout"] + assert timeout.total == 9.0 + assert timeout.connect == 2.0 + assert timeout.sock_read == 7.0 + + @pytest.mark.asyncio + async def test_http_client_rejects_crlf_in_request_headers(self) -> None: + client = AsyncHttpClient(rate_limiter=_DummyRateLimiter()) + client._session = _FakeSession([]) + + with pytest.raises(ValueError, match="header injection"): + await client.get("https://docs.example.com/page", headers={"X-Test": "ok\r\nbad: yes"}) + + @pytest.mark.asyncio + async def test_http_head_retries_and_tracks_adaptive_backoff(self) -> None: + limiter = _RecordingAdaptiveRateLimiter() + client = AsyncHttpClient(rate_limiter=limiter, max_retries=1, retry_base_delay=0.0) + client._session = _FakeSession( + [ + _FakeResponse( + 429, + headers={"Retry-After": "120", "Content-Type": "text/html"}, + url="https://docs.example.com/page", + ), + _FakeResponse( + 200, + headers={"Content-Type": "text/html"}, + url="https://docs.example.com/page", + ), + ] + ) + + response = await client.head("https://docs.example.com/page") + + assert response.status_code == 200 + assert client._session is not None + assert len(client._session.calls) == 2 + assert limiter.rate_limit_calls == [("https://docs.example.com/page", 120)] + assert limiter.success_calls == ["https://docs.example.com/page"] + def test_http_client_rejects_insecure_tls_override(self) -> None: with pytest.raises(ValueError, match="Insecure TLS is not supported"): AsyncHttpClient( @@ -298,12 +443,23 @@ def fake_fetch(url: str) -> _RobotsResponse: assert checker.is_allowed("https://public.example/docs") is False + def test_robots_checker_uses_validator_by_default(self, monkeypatch: pytest.MonkeyPatch) -> None: + checker = RobotsChecker() + monkeypatch.setattr( + checker._url_validator, + "validate", + lambda url: UrlValidationResult.invalid("blocked for test"), + ) + + assert checker.is_allowed("https://public.example/docs") is False + def test_robots_checker_allows_when_robots_missing(self, monkeypatch: pytest.MonkeyPatch) -> None: checker = RobotsChecker() def fake_fetch(url: str) -> _RobotsResponse: return _RobotsResponse(status_code=404, headers={}, text="") + monkeypatch.setattr(checker, "_validate_url", lambda url: True) monkeypatch.setattr(checker, "_fetch_url", fake_fetch) assert checker.is_allowed("https://public.example/docs") is True @@ -358,6 +514,29 @@ def close(self) -> None: with pytest.raises(ValueError, match="exceeds maximum size"): checker._fetch_url("https://evil.example.com/robots.txt") + def test_robots_cache_key_is_canonicalized(self, monkeypatch: pytest.MonkeyPatch) -> None: + checker = RobotsChecker() + calls: list[tuple[str, str]] = [] + + def fake_fetch(domain: str, robots_url: str): + calls.append((domain, robots_url)) + return SimpleNamespace(parser=None, status="missing") + + monkeypatch.setattr(checker, "_fetch_robots", fake_fetch) + + assert checker.is_allowed("HTTPS://Example.com/docs") is True + assert checker.is_allowed("https://example.com:443/other") is True + + assert calls == [("example.com", "https://example.com/robots.txt")] + + def test_robots_url_preserves_ipv6_brackets(self) -> None: + checker = RobotsChecker() + + assert ( + checker._get_robots_url("https://[2606:2800:220:1:248:1893:25c8:1946]:8443/docs") + == "https://[2606:2800:220:1:248:1893:25c8:1946]:8443/robots.txt" + ) + # --------------------------------------------------------------------------- # CRLF header injection prevention @@ -403,6 +582,22 @@ def test_auth_header_value_rejects_crlf(self) -> None: with pytest.raises(ValidationError, match="must not contain CR, LF"): AuthConfig(type=AuthType.HEADER, header_name="X-Auth", header_value="token\r\nX-Evil: true") + def test_auth_token_env_expansion_rejects_crlf(self, monkeypatch: pytest.MonkeyPatch) -> None: + from docpull.models.config import AuthConfig, AuthType + + monkeypatch.setenv("DOCPULL_TEST_TOKEN", "tok\r\nX-Evil: true") + + with pytest.raises(ValueError, match="token must not contain CR, LF, or null"): + AuthConfig(type=AuthType.BEARER, token="$DOCPULL_TEST_TOKEN") + + def test_auth_cookie_env_expansion_rejects_crlf(self, monkeypatch: pytest.MonkeyPatch) -> None: + from docpull.models.config import AuthConfig, AuthType + + monkeypatch.setenv("DOCPULL_TEST_COOKIE", "session=abc\r\nX-Evil: true") + + with pytest.raises(ValueError, match="cookie must not contain CR, LF, or null"): + AuthConfig(type=AuthType.COOKIE, cookie="$DOCPULL_TEST_COOKIE") + def test_auth_header_accepts_clean_values(self) -> None: from docpull.models.config import AuthConfig, AuthType @@ -483,3 +678,40 @@ def test_git_committed_event_removed(self) -> None: from docpull.models.events import EventType assert not hasattr(EventType, "GIT_COMMITTED") + + +class TestRateLimiterIsolation: + @pytest.mark.asyncio + async def test_waiting_host_does_not_block_other_hosts(self) -> None: + limiter = PerHostRateLimiter(default_delay=0.2, default_concurrent=1) + limiter._last_request["slow.example"] = time.monotonic() + + entered_fast = asyncio.Event() + + async def slow_request() -> None: + async with limiter.limit("https://slow.example/page"): + await asyncio.sleep(0.01) + + async def fast_request() -> None: + async with limiter.limit("https://fast.example/page"): + entered_fast.set() + + slow_task = asyncio.create_task(slow_request()) + await asyncio.sleep(0) + fast_task = asyncio.create_task(fast_request()) + + await asyncio.wait_for(entered_fast.wait(), timeout=0.05) + await asyncio.gather(slow_task, fast_task) + + def test_default_ports_and_case_share_one_host_key(self) -> None: + limiter = PerHostRateLimiter(default_delay=0.1, default_concurrent=1) + + assert limiter._get_host("HTTPS://Example.com:443/path") == "example.com" + assert limiter._get_host("https://example.com./path") == "example.com" + + def test_invalid_rate_limiter_config_is_rejected(self) -> None: + with pytest.raises(ValueError, match="delay must be >= 0"): + PerHostRateLimiter(default_delay=-0.1) + + with pytest.raises(ValueError, match="concurrency must be >= 1"): + PerHostRateLimiter(default_concurrent=0) diff --git a/tests/test_special_cases.py b/tests/test_special_cases.py index 443a923..e284de2 100644 --- a/tests/test_special_cases.py +++ b/tests/test_special_cases.py @@ -191,6 +191,43 @@ def test_separates_path_and_query_parameters(self): assert "`id` (string) (required)" in md assert "`expand` (array)" in md + def test_resolves_component_parameter_refs(self): + spec = { + "openapi": "3.0.0", + "info": {"title": "API"}, + "paths": { + "/items/{id}": { + "parameters": [{"$ref": "#/components/parameters/TraceId"}], + "get": { + "parameters": [{"$ref": "#/components/parameters/ItemId"}], + "responses": {"200": {"description": "ok"}}, + }, + } + }, + "components": { + "parameters": { + "TraceId": { + "name": "trace_id", + "in": "header", + "schema": {"type": "string"}, + "description": "

Request trace.

", + }, + "ItemId": { + "name": "id", + "in": "path", + "schema": {"type": "string"}, + }, + } + }, + } + html = json.dumps(spec).encode() + result = OpenApiExtractor().try_extract(html, "https://example.com/openapi.json") + assert result is not None + md = result.markdown + assert "**Header parameters:**" in md + assert "`trace_id` (string) — Request trace." in md + assert "`id` (string) (required)" in md + def test_handles_form_encoded_request_body(self): spec = { "openapi": "3.0.0", diff --git a/tests/test_update_metrics_script.py b/tests/test_update_metrics_script.py new file mode 100644 index 0000000..f0eeb92 --- /dev/null +++ b/tests/test_update_metrics_script.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import importlib.util +from pathlib import Path + + +def _load_update_metrics_module(): + repo_root = Path(__file__).resolve().parents[1] + script_path = repo_root / ".github" / "scripts" / "update_metrics.py" + spec = importlib.util.spec_from_file_location("update_metrics", script_path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_build_snapshot_rows_formats_all_expected_metrics() -> None: + update_metrics = _load_update_metrics_module() + + rows = update_metrics.build_snapshot_rows( + recent={"last_day": 12, "last_week": 3456, "last_month": 78901}, + repo={"stargazers_count": 11, "forks_count": 22, "subscribers_count": 33}, + open_issues=4, + open_prs=5, + clones={"count": 66, "uniques": 7}, + views={"count": 88, "uniques": 9}, + ) + + assert rows == [ + ["PyPI downloads (last 24h)", "12"], + ["PyPI downloads (last 7d)", "3,456"], + ["PyPI downloads (last 30d)", "78,901"], + ["GitHub stars", "11"], + ["GitHub forks", "22"], + ["GitHub watchers", "33"], + ["Open issues", "4"], + ["Open PRs", "5"], + ["Repo clones (last 14d)", "66"], + ["Unique cloners (last 14d)", "7"], + ["Repo views (last 14d)", "88"], + ["Unique visitors (last 14d)", "9"], + ] + + +def test_build_path_rows_trims_repo_prefix_and_wraps_code() -> None: + update_metrics = _load_update_metrics_module() + + rows = update_metrics.build_path_rows( + [ + {"path": "/raintree-technology/docpull", "count": 10, "uniques": 2}, + {"path": "/raintree-technology/docpull/docs/getting-started", "count": 3, "uniques": 1}, + ] + ) + + assert rows == [ + ["`/`", "10", "2"], + ["`/docs/getting-started`", "3", "1"], + ] + + +def test_append_section_with_table_uses_empty_state_when_no_rows() -> None: + update_metrics = _load_update_metrics_module() + + lines: list[str] = [] + update_metrics.append_section_with_table( + lines, + "## Example", + ["Col A", "Col B"], + [], + empty_message="_Nothing here._", + ) + + assert lines == [ + "## Example", + "", + "_Nothing here._", + "", + ] + + +def test_append_table_or_empty_uses_table_when_rows_exist() -> None: + update_metrics = _load_update_metrics_module() + + lines: list[str] = [] + update_metrics.append_table_or_empty( + lines, + ["Col A", "Col B"], + [["x", "y"]], + empty_message="_Nothing here._", + ) + + assert lines == [ + "| Col A | Col B |", + "|---|---|", + "| x | y |", + "", + ] diff --git a/web/.gitignore b/web/.gitignore deleted file mode 100644 index 63a9443..0000000 --- a/web/.gitignore +++ /dev/null @@ -1,37 +0,0 @@ -# Dependencies -node_modules/ - -# Next.js -.next/ -out/ -# Auto-generated by `next build` / `next dev`. Path inside flips between -# .next/types/routes.d.ts and .next/dev/types/routes.d.ts depending on -# which command was last run, so keeping it tracked produces noise. -next-env.d.ts - -# Build -dist/ - -# macOS -.DS_Store - -# IDEs -.vscode/ -.idea/ -*.swp -*.swo - -# Claude Code -.claude/ - -# Environment -.env -.env.local -.env.*.local - -# Logs -*.log -npm-debug.log* - -# TypeScript -*.tsbuildinfo diff --git a/web/README.md b/web/README.md index 9c326f4..25546e5 100644 --- a/web/README.md +++ b/web/README.md @@ -1,6 +1,6 @@ # docpull web -Marketing website for [docpull](https://github.com/raintree-technology/docpull) - the documentation fetcher for AI. +Marketing website for [docpull](https://github.com/raintree-technology/docpull) - the local web puller for turning server-rendered sites into Markdown. ## Tech Stack @@ -8,7 +8,7 @@ Marketing website for [docpull](https://github.com/raintree-technology/docpull) - React 19 - TypeScript - Tailwind CSS -- Framer Motion +- Vercel Analytics ## Development diff --git a/web/app/.well-known/agent-skills.json/route.ts b/web/app/.well-known/agent-skills.json/route.ts new file mode 100644 index 0000000..85ad244 --- /dev/null +++ b/web/app/.well-known/agent-skills.json/route.ts @@ -0,0 +1,33 @@ +import { absoluteUrl, discoveryPaths } from "@/lib/site"; + +export const dynamic = "force-static"; + +const payload = { + version: "https://agent-skills.dev/v1", + skills: [ + { + name: "docpull-research", + title: "docpull research", + description: + "Ground answers about libraries, frameworks, SDKs, and docs URLs in real fetched documentation via docpull MCP tools.", + url: absoluteUrl(discoveryPaths.docpullResearchSkill), + inputModes: ["text"], + tags: [ + "documentation", + "research", + "mcp", + "libraries", + "frameworks", + "grounding", + ], + }, + ], +}; + +export function GET() { + return Response.json(payload, { + headers: { + "Cache-Control": "public, s-maxage=3600, stale-while-revalidate=86400", + }, + }); +} diff --git a/web/app/.well-known/security.txt/route.ts b/web/app/.well-known/security.txt/route.ts new file mode 100644 index 0000000..70fa5b3 --- /dev/null +++ b/web/app/.well-known/security.txt/route.ts @@ -0,0 +1,19 @@ +import { absoluteUrl, discoveryPaths, site } from "@/lib/site"; + +export const dynamic = "force-static"; + +const body = `Contact: mailto:support@raintree.technology +Expires: ${site.securityExpiresAt} +Preferred-Languages: en +Canonical: ${absoluteUrl(discoveryPaths.security)} +Acknowledgments: https://github.com/raintree-technology/docpull +`; + +export function GET() { + return new Response(body, { + headers: { + "Content-Type": "text/plain; charset=utf-8", + "Cache-Control": "public, s-maxage=3600, stale-while-revalidate=86400", + }, + }); +} diff --git a/web/app/agent-skills/docpull-research.md/route.ts b/web/app/agent-skills/docpull-research.md/route.ts new file mode 100644 index 0000000..5449a80 --- /dev/null +++ b/web/app/agent-skills/docpull-research.md/route.ts @@ -0,0 +1,56 @@ +export const dynamic = "force-static"; + +const body = `--- +name: docpull-research +description: Use the docpull MCP tools to ground answers in real documentation when the user asks about a specific library, framework, SDK, API, or pasted docs URL. +allowed-tools: mcp__docpull__list_indexed, mcp__docpull__list_sources, mcp__docpull__ensure_docs, mcp__docpull__grep_docs, mcp__docpull__read_doc, mcp__docpull__fetch_url +--- + +# docpull research + +Ground library and framework answers in fetched documentation instead of model recall. + +## When to use this skill + +Activate when: +- the user names a specific library, framework, SDK, or API +- the question is version-sensitive or likely to drift +- the user pastes a docs URL +- a wrong answer would cause implementation churn + +Do not activate for: +- general programming explanations +- the user's own codebase +- highly stable standard-library questions + +## Workflow + +1. Check what is already cached with \`list_indexed\`. +2. If the library is cached, search it with \`grep_docs\`. +3. Use \`read_doc\` for line-level follow-up context. +4. If the library is not cached: + - use \`ensure_docs\` for a built-in alias + - use \`fetch_url\` for a single pasted page + - otherwise ask for the docs URL once; do not crawl unrelated docs speculatively +5. Answer with attribution to the fetched source. + +## Guidance + +- Prefer the docs over memory for fast-moving libraries. +- Do not over-fetch unrelated libraries. +- Broaden a search once before concluding the docs do not cover the topic. +- Say once that the answer is grounded in the docs, then stay concise. + +## Built-in aliases + +These aliases can be passed to \`ensure_docs(source=...)\` without additional setup: \`react\`, \`nextjs\`, \`tailwindcss\`, \`vite\`, \`hono\`, \`fastapi\`, \`express\`, \`anthropic\`, \`openai\`, \`langchain\`, \`supabase\`, \`drizzle\`, \`prisma\`. +`; + +export function GET() { + return new Response(body, { + headers: { + "Content-Type": "text/markdown; charset=utf-8", + "Cache-Control": "public, s-maxage=3600, stale-while-revalidate=86400", + }, + }); +} diff --git a/web/app/globals.css b/web/app/globals.css index 75690d9..3b81dc4 100644 --- a/web/app/globals.css +++ b/web/app/globals.css @@ -109,11 +109,29 @@ * { @apply border-border; } + html { + background: + radial-gradient(circle at top, rgba(92, 143, 255, 0.18), transparent 38%), + linear-gradient(180deg, rgba(245, 247, 250, 0.98), rgba(235, 239, 244, 0.94)); + } body { @apply bg-background text-foreground; + font-family: + ui-sans-serif, + -apple-system, + BlinkMacSystemFont, + "Segoe UI", + sans-serif; font-feature-settings: "rlig" 1, "calt" 1; + background: + radial-gradient(circle at top, rgba(92, 143, 255, 0.12), transparent 28rem), + linear-gradient(180deg, rgba(252, 253, 255, 0.96), rgba(241, 244, 248, 0.9)); + } + + ::selection { + background: rgba(68, 116, 214, 0.2); } } @@ -168,25 +186,98 @@ } :root .glass { - background: rgba(255, 255, 255, 0.85); - backdrop-filter: blur(32px); - -webkit-backdrop-filter: blur(32px); - border: 1px solid rgba(0, 0, 0, 0.06); + background: linear-gradient(180deg, rgba(255, 255, 255, 0.88), rgba(248, 250, 252, 0.8)); + backdrop-filter: blur(28px); + -webkit-backdrop-filter: blur(28px); + border: 1px solid rgba(15, 23, 42, 0.08); box-shadow: - 0 4px 24px rgba(0, 0, 0, 0.04), + 0 18px 48px rgba(15, 23, 42, 0.08), inset 0 1px 0 rgba(255, 255, 255, 0.8); } .dark .glass { - background: rgba(30, 30, 30, 0.85); - backdrop-filter: blur(32px); - -webkit-backdrop-filter: blur(32px); + background: linear-gradient(180deg, rgba(21, 24, 32, 0.84), rgba(16, 18, 25, 0.88)); + backdrop-filter: blur(28px); + -webkit-backdrop-filter: blur(28px); border: 1px solid rgba(255, 255, 255, 0.08); box-shadow: - 0 4px 24px rgba(0, 0, 0, 0.2), + 0 18px 48px rgba(0, 0, 0, 0.28), inset 0 1px 0 rgba(255, 255, 255, 0.1); } + .apple-panel { + background: + linear-gradient(180deg, rgba(255, 255, 255, 0.92), rgba(248, 250, 252, 0.82)); + border: 1px solid rgba(15, 23, 42, 0.08); + box-shadow: + 0 18px 44px rgba(15, 23, 42, 0.08), + inset 0 1px 0 rgba(255, 255, 255, 0.7); + backdrop-filter: blur(24px); + -webkit-backdrop-filter: blur(24px); + } + + .dark .apple-panel { + background: + linear-gradient(180deg, rgba(18, 21, 29, 0.9), rgba(12, 14, 20, 0.94)); + border: 1px solid rgba(255, 255, 255, 0.08); + box-shadow: + 0 24px 52px rgba(0, 0, 0, 0.34), + inset 0 1px 0 rgba(255, 255, 255, 0.08); + } + + .apple-button { + @apply inline-flex items-center justify-center rounded-full px-5 py-3 text-sm font-medium transition-all; + min-height: 44px; + background: linear-gradient(180deg, rgba(25, 28, 36, 0.96), rgba(15, 18, 25, 0.96)); + color: white; + box-shadow: + 0 10px 28px rgba(15, 23, 42, 0.18), + inset 0 1px 0 rgba(255, 255, 255, 0.12); + } + + .apple-button:hover { + transform: translateY(-1px); + box-shadow: + 0 14px 30px rgba(15, 23, 42, 0.22), + inset 0 1px 0 rgba(255, 255, 255, 0.16); + } + + .apple-button-secondary { + @apply inline-flex items-center justify-center rounded-full px-5 py-3 text-sm font-medium transition-colors; + min-height: 44px; + background: rgba(255, 255, 255, 0.62); + color: hsl(var(--foreground)); + border: 1px solid rgba(15, 23, 42, 0.08); + } + + .dark .apple-button-secondary { + background: rgba(255, 255, 255, 0.06); + border-color: rgba(255, 255, 255, 0.08); + } + + .section-kicker { + @apply text-[11px] uppercase tracking-[0.22em]; + color: rgba(71, 85, 105, 0.82); + } + + .section-title { + @apply text-3xl font-medium tracking-tight sm:text-4xl; + text-wrap: balance; + } + + .section-copy { + @apply text-base leading-relaxed sm:text-lg; + color: color-mix(in srgb, hsl(var(--foreground)) 74%, transparent); + text-wrap: pretty; + } + + .hairline-grid { + background-image: + linear-gradient(to right, rgba(148, 163, 184, 0.08) 1px, transparent 1px), + linear-gradient(to bottom, rgba(148, 163, 184, 0.08) 1px, transparent 1px); + background-size: 32px 32px; + } + /* Code styling */ pre { @apply rounded-lg overflow-x-auto; @@ -237,6 +328,18 @@ /* Responsive utilities */ } +html.dark { + background: + radial-gradient(circle at top, rgba(73, 111, 193, 0.22), transparent 34%), + linear-gradient(180deg, rgba(11, 13, 18, 1), rgba(8, 10, 14, 1)); +} + +.dark body { + background: + radial-gradient(circle at top, rgba(73, 111, 193, 0.2), transparent 28rem), + linear-gradient(180deg, rgba(10, 12, 17, 0.98), rgba(7, 9, 13, 1)); +} + /* Flow animations for How-it-works connectors */ @keyframes flow-right { 0% { @@ -277,3 +380,22 @@ a:focus-visible { outline: 2px solid hsl(var(--ring)); outline-offset: 3px; } + +.skip-link { + position: absolute; + left: 1rem; + top: 1rem; + z-index: 100; + transform: translateY(-200%); + padding: 0.75rem 1rem; + border-radius: 0.5rem; + background: hsl(var(--background)); + color: hsl(var(--foreground)); + border: 1px solid hsl(var(--border)); + box-shadow: 0 8px 24px rgba(0, 0, 0, 0.12); + transition: transform 0.15s ease; +} + +.skip-link:focus { + transform: translateY(0); +} diff --git a/web/app/layout.tsx b/web/app/layout.tsx index b641134..1d1d2c3 100644 --- a/web/app/layout.tsx +++ b/web/app/layout.tsx @@ -1,15 +1,16 @@ -import type { Metadata } from "next"; +import type { Metadata, Viewport } from "next"; import { Analytics } from "@vercel/analytics/next"; import "./globals.css"; import { ThemeProvider } from "@/components/ThemeProvider"; - -const baseUrl = "https://docpull.raintree.technology"; +import { site } from "@/lib/site"; export const metadata: Metadata = { - metadataBase: new URL(baseUrl), - title: "docpull - Fetch docs. Get AI-ready Markdown.", - description: - "Fast, type-safe, secure documentation fetcher. Transform any docs site into clean, AI-ready Markdown for LLMs, RAG pipelines, and offline archives.", + metadataBase: new URL(site.baseUrl), + title: { + default: "docpull - Turn the web into Markdown.", + template: "%s - docpull", + }, + description: site.description, applicationName: "docpull", authors: [{ name: "Raintree Technology", url: "https://raintree.technology" }], creator: "Raintree Technology", @@ -32,23 +33,22 @@ export const metadata: Metadata = { icon: "/logo.svg", }, keywords: [ - "documentation", "markdown", - "AI", - "LLM", - "RAG", "web scraping", + "web crawling", + "website archiving", "python", "cli", - "docs", "fetcher", + "knowledge base", + "server-rendered sites", ], openGraph: { - title: "docpull - Fetch docs. Get clean, AI-ready Markdown.", + title: "docpull - Turn the web into Markdown.", description: - "Turn any documentation site into clean Markdown for LLMs, RAG pipelines, and training datasets. Fast, secure, and built for AI workflows.", - url: baseUrl, + "Turn server-rendered sites into clean local Markdown with caching, deduplication, and strict network guards.", + url: site.baseUrl, type: "website", siteName: "docpull", locale: "en_US", @@ -57,21 +57,29 @@ export const metadata: Metadata = { url: "/og-image.png", width: 1200, height: 630, - alt: "docpull - Fetch docs. Get clean Markdown.", + alt: "docpull - Turn the web into Markdown.", }, ], }, twitter: { card: "summary_large_image", - title: "docpull - Fetch docs. Get clean, AI-ready Markdown.", + title: "docpull - Turn the web into Markdown.", description: - "Turn any documentation site into clean Markdown for LLMs, RAG pipelines, and training datasets. Fast, secure, and built for AI workflows.", + "Turn server-rendered sites into clean local Markdown with caching, deduplication, and strict network guards.", site: "@raintree_tech", creator: "@raintree_tech", images: ["/og-image.png"], }, }; +export const viewport: Viewport = { + colorScheme: "light dark", + themeColor: [ + { media: "(prefers-color-scheme: light)", color: "#f8fafc" }, + { media: "(prefers-color-scheme: dark)", color: "#0a0c11" }, + ], +}; + export default function RootLayout({ children, }: Readonly<{ @@ -80,6 +88,9 @@ export default function RootLayout({ return ( + + Skip to main content + {children} diff --git a/web/app/llms-full.txt/route.ts b/web/app/llms-full.txt/route.ts new file mode 100644 index 0000000..b893541 --- /dev/null +++ b/web/app/llms-full.txt/route.ts @@ -0,0 +1,124 @@ +import { absoluteUrl, discoveryPaths, site } from "@/lib/site"; + +export const dynamic = "force-static"; + +const body = `# ${site.name} + +## Summary + +docpull is a local, browser-free web puller that turns server-rendered pages into clean Markdown. It is built for documentation, blogs, help centers, pricing pages, changelogs, and other HTML-first content. It is useful for local archives, search indexes, agent workflows, and RAG ingestion. + +## Positioning + +Turn the web into Markdown. +Keep it all local. + +docpull pulls server-rendered web pages into clean Markdown on your machine. Use it for docs, blogs, help centers, pricing pages, changelogs, and other HTML-first sites without a hosted crawler or browser runner in the loop. + +## Installation + +\`\`\`bash +pip install docpull +\`\`\` + +Optional setup: +- PyPI: https://pypi.org/project/docpull/ +- Claude plugin: https://github.com/raintree-technology/docpull/tree/main/plugin + +## MCP and agent setup + +docpull can be connected through a local MCP server for Claude Code, Cursor, and Codex. + +Claude Code: +\`\`\`bash +pip install 'docpull[mcp]' +claude mcp add --transport stdio --scope user docpull -- docpull mcp +\`\`\` + +Codex: +\`\`\`bash +pip install 'docpull[mcp]' +codex mcp add docpull -- docpull mcp +\`\`\` + +Claude plugin install: +\`\`\`bash +pip install 'docpull[mcp]' +/plugin marketplace add raintree-technology/docpull +/plugin install docpull@docpull +\`\`\` + +## Core workflow + +1. Point docpull at a server-rendered URL. +2. Let the fetch pipeline discover pages and convert HTML to Markdown. +3. Use the output in a local docs folder, a search index, or an agent skill. + +Agent skill output: +\`\`\`bash +docpull https://docs.example.com --skill example-docs --max-pages 100 +\`\`\` + +## Product strengths + +- Markdown you can actually reuse. +- Dedup before disk fills up. +- Network rules stay enforced. +- Re-fetches stay selective. +- Partial crawls are first-class. + +## Profiles + +- RAG: deduped, metadata-rich output for LLMs and vector stores. +- Mirror: full archive with caching and resume support. +- Quick: 50 pages, depth 2, for testing and sampling. +- LLM: token-aware NDJSON for model ingestion, with clear skip reasons for JS-only pages. Add --strict-js-required when fail-loud routing is needed. + +## Example output + +\`\`\` +./docs/pricing.md: + +--- +title: "Pricing" +source: https://stripe.com/pricing +--- + +# Pricing + +Choose the plan that matches your business. +Usage-based billing starts when you move past +the free tier. +\`\`\` + +## Constraints + +- docpull does not run a browser. +- JavaScript-heavy pages that require client-side rendering are detected and skipped. +- For JS-rendered sites, use a browser crawler when necessary. +- Config examples use the one-URL-per-DocpullConfig shape. For multiple sites, + run separate CLI commands, load several configs in Python, or use MCP source + aliases. + +## Related resources + +- Homepage: ${absoluteUrl(discoveryPaths.home)} +- llms.txt: ${absoluteUrl(discoveryPaths.llms)} +- llms-full.txt: ${absoluteUrl(discoveryPaths.llmsFull)} +- agent skills: ${absoluteUrl(discoveryPaths.agentSkills)} +- sitemap.xml: ${absoluteUrl(discoveryPaths.sitemap)} +- robots.txt: ${absoluteUrl(discoveryPaths.robots)} +- RSS: ${absoluteUrl(discoveryPaths.rss)} +- security.txt: ${absoluteUrl(discoveryPaths.security)} +- README: https://github.com/raintree-technology/docpull#readme +- Changelog: https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md +`; + +export function GET() { + return new Response(body, { + headers: { + "Content-Type": "text/markdown; charset=utf-8", + "Cache-Control": "public, s-maxage=3600, stale-while-revalidate=86400", + }, + }); +} diff --git a/web/app/llms.txt/route.ts b/web/app/llms.txt/route.ts new file mode 100644 index 0000000..1265c7a --- /dev/null +++ b/web/app/llms.txt/route.ts @@ -0,0 +1,62 @@ +import { absoluteUrl, discoveryPaths, site } from "@/lib/site"; + +export const dynamic = "force-static"; + +const body = `# ${site.name} + +> docpull turns server-rendered web pages into clean Markdown locally. + +docpull is a browser-free web puller for documentation, blogs, help centers, changelogs, pricing pages, and other HTML-first sites. It is designed for local archives, agent workflows, RAG ingestion, and auditable content pipelines. + +## Canonical site +- ${absoluteUrl(discoveryPaths.home)} + +## Key resources +- Homepage: ${absoluteUrl(discoveryPaths.home)} +- README: https://github.com/raintree-technology/docpull#readme +- PyPI: https://pypi.org/project/docpull/ +- Changelog: https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md +- Plugin: https://github.com/raintree-technology/docpull/tree/main/plugin +- MCP server docs: https://github.com/raintree-technology/docpull#mcp-server + +## Machine-readable endpoints +- llms.txt: ${absoluteUrl(discoveryPaths.llms)} +- llms-full.txt: ${absoluteUrl(discoveryPaths.llmsFull)} +- sitemap.xml: ${absoluteUrl(discoveryPaths.sitemap)} +- robots.txt: ${absoluteUrl(discoveryPaths.robots)} +- RSS: ${absoluteUrl(discoveryPaths.rss)} +- security.txt: ${absoluteUrl(discoveryPaths.security)} +- agent skills: ${absoluteUrl(discoveryPaths.agentSkills)} + +## What matters +- Local-first operation. The crawl stays on your machine. +- Browser-free fetching. JavaScript-heavy pages are detected and skipped. +- Clean Markdown output with source metadata. +- Profiles for RAG, mirror, quick sampling, and LLM ingestion. +- MCP support for Claude Code, Cursor, and Codex workflows. + +## Primary sections on the homepage +- MCP setup +- URL In, Corpus Out +- What Holds Up +- Presets With Opinions +- Real Output +- Start Local +- Sharp Edges + +## Notes for agents +- Prefer the README for installation and feature details. +- Prefer homepage copy for current product positioning. +- Prefer llms-full.txt when a single consolidated summary is more useful than fetching the homepage HTML. +- Prefer .well-known/agent-skills.json when you want explicit task-level instructions for docs-grounded research. +- docpull does not render JavaScript-heavy sites; route to a browser crawler when strict HTML-first fetching is insufficient. +`; + +export function GET() { + return new Response(body, { + headers: { + "Content-Type": "text/markdown; charset=utf-8", + "Cache-Control": "public, s-maxage=3600, stale-while-revalidate=86400", + }, + }); +} diff --git a/web/app/not-found.tsx b/web/app/not-found.tsx index 5f16015..c4a3de8 100644 --- a/web/app/not-found.tsx +++ b/web/app/not-found.tsx @@ -11,7 +11,11 @@ export const metadata: Metadata = { export default function NotFound() { return ( -
+

404

diff --git a/web/app/page.tsx b/web/app/page.tsx index 49a2bbb..06a1191 100644 --- a/web/app/page.tsx +++ b/web/app/page.tsx @@ -9,6 +9,7 @@ import CodeExamples from "@/components/CodeExamples"; import Install from "@/components/Install"; import FAQ from "@/components/FAQ"; import StructuredData from "@/components/StructuredData"; +import McpSetup from "@/components/McpSetup"; export default function Home() { return ( @@ -16,12 +17,13 @@ export default function Home() {
-
+
+
diff --git a/web/app/robots.ts b/web/app/robots.ts index 83bc004..0235fab 100644 --- a/web/app/robots.ts +++ b/web/app/robots.ts @@ -1,6 +1,6 @@ import type { MetadataRoute } from "next"; -const baseUrl = "https://docpull.raintree.technology"; +import { absoluteUrl, discoveryPaths, site } from "@/lib/site"; // Spec: SEO / robots.txt (RFC 9309). Allow all crawlers and point them at the sitemap. export default function robots(): MetadataRoute.Robots { @@ -9,7 +9,7 @@ export default function robots(): MetadataRoute.Robots { userAgent: "*", allow: "/", }, - sitemap: `${baseUrl}/sitemap.xml`, - host: baseUrl, + sitemap: absoluteUrl(discoveryPaths.sitemap), + host: site.baseUrl, }; } diff --git a/web/app/rss.xml/route.ts b/web/app/rss.xml/route.ts new file mode 100644 index 0000000..181ddac --- /dev/null +++ b/web/app/rss.xml/route.ts @@ -0,0 +1,34 @@ +import { absoluteUrl, discoveryPaths, site, utcDate } from "@/lib/site"; +import { escapeXml } from "@/lib/utils"; + +export const dynamic = "force-static"; + +export function GET() { + const published = utcDate(site.publishedAt); + const xml = ` + + + ${escapeXml(site.name)} + ${absoluteUrl(discoveryPaths.home)} + ${escapeXml(site.rssDescription)} + en-us + ${published} + + ${escapeXml("docpull homepage")} + ${absoluteUrl(discoveryPaths.home)} + ${absoluteUrl(discoveryPaths.home)} + ${published} + ${escapeXml( + "Product overview, setup instructions, examples, and agent-readiness links.", + )} + + +`; + + return new Response(xml, { + headers: { + "Content-Type": "application/rss+xml; charset=utf-8", + "Cache-Control": "public, s-maxage=3600, stale-while-revalidate=86400", + }, + }); +} diff --git a/web/app/sitemap.ts b/web/app/sitemap.ts index 19236e4..00381ef 100644 --- a/web/app/sitemap.ts +++ b/web/app/sitemap.ts @@ -1,16 +1,34 @@ import type { MetadataRoute } from "next"; -const baseUrl = "https://docpull.raintree.technology"; +import { absoluteUrl, discoveryPaths, site } from "@/lib/site"; -// Spec: SEO / XML sitemaps. A single-page site lists one canonical URL. -// Fragment anchors (#features, #install, …) are not separate URLs and are omitted. export default function sitemap(): MetadataRoute.Sitemap { + const lastModified = new Date(site.publishedAt); + return [ { - url: baseUrl, - lastModified: new Date(), + url: absoluteUrl(discoveryPaths.home), + lastModified, changeFrequency: "monthly", priority: 1, }, + { + url: absoluteUrl(discoveryPaths.llms), + lastModified, + changeFrequency: "monthly", + priority: 0.8, + }, + { + url: absoluteUrl(discoveryPaths.llmsFull), + lastModified, + changeFrequency: "monthly", + priority: 0.8, + }, + { + url: absoluteUrl(discoveryPaths.docpullResearchSkill), + lastModified, + changeFrequency: "monthly", + priority: 0.7, + }, ]; } diff --git a/web/components/AsciiBackground.tsx b/web/components/AsciiBackground.tsx index ca4009f..f33e46a 100644 --- a/web/components/AsciiBackground.tsx +++ b/web/components/AsciiBackground.tsx @@ -6,6 +6,7 @@ import { useReducedMotion } from "@/lib/use-reduced-motion"; const ASCII_CHARS = " .·:;+*#%@"; const TARGET_FPS = 24; const FRAME_INTERVAL = 1000 / TARGET_FPS; +const MAX_DEVICE_PIXEL_RATIO = 2; export default function AsciiBackground() { const canvasRef = useRef(null); @@ -25,25 +26,19 @@ export default function AsciiBackground() { const charWidth = 14; const charHeight = 20; - const resize = () => { - canvas.width = window.innerWidth; - canvas.height = window.innerHeight; - if (reducedMotion) { - renderFrame(); - } - }; - const renderFrame = () => { const t = timeRef.current; - const cols = Math.ceil(canvas.width / charWidth); - const rows = Math.ceil(canvas.height / charHeight); + const width = window.innerWidth; + const height = window.innerHeight; + const cols = Math.ceil(width / charWidth); + const rows = Math.ceil(height / charHeight); const centerX = cols / 2; const centerY = rows / 2; const isDark = document.documentElement.classList.contains("dark"); - ctx.clearRect(0, 0, canvas.width, canvas.height); + ctx.clearRect(0, 0, width, height); ctx.font = `${charHeight - 2}px "SF Mono", "Fira Code", Consolas, monospace`; ctx.textAlign = "center"; ctx.textBaseline = "middle"; @@ -85,6 +80,25 @@ export default function AsciiBackground() { } }; + const resize = () => { + const pixelRatio = Math.min( + window.devicePixelRatio || 1, + MAX_DEVICE_PIXEL_RATIO, + ); + const width = window.innerWidth; + const height = window.innerHeight; + + canvas.width = Math.floor(width * pixelRatio); + canvas.height = Math.floor(height * pixelRatio); + canvas.style.width = `${width}px`; + canvas.style.height = `${height}px`; + ctx.setTransform(pixelRatio, 0, 0, pixelRatio, 0, 0); + + if (reducedMotion) { + renderFrame(); + } + }; + const animate = (timestamp: number) => { animationRef.current = requestAnimationFrame(animate); @@ -97,6 +111,12 @@ export default function AsciiBackground() { renderFrame(); }; + const themeObserver = new MutationObserver(renderFrame); + themeObserver.observe(document.documentElement, { + attributeFilter: ["class"], + attributes: true, + }); + resize(); if (reducedMotion) { renderFrame(); @@ -109,6 +129,7 @@ export default function AsciiBackground() { return () => { cancelAnimationFrame(animationRef.current); window.removeEventListener("resize", resize); + themeObserver.disconnect(); }; }, [reducedMotion]); diff --git a/web/components/CodeExamples.tsx b/web/components/CodeExamples.tsx index be888a2..ce43207 100644 --- a/web/components/CodeExamples.tsx +++ b/web/components/CodeExamples.tsx @@ -2,84 +2,9 @@ import { useState, useCallback, memo, type KeyboardEvent } from "react"; import { Copy, Check } from "lucide-react"; +import { codeExamples } from "@/lib/content/home"; import { cn } from "@/lib/utils"; - -const examples = [ - { - id: "default", - name: "Default", - code: `docpull https://docs.stripe.com`, - output: `./docs/authentication.md: - ---- -title: "Authentication" -source: https://docs.stripe.com/authentication ---- - -# Authentication - -The Stripe API uses API keys to authenticate requests. -You can view and manage your API keys in the Stripe -Dashboard. - -Test mode secret keys have the prefix sk_test_ and live -mode secret keys have the prefix sk_live_...`, - }, - { - id: "rag", - name: "RAG", - code: `docpull https://docs.anthropic.com --profile rag`, - output: `./docs/messages.md: - ---- -title: "Messages" -source: https://docs.anthropic.com/en/api/messages -description: "Send a structured list of input messages and get the model's response." ---- - -# Messages - -Send messages to Claude using the Messages API...`, - }, - { - id: "skills", - name: "Claude Code", - code: `docpull https://sdk.vercel.ai -o .claude/skills/vercel-ai`, - output: `.claude/skills/vercel-ai/ -├── getting-started.md -├── streaming.md -├── tools.md -└── providers.md - -./.claude/skills/vercel-ai/getting-started.md: - ---- -title: "Getting Started" -source: https://sdk.vercel.ai/docs/getting-started ---- - -# Getting Started - -Install the Vercel AI SDK to build AI-powered applications...`, - }, - { - id: "python", - name: "Python", - code: `from docpull import Fetcher, DocpullConfig - -config = DocpullConfig(url="https://docs.example.com") -async with Fetcher(config) as fetcher: - async for event in fetcher.run(): - print(f"{event.current}/{event.total}: {event.url}")`, - output: `1/124: https://docs.example.com/intro -2/124: https://docs.example.com/quickstart -3/124: https://docs.example.com/api/overview -... -124/124: https://docs.example.com/changelog - -Completed: 124 pages, 4.2 MB`, - }, -] as const; +import { useCopyToClipboard } from "@/lib/hooks/use-copy-to-clipboard"; const CodeBlock = memo(function CodeBlock({ code, @@ -88,13 +13,8 @@ const CodeBlock = memo(function CodeBlock({ code: string; output: string; }) { - const [copied, setCopied] = useState(false); - - const handleCopy = useCallback(() => { - navigator.clipboard.writeText(code); - setCopied(true); - setTimeout(() => setCopied(false), 2000); - }, [code]); + const { copiedId, copy } = useCopyToClipboard(); + const copied = copiedId === "code"; return (
@@ -106,7 +26,7 @@ const CodeBlock = memo(function CodeBlock({
@@ -44,20 +44,22 @@ function FaqItem({ q, a, index }: { q: string; a: ReactNode; index: number }) { export default function FAQ() { return ( -
+
-
-

- Why docpull? +
+

Sharp edges

+

+ The questions people ask right before they try it.

-

- Answers to questions people ask before installing. +

+ This is where the practical constraints belong: JavaScript-heavy + sites, auth, MCP usage, and what the fetcher will or will not do.

-
+
{faqs.map((faq, i) => ( - + ))}
diff --git a/web/components/Features.tsx b/web/components/Features.tsx index ef60fc1..5d98fa3 100644 --- a/web/components/Features.tsx +++ b/web/components/Features.tsx @@ -1,54 +1,92 @@ -const features = [ - { - title: "AI-Ready Output", - description: - "Markdown with YAML frontmatter — title, source URL, heading outline, OpenGraph description. Drops into a vector store or a `.claude/skills/` directory.", - }, - { - title: "Streaming Dedup", - description: - "SHA-256-hashed at fetch time. Constant memory per page — duplicate pages are detected before they're written to disk, not after.", - }, - { - title: "Zero-Trust Networking", - description: - "HTTPS-only, robots.txt compliant, SSRF-protected with DNS pinning at connect time. Built for crawls where an agent picks the URLs — pass --require-pinned-dns to refuse weakened proxy configurations.", - }, - { - title: "Conditional Re-fetch", - description: - "If-None-Match / If-Modified-Since on every cached page. Re-runs only transfer what changed; the discovered URL list is persisted so a crash resumes instead of restarts.", - }, - { - title: "Path & Pattern Filters", - description: - "--include-paths and --exclude-paths glob filters at discovery time. Ship only the routes your model needs, not the entire site.", - }, -]; +import { featuredFeatures, supportingFeatures } from "@/lib/content/home"; export default function Features() { return ( -
-
-
-

- Features +
+
+
+

What matters

+

+ The core behaviors carry the page.

-

- Everything you need for production-grade doc ingestion. +

+ The product is strongest when it is concrete: clean Markdown, early + dedup, and strict network behavior. Those should read like the core + of the page, not six identical cards.

-
- {features.map((feature, index) => ( -
-

{feature.title}

-

+

+ {featuredFeatures.map((feature) => ( +
+
+ + + Core behavior + +
+

+ {feature.title} +

+

{feature.description}

-
+ +
+
+                  {feature.snippet}
+                
+
+ +
+ {feature.points.map((point) => ( +
+ {point} +
+ ))} +
+ ))}
+ +
+
+
+

+ The supporting details still matter +

+

+ These are important, but they read better as a tighter list than + as another wall of equal-weight cards. +

+
+
+ +
+ {supportingFeatures.map((feature) => ( +
+

+ {feature.title} +

+

+ {feature.description} +

+
+ ))} +
+
); diff --git a/web/components/Footer.tsx b/web/components/Footer.tsx index ba7c03c..e4d6933 100644 --- a/web/components/Footer.tsx +++ b/web/components/Footer.tsx @@ -43,146 +43,119 @@ function RaintreeLogo({ ); } +const links = [ + { + label: "PyPI", + href: "https://pypi.org/project/docpull/", + }, + { + label: "README", + href: "https://github.com/raintree-technology/docpull#readme", + }, + { + label: "llms.txt", + href: "/llms.txt", + }, + { + label: "llms-full.txt", + href: "/llms-full.txt", + }, + { + label: "Agent Skills", + href: "/.well-known/agent-skills.json", + }, + { + label: "RSS", + href: "/rss.xml", + }, + { + label: "security.txt", + href: "/.well-known/security.txt", + }, + { + label: "Sitemap", + href: "/sitemap.xml", + }, +] as const; + export default function Footer() { return ( -