Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 43 additions & 21 deletions web/components/Features.tsx
Original file line number Diff line number Diff line change
@@ -1,56 +1,78 @@
const REPO = "https://github.com/raintree-technology/docpull/blob/main";

const features = [
{
title: "Markdown Agents Can Use",
title: "Clean Markdown, ready to use",
description:
"Every page includes clean Markdown plus frontmatter for title, source URL, headings, and description. Drop it into RAG, search, or a skill directory.",
"Every page becomes Markdown with a frontmatter header — title, source URL, and description. Code blocks, tables, and images are preserved. Nav, footers, and cookie banners are stripped.",
srcPath: "src/docpull/conversion/extractor.py",
srcLabel: "extractor.py",
},
{
title: "No Duplicate Slop",
title: "No duplicates",
description:
"Pages are SHA-256 hashed while they stream in, so duplicates are caught before they hit disk instead of cleaned up later.",
"Pages are content-hashed as they stream in — duplicates are caught before they touch disk.",
srcPath: "src/docpull/pipeline/steps/dedup.py",
srcLabel: "dedup.py",
},
{
title: "Safe for Agent-Chosen URLs",
title: "Safe for AI agents",
description:
"HTTPS-only, robots.txt compliant, SSRF-protected, and DNS-pinned at connect time. Use --require-pinned-dns when proxy settings weaken that guarantee.",
"HTTPS-only, robots.txt compliant, and protected against URL-based attacks — necessary when an AI agent is choosing which URLs to fetch.",
srcPath: "src/docpull/security/url_validator.py",
srcLabel: "url_validator.py",
},
{
title: "Cheap to Re-run",
title: "Cheap to re-run",
description:
"Cached pages use If-None-Match and If-Modified-Since. Re-runs fetch what changed, and saved frontier state lets interrupted crawls resume.",
"Only re-fetches pages that changed since the last run. Interrupted crawls resume where they left off.",
srcPath: "src/docpull/pipeline/steps/fetch.py",
srcLabel: "fetch.py",
},
{
title: "Crawl the Parts That Matter",
title: "Crawl only what matters",
description:
"Include and exclude path globs during discovery, so your model gets the relevant docs instead of every route the site exposes.",
"Include and exclude URL patterns during discovery so your agent gets the relevant pages instead of every route the site exposes.",
srcPath: "src/docpull/discovery/filters.py",
srcLabel: "filters.py",
},
{
title: "Parallel Pack Workflows",
title: "Parallel search packs",
description:
"Optional Parallel Search, Extract, Task, entity, batch, monitor, and API-spec workflows become local packs with AGENT_CONTEXT.md, source files, manifests, IDs, and usage metadata.",
"Optional integration with Parallel to find and extract live web sources, organized into a local pack with a load plan your agent can follow.",
srcPath: "src/docpull/parallel_workflows.py",
srcLabel: "parallel_workflows.py",
},
];
] as const;

export default function Features() {
return (
<section id="features" className="pt-16 sm:pt-32 pb-24 border-t">
<div className="mx-auto max-w-5xl px-6">
<div className="mb-12 text-center sm:text-left">
<h2 className="text-2xl font-medium mb-3">
<span>Features</span>
</h2>
<h2 className="text-2xl font-medium mb-3">Features</h2>
<p className="text-muted-foreground">
The boring pieces that make documentation ingestion dependable.
The pieces that make documentation fetching dependable.
</p>
</div>

<div className="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-3 gap-4 sm:gap-6">
{features.map((feature, index) => (
<div key={index} className="p-4 rounded-xl glass">
<h3 className="font-medium text-sm mb-1">{feature.title}</h3>
<p className="text-sm text-muted-foreground leading-relaxed">
{features.map((feature) => (
<div key={feature.title} className="p-4 rounded-xl glass flex flex-col gap-2">
<h3 className="font-medium text-sm">{feature.title}</h3>
<p className="text-sm text-muted-foreground leading-relaxed flex-1">
{feature.description}
</p>
{feature.srcPath && (
<a
href={`${REPO}/${feature.srcPath}`}
target="_blank"
rel="noopener noreferrer"
className="text-[11px] font-mono text-muted-foreground/50 hover:text-muted-foreground transition-colors w-fit"
>
{feature.srcLabel}
</a>
)}
</div>
))}
</div>
Expand Down
9 changes: 3 additions & 6 deletions web/components/Hero.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,9 @@ export default function Hero() {
</div>

<p className="mt-4 text-xs text-muted-foreground max-w-md leading-relaxed">
Best for static docs, API references, and server-rendered
sites. JS-rendered SPAs are detected and skipped — pass{" "}
<code className="font-mono text-[11px] bg-background/60 px-1 rounded">
--strict-js-required
</code>{" "}
to make that an error so your agent can route elsewhere.
Works with static docs, API references, and server-rendered
sites. JavaScript-heavy pages are detected and skipped
automatically.
</p>
</div>

Expand Down
2 changes: 1 addition & 1 deletion web/components/HowItWorks.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ export default function HowItWorks() {
/>
<StepText
title="Use"
desc="Use the Markdown in search, RAG, offline archives, or skills."
desc="Load the Markdown into your agent, search index, or skill directory."
active={activeIdx === 2}
/>
</div>
Expand Down
128 changes: 28 additions & 100 deletions web/components/ParallelPacks.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,83 +2,50 @@ import Image from "next/image";

const workflows = [
{
title: "Discovery + Extract Packs",
title: "Discovery & research packs",
command: "context-pack / discover-docs",
description:
"Parallel discovers and extracts current web sources; docpull ranks candidates, writes crawl plans, AGENT_CONTEXT.md, Markdown, NDJSON chunks, source indexes, manifests, IDs, hashes, and usage metadata.",
"Parallel finds and extracts current web sources. docpull saves them locally as Markdown, structured records, source indexes, and an AGENT_CONTEXT.md load plan.",
},
{
title: "Fallback + Diff Packs",
command: "fallback-pack / diff-brief",
title: "API specs & entity research",
command: "api-pack / entity-pack",
description:
"Try core docpull first, fall back to Parallel Extract only for misses, then score sources or send pack diffs through Parallel Task for change briefs.",
"Turn llms.txt files and OpenAPI specs into local packs, or build dossiers on companies, vendors, and research targets from Parallel's entity search.",
},
{
title: "Entity Dossiers",
command: "entity-pack / findall-pack",
title: "Diffs & change briefs",
command: "diff-brief / fallback-pack",
description:
"Entity Search and FindAll become local candidate packs for companies, people, vendors, competitors, or research targets.",
},
{
title: "Batch + Monitor Packs",
command: "taskgroup-pack --wait / monitor-pack",
description:
"TaskGroup rows can wait for completed outputs, while Monitor create, list, retrieve, update, cancel, trigger, and event pages become reusable local artifacts.",
},
{
title: "API Context Packs",
command: "api-pack / pack score / pack sources",
description:
"Turn llms.txt and OpenAPI specs into docpull packs, then grade readiness, rank sources, or diff refreshed snapshots before agents load the context.",
"Compare two snapshots of a pack to see what changed, or fall back to Parallel Extract only for pages your local crawl missed.",
},
] as const;

const decisionCards = [
{
title: "Use core docpull for known docs",
title: "Use docpull for known docs",
description:
"Start with the local crawler when you already know the docs URL and want a same-domain Markdown mirror with no browser and no API key.",
"Start here when you already have the URL and want a clean Markdown mirror no browser, no API key.",
points: [
"static docs and API references",
"RAG or skill-ready Markdown",
"repeatable site mirrors",
"search-ready or skill-ready Markdown",
"repeatable, offline-friendly archives",
],
},
{
title: "Use Parallel packs for web research",
title: "Add Parallel for web research",
description:
"Add the Parallel layer when discovery, extraction, research, entities, or monitoring should happen before docpull writes local context artifacts and a load plan.",
"Use the Parallel layer when you need to find sources first, extract live content, or run entity and batch research before writing local context.",
points: [
"research packs from search queries",
"ranked docs discovery and crawl commands",
"cited source bundles for agents",
"AGENT_CONTEXT.md load plan",
"repeatable NDJSON, manifests, and source files",
"API-doc or vendor comparison research",
"fallback, diff, task, entity, batch, and monitor workflows",
"ranked docs discovery with crawl plans",
"cited source bundles with a load plan",
"API-doc and vendor comparison research",
"diffs, entity dossiers, and batch workflows",
],
},
] as const;

const keyFlow = [
"pip install 'docpull[parallel]'",
"docpull parallel init",
"docpull parallel auth --json",
"docpull parallel init --project",
"docpull parallel context-pack ... --dry-run --max-estimated-cost 0.05",
] as const;

const controls = [
"--dry-run",
"--max-estimated-cost",
"--include-domain / --exclude-domain",
"--after-date",
"--fetch-max-age-seconds",
"--excerpt-chars-per-result",
"--client-model",
"pack sources",
] as const;

export default function ParallelPacks() {
return (
<section id="parallel" className="py-16 sm:py-24 border-t">
Expand All @@ -102,11 +69,11 @@ export default function ParallelPacks() {
context packs
</span>
</h2>
<p className="text-sm sm:text-base text-muted-foreground max-w-3xl">
Parallel is the optional source-discovery and research layer. Use
core docpull to mirror a known docs site; use Parallel when an agent
needs current web sources found, extracted, scored, and packaged
into a local context pack before it starts work.
<p className="text-sm sm:text-base text-muted-foreground max-w-2xl">
Parallel is an optional source-discovery layer. Use docpull when
you already know the URL. Add Parallel when an agent needs to find
sources, extract live content, and package everything into a local
context pack before it starts work.
</p>
</div>

Expand All @@ -123,7 +90,10 @@ export default function ParallelPacks() {
key={point}
className="flex gap-2 text-xs text-muted-foreground leading-relaxed"
>
<span aria-hidden="true" className="mt-1.5 h-1 w-1 shrink-0 rounded-full bg-foreground/50" />
<span
aria-hidden="true"
className="mt-1.5 h-1 w-1 shrink-0 rounded-full bg-foreground/50"
/>
<span>{point}</span>
</li>
))}
Expand All @@ -132,49 +102,7 @@ export default function ParallelPacks() {
))}
</div>

<div className="grid grid-cols-1 lg:grid-cols-[1.1fr_0.9fr] gap-4 sm:gap-6 mb-4 sm:mb-6">
<div className="p-4 sm:p-5 rounded-xl glass">
<h3 className="font-medium text-sm mb-3">API key flow</h3>
<div className="space-y-2">
{keyFlow.map((command) => (
<code
key={command}
className="block px-3 py-2 bg-background/60 rounded-md text-xs font-mono text-muted-foreground overflow-x-auto"
>
{command}
</code>
))}
</div>
<p className="mt-3 text-xs text-muted-foreground leading-relaxed">
Keys live in the environment, user config, or project .env.local.
docpull does not echo{" "}
<code className="font-mono text-[11px]">PARALLEL_API_KEY</code>,
but pack artifacts can include source content, task inputs,
outputs, and metadata.
</p>
</div>

<div className="p-4 sm:p-5 rounded-xl glass">
<h3 className="font-medium text-sm mb-3">Cost and source controls</h3>
<div className="flex flex-wrap gap-2">
{controls.map((control) => (
<code
key={control}
className="px-2.5 py-1.5 bg-background/60 rounded-md text-[11px] font-mono text-muted-foreground"
>
{control}
</code>
))}
</div>
<p className="mt-3 text-xs text-muted-foreground leading-relaxed">
Dry runs estimate spend before live calls, domain filters pin the
source policy, and AGENT_CONTEXT.md gives agents a deterministic
load order before they inspect deeper metadata.
</p>
</div>
</div>

<div className="grid grid-cols-1 sm:grid-cols-2 gap-3 sm:gap-4">
<div className="grid grid-cols-1 sm:grid-cols-3 gap-3 sm:gap-4">
{workflows.map((workflow) => (
<div key={workflow.title} className="p-4 rounded-xl glass">
<div className="flex flex-wrap items-baseline justify-between gap-2 mb-2">
Expand Down
14 changes: 7 additions & 7 deletions web/components/Profiles.tsx
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
const profiles = [
{
name: "RAG",
description: "Clean Markdown with metadata and deduping for retrieval.",
description: "Clean Markdown with metadata and deduplication for search and retrieval.",
example: "docpull URL --profile rag",
},
{
name: "Mirror",
description: "A fuller local archive with cache, resume, and stable paths.",
description: "A full local archive with caching, resume on interrupt, and stable file paths.",
example: "docpull URL --profile mirror",
},
{
name: "Quick",
description: "A 50-page sample when you need to inspect output first.",
description: "A 50-page sample when you want to inspect output before committing to a full crawl.",
example: "docpull URL --profile quick",
},
{
name: "LLM",
description:
"Token-aware NDJSON chunks that skip JS-only pages unless strict mode is enabled.",
"Chunked, streaming records sized for language model context windows. JavaScript-only pages are skipped unless strict mode is on.",
example: "docpull URL --profile llm --stream | jq .",
},
];
Expand All @@ -28,16 +28,16 @@ export default function Profiles() {
<div className="mx-auto max-w-5xl px-6">
<div className="mb-8 sm:mb-12 text-center sm:text-left">
<h2 className="text-xl sm:text-2xl font-medium mb-2 sm:mb-3">
<span>Profiles</span>
Profiles
</h2>
<p className="text-sm sm:text-base text-muted-foreground">
Choose the output shape before you crawl.
</p>
</div>

<div className="grid grid-cols-1 sm:grid-cols-2 gap-3 sm:gap-4">
{profiles.map((profile, index) => (
<div key={index} className="p-4 rounded-xl glass">
{profiles.map((profile) => (
<div key={profile.name} className="p-4 rounded-xl glass">
<h3 className="font-medium mb-2">{profile.name}</h3>
<p className="text-sm text-muted-foreground mb-3">
{profile.description}
Expand Down
Loading