-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl.ts
More file actions
62 lines (56 loc) · 2.12 KB
/
crawl.ts
File metadata and controls
62 lines (56 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import { defineCommand } from "citty";
import * as scrapegraphai from "scrapegraph-js";
import { resolveApiKey } from "../lib/folders.js";
import * as log from "../lib/log.js";
export default defineCommand({
meta: {
name: "crawl",
description: "Crawl and extract data from multiple pages",
},
args: {
url: {
type: "positional",
description: "Starting URL to crawl",
required: true,
},
prompt: {
type: "string",
alias: "p",
description: "Extraction prompt (required when extraction mode is on)",
},
"no-extraction": {
type: "boolean",
description: "Return markdown only (2 credits/page instead of 10)",
},
"max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" },
depth: { type: "string", description: "Crawl depth (default 1)" },
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
rules: { type: "string", description: "Crawl rules as JSON object string" },
"no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" },
stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" },
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
},
run: async ({ args }) => {
const out = log.create(!!args.json);
out.docs("https://docs.scrapegraphai.com/services/smartcrawler");
const key = await resolveApiKey(!!args.json);
const base: Record<string, unknown> = { url: args.url };
if (args["max-pages"]) base.max_pages = Number(args["max-pages"]);
if (args.depth) base.depth = Number(args.depth);
if (args.rules) base.rules = JSON.parse(args.rules);
if (args["no-sitemap"]) base.sitemap = false;
if (args.stealth) base.stealth = true;
if (args["no-extraction"]) {
base.extraction_mode = false;
} else {
if (args.prompt) base.prompt = args.prompt;
if (args.schema) base.schema = JSON.parse(args.schema);
}
const params = base as scrapegraphai.CrawlParams;
out.start("Crawling");
const result = await scrapegraphai.crawl(key, params, out.poll);
out.stop(result.elapsedMs);
if (result.data) out.result(result.data);
else out.error(result.error);
},
});