scrapegraph-py/scrapegraph-js/src/crawl.js at 5d996c50692f77a4330512a781f3ff70a46ddfab · ScrapeGraphAI/scrapegraph-py · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import axios from 'axios';
import handleError from './utils/handleError.js';
import { ZodType } from 'zod';
import { zodToJsonSchema } from 'zod-to-json-schema';
import { isMockEnabled, getMockConfig } from './utils/mockConfig.js';
import { getMockResponse } from './utils/mockResponse.js';

/**
 * Start a crawl job using the ScrapeGraphAI API.
 *
 * @param {string} apiKey - Your ScrapeGraph AI API key
 * @param {string} url - The starting URL for the crawl
 * @param {string|null} prompt - The prompt to guide the crawl and extraction (null for markdown mode)
 * @param {Object|ZodType|null} schema - JSON schema or Zod schema defining the structure of the extracted data (null for markdown mode)
 * @param {Object} [options] - Optional crawl parameters
 * @param {boolean} [options.extractionMode=true] - true for AI extraction, false for markdown conversion (NO AI/LLM)
 * @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
 * @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
 * @param {number|null} [options.breadth] - Maximum number of links to crawl per depth level. If null/undefined, unlimited (default). Controls the 'width' of exploration at each depth. Useful for limiting crawl scope on large sites. Note: maxPages always takes priority. Ignored when sitemap=true.
 * @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
 * @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
 * @param {boolean} [options.sitemap] - Whether to use sitemap for better page discovery
 * @param {number} [options.batchSize=1] - Batch size for processing pages (1-10)
 * @param {boolean} [options.mock] - Override mock mode for this request
 * @param {boolean} [options.renderHeavyJs=false] - Whether to render heavy JavaScript on the page
 * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection
 * @param {Array<string>} [options.includePaths] - List of path patterns to include (e.g., ['/products/*', '/blog/**']). Supports wildcards: * matches any characters, ** matches any path segments
 * @param {Array<string>} [options.excludePaths] - List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). Supports wildcards and takes precedence over includePaths
 * @param {string} [options.webhookUrl] - URL to receive webhook notifications when the crawl job completes
 * @returns {Promise<Object>} The crawl job response
 * @throws {Error} Throws an error if the HTTP request fails
 */
export async function crawl(
  apiKey,
  url,
  prompt,
  schema,
  options = {}
) {
  const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null, webhookUrl = null } = options;

  // Check if mock mode is enabled
  const useMock = mock !== null ? mock : isMockEnabled();

  if (useMock) {
    console.log('🧪 Mock mode active. Returning stub for crawl request');
    const mockConfig = getMockConfig();
    const mockData = getMockResponse('POST', 'https://api.scrapegraphai.com/v1/crawl', mockConfig.customResponses, mockConfig.customHandler);
    return mockData;
  }
  const endpoint = 'https://api.scrapegraphai.com/v1/crawl';
  const headers = {
    'accept': 'application/json',
    'SGAI-APIKEY': apiKey,
    'Content-Type': 'application/json',
  };

  let schemaPayload = null;
  if (schema !== null && schema !== undefined) {
    if (schema instanceof ZodType) {
      schemaPayload = zodToJsonSchema(schema);
    } else if (typeof schema === 'object') {
      schemaPayload = schema;
    } else {
      throw new Error('The schema must be a Zod schema, a plain object, or null');
    }
  }

  const {
    cacheWebsite = true,
    depth = 2,
    breadth = null,
    maxPages = 2,
    sameDomainOnly = true,
    sitemap = false,
    batchSize = 1,
  } = options;

  const payload = {
    url,
    prompt,
    schema: schemaPayload,
    cache_website: cacheWebsite,
    depth,
    max_pages: maxPages,
    same_domain_only: sameDomainOnly,
    sitemap,
    batch_size: batchSize,
    render_heavy_js: renderHeavyJs,
  };

  if (breadth !== null && breadth !== undefined) {
    payload.breadth = breadth;
  }

  if (stealth) {
    payload.stealth = stealth;
  }

  if (includePaths) {
    payload.include_paths = includePaths;
  }

  if (excludePaths) {
    payload.exclude_paths = excludePaths;
  }

  if (webhookUrl) {
    payload.webhook_url = webhookUrl;
  }

  try {
    const response = await axios.post(endpoint, payload, { headers });
    return response.data;
  } catch (error) {
    handleError(error);
  }
}

/**
 * Get the result of a crawl job by ID.
 *
 * @param {string} apiKey - Your ScrapeGraph AI API key
 * @param {string} crawlId - The crawl job ID
 * @returns {Promise<Object>} The crawl result
 * @throws {Error} Throws an error if the HTTP request fails
 */
export async function getCrawlRequest(apiKey, crawlId, options = {}) {
  const { mock = null } = options;

  // Check if mock mode is enabled
  const useMock = mock !== null ? mock : isMockEnabled();

  if (useMock) {
    console.log('🧪 Mock mode active. Returning stub for getCrawlRequest');
    const mockConfig = getMockConfig();
    const mockData = getMockResponse('GET', `https://api.scrapegraphai.com/v1/crawl/${crawlId}`, mockConfig.customResponses, mockConfig.customHandler);
    return mockData;
  }

  const endpoint = `https://api.scrapegraphai.com/v1/crawl/${crawlId}`;
  const headers = {
    'accept': 'application/json',
    'SGAI-APIKEY': apiKey,
  };

  try {
    const response = await axios.get(endpoint, { headers });
    return response.data;
  } catch (error) {
    handleError(error);
  }
}