-
Notifications
You must be signed in to change notification settings - Fork 215
Expand file tree
/
Copy path__init__.py
More file actions
143 lines (107 loc) · 4.3 KB
/
__init__.py
File metadata and controls
143 lines (107 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
from firecrawl import FirecrawlApp
from typing import List, Dict, Any, Optional
app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
def web_scrape(url: str):
"""
Scrape a url and return markdown. Use this to read a singular page and web_crawl only if you
need to read all other links as well.
"""
scrape_result = app.scrape_url(url, params={'formats': ['markdown']})
return scrape_result
def web_crawl(url: str):
"""
Scrape a url and crawl through other links from that page, scraping their contents.
This tool returns a crawl_id that you will need to use after waiting for a period of time
to retrieve the final contents. You should attempt to accomplish another task while waiting
for the crawl to complete.
Crawl will ignore sublinks of a page if they aren’t children of the url you provide.
So, the website.com/other-parent/blog-1 wouldn’t be returned if you crawled website.com/blogs/.
"""
crawl_status = app.crawl_url(
url, params={'limit': 100, 'scrapeOptions': {'formats': ['markdown']}}, poll_interval=30
)
return crawl_status
def retrieve_web_crawl(crawl_id: str):
"""
Retrieve the results of a previously started web crawl. Crawls take time to process
so be sure to only use this tool some time after initiating a crawl. The result
will tell you if the crawl is finished. If it is not, wait some more time then try again.
"""
return app.check_crawl_status(crawl_id)
def batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']):
"""
Batch scrape multiple URLs simultaneously.
Args:
urls: List of URLs to scrape
formats: List of desired output formats (e.g., ['markdown', 'html'])
Returns:
Dictionary containing the batch scrape results
"""
batch_result = app.batch_scrape_urls(urls, {'formats': formats})
return batch_result
def async_batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']):
"""
Asynchronously batch scrape multiple URLs.
Args:
urls: List of URLs to scrape
formats: List of desired output formats (e.g., ['markdown', 'html'])
Returns:
Dictionary containing the job ID and status URL
"""
batch_job = app.async_batch_scrape_urls(urls, {'formats': formats})
return batch_job
def check_batch_status(job_id: str):
"""
Check the status of an asynchronous batch scrape job.
Args:
job_id: The ID of the batch scrape job
Returns:
Dictionary containing the current status and results if completed
"""
return app.check_batch_scrape_status(job_id)
def extract_data(urls: List[str], schema: Optional[Dict[str, Any]] = None, prompt: Optional[str] = None) -> Dict[
str, Any]:
"""
Extract structured data from URLs using LLMs.
Args:
urls: List of URLs to extract data from
schema: Optional JSON schema defining the structure of data to extract
prompt: Optional natural language prompt describing the data to extract
Returns:
Dictionary containing the extracted structured data
"""
params: Dict[str, Any] = {}
if prompt is not None:
params['prompt'] = prompt
elif schema is not None:
params['schema'] = schema
data = app.extract(urls, params)
return data
def map_website(url: str, search: Optional[str] = None):
"""
Map a website to get all URLs, with optional search functionality.
Args:
url: The base URL to map
search: Optional search term to filter URLs
Returns:
Dictionary containing the list of discovered URLs
"""
params = {'search': search} if search else {}
map_result = app.map_url(url, params)
return map_result
def batch_extract(urls: List[str], extract_params: Dict[str, Any]):
"""
Batch extract structured data from multiple URLs.
Args:
urls: List of URLs to extract data from
extract_params: Dictionary containing extraction parameters including prompt or schema
Returns:
Dictionary containing the extracted data from all URLs
"""
params = {
'formats': ['extract'],
'extract': extract_params
}
batch_result = app.batch_scrape_urls(urls, params)
return batch_result