AgentStack/agentstack/_tools/firecrawl/__init__.py at 25e8da882a3f847b56f8413ad3149a9872537d34 · agentstack-ai/AgentStack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
from firecrawl import FirecrawlApp
from typing import List, Dict, Any, Optional
app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))


def web_scrape(url: str):
    """
    Scrape a url and return markdown. Use this to read a singular page and web_crawl only if you
    need to read all other links as well.
    """
    scrape_result = app.scrape_url(url, params={'formats': ['markdown']})
    return scrape_result


def web_crawl(url: str):
    """
    Scrape a url and crawl through other links from that page, scraping their contents.
    This tool returns a crawl_id that you will need to use after waiting for a period of time
    to retrieve the final contents. You should attempt to accomplish another task while waiting
    for the crawl to complete.

    Crawl will ignore sublinks of a page if they aren’t children of the url you provide.
    So, the website.com/other-parent/blog-1 wouldn’t be returned if you crawled website.com/blogs/.
    """

    crawl_status = app.crawl_url(
        url, params={'limit': 100, 'scrapeOptions': {'formats': ['markdown']}}, poll_interval=30
    )

    return crawl_status


def retrieve_web_crawl(crawl_id: str):
    """
    Retrieve the results of a previously started web crawl. Crawls take time to process
    so be sure to only use this tool some time after initiating a crawl. The result
    will tell you if the crawl is finished. If it is not, wait some more time then try again.
    """
    return app.check_crawl_status(crawl_id)


def batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']):
    """
    Batch scrape multiple URLs simultaneously.

    Args:
        urls: List of URLs to scrape
        formats: List of desired output formats (e.g., ['markdown', 'html'])

    Returns:
        Dictionary containing the batch scrape results
    """
    batch_result = app.batch_scrape_urls(urls, {'formats': formats})
    return batch_result


def async_batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']):
    """
    Asynchronously batch scrape multiple URLs.

    Args:
        urls: List of URLs to scrape
        formats: List of desired output formats (e.g., ['markdown', 'html'])

    Returns:
        Dictionary containing the job ID and status URL
    """
    batch_job = app.async_batch_scrape_urls(urls, {'formats': formats})
    return batch_job


def check_batch_status(job_id: str):
    """
    Check the status of an asynchronous batch scrape job.

    Args:
        job_id: The ID of the batch scrape job

    Returns:
        Dictionary containing the current status and results if completed
    """
    return app.check_batch_scrape_status(job_id)


def extract_data(urls: List[str], schema: Optional[Dict[str, Any]] = None, prompt: Optional[str] = None) -> Dict[
    str, Any]:
    """
    Extract structured data from URLs using LLMs.

    Args:
        urls: List of URLs to extract data from
        schema: Optional JSON schema defining the structure of data to extract
        prompt: Optional natural language prompt describing the data to extract

    Returns:
        Dictionary containing the extracted structured data
    """
    params: Dict[str, Any] = {}

    if prompt is not None:
        params['prompt'] = prompt
    elif schema is not None:
        params['schema'] = schema

    data = app.extract(urls, params)
    return data


def map_website(url: str, search: Optional[str] = None):
    """
    Map a website to get all URLs, with optional search functionality.

    Args:
        url: The base URL to map
        search: Optional search term to filter URLs

    Returns:
        Dictionary containing the list of discovered URLs
    """
    params = {'search': search} if search else {}
    map_result = app.map_url(url, params)
    return map_result


def batch_extract(urls: List[str], extract_params: Dict[str, Any]):
    """
    Batch extract structured data from multiple URLs.

    Args:
        urls: List of URLs to extract data from
        extract_params: Dictionary containing extraction parameters including prompt or schema

    Returns:
        Dictionary containing the extracted data from all URLs
    """
    params = {
        'formats': ['extract'],
        'extract': extract_params
    }

    batch_result = app.batch_scrape_urls(urls, params)
    return batch_result