-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscraping_with_agentsql.py
More file actions
61 lines (38 loc) · 1.18 KB
/
scraping_with_agentsql.py
File metadata and controls
61 lines (38 loc) · 1.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import agentql
from playwright.sync_api import sync_playwright
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
def scrape_agentql(playwright):
page_nr = 0
#initiate the browser
browser = playwright.chromium.launch_persistent_context(
user_data_dir="C:\playwright",
channel="chrome",
headless=False,
no_viewport=True,
)
data = []
page = agentql.wrap(browser.new_page())
page.goto("https://www.costco.com/candy.html")
while page_nr <= 4:
page.goto("https://www.costco.com/candy.html?currentPage="+str(page_nr)+"&pageSize=24")
# use your own words to describe what you're looking for
QUERY = """
{
products[] {
title
price
}
}
"""
# query_data returns data from the page
response = page.query_data(QUERY)
for product in response['products']:
data.append(product)
page_nr += 1
return data
with sync_playwright() as playwright:
products = scrape_agentql(playwright)
df = pd.DataFrame(products)
df.to_excel("agentql_products.xlsx",index=False)