Playwright-Tutorial/scraping_with_agentsql.py at main · ThomasJanssen-tech/Playwright-Tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import agentql
from playwright.sync_api import sync_playwright
import pandas as pd
from dotenv import load_dotenv

load_dotenv()


def scrape_agentql(playwright):

    page_nr = 0

    #initiate the browser
    browser = playwright.chromium.launch_persistent_context(
        user_data_dir="C:\playwright",
        channel="chrome",
        headless=False,
        no_viewport=True,
    )

    data = []

    page = agentql.wrap(browser.new_page())

    page.goto("https://www.costco.com/candy.html")


    while page_nr <= 4:


        page.goto("https://www.costco.com/candy.html?currentPage="+str(page_nr)+"&pageSize=24")

        # use your own words to describe what you're looking for
        QUERY = """
        {
            products[] {
                title
                price
            }
        }
        """

        # query_data returns data from the page
        response = page.query_data(QUERY)


        for product in response['products']:
            data.append(product)

        page_nr += 1

    return data


with sync_playwright() as playwright:
    products = scrape_agentql(playwright)

    df = pd.DataFrame(products)
    df.to_excel("agentql_products.xlsx",index=False)