Playwright-Tutorial/patchright_scraping_indeed.py at main · ThomasJanssen-tech/Playwright-Tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from patchright.sync_api import sync_playwright
import pandas as pd
import time

def scrape_indeed(playwright):
    browser = playwright.chromium.launch_persistent_context(
        user_data_dir="C:\playwright",
        channel="chrome",
        headless=False,
        no_viewport=True,
    )


    page = browser.new_page()


    page_count = 0

    jobs = []


    while page_count < 2:

        print("SCRAPING LIST ITEMS")

        page.goto('https://www.indeed.com/jobs?q=python+developer&start='+str(page_count * 10))

        time.sleep(10)

        vacancies = page.locator('.cardOutline')

        for vacancy in vacancies.element_handles():
            item = {}

            item['Title'] = vacancy.query_selector("h2").inner_text()
            item['URL'] = "https://www.indeed.com"+vacancy.query_selector("a").get_attribute("href")

            jobs.append(item)

        page_count += 1

    all_items = []

    for job in jobs:

        print("SCRAPING DETAILS PAGE")


        page.goto(job['URL'])

        time.sleep(2)

        item = {}

        item["Title"] = job['Title']
        item["URL"] = job["URL"]
        item["CompanyName"] = ""
        item["Location"] = ""
        item["Salaryinfo"] = ""

        company_name = page.get_by_test_id("inlineHeader-companyName")

        if company_name.count() > 0:
            item["CompanyName"] = company_name.inner_text()

        company_location = page.get_by_test_id("inlineHeader-companyLocation")

        if company_location.count() > 0:
            item["Location"] = company_location.inner_text()


        salaryinfo = page.get_by_test_id("jobsearch-OtherJobDetailsContainer")

        if(salaryinfo.count() > 0):
            item["Salaryinfo"] = salaryinfo.inner_text()


        all_items.append(item)


    browser.close()

    return all_items


with sync_playwright() as playwright:
    jobs = scrape_indeed(playwright)

    df = pd.DataFrame(jobs)
    df.to_excel("jobs.xlsx",index=False)