-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpatchright_scraping_indeed.py
More file actions
93 lines (51 loc) · 1.95 KB
/
patchright_scraping_indeed.py
File metadata and controls
93 lines (51 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from patchright.sync_api import sync_playwright
import pandas as pd
import time
def scrape_indeed(playwright):
browser = playwright.chromium.launch_persistent_context(
user_data_dir="C:\playwright",
channel="chrome",
headless=False,
no_viewport=True,
)
page = browser.new_page()
page_count = 0
jobs = []
while page_count < 2:
print("SCRAPING LIST ITEMS")
page.goto('https://www.indeed.com/jobs?q=python+developer&start='+str(page_count * 10))
time.sleep(10)
vacancies = page.locator('.cardOutline')
for vacancy in vacancies.element_handles():
item = {}
item['Title'] = vacancy.query_selector("h2").inner_text()
item['URL'] = "https://www.indeed.com"+vacancy.query_selector("a").get_attribute("href")
jobs.append(item)
page_count += 1
all_items = []
for job in jobs:
print("SCRAPING DETAILS PAGE")
page.goto(job['URL'])
time.sleep(2)
item = {}
item["Title"] = job['Title']
item["URL"] = job["URL"]
item["CompanyName"] = ""
item["Location"] = ""
item["Salaryinfo"] = ""
company_name = page.get_by_test_id("inlineHeader-companyName")
if company_name.count() > 0:
item["CompanyName"] = company_name.inner_text()
company_location = page.get_by_test_id("inlineHeader-companyLocation")
if company_location.count() > 0:
item["Location"] = company_location.inner_text()
salaryinfo = page.get_by_test_id("jobsearch-OtherJobDetailsContainer")
if(salaryinfo.count() > 0):
item["Salaryinfo"] = salaryinfo.inner_text()
all_items.append(item)
browser.close()
return all_items
with sync_playwright() as playwright:
jobs = scrape_indeed(playwright)
df = pd.DataFrame(jobs)
df.to_excel("jobs.xlsx",index=False)