-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathglassdoor_scraper.py
More file actions
79 lines (73 loc) · 3.51 KB
/
glassdoor_scraper.py
File metadata and controls
79 lines (73 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from bs4 import BeautifulSoup
from seleniumbase import SB
import re
import traceback
def add_value(data, key, value):
if key in data.keys():
if type(data[key]) is list:
data[key] = data[key] + [value]
else:
data[key] = [data[key]] + [value]
else:
data[key] = value
return data
user_d_d = "C:/Users/pmail/PycharmProjects/Job_Scraper/Job_Scraper/data/browser_profile/profile_glassdoor"
def glassdoor_scraper(url):
my_dict = {}
try:
with SB(uc=True, headless=True, pls="eager", user_data_dir=user_d_d) as sb:
sb.activate_cdp_mode(url)
sb.cdp.sleep(5)
sb.click_if_visible('button[id="onetrust-accept-btn-handler"]', timeout=5)
while sb.cdp.is_element_visible('button[data-test="load-more"]'):
sb.click_if_visible('button[class="CloseButton"]', timeout=0.1)
sb.click_if_visible('button[data-test="auth-modal-close-button"]', timeout=0.1)
sb.click_if_visible('button[data-test="load-more"]')
sb.cdp.sleep(2)
html = sb.cdp.get_page_source()
soup = BeautifulSoup(html, "html.parser")
all_job = soup.find_all("li", {"data-test": "jobListing"})
if not all_job:
raise Exception("No jobs found")
last_id = 0
for job in all_job:
id_ = job.get("data-jobid")
title = job.find("a", {"data-test": "job-title"})
name = title.text.strip()
name = name.replace('"', '')
link = title.get("href")
sb.click_if_visible('button[class="CloseButton"]', timeout=0.1)
sb.click_if_visible('button[data-test="auth-modal-close-button"]', timeout=0.1)
sb.click_if_visible('button[id="onetrust-accept-btn-handler"]', timeout=0.1)
sb.click(f'li[data-jobid="{id_}"]')
html = sb.cdp.get_page_source()
soup = BeautifulSoup(html, "html.parser")
description = soup.select(f'div[class^="JobDetails_jobDescription"][data-brandviews*="{id_}"]')
desc = ""
company_name = soup.find('h4', {"class": re.compile(r'heading_Subhead')}).text.strip()
location = sb.cdp.find_element(selector="//div[@data-test='location']")
for d in description:
desc += d.text.strip()
# It can happen that the description doesn't load correctly, so I reload it
while description is None:
sb.click(f'li[data-jobid="{last_id}"]')
sb.cdp.sleep(1)
sb.click(f'li[data-jobid="{id_}"]')
sb.cdp.sleep(1)
html = sb.cdp.get_page_source()
soup = BeautifulSoup(html, "html.parser")
description = soup.select(f'div[class^="JobDetails_jobDescription"][data-brandviews*="{id_}"]')
desc = ""
for d in description:
desc += d.text.strip()
my_dict.update({id_: f'=HYPERLINK("{link}","{name}")'})
add_value(my_dict, id_, company_name)
add_value(my_dict, id_, location.text.strip())
add_value(my_dict, id_, desc)
last_id = id_
except Exception as e:
my_dict.clear()
my_dict.update({'ERROR in jobup script': str(e)})
print(traceback.format_exc())
finally:
return my_dict