-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindeed_scraper.py
More file actions
106 lines (95 loc) · 4.23 KB
/
indeed_scraper.py
File metadata and controls
106 lines (95 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from bs4 import BeautifulSoup
from seleniumbase import sb_cdp
import traceback
def add_value(data, key, value):
if key in data.keys():
if type(data[key]) is list:
data[key] = data[key] + [value]
else:
data[key] = [data[key]] + [value]
else:
data[key] = value
return data
def indeed_scraper(url):
my_dict = {}
sb = None
try:
# I couldn't find why but this code doesn't work in headless mode
sb = sb_cdp.Chrome(url)
sb.sleep(2)
sb.solve_captcha()
sb.sleep(2)
sb.click_if_visible('button[id="onetrust-accept-btn-handler"]')
sb.click_if_visible('button[onclick="closeGoogleOnlyModal()"]')
while True:
if sb.is_element_present('main[class="error"]'):
sb.click_captcha()
sb.sleep(2)
sb.click_if_visible('button[id="onetrust-accept-btn-handler"]')
sb.click_if_visible('button[onclick="closeGoogleOnlyModal()"]')
html = sb.get_page_source()
soup = BeautifulSoup(html, "html.parser")
all_job = soup.select('a[data-jk]')
all_company = soup.select('span[data-testid="company-name"]')
all_locations = soup.select('div[data-testid="text-location"]')
for i in range(len(all_job)):
id_ = all_job[i].get('data-jk')
link = all_job[i].get('href')
jobtitle = all_job[i].select_one('span[id^="jobTitle"]').get('id')
if jobtitle.replace('jobTitle-', '') != id_:
continue
name = all_job[i].get_text().strip()
company_name = all_company[i].get_text().strip()
location_name = all_locations[i].get_text().strip()
my_dict.update({id_: f'=HYPERLINK("https://ch-fr.indeed.com/viewjob?jk={id_}","{name}")'})
add_value(my_dict, id_, company_name)
add_value(my_dict, id_, location_name)
# This was not a loop condition because, on specific case, there is no next page.
if not sb.is_element_present('a[data-testid="pagination-page-next"'):
break
sb.click_if_visible('a[data-testid="pagination-page-next"')
sb.sleep(1)
while sb.is_element_present('div[class^="pass-FormContent"]'):
sb.go_back()
sb.sleep(1)
url = sb.get_current_url()
sb.sleep(1)
sb.driver.quit()
sb.sleep(1)
sb = sb_cdp.Chrome(url)
sb.sleep(1)
sb.click_if_visible('a[data-testid="pagination-page-next"')
sb.sleep(1)
while not sb.is_element_present('div[class^="jobsearch-RightPane"]'):
sb.sleep(0.1)
if sb.is_element_present('main[class="error"]'):
sb.click_captcha()
sb.sleep(2)
if not my_dict:
raise Exception("No jobs found")
# I tried "parallelizing" this part, but unlike the LinkedIn code, it didn't speed up data extraction. If you really want to parallelize it, parallel processing is preferable to multithreading.
for id_ in my_dict.keys():
sb.open(f"https://ch-fr.indeed.com/viewjob?jk={id_}")
while not sb.is_element_present('div[id="jobDescriptionText"]'):
sb.sleep(0.1)
if sb.is_element_present('main[class="error"]'):
# sb.uc_gui_click_captcha()
sb.click_captcha()
sb.sleep(2)
if sb.is_element_present('main[class="error-wrapper"]'):
print(my_dict.get(id_))
break
html = sb.get_page_source()
soup = BeautifulSoup(html, "html.parser")
description = soup.select_one('div[id="jobDescriptionText"]')
add_value(my_dict, id_, description.text.strip())
for job in my_dict.values():
print(job)
except Exception as e:
my_dict.clear()
my_dict.update({'ERROR in jobup script': str(e)})
print(traceback.format_exc())
finally:
if not sb:
sb.driver.stop()
return my_dict