Job_Scraper/indeed_scraper.py at main · Patrick2ooo/Job_Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from bs4 import BeautifulSoup
from seleniumbase import sb_cdp
import traceback


def add_value(data, key, value):
    if key in data.keys():
        if type(data[key]) is list:
            data[key] = data[key] + [value]
        else:
            data[key] = [data[key]] + [value]
    else:
        data[key] = value
    return data

def indeed_scraper(url):
    my_dict = {}
    sb = None
    try:
        # I couldn't find why but this code doesn't work in headless mode
        sb = sb_cdp.Chrome(url)
        sb.sleep(2)
        sb.solve_captcha()
        sb.sleep(2)
        sb.click_if_visible('button[id="onetrust-accept-btn-handler"]')
        sb.click_if_visible('button[onclick="closeGoogleOnlyModal()"]')
        while True:
            if sb.is_element_present('main[class="error"]'):
                sb.click_captcha()
                sb.sleep(2)
            sb.click_if_visible('button[id="onetrust-accept-btn-handler"]')
            sb.click_if_visible('button[onclick="closeGoogleOnlyModal()"]')
            html = sb.get_page_source()
            soup = BeautifulSoup(html, "html.parser")
            all_job = soup.select('a[data-jk]')
            all_company = soup.select('span[data-testid="company-name"]')
            all_locations = soup.select('div[data-testid="text-location"]')

            for i in range(len(all_job)):
                id_ = all_job[i].get('data-jk')
                link = all_job[i].get('href')
                jobtitle = all_job[i].select_one('span[id^="jobTitle"]').get('id')
                if jobtitle.replace('jobTitle-', '') != id_:
                    continue
                name = all_job[i].get_text().strip()
                company_name = all_company[i].get_text().strip()
                location_name = all_locations[i].get_text().strip()
                my_dict.update({id_: f'=HYPERLINK("https://ch-fr.indeed.com/viewjob?jk={id_}","{name}")'})
                add_value(my_dict, id_, company_name)
                add_value(my_dict, id_, location_name)

            # This was not a loop condition because, on specific case, there is no next page.
            if not sb.is_element_present('a[data-testid="pagination-page-next"'):
                break

            sb.click_if_visible('a[data-testid="pagination-page-next"')
            sb.sleep(1)
            while sb.is_element_present('div[class^="pass-FormContent"]'):
                sb.go_back()
                sb.sleep(1)
                url = sb.get_current_url()
                sb.sleep(1)
                sb.driver.quit()
                sb.sleep(1)
                sb = sb_cdp.Chrome(url)
                sb.sleep(1)
                sb.click_if_visible('a[data-testid="pagination-page-next"')
                sb.sleep(1)

            while not sb.is_element_present('div[class^="jobsearch-RightPane"]'):
                sb.sleep(0.1)
                if sb.is_element_present('main[class="error"]'):
                    sb.click_captcha()
                    sb.sleep(2)

        if not my_dict:
            raise Exception("No jobs found")

        # I tried "parallelizing" this part, but unlike the LinkedIn code, it didn't speed up data extraction. If you really want to parallelize it, parallel processing is preferable to multithreading.
        for id_ in my_dict.keys():
            sb.open(f"https://ch-fr.indeed.com/viewjob?jk={id_}")
            while not sb.is_element_present('div[id="jobDescriptionText"]'):
                sb.sleep(0.1)
                if sb.is_element_present('main[class="error"]'):
                    # sb.uc_gui_click_captcha()
                    sb.click_captcha()
                    sb.sleep(2)
                if sb.is_element_present('main[class="error-wrapper"]'):
                    print(my_dict.get(id_))
                    break
            html = sb.get_page_source()
            soup = BeautifulSoup(html, "html.parser")
            description = soup.select_one('div[id="jobDescriptionText"]')
            add_value(my_dict, id_, description.text.strip())

        for job in my_dict.values():
            print(job)

    except Exception as e:
        my_dict.clear()
        my_dict.update({'ERROR in jobup script': str(e)})
        print(traceback.format_exc())
    finally:
        if not sb:
            sb.driver.stop()
        return my_dict