Job_Scraper/main.py at main · Patrick2ooo/Job_Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from Job_Scraper.indeed_scraper import indeed_scraper
from linkedin_scraper import *
from jobup_scraper import *
from glassdoor_scraper import *
import pandas as pd

# TODO :    - add new website like leTemps, jobScout, MichaelPage etc...
#           - Find a way to remove duplicate
#           - add the possibility to work with only one file in a way that if i scrape a job i already saw it goes to the trash
#           - maybe develop a frontend or not (i'm lazy and it works for me like that)


if __name__ == '__main__':
    # if you wanna adjust linkedin filter in the link go to https://gist.github.com/Diegiwg/51c22fa7ec9d92ed9b5d1f537b9e1107?permalink_comment_id=5418613 for more info. For the other part you can just go on the original website and copy paste the link you want to have
    # linkedin_dict = linkedin_scraper("https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Software+Engineer+OR+Embedded+Engineer&location=Switzerland&f_TPR=r86400")
    # jobup_dict = jobup_scraper("https://www.jobup.ch/fr/emplois/?publication-date=1&term=software%20engineer")
    # glassdoor_dict = glassdoor_scraper("https://fr.glassdoor.ch/Emploi/software-engineer-emplois-SRCH_KO0,17.htm?fromAge=7")
    indeed_dict = indeed_scraper("https://ch-fr.indeed.com/jobs?q=ing%C3%A9nieur+informatique&l=&fromage=7")

    all_job_dict = indeed_dict

    filters = ["python", "vhdl", r"\Wc\W", "linux", "IOT", "systemverilog"] # for one character keyword it is important to have a regex like r"\Wc\W" where \W can represent any special character like "*-+=)" check https://www.rexegg.com/regex-quickstart.php for more info

    # Excel part
    writer = None
    try:
        df = pd.DataFrame.from_dict(all_job_dict, orient='index', columns=['Title', 'Company', 'Location', 'Description'])
        filtered_df = df[df['Description'].str.contains('|'.join(filters), case=False, regex=True, na=False)]
        writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
        df.to_excel(writer, sheet_name='job', index=False)
        filtered_df.to_excel(writer, sheet_name='job_filtered', index=False) # to remove id, add index=False
        worksheet = writer.sheets['job']
        worksheet.autofit()
        worksheet.set_column('A:A', 50)
        worksheet.autofilter(0, 0, df.shape[0], df.shape[1])
        worksheet = writer.sheets['job_filtered']
        worksheet.autofit()
        worksheet.set_column('A:A', 50)
        worksheet.autofilter(0, 0, filtered_df.shape[0], filtered_df.shape[1])
    except Exception as e:
        print(traceback.format_exc())
    finally:
        if writer:
            writer.close()
        print('Job scraping finished')