-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
46 lines (39 loc) · 2.63 KB
/
main.py
File metadata and controls
46 lines (39 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from Job_Scraper.indeed_scraper import indeed_scraper
from linkedin_scraper import *
from jobup_scraper import *
from glassdoor_scraper import *
import pandas as pd
# TODO : - add new website like leTemps, jobScout, MichaelPage etc...
# - Find a way to remove duplicate
# - add the possibility to work with only one file in a way that if i scrape a job i already saw it goes to the trash
# - maybe develop a frontend or not (i'm lazy and it works for me like that)
if __name__ == '__main__':
# if you wanna adjust linkedin filter in the link go to https://gist.github.com/Diegiwg/51c22fa7ec9d92ed9b5d1f537b9e1107?permalink_comment_id=5418613 for more info. For the other part you can just go on the original website and copy paste the link you want to have
# linkedin_dict = linkedin_scraper("https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Software+Engineer+OR+Embedded+Engineer&location=Switzerland&f_TPR=r86400")
# jobup_dict = jobup_scraper("https://www.jobup.ch/fr/emplois/?publication-date=1&term=software%20engineer")
# glassdoor_dict = glassdoor_scraper("https://fr.glassdoor.ch/Emploi/software-engineer-emplois-SRCH_KO0,17.htm?fromAge=7")
indeed_dict = indeed_scraper("https://ch-fr.indeed.com/jobs?q=ing%C3%A9nieur+informatique&l=&fromage=7")
all_job_dict = indeed_dict
filters = ["python", "vhdl", r"\Wc\W", "linux", "IOT", "systemverilog"] # for one character keyword it is important to have a regex like r"\Wc\W" where \W can represent any special character like "*-+=)" check https://www.rexegg.com/regex-quickstart.php for more info
# Excel part
writer = None
try:
df = pd.DataFrame.from_dict(all_job_dict, orient='index', columns=['Title', 'Company', 'Location', 'Description'])
filtered_df = df[df['Description'].str.contains('|'.join(filters), case=False, regex=True, na=False)]
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='job', index=False)
filtered_df.to_excel(writer, sheet_name='job_filtered', index=False) # to remove id, add index=False
worksheet = writer.sheets['job']
worksheet.autofit()
worksheet.set_column('A:A', 50)
worksheet.autofilter(0, 0, df.shape[0], df.shape[1])
worksheet = writer.sheets['job_filtered']
worksheet.autofit()
worksheet.set_column('A:A', 50)
worksheet.autofilter(0, 0, filtered_df.shape[0], filtered_df.shape[1])
except Exception as e:
print(traceback.format_exc())
finally:
if writer:
writer.close()
print('Job scraping finished')