-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathupwork_best_matches_scraper.py
More file actions
229 lines (196 loc) · 9.43 KB
/
upwork_best_matches_scraper.py
File metadata and controls
229 lines (196 loc) · 9.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# Copyright (c) 2026 roperi
import os
import sys
import time
from datetime import datetime
import logging
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from utils.job_helpers import parse_job_details
from utils.database import create_db, connect_to_db
from settings import config
# LOGGING
# Create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Get paths
scriptdir = os.path.dirname(os.path.abspath(__file__))
logdir = os.path.join(scriptdir, 'log')
if not os.path.exists(logdir):
os.makedirs(logdir)
mypath = os.path.join(logdir, 'upwork_best_matches_scraper.log')
# Create file handler which logs even DEBUG messages
fh = logging.FileHandler(mypath)
fh.setLevel(logging.DEBUG)
# Create console handler
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
formatter = logging.Formatter('[%(levelname)s. %(name)s, (line #%(lineno)d) - %(asctime)s] %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add handlers to logger
logger.addHandler(fh)
logger.addHandler(ch)
# FUNCTIONS
def get_driver_with_retry(chrome_versions, max_attempts=3):
for attempt in range(max_attempts):
for chrome_version in chrome_versions:
logger.info(f'Trying with Chrome version {chrome_version}')
try:
logger.info(f'Attempt #{attempt+1}/{max_attempts}')
options = uc.ChromeOptions()
options.headless = False
return uc.Chrome(options=options, version_main=chrome_version)
except Exception as e:
logger.error(f"Failed to launch Chrome driver with version {chrome_version}. Retrying...")
logger.error(f"All attempts failed for all Chrome versions within {max_attempts} attempts. Unable to launch "
f"Chrome driver.")
return None
def main():
"""
Main function for scraping job postings from Upwork.
Returns:
bool: True if the scraping process completed successfully, False otherwise.
This function connects to the database, configures the web driver, logs into site, and then starts an infinite loop
to continuously scrape job postings. It scrolls down the page to load more job postings, extracts job details, and
stores them in the database. It refreshes the browser after each scraping cycle and pauses for the specified number
of hours before continuing to the next cycle. If an error occurs during the scraping process, it prints the error
message and returns False.
"""
try:
# Connect to database
conn, cursor = connect_to_db()
# Create table (if it does not exist)
create_db(conn, cursor)
# Configure the undetected_chromedriver options
logger.info('Launching driver')
driver = get_driver_with_retry(chrome_versions=config.CHROME_VERSIONS, max_attempts=config.MAX_ATTEMPTS)
if driver:
# Login
user_login_page = 'https://www.upwork.com/ab/account-security/login'
logger.info(f'Navigating to `{user_login_page}`')
driver.get(user_login_page)
logger.info('Pausing for windows to fully load')
time.sleep(25)
logger.info('Switching to main window')
all_windows = driver.window_handles
driver.switch_to.window(all_windows[-1])
logger.info('Submitting username')
username_input = WebDriverWait(driver, 30).until(
EC.visibility_of_element_located(
(By.XPATH,
"/html/body/div[3]/div/div/div/main/div/div/div[2]/div[2]/form/div/div/div[1]/div[3]/div/div/div/"
"div/input")
)
)
username_input.send_keys(config.UPWORK_USERNAME)
username_field = WebDriverWait(driver, 30).until(
EC.visibility_of_element_located(
(By.XPATH,
"/html/body/div[3]/div/div/div/main/div/div/div[2]/div[2]/form/div/div/div[1]/div[3]/div/div/div/"
"div/input")
)
)
username_field.send_keys(Keys.ENTER)
time.sleep(4)
logger.info('Submitting password')
password_input = WebDriverWait(driver, 30).until(
EC.visibility_of_element_located(
(By.XPATH,
"/html/body/div[3]/div/div/div/main/div/div/div[2]/div[2]/div/form/div/div/div[1]/div[3]/div/div/"
"div/input")
)
)
password_input.send_keys(config.UPWORK_PASSWORD)
password_field = WebDriverWait(driver, 30).until(
EC.visibility_of_element_located(
(By.XPATH,
"/html/body/div[3]/div/div/div/main/div/div/div[2]/div[2]/div/form/div/div/div[1]/div[3]/div/div/"
"div/input")
)
)
password_field.send_keys(Keys.ENTER)
logger.info(f'Pausing for {config.VERIFICATION_PAUSE} seconds for credentials verification')
time.sleep(config.VERIFICATION_PAUSE)
# Scroll down using keyboard actions
logger.info('Scrolling down page')
body = driver.find_elements('xpath', "/html/body")
for i in range(0, 12): # Takes about 12 page downs to reach the bottom
body[-1].send_keys(Keys.PAGE_DOWN)
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait for footer to load
timeout_wait = 120
logger.info(f'Waiting for page footer (max {timeout_wait} seconds)...')
wait = WebDriverWait(driver, timeout_wait)
wait.until(EC.visibility_of_element_located((By.TAG_NAME, "footer")))
# Get all text as a wall of text (including user's mini bio on the top-right panel)
jobs_container_xpath = '/html/body/div[3]/div/div/div[1]/div[2]/div/div/main/div'
jobs_containers = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, jobs_container_xpath))
)
if not jobs_containers:
logger.error("No jobs container found; stopping this run.")
return False
# Get all text as a wall of text (including user's mini bio on the top-right panel)
text = jobs_containers[-1].text
# Get rid of the right panel
text_1 = text.split(config.UPWORK_USER_NAME)[0]
# Get rid of the top panel
text_2 = text_1.split('Ordered by most relevant.')[-1]
# Get all job posts
job_posts = text_2.split('Posted')[1:]
# Get urls
job_links = driver.find_elements("xpath", "//a[contains(@href, '/jobs/')]")
job_urls = [link.get_attribute("href") for link in job_links
if 'ontology_skill_uid' not in link.get_attribute("href")
and 'search/saved' not in link.get_attribute("href")
and 'search/jobs/saved' not in link.get_attribute("href")
]
# Scrape jobs
print('Scraping jobs...')
counter = 0
for j in job_posts:
job_details = parse_job_details(j.split('\n'))
# Check if the job ID already exists in the database
job_id = job_details.get('job_id')
job_url = job_urls[counter].split('/?')[0]
cursor.execute('SELECT COUNT(*) FROM jobs WHERE job_id = ?', (job_id,))
count = cursor.fetchone()[0]
if count > 0:
logger.info(f' Job ID #{job_id} already exists. Updating job proposals...')
updated_proposals = job_details.get('job_proposals')
# Update the job_proposals column
cursor.execute('UPDATE jobs SET job_proposals = ?, updated_at = ? WHERE job_id = ?', (
updated_proposals, datetime.now(), job_id))
else:
posted_date = job_details.get('posted_date')
job_title = job_details.get('job_title')
job_description = job_details.get('job_description')
job_tags = job_details.get('job_tags')
job_proposals = job_details.get('job_proposals')
logger.info(f'Storing `{job_details.get("job_title")}` job in database')
cursor.execute(
'INSERT INTO jobs (job_id, job_url, job_title, posted_date, job_description, job_tags, '
'job_proposals) VALUES (?, ?, ?, ?, ?, ?, ?)',
(job_id, job_url, job_title, posted_date, job_description, job_tags, job_proposals))
conn.commit()
counter += 1
# Close the browser
logger.info('Closing browser...')
driver.quit()
else:
logger.error("Couldn't load driver")
except Exception as e:
logger.error(e)
return False
finally:
logger.info('Closing connection to database')
cursor.close()
conn.close()
if __name__ == '__main__':
main()