-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
86 lines (73 loc) · 2.59 KB
/
scrape.py
File metadata and controls
86 lines (73 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from bs4 import BeautifulSoup
import requests
from itertools import count
from urllib.parse import urljoin
import logging
from time import sleep
logger = logging.getLogger(__name__)
_session = None
def get_session():
global _session
if not _session:
_session = requests.Session()
_session.headers['User-Agent'] = "codls fun scraper/0"
return _session
def get_and_parse_page(n=1):
DONATIONS_LIST = 'https://gamesdonequick.com/tracker/donations/'
session = get_session()
logger.info('Fetching page %s', n)
resp = session.get(
DONATIONS_LIST,
params={'page': n})
if not resp.ok:
if resp.status_code == 404:
logger.info('Page %s does not exist', n)
return None
elif resp.status_code == 429:
logger.info('Rate limited for %s seconds', resp.headers['retry-after'])
raise RateLimited(int(resp.headers['retry-after']))
else:
raise Exception(resp)
soup = BeautifulSoup(resp.text, 'html.parser')
rows = [row for row in soup.table.children if row.name == 'tr']
relative_urls = filter(None, map(parse_row, rows))
urls = map(lambda url: urljoin(DONATIONS_LIST, url), relative_urls)
return list(urls)
def parse_row(row_soup):
cols = row_soup('td')
if cols[3].string.strip() == 'Yes':
return cols[2].a['href']
def get_message(url, try_not_to_get_rate_limited=True):
session = get_session()
resp = session.get(url)
if try_not_to_get_rate_limited:
sleep(2)
if not resp.ok:
if resp.status_code == 429:
logger.info('Rate limited for %s seconds', resp.headers['retry-after'])
raise RateLimited(int(resp.headers['retry-after']))
else:
raise Exception(resp)
soup = BeautifulSoup(resp.text, 'html.parser')
message = soup.table.find('td')
if (
message.find('i', class_='fa-question-circle')
or message.find('i', class_='fa-times-circle')):
# message is pending approval or rejected
return None
return message.get_text().strip()
def get_donation_urls(start_at_page=1, known=None):
if not known:
known = frozenset()
for page in count(start_at_page):
logger.info('%s donations with messages found so far', len(known))
new = get_and_parse_page(page)
if not new:
break
new = frozenset(new).difference(known)
known = known.union(new)
for url in new:
yield url
class RateLimited(Exception):
def __init__(self, seconds):
self.seconds = seconds