-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlinks.py
More file actions
30 lines (26 loc) · 1.08 KB
/
links.py
File metadata and controls
30 lines (26 loc) · 1.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests
import random
import logging
import time
from bs4 import BeautifulSoup
url_base = "https://stackoverflow.com"
url_unencoded = "https://stackoverflow.com/questions/tagged/python?tab=votes&page={}&pagesize=50"
max_pages = 26120
min_sleep = 0.5
max_sleep = 2.5
logging.basicConfig(filename='links.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
for i in range(1, max_pages + 1):
url_encoded = str.format(url_unencoded, i)
logging.info("Getting from " + url_encoded)
raw = requests.get(url_encoded).text
soup = BeautifulSoup(raw, features="html.parser")
questions = soup.find("div", {"id": "questions"})
links = questions.findAll("a", {"class": "question-hyperlink"}, href=True)
logging.info(f'\tFound %d links', len(links))
out_file = str.format("out/{}.links", i)
file = open(out_file, "w+")
for link in links:
file.write(url_base + link['href'] + "\n")
sleep_time = (random.random() * (max_sleep - min_sleep)) + min_sleep
logging.info(f'\tSleeping for %f seconds', sleep_time)
time.sleep(sleep_time)