forked from newhouseb/SCPD-Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
74 lines (60 loc) · 2.22 KB
/
scrape.py
File metadata and controls
74 lines (60 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
import re
import os
import getpass
import errno
import itertools
from subprocess import Popen
from mechanize import Browser
from multiprocessing import Pool
def make_sure_path_exists(path):
print path
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
def download(url, direc):
"""Copy the contents of a file from a given URL
to a local file.
"""
make_sure_path_exists(os.getcwd()+ os.sep + direc + os.sep);
if os.path.exists(os.getcwd()+ os.sep + direc + os.sep + url.split('/')[-1]):
print "Already downloaded", url
return
import urllib
webFile = urllib.urlopen(url)
localFile = open(os.getcwd()+ os.sep + direc + os.sep + url.split('/')[-1], 'wb')
localFile.write(webFile.read())
webFile.close()
localFile.close()
def download_star(url_direc):
download(*url_direc)
if __name__ == '__main__':
# Pretend we're just a regular old user (this is naughty, don't try this at home kids)
br = Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6; en-us) AppleWebKit/531.9 (KHTML, like Gecko) Version/4.0.3 Safari/531.9')]
br.set_handle_robots(False)
br.open(raw_input('Enter course page URL: '))
assert br.viewing_html()
# Import from a module outside of version control your SUNET id and password
br.select_form(name="login")
br["username"] = raw_input('Username: ')
br["password"] = getpass.getpass();
# Open the course page for the title you're looking for
br.submit()
br.select_form(name="login")
br["otp"] = raw_input('Enter Two-Factor Authentification: ')
response = br.submit()
direc = raw_input("Directory to store videos: ")
# Build up a list of lectures
links = []
for link in br.links(text="Watch Now"):
link2 = link.attrs[7]
link3 = eval(link2[1])
print link3['url'][1]['webm-url']
links.append(link3['url'][1]['webm-url'])
print "Downloading %d videos..." % len(links)
# Make a thread pool and download 5 files at a time
p = Pool(processes=10)
p.map(download_star, itertools.izip(links, itertools.repeat(direc)))