forked from yet-another-account/openwebtext
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter.py
More file actions
118 lines (103 loc) · 2.07 KB
/
filter.py
File metadata and controls
118 lines (103 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import tldextract
import tqdm
# domains that aren't scraper friendly. do not include subdomains!
exclude_domains = set([
# image & video hosting sites
'imgur.com',
'redd.it',
'instagram.com',
'discord.gg',
'gfycat.com',
'giphy.com',
'reddituploads.com',
'redditmedia.com',
'twimg.com',
'sli.mg',
'magaimg.net',
'flickr.com',
'imgflip.com',
'youtube.com',
'youtu.be',
'youtubedoubler.com',
'vimeo.com',
'twitch.tv',
'streamable.com',
'bandcamp.com',
'soundcloud.com',
# not scraper friendly
'reddit.com',
'gyazo.com',
'github.com',
'xkcd.com',
'twitter.com',
'spotify.com',
'itunes.apple.com',
'facebook.com',
'gunprime.com',
'strawpoll.me',
'voyagefusion.com',
'rollingstone.com',
'google.com',
'timeanddate.com',
'walmart.com',
'roanoke.com',
'spotrac.com',
# original paper excluded wikipedia
'wikipedia.org',
# lots of top posts for this one
'battleforthenet.com',
])
exclude_extensions = (
'.png',
'.jpg',
'.jpeg',
'.gif',
'.gifv',
'.pdf',
'.mp4',
'.mp3',
'.ogv',
'.webm',
'.doc',
'.docx',
'.log',
'.csv',
'.dat',
'.iso',
'.bin',
'.exe',
'.apk',
'.jar',
'.app',
'.ppt',
'.pps',
'.pptx',
'.xml',
'.gz',
'.xz',
'.bz2',
'.tgz',
'.tar',
'.zip',
'.wma',
'.mov',
'.wmv',
'.3gp',
'.svg',
)
def should_exclude(url):
ext = tldextract.extract(url)
domain = '.'.join([x for x in ext if x])
basedomain = '.'.join(ext[-2:])
if basedomain in exclude_domains or domain in exclude_domains:
return True
if url.split('?')[0].endswith(exclude_extensions):
return True
return False
if __name__ == '__main__':
with open('urls.txt') as urls, open('urls-filtered.txt', 'w') as out:
for line in tqdm.tqdm(urls, total=84532938):
line = line.strip()
if should_exclude(line):
continue
out.write(line + '\n')