forked from jwngr/sdow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprune_pages_file.py
More file actions
41 lines (32 loc) · 1.27 KB
/
prune_pages_file.py
File metadata and controls
41 lines (32 loc) · 1.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""
Prunes the pages file by removing pages which are marked as redirects but have no corresponding
redirect in the redirects file.
Output is written to stdout.
"""
import io
import sys
import gzip
# Validate input arguments.
if len(sys.argv) < 3:
print('[ERROR] Not enough arguments provided!')
print('[INFO] Usage: {0} <pages_file> <redirects_file>'.format(sys.argv[0]))
sys.exit()
PAGES_FILE = sys.argv[1]
REDIRECTS_FILE = sys.argv[2]
if not PAGES_FILE.endswith('.gz'):
print('[ERROR] Pages file must be gzipped.')
sys.exit()
if not REDIRECTS_FILE.endswith('.gz'):
print('[ERROR] Redirects file must be gzipped.')
sys.exit()
# Create a dictionary of redirects.
REDIRECTS = {}
for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'rb')):
[source_page_id, _] = line.rstrip(b'\n').split(b'\t')
REDIRECTS[source_page_id] = True
# Loop through the pages file, ignoring pages which are marked as redirects but which do not have a
# corresponding redirect in the redirects dictionary, printing the remaining pages to stdout.
for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')):
[page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t')
if is_redirect == '0' or page_id in REDIRECTS:
print(b'\t'.join([page_id, page_title, is_redirect]).decode())