forked from yet-another-account/openwebtext
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
63 lines (47 loc) · 1.29 KB
/
utils.py
File metadata and controls
63 lines (47 loc) · 1.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Code taken in large part from https://github.com/jcpeterson/openwebtext
import os
import os.path as op
import tarfile
import re
import collections
def extract_month(url_file_name):
month_re = r"(RS_.*2\d{3}-\d{2})"
month = op.split(url_file_name)[-1]
month = re.match(month_re, month).group()
return month
def chunks(l, n, s=0):
"""Yield successive n-sized chunks from l, skipping the first s chunks."""
if isinstance(l, collections.Iterable):
chnk = []
for i, elem in enumerate(l):
if i < s:
continue
chnk.append(elem)
if len(chnk) == n:
yield chnk
chnk = []
if len(chnk) != 0:
yield chnk
else:
for i in range(s, len(l), n):
yield l[i : i + n]
def extract_archive(archive_fp, outdir="."):
with tarfile.open(archive_fp, "r") as tar:
tar.extractall(outdir)
return outdir
def mkdir(fp):
try:
os.makedirs(fp)
except FileExistsError:
pass
return fp
def linecount(filename):
f = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines