-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathusable_dataset_tar_extractor.py
More file actions
59 lines (50 loc) · 2.44 KB
/
usable_dataset_tar_extractor.py
File metadata and controls
59 lines (50 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import json
import os
import shutil
from tqdm import tqdm
from texparser.UsablePaperExtractor import UsablePaperExtractor
from texparser.tar_extractor import TarExtractor
class UsableDatasetTarExtractor:
def __init__(self, file_to_start: str, dataset_folder_path, extract_folder_path: str = "",
extract_to_keep_path: str = None) -> None:
self.file_to_start = file_to_start
self.dataset_folder_path = dataset_folder_path
self.extract_folder_path = extract_folder_path
self.tarExtractor = TarExtractor(
self.dataset_folder_path, self.extract_folder_path)
self.usablePaperExtractor = UsablePaperExtractor(extract_to_keep_path)
def run(self):
self.tarExtractor.create_extract_folder_path()
filenames = os.listdir(self.dataset_folder_path)
found_start_file = (self.file_to_start is None)
for count, filename in enumerate(tqdm(filenames)):
if filename == self.file_to_start:
print("found_startfile:", filename)
found_start_file = True
if found_start_file:
try:
self.tarExtractor.untar_file_into_folder(
self.dataset_folder_path + filename)
except Exception as e:
print(e)
extract_folder = self.extract_folder_path + \
self.dataset_folder_path + filename.replace(".tar", "")
sub_extract_folder = extract_folder + \
"/" + os.listdir(extract_folder)[0]
self.tarExtractor.extract_folder(
sub_extract_folder, self.tarExtractor.untargz_file_into_folder)[1]
data = self.usablePaperExtractor.extract_all(
sub_extract_folder)
self.log_progress(data, count)
shutil.rmtree(extract_folder)
self.tarExtractor.delete_extract_folder_path()
def log_progress(self, data: dict, count: int) -> None:
if count % 20 == 0:
with open("backup.json", "a") as backup_file:
backup_file.writelines(json.dumps(data, indent=7))
with open("save.json", "w") as backup_file:
backup_file.writelines(json.dumps(data, indent=7))
if __name__ == "__main__":
parser = UsableDatasetTarExtractor(None, "content/", "extract/", "usable_dataset/")
print("Parser started")
parser.run()