-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.py
More file actions
72 lines (54 loc) · 2.17 KB
/
index.py
File metadata and controls
72 lines (54 loc) · 2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import string
import sys
import os
import subprocess
from Inverter import HTMLInverter
from nltk.stem import PorterStemmer
import time
import shutil
if __name__ == "__main__":
input_dir = sys.argv[1]
output_dir = sys.argv[2]
files = os.listdir(input_dir)
total_docs = len(files)
stemmer = PorterStemmer()
m = HTMLInverter(total_docs)
#delete and then create the directory for our tokenized files
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# with open(os.path.join(output_dir, "map.txt"), 'w') as f:
# f.write("DocID,FileName\n")
doc_id = 0
#Loop over all files and tokenize them
print("building global hashtable..")
for file in files:
if file.endswith(".html"):
input_file = os.path.join(input_dir,file)
try:
command = f'./tokenizer_files {input_file}'
#print(f'Tokenizing {html_docs}!!!')
# Run the C++ command and capture its output
result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Check if the execution was successful
if result.returncode == 0:
tokens = result.stdout.decode('utf-8').splitlines() # Decode the binary output to text
else:
print("Error while running C++ script:")
print(result.stderr.decode('utf-8')) # Decode the binary error to text
except Exception as e:
print(f"An error occurred: {e}")
tokens = [stemmer.stem(token) for token in tokens]
m.processTokens(tokens, doc_id)
with open(os.path.join(output_dir, "map.txt"), 'a') as f:
id = str(doc_id).ljust(4)
fname = str(file).ljust(12)
rec = id + '|' + fname + '\n'
f.write(rec)
doc_id += 1
print("finished table..")
print(f"writing files to {output_dir}..")
m.finish(output_dir)
print("finished writing output..")
print("complete!!")