-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathget_sent_ids.py
More file actions
32 lines (26 loc) · 866 Bytes
/
get_sent_ids.py
File metadata and controls
32 lines (26 loc) · 866 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import sys
import argparse
import pickle
from index_sentences import s2hash
import re
def remove_initial_dash(s): #removes "- "
return re.sub("^\s*-+\s*","",s)
if __name__=="__main__":
parser = argparse.ArgumentParser(description='')
parser.add_argument('--s2i', help='s2i hash to idx dict pickle, result of index_sentences.py')
args = parser.parse_args()
with open(args.s2i,"rb") as f:
sdict=pickle.load(f)
print(f"Loaded dict of {len(sdict)} hashes")
found=0
lost=0
for idx,qline in enumerate(sys.stdin):
qline=qline.strip()
qline=remove_initial_dash(qline)
_,linehash=s2hash((0,qline)) #this function takes a pair (id,sent)
if linehash in sdict:
found+=1
else:
lost+=1
print("LOST",qline)
print(f"Found {found} lost {lost}")