-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathComputeBioSentVecAbstractEmbedding.py
More file actions
75 lines (61 loc) · 2.44 KB
/
ComputeBioSentVecAbstractEmbedding.py
File metadata and controls
75 lines (61 loc) · 2.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance
import glob
import pickle
import json
import argparse
def preprocess_sentence(text):
text = text.replace('/', ' / ')
text = text.replace('.-', ' .- ')
text = text.replace('.', ' . ')
text = text.replace('\'', ' \' ')
text = text.lower()
tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]
return ' '.join(tokens)
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('--claim_file', type=str)
argparser.add_argument('--corpus_file', type=str)
argparser.add_argument('--sentvec_path', type=str)
argparser.add_argument('--corpus_embedding_pickle', type=str, default="corpus_paragraph_biosentvec.pkl")
argparser.add_argument('--claim_embedding_pickle', type=str, default="claim_biosentvec.pkl")
args = argparser.parse_args()
claim_file = args.claim_file
corpus_file = args.corpus_file
corpus = {}
with open(corpus_file) as f:
for line in f:
abstract = json.loads(line)
corpus[str(abstract["doc_id"])] = abstract
claims = []
with open(claim_file) as f:
for line in f:
claim = json.loads(line)
claims.append(claim)
model_path = args.sentvec_path
model = sent2vec.Sent2vecModel()
try:
model.load_model(model_path)
except Exception as e:
print(e)
print('model successfully loaded')
stop_words = set(stopwords.words('english'))
# By paragraph embedding
corpus_embeddings = {}
for k, v in corpus.items():
original_sentences = [v['title']] + v['abstract']
processed_paragraph = " ".join([preprocess_sentence(sentence) for sentence in original_sentences])
sentence_vector = model.embed_sentence(processed_paragraph)
corpus_embeddings[k] = sentence_vector
with open(args.corpus_embedding_pickle,"wb") as f:
pickle.dump(corpus_embeddings,f)
claim_embeddings = {}
for claim in claims:
processed_sentence = preprocess_sentence(claim['claim'])
sentence_vector = model.embed_sentence(processed_sentence)
claim_embeddings[claim["id"]] = sentence_vector
with open(args.claim_embedding_pickle,"wb") as f:
pickle.dump(claim_embeddings,f)