-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcalculation.py
More file actions
245 lines (209 loc) · 8.28 KB
/
calculation.py
File metadata and controls
245 lines (209 loc) · 8.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import logging
import numpy
import random
import copy
from sklearn.decomposition import PCA
from data_controller import simlex_vocab, wordsim_vocab
# Creates deep-copies of one ore more dictionaries
def create_duplicates(set1, set2=None, set3=None, set4=None):
copy1 = copy.deepcopy(set1)
if set2 is not None:
copy2 = copy.deepcopy(set2)
if set3 is not None:
copy3 = copy.deepcopy(set3)
if set4 is not None:
copy4 = copy.deepcopy(set4)
return copy1, copy2, copy3, copy4
return copy1, copy2, copy3
return copy1, copy2
return copy1
# Transforms a dictionary into a list
def transform_dict_to_list(dict1):
vector_list = []
for word in dict1:
vector_list.append(list(dict1[word]))
return numpy.array(vector_list)
# Extracts vectors from two, three or four dictionaries into vector lists
def transform_multiple_dicts_to_lists(dict1, dict2, dict3=None, dict4=None):
vectors1 = transform_dict_to_list(dict1)
vectors2 = transform_dict_to_list(dict2)
if dict3 is not None:
vectors3 = transform_dict_to_list(dict3)
if dict4 is not None:
vectors4 = transform_dict_to_list(dict4)
return vectors1, vectors2, vectors3, vectors4
return vectors1, vectors2, vectors3
return vectors1, vectors2
# Checks if sizes of vectors are equal, deletes random elements from the larger dict if not
def check_sizes(vector_set1, vector_set2):
if (len(vector_set1) > 0) & (len(vector_set2) > 0):
if len(vector_set1) == len(vector_set2):
return vector_set1, vector_set2
elif len(vector_set1) > len(vector_set2):
difference = len(vector_set1) - len(vector_set2)
for i in range(difference):
key = random.choice(list(vector_set1.keys()))
del vector_set1[key]
logging.info("CM: Removed keys from dictionary 2: " + str(key))
elif len(vector_set2) > len(vector_set1):
difference = len(vector_set2) - len(vector_set1)
for i in range(difference):
key = random.choice(list(vector_set2.keys()))
del vector_set2[key]
logging.info("CM: Removed keys from dictionary 2: " + str(key))
return vector_set1, vector_set2
# Checks weather two vector sets have the same length
def check_set_sizes(vector_set1, vector_set2):
if len(vector_set1) > 0 & len(vector_set2) > 0:
if len(vector_set1) != len(vector_set2):
make_set_size_equal(vector_set1, vector_set2)
return vector_set1, vector_set2
# Removes random elements from the longer list until their equal
def make_set_size_equal(vector_set1, vector_set2):
logging.info('CM: Making set sizes equal:')
while len(vector_set1) != len(vector_set2):
if len(vector_set1) > len(vector_set2):
key = random.choice(list(vector_set1.keys()))
del vector_set1[key]
logging.info("CM: REMOVED KEY from list 1: " + str(key))
if len(vector_set2) > len(vector_set1):
key = random.choice(list(vector_set2.keys()))
del vector_set2[key]
logging.info("CM: REMOVED KEY from list 2: " + str(key))
return vector_set1, vector_set2
# Checks if two sets contain duplicates and removes them
def check_set_content(vector_set1, vector_set2):
duplicates = [word for word in vector_set1 if word in vector_set2]
if not duplicates:
for word in duplicates:
vector_set1.remove(word)
vector_set2.remove(word)
# print(duplicates[word])
return vector_set1, vector_set2
# Makes and dict containing a vector to an numpy array for easier calculation
def create_numpy_vector(vector_set):
array = []
for word in vector_set:
vector = list(vector_set[word])
array.append(numpy.array(vector))
numpy_array = numpy.array(array)
return numpy_array
# Calculates the cosines similarity of two vectors
def cosine_similarity(vector1, vector2):
dot = numpy.dot(vector1, vector2)
norm_target = numpy.linalg.norm(vector1)
norm_argument = numpy.linalg.norm(vector2)
cos = dot / (norm_target * norm_argument)
return cos
# Concatenates two dictionaries to one big
def concatenate_dicts(dict1, dict2, dict3=None, dict4=None):
result = {}
for word in dict1:
result[word] = dict1[word]
for word in dict2:
result[word] = dict2[word]
if dict3 is not None:
for word in dict3:
result[word] = dict3[word]
if dict4 is not None:
for word in dict4:
result[word] = dict4[word]
return result
# Calculates the euclidean distance between two vectors
def euclidean_distance(vector1, vector2):
vector_a = numpy.array(vector1)
vector_b = numpy.array(vector2)
distance = numpy.linalg.norm(vector_a-vector_b)
return distance
def create_vocab_and_vecs(t1, t2=None, a1=None, a2=None, aug1=None, aug2=None, lex_dict=None):
vocab = {}
vecs = []
counter = 0
dicts = {}
dicts.update(t1)
if t2 is not None:
dicts.update(t2)
if a1 is not None:
dicts.update(a1)
if a2 is not None:
dicts.update(a2)
if aug1 is not None:
dicts.update(aug1)
if aug2 is not None:
dicts.update(aug2)
if lex_dict is not None:
dicts.update(lex_dict)
for word in dicts:
vocab[word] = counter
vecs.append(dicts[word])
counter += 1
return vocab, vecs
def vocab_to_dict(vocab, vecs, lists):
res = {}
for word in lists:
if word in vocab:
res[word] = vecs[vocab[word]]
return res
def vocabs_to_dicts(vocab, vecs, t1_list, t2_list, a1_list, a2_list, aug1_list=None, aug2_list=None, lex=None):
t1 = vocab_to_dict(vocab, vecs, t1_list)
t2 = vocab_to_dict(vocab, vecs, t2_list)
a1 = vocab_to_dict(vocab, vecs, a1_list)
a2 = vocab_to_dict(vocab, vecs, a2_list)
if lex is not None:
lex_dict = {}
if lex == 'simlex':
# print('Vocab to dicts --- SIMLEX')
lex_dict = vocab_to_dict(vocab, vecs, simlex_vocab)
if lex == 'wordsim':
# print('Vocab to dicts --- WORDSIM')
lex_dict = vocab_to_dict(vocab, vecs, wordsim_vocab)
return t1, t2, a1, a2, lex_dict
if aug1_list is not None and aug2_list is not None:
aug1 = vocab_to_dict(vocab, vecs, aug1_list)
aug2 = vocab_to_dict(vocab, vecs, aug2_list)
return t1, t2, a1, a2, aug1, aug2
return t1, t2, a1, a2
def dict_to_vocab_vecs(dictionary):
vocab = {}
vecs = []
counter = 0
for word in dictionary:
vocab[word] = counter
vecs.append(dictionary[word])
counter += 1
return vocab, vecs
# Computes a Principal Component Analysis (PCA) on up to four dictionaries
def principal_componant_analysis(vector_dict1, vector_dict2=None, vector_dict3=None, vector_dict4=None):
logging.info("PCA: Principal composant analysis started")
vector_dict1_copy = create_duplicates(vector_dict1)
array_words = []
array2d = []
for word in vector_dict1_copy:
array_words.append(word)
array2d.append(list(vector_dict1_copy[word]))
if vector_dict2 is not None:
vector_dict2_copy = create_duplicates(vector_dict2)
for word in vector_dict2_copy:
array_words.append(word)
array2d.append(list(vector_dict2_copy[word]))
if vector_dict3 is not None:
vector_dict3_copy = create_duplicates(vector_dict3)
for word in vector_dict3_copy:
array_words.append(word)
array2d.append(list(vector_dict3_copy[word]))
if vector_dict4 is not None:
vector_dict4_copy = create_duplicates(vector_dict4)
for word in vector_dict4_copy:
array_words.append(word)
array2d.append(list(vector_dict4_copy[word]))
pca2 = PCA(n_components=2)
prinicpal_components = pca2.fit_transform(numpy.array(array2d))
results = {}
for i in range(len(array_words)):
results[array_words[i]] = prinicpal_components[i]
logging.info("PCA: Prinicipal composant analysis completed")
return results
def principal_componant_analysis2(vecs):
pca2 = PCA(n_components=2)
prinicpal_components = pca2.fit_transform(numpy.array(vecs))
return prinicpal_components