-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontent_and_elements_jaccard_comparison.py
More file actions
78 lines (62 loc) · 3.49 KB
/
content_and_elements_jaccard_comparison.py
File metadata and controls
78 lines (62 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
"""content_and_elements_jaccard_comparison.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Zcuo9ijl7SlsgphOOe_X69Hr6Uhmt8uJ
"""
vec1 = ['large', 'light double bed', 'light pouf', 'white bedside table', 'light round chandelier', 'white painting', '2 wall lamps']
vec2 = ['small', 'black bed', 'painting', 'wall lamp', 'green curtains']
vec3 = ['small', 'white double bed', 'white dresser', 'white mirror', 'light table lamp', 'small black TV', 'white ceiling fan', 'white blinds']
vec4 = ['large', 'light wood-colored double bed', '2 light wood-colored bedside tables', '2 light wood-colored dressers', 'light wood-colored wardrobe', 'white rug', 'black chair', 'chandelier', 'light yellow curtains']
vecs = [vec1, vec2, vec3, vec4]
import numpy as np
# !pip install spacy
# !pip install gensim
import spacy
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")
nlp = spacy.load("en_core_web_sm")
# Function to calculate cosine similarity between two vectors
def calculate_cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
# Function to calculate element similarity using Jaccard similarity
def calculate_element_similarity(set1, set2):
return len(set1 & set2) / len(set1 | set2)
# Function to preprocess and get vector representations for vectors
def get_vector_representation(vector, word2vec_model):
vec = []
for phrase in vector:
tokens = nlp(phrase)
for token in tokens:
word = token.text
if word in word2vec_model:
vec.extend(word2vec_model[word])
return vec
vector1 = get_vector_representation(vec1, word2vec_model)
vector2 = get_vector_representation(vec2, word2vec_model)
vector3 = get_vector_representation(vec3, word2vec_model)
vector4 = get_vector_representation(vec4, word2vec_model)
max_dimension = max(len(vector1), len(vector2), len(vector3), len(vector4))
def pad_vector(vector, dimension):
return np.pad(vector, (0, dimension - len(vector)), 'constant')
vector1 = pad_vector(vector1, max_dimension)
vector2 = pad_vector(vector2, max_dimension)
vector3 = pad_vector(vector3, max_dimension)
vector4 = pad_vector(vector4, max_dimension)
element_similarities = []
element_similarities.append(calculate_element_similarity(set(vec1), set(vec2)))
element_similarities.append(calculate_element_similarity(set(vec1), set(vec3)))
element_similarities.append(calculate_element_similarity(set(vec1), set(vec4)))
element_similarities.append(calculate_element_similarity(set(vec2), set(vec3)))
element_similarities.append(calculate_element_similarity(set(vec2), set(vec4)))
element_similarities.append(calculate_element_similarity(set(vec3), set(vec4)))
vector_similarities = []
vector_similarities.append(calculate_cosine_similarity(vector1, vector2))
vector_similarities.append(calculate_cosine_similarity(vector1, vector3))
vector_similarities.append(calculate_cosine_similarity(vector1, vector4))
vector_similarities.append(calculate_cosine_similarity(vector2, vector3))
vector_similarities.append(calculate_cosine_similarity(vector2, vector4))
vector_similarities.append(calculate_cosine_similarity(vector3, vector4))
overall_similarities = [0.5 * vector_sim + 0.5 * element_sim for vector_sim, element_sim in zip(vector_similarities, element_similarities)]
for i, similarity in enumerate(overall_similarities):
print(f"Similarity between vec{1+i//3} and vec{2+i%3}: {similarity:.2f}")