-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimilarityCheck.py
More file actions
39 lines (30 loc) · 845 Bytes
/
similarityCheck.py
File metadata and controls
39 lines (30 loc) · 845 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from difflib import SequenceMatcher
import itertools
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def removeSimilars(topTen):
removedSimilars = topTen
for p1, p2 in itertools.combinations(topTen, 2):
#similarity = fuzz.ratio(p1[4], p2[4])
if len(p1[3])<2:
for r in removedSimilars:
if r == p1:
removedSimilars.remove(r)
elif len(p2[3])<2:
for r in removedSimilars:
if r == p2:
removedSimilars.remove(r)
else:
similarity = similar(p1[3], p2[3])
if similarity ==True :
selected = max(p1[4],p2[4])
for r in removedSimilars:
if selected==p1[4]:
if r == p2:
removedSimilars.remove(r)
else:
if r==p1:
removedSimilars.remove(r)
return removedSimilars
def similar(seq1, seq2):
return SequenceMatcher(a=seq1, b=seq2).ratio() > 0.7