-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsimhash.py
More file actions
59 lines (52 loc) · 2.3 KB
/
csimhash.py
File metadata and controls
59 lines (52 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import hashlib
from nltk.corpus import stopwords
import ctoken
from collections import defaultdict
stopwords = [ctoken.tokenize(i)[0] for i in stopwords.words('english')]
HASHBITS = 128
def hash(words):
"""
Creates a hashcode using the simhash method, and md5 128 bit hashcodes.
:param words: list of tokenized words
:return: 128 bit hash code
"""
# compute word frequencies
if not words:
raise IndexError
freqs = defaultdict(int)
for word in words:
freqs[word] += 1
# computes the binary of set(words) and outputs a (word_binary, word_frequency) tuple, removing stopwords
# zfill is for extra 0's in the binary to get to HASHBITS
binaries = [(bin(int(hashlib.md5(word.encode("utf-8")).hexdigest(), 16))[2:].zfill(HASHBITS), freqs[word])
for word in set(words) if word not in stopwords]
# computes the simhash converted word binary, which converts 0's to -1's and multiplies by the word weight
simhash_bin_arrays= [[-1 * weight if bit =="0" else weight for bit in binary] for binary, weight in binaries]
# sums the columns into an array as long as the size of bits in previous
try:
sum_array = [sum([simhash_bin_arrays[row][col]
for row in range(0, len(simhash_bin_arrays))]) for col in range(0, len(simhash_bin_arrays[0]))]
except IndexError:
print(binaries)
print(simhash_bin_arrays)
print(len(simhash_bin_arrays))
# converts sums back to binary representation
final_list = ["1" if val > 0 else "0" for val in sum_array]
# convert to string
doc_simhash_hash = "".join(final_list)
return doc_simhash_hash
def hamming_distance(hash1, hash2):
# if the bits do not match, increment hamming distance count
return sum([1 if elem1 != elem2 else 0 for elem1, elem2 in zip(hash1, hash2)])
def find_similar_to_doc(main_docID, hashdict, distance):
ret = list()
for docID, hash in hashdict.items():
distance = hamming_distance(hashdict[main_docID], hash)
if distance < distance:
ret.append(docID, distance)
return ret
if __name__ == '__main__':
words = ["dingo", "ate", "me", "baby", ]
words6 = ["dingo", "ate", "me", "baby", ]
words2 = ["dingo", "ate", "me", "baby", ]
print(hamming_distance(hash(words), hash(words2)))