-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtf-idf.py
More file actions
65 lines (57 loc) · 1.6 KB
/
tf-idf.py
File metadata and controls
65 lines (57 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# coding: UTF-8
import MeCab,math
# get article data from file
def get_articles():
data = open("data/article.txt", "r").read()
articles = data.split(',')
articles.pop()
return articles
# morphological analysis by mecab and count words
def mecab_parse(articles):
result = []
tagger = MeCab.Tagger()
for i in articles:
result.append(tagger.parse(i))
wordCount = [0] * int(len(result))
for i in range(len(result)):
wordCount[i] = {}
for word in result:
wordCount[i].setdefault(word,0)
wordCount[i][word]+=1
return wordCount
def calc_idf(wordCount):
docNum = int(len(wordCount))
wordNum = {}
for i in range(docNum):
for word in wordCount[i]:
wordNum.setdefault(word,0)
wordNum[word]+=1
for k,v in wordNum.items():
wordNum[k] = math.log((1.0*docNum/v), 2)
return wordNum
# target the first article data (j = 0)
def calc_tf(wordCount):
totalCount = 0
wordNum = {}
for i in wordCount[0].values():
totalCount += i
for k,v in wordCount[0].items():
wordNum[k] = 1.0 * v / totalCount
return wordNum
def calc_tf_idf(tf,idf):
td_idf = {}
for word in tf.keys():
td_idf[word] = tf[word] * idf[word]
return td_idf
def output(td_idf):
for k,v in sorted(td_idf.items(), key=lambda x: x[1]):
print k + ': ' + str(v)
def main():
articles = get_articles()
wordCount = mecab_parse(articles)
idf = calc_idf(wordCount)
tf = calc_tf(wordCount)
td_idf = calc_tf_idf(tf,idf)
# print td_idf
output(td_idf)
main()