-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrecalculate.py
More file actions
59 lines (50 loc) · 2.05 KB
/
recalculate.py
File metadata and controls
59 lines (50 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 19 16:40:46 2017
@author: yjiang
Calculate the term/phrase frequency based on the language model
"""
import gensim, logging, nltk, re, os, sys
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Phrases
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
bigram = Phrases.load('model/bigram')
trigram = Phrases.load('model/trigram')
def tokenize_stem_stop(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for word in nltk.word_tokenize(text) if word not in stopwords]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z0-9]', token):
filtered_tokens.append(token)
#stems = [stemmer.stem(t) for t in tokens]
return filtered_tokens
def recal(path):
result = {}
for line in open(path):
text = tokenize_stem_stop(line.rsplit(' ', 1)[0].lower())
parsed_text = trigram[bigram[text]]
value = int(line.rsplit(' ', 1)[1])
for term in parsed_text:
w = 1
if not term.isdigit() and len(term)>2:
if len(text)>4 and '_' in term:
w = 50
if term in result:
result[term] = result[term] + value*w
else:
result[term] = value*w
return result
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
output = recal('/Users/yjiang/Documents/pythonWorkspace/freqCounter/data/agg_self_200.txt')
sorted_output = sorted(output.items(), key=lambda x: x[1], reverse=True)
with open("output/output_self_200.csv", "w") as text_file:
for out in sorted_output:
if not hasNumbers(out[0]) and out[1]>=50:
text_file.write("{0},{1}\n".format(out[0], out[1]))
# sent = ['near', 'earth', 'object', 'planetary', 'society']
# print(trigram[bigram[sent]])