-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathlang_proc.py
More file actions
50 lines (36 loc) · 1.49 KB
/
lang_proc.py
File metadata and controls
50 lines (36 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
_stop_words = stopwords.words('english')
import itertools
import string
class Term(object):
def __init__(self, full_word):
self.full_word = full_word
# TODO: Lemmatization requires downloads
# wnl = WordNetLemmatizer()
# lemmas = [wnl.lemmatize(token) for token in tokens]
self.stem = PorterStemmer().stem(full_word).lower()
def __eq__(self, other):
return self.stem == other.stem
def __hash__(self):
return hash(self.stem)
def __repr__(self):
return "Term {}({})".format(self.stem.encode('utf8'), self.full_word.encode('utf8'))
def __str__(self):
return repr(self)
def is_punctuation(self):
return self.stem in string.punctuation
def is_stop_word(self):
return self.full_word in _stop_words
def stem_and_tokenize_text(text):
sents = sent_tokenize(text)
tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
terms = [Term(token) for token in tokens]
return filter(lambda term: not term.is_punctuation(), terms)
def to_query_terms(query_raw):
# In case query and doc require different processing in the future
return stem_and_tokenize_text(query_raw)
def to_doc_terms(doc_raw):
# In case query and doc require different processing in the future
return stem_and_tokenize_text(doc_raw)