498KeywordExtraction/rake.py at master · bliminate/498KeywordExtraction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
from collections import defaultdict

def get_stopwords(stopwords_file):
    with open(stopwords_file, 'r') as myfile:
        text = myfile.read()
    return text.split('\n')

def build_regex(stopwords_file):
    stopword_list = get_stopwords(stopwords_file)
    stopword_regex_list = []
    for stopword in stopword_list:
        # looks for stopwords including at the beginning of the sentence
        #but excluding those part of other words
        stopword_regex = r'\b' + stopword + r'(?![\w-])'
        stopword_regex_list.append(stopword_regex)
    stop_regex = re.compile('|'.join(stopword_regex_list), re.IGNORECASE)
    return stop_regex

def get_sentences(text_file):
    with open(text_file, 'r') as myfile:
        text = myfile.read()
    # for punctuation
    punctuation_regex = re.compile('[.,!?:;\n\"\'()-]')
    sentences = punctuation_regex.split(text)
    return sentences

def get_candidate_keywords(stop_regex, sentences):
    candidate_keywords = []
    for sentence in sentences:
        sentence_keywords = stop_regex.split(sentence)
        for keyword in sentence_keywords:
            keyword = keyword.strip()
            if keyword != '':
                candidate_keywords.append(keyword)
    return candidate_keywords

def make_keyword_sentences_dict(keywords, sentences):
    keyword_sentences = defaultdict(list)
    for keyword in keywords:
        for sentence in sentences:
            if keyword in sentence:
                keyword_sentences[keyword].append(sentence.strip())
    return keyword_sentences

def RAKE(stoplist_file, text_file):
    stop_regex = build_regex(stoplist_file)
    sentences = get_sentences(text_file)
    candidate_keywords = get_candidate_keywords(stop_regex, sentences)
    keyword_sentences = make_keyword_sentences_dict(candidate_keywords, sentences)
    return keyword_sentences