tensor_decomp_embedding/gensim_utils.py at master · popcorncolonel/tensor_decomp_embedding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from nltk import bigrams
import gensim
import numpy as np

def get_context_matrix(model, word_vocabs, word_index, fixed_size=True, padding_words=False):
    """
    `word_index` is the index in word_vocabs where the target word appears.
    # word_vocabs is a list of vocabs corresponding to sentence indices

    if `fixed_size` is false, it will not put anything in the list for things out of range - it will simply no-op.
    """
    start = word_index - model.window
    context_matrix = []
    for i in range(start, word_index + model.window + 1):
        if i == word_index:
            continue
        if 0 <= i < len(word_vocabs):
            context_matrix.append(word_vocabs[i].index)
            #assert word_vocabs[i] != len(model.vocab)
            #assert word_vocabs[i] != len(model.vocab) + 1
        elif i < 0: # before sentence
            if fixed_size and padding_words:
                context_matrix.append(len(model.vocab)) # this is a "padding" vector (<S> token)
        else: # after sentence
            if fixed_size and padding_words:
                context_matrix.append(len(model.vocab) + 1) # this is a "padding" vector (</S> token)

    #sent = [model.index2word[x] for x in context_matrix]
    return context_matrix


def get_target_y(word_vocabs, word_index):
    return word_vocabs[word_index].index


def batch_generator(model, sentences, batch_size=512, n_iters=1, fixed_size=True, stopwords=set()):
    '''
    if `fixed_size` is True, sentences will only include words and contexts in the middle of sentences
        (because the first word in the sentence doesn't have 5 words before it)
    otherwise, it will include all words in the sentence. In that case, the context will
        range from min{len(sentence)-1, 5} (usually 5) to model.window (usually 10)
    '''
    if not n_iters:
        n_iters = model.iter
    batch = []
    for i in range(n_iters):
        #print('STARTING NEW TRAINING SET ITER!!!!\nITER {}\n'.format(i))
        for sentence in sentences:
            word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and w not in stopwords]
            for pos, word in enumerate(word_vocabs):
                if fixed_size:
                    if pos < model.window:
                        continue
                    if pos + model.window >= len(word_vocabs):
                        break
                # `word` is the word we're trying to predict
                word_matrix = get_context_matrix(model, word_vocabs, pos, fixed_size=fixed_size)
                target_y = get_target_y(word_vocabs, pos)
                batch.append((word_matrix, target_y))
            if len(batch) >= batch_size:
                yield batch
                batch = []
        if batch:
            yield batch

def batch_generator2(model, sentences, batch_size):
    '''
    Outputs sentences in chunks of 11. No word/context pairs or anything.
    '''
    batch = []
    def append_chunks(l, n):
        for i in range(0, len(l), n):
            batch.append(l[i:i+n])
    for sentence in sentences:
        words = [model.vocab[w].index for w in sentence if w in model.vocab]
        append_chunks(words, 1 + 2*model.window)
        if len(batch) >= batch_size:
            yield batch
            batch = []
    if batch:
        yield batch

if __name__ == '__main__':
    def sentences_generator(num_sents=5e6):
        num_sents = int(num_sents)
        tokenized_wiki = '../wikidump_2008.txt.randomized'  # already has stopwords removed
        count = 0
        n_tokens = 0
        with gensim.utils.smart_open(tokenized_wiki, 'r') as f:
            for line in f:
                if count % int(num_sents / 10) == 0 and count > 0:
                    print("Just hit sentence {} out of {} ({}%)".format(count, num_sents, 100*count / num_sents))
                if count < num_sents:
                    sent = line.rstrip().split()
                    if len(sent) > 200:
                        print(' '.join(sent))
                        import pdb; pdb.set_trace()
                        pass
                    if len(sent) > 1000:
                        continue
                    n_tokens += len(sent)
                    count += 1
                    yield sent
                else:
                    print("{} total tokens".format(n_tokens))
                    raise StopIteration

    sentences = sentences_generator()
    import dill
    with open('wikimodel_5000000_1000', 'rb') as f:
        model = dill.load(f)
    batches = batch_generator2(model, sentences, batch_size=512)
    for _ in batches:
        pass