-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathpreppy.py
More file actions
165 lines (146 loc) · 6.07 KB
/
preppy.py
File metadata and controls
165 lines (146 loc) · 6.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
'''
Tools to take a directory of txt files and convert them to TF records
'''
from collections import defaultdict, Counter
import numpy as np
import tensorflow as tf
PAD = "<PAD>"
START = "<START>"
EOS = "<EOS>"
class Preppy():
'''
Class that converts text inputs to numpy arrays of ids.
It assigns ids sequentially to the token on the fly.
'''
def __init__(self, tokenizer_fn):
self.vocab = defaultdict(self.next_value) # map tokens to ids. Automatically gets next id when needed
self.token_counter = Counter() # Counts the token frequency
self.vocab[PAD] = 0
self.vocab[START] = 1
self.vocab[EOS] = 2
self.next = 2 # After 2 comes 3
self.tokenizer = tokenizer_fn
self.reverse_vocab = {}
def next_value(self):
self.next += 1
return self.next
def sequence_to_tf_example(self, sequence):
'''
Gets a sequence (a text like "hello how are you") and returns a a SequenceExample
:param sequence: Some text
:return: A A sequence exmaple
'''
# Convert the text to a list of ids
id_list = self.sentence_to_id_list(sequence)
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
sequence_length = len(id_list) + 2 # For start and end
# Add the context feature, here we just need length
ex.context.feature["length"].int64_list.value.append(sequence_length)
# Feature lists for the two sequential features of our example
# Add the tokens. This is the core sequence.
# You can add another sequence in the feature_list dictionary, for translation for instance
fl_tokens = ex.feature_lists.feature_list["tokens"]
# Prepend with start token
fl_tokens.feature.add().int64_list.value.append(self.vocab[START])
for token in id_list:
# Add those tokens one by one
fl_tokens.feature.add().int64_list.value.append(token)
# apend with end token
fl_tokens.feature.add().int64_list.value.append(self.vocab[EOS])
return ex
def ids_to_string(self, tokens, length=None):
string = ''.join([self.reverse_vocab[x] for x in tokens[:length]])
return string
def convert_token_to_id(self, token):
'''
Gets a token, looks it up in the vocabulary. If it doesn't exist in the vocab, it gets added to id with an id
Then we return the id
:param token:
:return: the token id in the vocab
'''
self.token_counter[token] += 1
return self.vocab[token]
def sentence_to_tokens(self, sent):
return self.tokenizer(sent)
def tokens_to_id_list(self, tokens):
return list(map(self.convert_token_to_id, tokens))
def sentence_to_id_list(self, sent):
tokens = self.sentence_to_tokens(sent)
id_list = self.tokens_to_id_list(tokens)
return id_list
def sentence_to_numpy_array(self, sent):
id_list = self.sentence_to_id_list(sent)
return np.array(id_list)
def update_reverse_vocab(self):
self.reverse_vocab = {id_: token for token, id_ in self.vocab.items()}
def id_list_to_text(self, id_list):
tokens = ''.join(map(lambda x: self.reverse_vocab[x], id_list))
return tokens
@staticmethod
def parse(ex):
'''
Explain to TF how to go from a serialized example back to tensors
:param ex:
:return: A dictionary of tensors, in this case {seq: The sequence, length: The length of the sequence}
'''
context_features = {
"length": tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
"tokens": tf.FixedLenSequenceFeature([], dtype=tf.int64),
}
# Parse the example (returns a dictionary of tensors)
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=ex,
context_features=context_features,
sequence_features=sequence_features
)
return {"seq": sequence_parsed["tokens"],
"length": context_parsed["length"]}
class BibPreppy(Preppy):
'''
An extension of Preppy suited for our task of the table.
It adds
1) Storing the book_id in the TFRecord
2) A map from book_ids to book names so we can explore the results
'''
def __init__(self, tokenizer_fn):
super(BibPreppy, self).__init__(tokenizer_fn)
self.book_map = {}
def sequence_to_tf_example(self, sequence, book_id):
id_list = self.sentence_to_id_list(sequence)
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
sequence_length = len(sequence)
ex.context.feature["length"].int64_list.value.append(sequence_length + 2) # +2 for start and end
ex.context.feature["book_id"].int64_list.value.append(book_id)
# Feature lists for the two sequential features of our example
fl_tokens = ex.feature_lists.feature_list["tokens"]
fl_tokens.feature.add().int64_list.value.append(self.vocab[START])
for token in id_list:
fl_tokens.feature.add().int64_list.value.append(token)
fl_tokens.feature.add().int64_list.value.append(self.vocab[EOS])
return ex
@staticmethod
def parse(ex):
'''
Explain to TF how to go from a serialized example back to tensors
:param ex:
:return:
'''
context_features = {
"length": tf.FixedLenFeature([], dtype=tf.int64),
"book_id": tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
"tokens": tf.FixedLenSequenceFeature([], dtype=tf.int64),
}
# Parse the example (returns a dictionary of tensors)
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=ex,
context_features=context_features,
sequence_features=sequence_features
)
return {"seq": sequence_parsed["tokens"], "length": context_parsed["length"],
"book_id": context_parsed["book_id"]}