-
Notifications
You must be signed in to change notification settings - Fork 52
Expand file tree
/
Copy path__init__.py
More file actions
253 lines (192 loc) · 7.86 KB
/
__init__.py
File metadata and controls
253 lines (192 loc) · 7.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""English Word Segmentation in Python
Word segmentation is the process of dividing a phrase without spaces back
into its constituent parts. For example, consider a phrase like "thisisatest".
For humans, it's relatively easy to parse. This module makes it easy for
machines too. Use `segment` to parse a phrase into its parts:
>>> from wordsegment import load, segment
>>> load()
>>> segment('thisisatest')
['this', 'is', 'a', 'test']
In the code, 1024908267229 is the total number of words in the corpus. A
subset of this corpus is found in unigrams.txt and bigrams.txt which
should accompany this file. A copy of these files may be found at
http://norvig.com/ngrams/ under the names count_1w.txt and count_2w.txt
respectively.
Copyright (c) 2016 by Grant Jenks
Based on code from the chapter "Natural Language Corpus Data"
from the book "Beautiful Data" (Segaran and Hammerbacher, 2009)
http://oreilly.com/catalog/9780596157111/
Original Copyright (c) 2008-2009 by Peter Norvig
"""
import io
import math
import os.path as op
import sys
import re
import string
class Segmenter(object):
"""Segmenter
Support for object-oriented programming and customization.
"""
ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789')
UNIGRAMS_FILENAME = op.join(
op.dirname(op.realpath(__file__)),
'unigrams.txt',
)
BIGRAMS_FILENAME = op.join(
op.dirname(op.realpath(__file__)),
'bigrams.txt',
)
TOTAL = 1024908267229.0
LIMIT = 24
WORDS_FILENAME = op.join(
op.dirname(op.realpath(__file__)),
'words.txt',
)
def __init__(self):
self.unigrams = {}
self.bigrams = {}
self.total = 0.0
self.limit = 0
self.words = []
def load(self):
"Load unigram and bigram counts from disk."
self.unigrams.update(self.parse(self.UNIGRAMS_FILENAME))
self.bigrams.update(self.parse(self.BIGRAMS_FILENAME))
self.total = self.TOTAL
self.limit = self.LIMIT
with io.open(self.WORDS_FILENAME, encoding='utf-8') as reader:
text = reader.read()
self.words.extend(text.splitlines())
@staticmethod
def parse(filename):
"Read `filename` and parse tab-separated file of word and count pairs."
with io.open(filename, encoding='utf-8') as reader:
lines = (line.split('\t') for line in reader)
return dict((word, float(number)) for word, number in lines)
def score(self, word, previous=None):
"Score `word` in the context of `previous` word."
unigrams = self.unigrams
bigrams = self.bigrams
total = self.total
if previous is None:
if word in unigrams:
# Probability of the given word.
return unigrams[word] / total
# Penalize words not found in the unigrams according
# to their length, a crucial heuristic.
return 10.0 / (total * 10 ** len(word))
bigram = '{0} {1}'.format(previous, word)
if bigram in bigrams and previous in unigrams:
# Conditional probability of the word given the previous
# word. The technical name is *stupid backoff* and it's
# not a probability distribution but it works well in
# practice.
return bigrams[bigram] / total / self.score(previous)
# Fall back to using the unigram probability.
return self.score(word)
def isegment(self, text):
"Return iterator of words that is the best segmenation of `text`."
memo = dict()
def search(text, previous='<s>'):
"Return max of candidates matching `text` given `previous` word."
if text == '':
return 0.0, []
def candidates():
"Generator of (score, words) pairs for all divisions of text."
for prefix, suffix in self.divide(text):
prefix_score = math.log10(self.score(prefix, previous))
pair = (suffix, prefix)
if pair not in memo:
memo[pair] = search(suffix, prefix)
suffix_score, suffix_words = memo[pair]
yield (prefix_score + suffix_score, [prefix] + suffix_words)
return max(candidates())
# Avoid recursion limit issues by dividing text into chunks, segmenting
# those chunks and combining the results together. Chunks may divide
# words in the middle so prefix chunks with the last five words of the
# previous result.
clean_text = self.clean(text)
size = 250
prefix = ''
for offset in range(0, len(clean_text), size):
chunk = clean_text[offset:(offset + size)]
_, chunk_words = search(prefix + chunk)
prefix = ''.join(chunk_words[-5:])
del chunk_words[-5:]
for word in chunk_words:
yield word
_, prefix_words = search(prefix)
for word in prefix_words:
yield word
def segment_ignore_digits(self, text):
"apply segmentation only to non-numeric text"
ignore_chars = string.digits
# test if text contains digits
segments = re.split(r'((?=\S*[\d.])\S*)', text)
digit_checker = re.compile(r'\d')
results = []
for substring in segments:
if digit_checker.search(substring) is not None:
# has digits, so append substring w/out modification
results.append(substring)
else:
results.extend(self.isegment(substring))
return results
def segment(self, text, ignore_digits = False):
"Return list of words that is the best segmenation of `text`."
if ignore_digits:
return self.segment_ignore_digits(text)
else:
return list(self.isegment(text))
def divide(self, text):
"Yield `(prefix, suffix)` pairs from `text`."
for pos in range(1, min(len(text), self.limit) + 1):
yield (text[:pos], text[pos:])
@classmethod
def clean(cls, text):
"Return `text` lower-cased with non-alphanumeric characters removed."
alphabet = cls.ALPHABET
text_lower = text.lower()
letters = (letter for letter in text_lower if letter in alphabet)
return ''.join(letters)
_segmenter = Segmenter() # pylint: disable=invalid-name
clean = _segmenter.clean # pylint: disable=invalid-name
load = _segmenter.load # pylint: disable=invalid-name
isegment = _segmenter.isegment # pylint: disable=invalid-name
segment = _segmenter.segment # pylint: disable=invalid-name
UNIGRAMS = _segmenter.unigrams
BIGRAMS = _segmenter.bigrams
WORDS = _segmenter.words
def main(arguments=()):
"""Command-line interface (CLI) entry-point. Parse `arguments` into in-file
and out-file then read lines from in-file, segment the lines, and write the
result to out-file. Input and output default to stdin and stdout
respectively.
"""
import argparse
import os
parser = argparse.ArgumentParser(description='English Word Segmentation')
parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
default=sys.stdin)
parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
default=sys.stdout)
streams = parser.parse_args(arguments)
load()
for line in iter(streams.infile.readline, ''):
streams.outfile.write(' '.join(segment(line.strip())))
streams.outfile.write(os.linesep)
if __name__ == '__main__':
main(sys.argv[1:])
__all__ = [
'Segmenter',
'load', 'isegment', 'segment',
'UNIGRAMS', 'BIGRAMS', 'WORDS',
'main'
]
__title__ = 'wordsegment'
__version__ = '1.3.1'
__build__ = 0x010301
__author__ = 'Grant Jenks'
__license__ = 'Apache 2.0'
__copyright__ = 'Copyright 2018 Grant Jenks'