python-wordsegment/wordsegment/__init__.py at 76c4003073cff3ab2bd9a1562e201bfc8cf6d2c8 · grantjenks/python-wordsegment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""English Word Segmentation in Python

Word segmentation is the process of dividing a phrase without spaces back
into its constituent parts. For example, consider a phrase like "thisisatest".
For humans, it's relatively easy to parse. This module makes it easy for
machines too. Use `segment` to parse a phrase into its parts:

>>> from wordsegment import load, segment
>>> load()
>>> segment('thisisatest')
['this', 'is', 'a', 'test']

In the code, 1024908267229 is the total number of words in the corpus. A
subset of this corpus is found in unigrams.txt and bigrams.txt which
should accompany this file. A copy of these files may be found at
http://norvig.com/ngrams/ under the names count_1w.txt and count_2w.txt
respectively.

Copyright (c) 2016 by Grant Jenks

Based on code from the chapter "Natural Language Corpus Data"
from the book "Beautiful Data" (Segaran and Hammerbacher, 2009)
http://oreilly.com/catalog/9780596157111/

Original Copyright (c) 2008-2009 by Peter Norvig

"""

import io
import math
import os.path as op
import sys
import re
import string


class Segmenter(object):
    """Segmenter

    Support for object-oriented programming and customization.

    """
    ALPHABET = set('abcdefghijklmnopqrstuvwxyz0123456789')
    UNIGRAMS_FILENAME = op.join(
        op.dirname(op.realpath(__file__)),
        'unigrams.txt',
    )
    BIGRAMS_FILENAME = op.join(
        op.dirname(op.realpath(__file__)),
        'bigrams.txt',
    )
    TOTAL = 1024908267229.0
    LIMIT = 24
    WORDS_FILENAME = op.join(
        op.dirname(op.realpath(__file__)),
        'words.txt',
    )


    def __init__(self):
        self.unigrams = {}
        self.bigrams = {}
        self.total = 0.0
        self.limit = 0
        self.words = []


    def load(self):
        "Load unigram and bigram counts from disk."
        self.unigrams.update(self.parse(self.UNIGRAMS_FILENAME))
        self.bigrams.update(self.parse(self.BIGRAMS_FILENAME))
        self.total = self.TOTAL
        self.limit = self.LIMIT
        with io.open(self.WORDS_FILENAME, encoding='utf-8') as reader:
            text = reader.read()
            self.words.extend(text.splitlines())


    @staticmethod
    def parse(filename):
        "Read `filename` and parse tab-separated file of word and count pairs."
        with io.open(filename, encoding='utf-8') as reader:
            lines = (line.split('\t') for line in reader)
            return dict((word, float(number)) for word, number in lines)


    def score(self, word, previous=None):
        "Score `word` in the context of `previous` word."
        unigrams = self.unigrams
        bigrams = self.bigrams
        total = self.total

        if previous is None:
            if word in unigrams:

                # Probability of the given word.

                return unigrams[word] / total

            # Penalize words not found in the unigrams according
            # to their length, a crucial heuristic.

            return 10.0 / (total * 10 ** len(word))

        bigram = '{0} {1}'.format(previous, word)

        if bigram in bigrams and previous in unigrams:

            # Conditional probability of the word given the previous
            # word. The technical name is *stupid backoff* and it's
            # not a probability distribution but it works well in
            # practice.

            return bigrams[bigram] / total / self.score(previous)

        # Fall back to using the unigram probability.

        return self.score(word)


    def isegment(self, text):
        "Return iterator of words that is the best segmenation of `text`."
        memo = dict()

        def search(text, previous='<s>'):
            "Return max of candidates matching `text` given `previous` word."
            if text == '':
                return 0.0, []

            def candidates():
                "Generator of (score, words) pairs for all divisions of text."
                for prefix, suffix in self.divide(text):
                    prefix_score = math.log10(self.score(prefix, previous))

                    pair = (suffix, prefix)
                    if pair not in memo:
                        memo[pair] = search(suffix, prefix)
                    suffix_score, suffix_words = memo[pair]

                    yield (prefix_score + suffix_score, [prefix] + suffix_words)

            return max(candidates())

        # Avoid recursion limit issues by dividing text into chunks, segmenting
        # those chunks and combining the results together. Chunks may divide
        # words in the middle so prefix chunks with the last five words of the
        # previous result.

        clean_text = self.clean(text)
        size = 250
        prefix = ''

        for offset in range(0, len(clean_text), size):
            chunk = clean_text[offset:(offset + size)]
            _, chunk_words = search(prefix + chunk)
            prefix = ''.join(chunk_words[-5:])
            del chunk_words[-5:]
            for word in chunk_words:
                yield word

        _, prefix_words = search(prefix)

        for word in prefix_words:
            yield word

    def segment_ignore_digits(self, text):
        "apply segmentation only to non-numeric text"
        ignore_chars = string.digits
        # test if text contains digits
        segments = re.split(r'((?=\S*[\d.])\S*)', text)
        digit_checker = re.compile(r'\d')
        results = []
        for substring in segments:
            if digit_checker.search(substring) is not None:
                # has digits, so append substring w/out modification
                results.append(substring)
            else:
                results.extend(self.isegment(substring))
        return results

    def segment(self, text, ignore_digits = False):
        "Return list of words that is the best segmenation of `text`."
        if ignore_digits:
            return self.segment_ignore_digits(text)
        else:
            return list(self.isegment(text))


    def divide(self, text):
        "Yield `(prefix, suffix)` pairs from `text`."
        for pos in range(1, min(len(text), self.limit) + 1):
            yield (text[:pos], text[pos:])


    @classmethod
    def clean(cls, text):
        "Return `text` lower-cased with non-alphanumeric characters removed."
        alphabet = cls.ALPHABET
        text_lower = text.lower()
        letters = (letter for letter in text_lower if letter in alphabet)
        return ''.join(letters)


_segmenter = Segmenter()        # pylint: disable=invalid-name
clean = _segmenter.clean        # pylint: disable=invalid-name
load = _segmenter.load          # pylint: disable=invalid-name
isegment = _segmenter.isegment  # pylint: disable=invalid-name
segment = _segmenter.segment    # pylint: disable=invalid-name
UNIGRAMS = _segmenter.unigrams
BIGRAMS = _segmenter.bigrams
WORDS = _segmenter.words


def main(arguments=()):
    """Command-line interface (CLI) entry-point. Parse `arguments` into in-file
    and out-file then read lines from in-file, segment the lines, and write the
    result to out-file. Input and output default to stdin and stdout
    respectively.

    """
    import argparse
    import os

    parser = argparse.ArgumentParser(description='English Word Segmentation')
    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
                        default=sys.stdin)
    parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
                        default=sys.stdout)

    streams = parser.parse_args(arguments)
    load()

    for line in iter(streams.infile.readline, ''):
        streams.outfile.write(' '.join(segment(line.strip())))
        streams.outfile.write(os.linesep)


if __name__ == '__main__':
    main(sys.argv[1:])


__all__ = [
    'Segmenter',
    'load', 'isegment', 'segment',
    'UNIGRAMS', 'BIGRAMS', 'WORDS',
    'main'
]
__title__ = 'wordsegment'
__version__ = '1.3.1'
__build__ = 0x010301
__author__ = 'Grant Jenks'
__license__ = 'Apache 2.0'
__copyright__ = 'Copyright 2018 Grant Jenks'