-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Expand file tree
/
Copy path__init__.py
More file actions
3964 lines (3445 loc) · 159 KB
/
__init__.py
File metadata and controls
3964 lines (3445 loc) · 159 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#### PATTERN | VECTOR ##############################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2010 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern
####################################################################################################
# Vector space model, based on cosine similarity using tf-idf.
# Documents (e.g., a sentence or a text) are represented as bag-of-words:
# the unordered words in the document and their (relative frequency).
# The dictionary of word => frequency items is called the document vector.
# The frequency weight is either TF or TF-IDF (term frequency-inverse document frequency, i.e.,
# the relevance of a word in a document offset by the frequency of the word in all documents).
# Documents can be grouped in a Model to calculate TF-IDF and cosine similarity,
# which measures similarity (0.0-1.0) between documents based on the cosine distance metric.
# A document cay have a type (or label). A model of labeled documents can be used to train
# a classifier. A classifier can be used to predict the label of unlabeled documents.
# This is called supervised machine learning (since we provide labeled training examples).
# Unsupervised machine learning or clustering can be used to group unlabeled documents
# into subsets based on their similarity.
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import division
from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range, next
from collections import OrderedDict
from . import stemmer
_stemmer = stemmer
import sys
import os
import re
import glob
import heapq
import codecs
import tempfile
try:
# Python 2
import cPickle as pickle
except ImportError:
# Python 3
import pickle
import gzip
import types
from math import log, exp, sqrt, tanh
from time import time
from random import random, randint, uniform, choice, sample, seed
from itertools import chain
from bisect import insort
from operator import itemgetter
from collections import defaultdict
from io import open
import numpy as np
import scipy
try:
MODULE = os.path.dirname(os.path.realpath(__file__))
except:
MODULE = ""
try:
from pattern.text import singularize, predicative, conjugate, tokenize
except:
try:
import sys
sys.path.insert(0, os.path.join(MODULE, ".."))
from text import singularize, predicative, conjugate, tokenize
except:
singularize = lambda w, **k: w
predicative = lambda w, **k: w
conjugate = lambda w, t, **k: w
tokenize = lambda s: list(filter(len,
re.split(r"(.*?[\.|\?|\!])",
re.sub(r"(\.|\?|\!|,|;|:)", " \\1", s))))
from pattern.helpers import encode_string, decode_string
decode_utf8 = decode_string
encode_utf8 = encode_string
def shi(i, base="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"):
""" Returns a short string hash for a given int.
"""
s = []
while i > 0:
i, r = divmod(i, len(base))
s.append(base[r])
return "".join(reversed(s))
#--- LIST FUNCTIONS --------------------------------------------------------------------------------
def shuffled(iterable, **kwargs):
""" Returns a copy of the given list with the items in random order.
"""
seed(kwargs.get("seed"))
return sorted(list(iterable), key=lambda x: random())
def chunk(iterable, n):
""" Returns an iterator of n successive equal-sized chunks from the given list.
"""
# list(chunk([1, 2, 3, 4], n=2)) => [[1, 2], [3, 4]]
a = list(iterable)
n = int(n)
i = 0
j = 0
result = []
for m in range(n):
j = i + len(a[m::n])
# yield a[i:j]
result.append(a[i:j])
i = j
return result
def mix(iterables=[], n=10):
""" Returns an iterator that alternates the given lists, in n chunks.
"""
# list(mix([[1, 2, 3, 4], ["a", "b"]], n=2)) => [1, 2, "a", 3, 4, "b"]
a = [list(chunk(x, n)) for x in iterables]
result = []
for i in range(int(n)):
for x in a:
for item in x[i]:
# yield item
result.append(item)
return result
def bin(iterable, key=lambda x: x, value=lambda x: x):
""" Returns a dictionary with items in the given list grouped by the given key.
"""
# bin([["a", 1], ["a", 2], ["b", 3]], key=lambda x: x[0]) =>
# {"a": [["a", 1], ["a", 2]],
# "b": [["b", 3]]
# }
m = defaultdict(list)
for x in iterable:
m[key(x)].append(value(x))
return m
def pimap(iterable, function, *args, **kwargs):
""" Returns an iterator of function(x, *args, **kwargs) for the iterable (x1, x2, x3, ...).
The function is applied in parallel over available CPU cores.
"""
from multiprocessing import Pool
global worker
def worker(x):
return function(x, *args, **kwargs)
return Pool(processes=None).imap(worker, iterable)
#--- READ-ONLY DICTIONARY --------------------------------------------------------------------------
class ReadOnlyError(Exception):
pass
class readonlyodict(OrderedDict):
def __init__(self, *args, **kwargs):
self._f = False
super(readonlyodict, self).__init__(*args, **kwargs)
self._f = True
@classmethod
def fromkeys(cls, k, default=None):
return readonlyodict((k, default) for k in k)
def __setitem__(self, *args, **kwargs):
if self._f:
raise ReadOnlyError
return super(readonlyodict, self).__setitem__(*args, **kwargs)
def __delitem__(self, *args, **kwargs):
if self._f:
raise ReadOnlyError
return super(readonlyodict, self).__delitem__(*args, **kwargs)
def pop(self, *args, **kwargs):
if self._f:
raise ReadOnlyError
return super(readonlyodict, self).pop(*args, **kwargs)
def popitem(self, *args, **kwargs):
if self._f:
raise ReadOnlyError
return super(readonlyodict, self).popitem(*args, **kwargs)
def update(self, *args, **kwargs):
if self._f:
raise ReadOnlyError
return super(readonlyodict, self).update(*args, **kwargs)
# Read-only dictionary, used for Document.terms and Document.vector
# (updating these directly invalidates the Document and Model cache).
class readonlydict(dict):
def __init__(self, *args, **kwargs):
dict.__init__(self, *args, **kwargs)
@classmethod
def fromkeys(cls, k, default=None):
return readonlydict((k, default) for k in k)
def copy(self):
return readonlydict(self)
def __setitem__(self, k, v):
raise ReadOnlyError
def __delitem__(self, k):
raise ReadOnlyError
def pop(self, k, default=None):
raise ReadOnlyError
def popitem(self, kv):
raise ReadOnlyError
def clear(self):
raise ReadOnlyError
def update(self, kv):
raise ReadOnlyError
def setdefault(self, k, default=None):
if k in self:
return self[k]
raise ReadOnlyError
# Read-only list, used for Model.documents.
class readonlylist(list):
def __init__(self, *args, **kwargs):
list.__init__(self, *args, **kwargs)
def __setitem__(self, i, v):
raise ReadOnlyError
def __delitem__(self, i):
raise ReadOnlyError
def append(self, v):
raise ReadOnlyError
def extend(self, v):
raise ReadOnlyError
def insert(self, i, v):
raise ReadOnlyError
def remove(self, v):
raise ReadOnlyError
def pop(self, i):
raise ReadOnlyError
#### DOCUMENT ######################################################################################
#--- STOP WORDS ------------------------------------------------------------------------------------
# A dictionary of (language, words)-items of function words, for example: {"en": {"the": True}}.
# - de: 950+, Marco Götze & Steffen Geyer
# - en: 550+, Martin Porter (http://snowball.tartarus.org)
# - es: 300+, Martin Porter
# - fr: 550+, Martin Porter, Audrey Baneyx
# - nl: 100+, Martin Porter, Damien van Holten
stopwords = _stopwords = {}
for f in glob.glob(os.path.join(MODULE, "stopwords-*.txt")):
language = os.path.basename(f)[-6:-4] # stopwords-[en].txt
w = codecs.open(f, encoding="utf-8")
w = (w.strip() for w in w.read().split(","))
stopwords[language] = dict.fromkeys(w, True)
# The following English words could also be meaningful nouns:
#from pattern.vector import stopwords
#for w in ["mine", "us", "will", "can", "may", "might"]:
# stopwords["en"].pop(w)
#--- WORD COUNT ------------------------------------------------------------------------------------
# Simple bag-of-word models are often made up of word frequencies or character trigram frequencies.
PUNCTUATION = ".,;:!?()[]{}`'\"@#$^&*+-|=~_"
def words(string, filter=lambda w: w.strip("'").isalnum(), punctuation=PUNCTUATION, **kwargs):
""" Returns a list of words (alphanumeric character sequences) from the given string.
Common punctuation marks are stripped from words.
"""
string = decode_utf8(string)
string = re.sub(r"([a-z|A-Z])'(m|s|ve|re|ll|d)", "\\1 <QUOTE/>\\2", string)
string = re.sub(r"(c|d|gl|j|l|m|n|s|t|un)'([a-z|A-Z])", "\\1<QUOTE/> \\2", string)
words = (w.strip(punctuation).replace("<QUOTE/>", "'", 1) for w in string.split())
words = (w for w in words if filter is None or filter(w) is not False)
words = [w for w in words if w]
return words
PORTER, LEMMA = "porter", "lemma"
def stem(word, stemmer=PORTER, **kwargs):
""" Returns the base form of the word when counting words in count().
With stemmer=PORTER, the Porter2 stemming algorithm is used.
With stemmer=LEMMA, either uses Word.lemma or inflect.singularize().
(with optional parameter language="en", pattern.en.inflect is used).
"""
if hasattr(word, "string") and stemmer in (PORTER, None):
word = word.string
if isinstance(word, str):
word = decode_utf8(word.lower())
if stemmer is None:
return word.lower()
if stemmer == PORTER:
return _stemmer.stem(word, **kwargs)
if stemmer == LEMMA:
if hasattr(word, "lemma"): # pattern.en.Word
w = word.string.lower()
if word.lemma is not None:
return word.lemma
if word.pos == "NNS":
return singularize(w)
if word.pos.startswith(("VB", "MD")):
return conjugate(w, "infinitive") or w
if word.pos.startswith(("JJ",)):
return predicative(w)
if word.pos.startswith(("DT", "PR", "WP")):
return singularize(w, pos=word.pos)
return w
return singularize(word, pos=kwargs.get("pos", "NN"))
if hasattr(stemmer, "__call__"):
return decode_utf8(stemmer(word))
return word.lower()
def count(words=[], top=None, threshold=0, stemmer=None, exclude=[], stopwords=False, language=None, **kwargs):
""" Returns a dictionary of (word, count)-items, in lowercase.
Words in the exclude list and stop words (by default, English) are not counted.
Words whose count falls below (or equals) the given threshold are excluded.
Words that are not in the given top most counted are excluded.
"""
# An optional dict-parameter can be used to specify a subclass of dict,
# e.g., count(words, dict=readonlydict) as used in Document.
count = kwargs.get("dict", dict)()
for w in words:
w1 = w
w2 = w
if hasattr(w, "string"): # pattern.en.Word
w1 = w.string.lower()
if isinstance(w, str):
w1 = w.lower()
w2 = w.lower()
if (stopwords or w1 not in _stopwords.get(language or "en", ())) and w1 not in exclude:
if stemmer is not None:
w2 = stem(w2, stemmer, **kwargs).lower()
dict.__setitem__(count, w2, (w2 in count) and count[w2] + 1 or 1)
for k in list(count.keys()):
if count[k] <= threshold:
dict.__delitem__(count, k)
if top is not None:
count = count.__class__(heapq.nsmallest(top, list(count.items()), key=lambda kv: (-kv[1], kv[0])))
return count
def character_ngrams(string="", n=3, top=None, threshold=0, exclude=[], **kwargs):
""" Returns a dictionary of (character n-gram, count)-items.
N-grams in the exclude list are not counted.
N-grams whose count falls below (or equals) the given threshold are excluded.
N-grams that are not in the given top most counted are excluded.
"""
# An optional dict-parameter can be used to specify a subclass of dict,
# e.g., count(words, dict=readonlydict) as used in Document.
count = defaultdict(int)
if n > 0:
for i in range(len(string) - n + 1):
w = string[i:i + n]
if w not in exclude:
count[w] += 1
if threshold > 0:
count = dict((k, v) for k, v in count.items() if v > threshold)
if top is not None:
count = dict(heapq.nsmallest(top, list(count.items()), key=lambda kv: (-kv[1], kv[0])))
return kwargs.get("dict", dict)(count)
chngrams = character_ngrams
#--- DOCUMENT --------------------------------------------------------------------------------------
# A Document is a bag of words in which each word is a feature.
# A Document is represented as a vector of weighted (TF-IDF) features.
# A Document can be part of a training model used for learning (i.e., clustering or classification).
_UID = 0
_SESSION = shi(int(time() * 1000)) # Avoid collision with pickled documents.
def _uid():
""" Returns a string id, for example: "NPIJYaS-1", "NPIJYaS-2", ...
The string part is based on the current time, the number suffix is auto-incremental.
"""
global _UID
_UID += 1
return _SESSION + "-" + str(_UID)
# Term relevance weight:
TF, TFIDF, TF_IDF, BINARY = \
"tf", "tf-idf", "tf-idf", "binary"
class Document(object):
# Document(string = "",
# filter = lambda w: w.lstrip("'").isalnum(),
# punctuation = PUNCTUATION,
# top = None,
# threshold = 0,
# stemmer = None,
# exclude = [],
# stopwords = False,
# name = None,
# type = None,
# language = None,
# description = None
# )
def __init__(self, string="", **kwargs):
""" An unordered bag-of-words representation of the given string, list, dict or Sentence.
Lists can contain tuples (of), strings or numbers.
Dicts can contain tuples (of), strings or numbers as keys, and floats as values.
Document.words stores a dict of (word, count)-items.
Document.vector stores a dict of (word, weight)-items,
where weight is the term frequency normalized (0.0-1.0) to remove document length bias.
Punctuation marks are stripped from the words.
Stop words in the exclude list are excluded from the document.
Only top words whose count exceeds the threshold are included in the document.
"""
kwargs.setdefault("filter", lambda w: w.lstrip("'").isalnum())
kwargs.setdefault("threshold", 0)
kwargs.setdefault("dict", readonlydict)
# A string of words: map to read-only dict of (word, count)-items.
if string is None:
w = kwargs["dict"]()
v = None
elif isinstance(string, str):
w = words(string, **kwargs)
w = count(w, **kwargs)
v = None
# A list of words: map to read-only dict of (word, count)-items.
elif isinstance(string, (list, tuple)) and not string.__class__.__name__ == "Text":
w = string
w = count(w, **kwargs)
v = None
# A set of unique words: map to ready-only dict of (word, 1)-items.
elif isinstance(string, set):
w = string
w = kwargs["dict"].fromkeys(w, 1)
v = None
# A Vector of (word, weight)-items: copy as document vector.
elif isinstance(string, Vector):
w = string
w = kwargs["dict"](w)
v = Vector(w)
# A dict of (word, count)-items: make read-only.
elif isinstance(string, dict):
w = string
w = kwargs["dict"](w)
v = None
# pattern.en.Sentence with Word objects: can use stemmer=LEMMA.
elif string.__class__.__name__ == "Sentence":
w = string.words
w = [w for w in w if kwargs["filter"](w.string)]
w = count(w, **kwargs)
v = None
# pattern.en.Text with Sentence objects, can use stemmer=LEMMA.
elif string.__class__.__name__ == "Text":
w = []
[w.extend(sentence.words) for sentence in string]
w = [w for w in w if kwargs["filter"](w.string)]
w = count(w, **kwargs)
v = None
# Another Document: copy words, wordcount, name and type.
elif isinstance(string, Document):
for k in ("name", "type", "label", "language", "description"):
if hasattr(string, k):
kwargs.setdefault(k, getattr(string, k))
w = string.terms
w = kwargs["dict"](w)
v = None
else:
raise TypeError("document string is not str, unicode, list, dict, Vector, Sentence or Text.")
self._id = _uid() # Document ID, used when comparing objects.
self._name = kwargs.get("name") # Name that describes the document content.
self._type = kwargs.get("type", # Type that describes the category or class of the document.
kwargs.get("label"))
self._language = kwargs.get("language")
self._description = kwargs.get("description", "")
self._terms = w # Dictionary of (word, count)-items.
self._vector = v # Cached tf-idf vector.
self._count = None # Total number of words (minus stop words).
self._model = None # Parent Model.
@classmethod
def load(cls, path):
""" Returns a new Document from the given text file path.
The given text file must be generated with Document.save().
"""
# Open unicode file.
s = open(path, "rb").read()
s = s.lstrip(codecs.BOM_UTF8)
s = decode_utf8(s)
a = {}
v = {}
# Parse document name and type.
# Parse document terms and frequency.
for s in s.splitlines():
if s.startswith("#"): # comment
a["description"] = a.get("description", "") + s.lstrip("#").strip() + "\n"
elif s.startswith("@name:"):
a["name"] = s[len("@name:") + 1:].replace("\\n", "\n")
elif s.startswith("@type:"):
a["type"] = s[len("@type:") + 1:].replace("\\n", "\n")
elif s.startswith("@language:"):
a["lang"] = s[len("@lang:") + 1:].replace("\\n", "\n")
else:
s = s.split(" ")
w, f = " ".join(s[:-1]), s[-1]
if f.isdigit():
v[w] = int(f)
else:
v[w] = float(f)
return cls(v, name = a.get("name"),
type = a.get("type"),
language = a.get("lang"),
description = a.get("description").rstrip("\n"))
def save(self, path):
""" Saves the document as a text file at the given path.
The file content has the following format:
# Cat document.
@name: cat
@type: animal
a 3
cat 2
catch 1
claw 1
...
"""
s = []
# Parse document description.
for x in self.description.split("\n"):
s.append("# %s" % x)
# Parse document name, type and language.
for k, v in (("@name:", self.name), ("@type:", self.type), ("@lang:", self.language)):
if v is not None:
s.append("%s %s" % (k, v.replace("\n", "\\n")))
# Parse document terms and frequency.
for w, f in sorted(self.terms.items()):
if isinstance(f, int):
s.append("%s %i" % (w, f))
if isinstance(f, float):
s.append("%s %.3f" % (w, f))
s = "\n".join(s)
s = encode_utf8(s)
# Save unicode file.
f = open(path, "wb")
f.write(codecs.BOM_UTF8)
f.write(s)
f.close()
def _get_model(self):
return self._model
def _set_model(self, model):
self._vector = None
self._model and self._model._update()
self._model = model
self._model and self._model._update()
model = corpus = property(_get_model, _set_model)
@property
def id(self):
return self._id
@property
def name(self):
return self._name
@property
def type(self):
return self._type
@property
def label(self):
return self._type
@property
def language(self):
return self._language
@property
def description(self):
return self._description
@property
def terms(self):
return self._terms
@property
def words(self):
return self._terms
@property
def features(self):
return list(self._terms.keys())
@property
def count(self):
# Yields the number of words in the document representation.
# Cache the word count so we can reuse it when calculating tf.
if not self._count:
self._count = sum(self.terms.values())
return self._count
@property
def wordcount(self):
return self._count
def __len__(self):
return len(self.terms)
def __iter__(self):
return iter(self.terms)
def __contains__(self, word):
return word in self.terms
def __getitem__(self, word):
return self.terms.__getitem__(word)
def get(self, word, default=None):
return self.terms.get(word, default)
def term_frequency(self, word):
""" Returns the term frequency of a given word in the document (0.0-1.0).
tf = number of occurences of the word / number of words in document.
The more occurences of the word, the higher its relative tf weight.
"""
return float(self.terms.get(word, 0)) / (self.count or 1)
tf = term_frequency
def term_frequency_inverse_document_frequency(self, word, weight=TFIDF):
""" Returns the word relevance as tf * idf (0.0-1.0).
The relevance is a measure of how frequent the word occurs in the document,
compared to its frequency in other documents in the model.
If the document is not incorporated in a model, simply returns tf weight.
"""
if self.model is not None and weight == TFIDF:
# Use tf if no model, or idf==None (happens when the word is not in the model).
idf = self.model.idf(word)
idf = idf is None and 1 or idf
return self.tf(word) * idf
return self.tf(word)
tf_idf = tfidf = term_frequency_inverse_document_frequency
def information_gain(self, word):
""" Returns the information gain for the given word (0.0-1.0).
"""
if self.model is not None:
return self.model.ig(word)
return 0.0
ig = infogain = information_gain
def gain_ratio(self, word):
""" Returns the information gain ratio for the given word (0.0-1.0).
"""
if self.model is not None:
return self.model.gr(word)
return 0.0
gr = gainratio = gain_ratio
@property
def vector(self):
""" Yields the document vector, a dictionary of (word, relevance)-items from the document.
The relevance is tf, tf * idf, infogain or binary if the document is part of a Model,
based on the value of Model.weight (TF, TFIDF, IG, GR, BINARY, None).
The document vector is used to calculate similarity between two documents,
for example in a clustering or classification algorithm.
"""
if not self._vector:
# See the Vector class below = a dict with extra functionality (copy, norm).
# When a document is added/deleted from a model, the cached vector is deleted.
w = getattr(self.model, "weight", TF)
if w not in (TF, TFIDF, IG, INFOGAIN, GR, GAINRATIO, BINARY):
f = lambda w: float(self._terms[w]); w=None
if w == BINARY:
f = lambda w: int(self._terms[w] > 0)
if w == TF:
f = self.tf
if w == TFIDF:
f = self.tf_idf
if w in (IG, INFOGAIN):
f = self.model.ig
if w in (GR, GAINRATIO):
f = self.model.gr
self._vector = Vector(((w, f(w)) for w in self.terms), weight=w)
return self._vector
@property
def concepts(self):
""" Yields the document concept vector if the document is part of an LSA model.
"""
return self.model and self.model.lsa and self.model.lsa.concepts.get(self.id) or None
def keywords(self, top=10, normalized=True):
""" Returns a sorted list of (relevance, word)-tuples that are top keywords in the document.
With normalized=True, weights are normalized between 0.0 and 1.0 (their sum will be 1.0).
"""
n = normalized and sum(self.vector.values()) or 1.0
v = ((f / n, w) for w, f in self.vector.items())
v = heapq.nsmallest(top, v, key=lambda v: (-v[0], v[1]))
return v
def cosine_similarity(self, document):
""" Returns the similarity between the two documents as a number between 0.0-1.0.
If both documents are part of the same model the calculations are cached for reuse.
"""
if self.model is not None:
return self.model.cosine_similarity(self, document)
if document.model is not None:
return document.model.cosine_similarity(self, document)
return cosine_similarity(self.vector, document.vector)
similarity = cosine_similarity
def copy(self):
d = Document(None, name=self.name, type=self.type, description=self.description)
dict.update(d.terms, self.terms)
return d
def __eq__(self, document):
return isinstance(document, Document) and self.id == document.id
def __ne__(self, document):
return not self.__eq__(document)
def __repr__(self):
return "Document(id=%s%s%s)" % (
repr(self._id),
self.name and ", name=%s" % repr(self.name) or "",
self.type and ", type=%s" % repr(self.type) or "")
# This is required because we overwrite the parent's __eq__() method.
# Otherwise objects will be unhashable in Python 3.
# More information: http://docs.python.org/3.6/reference/datamodel.html#object.__hash__
__hash__ = object.__hash__
Bag = BagOfWords = BOW = Document
#--- VECTOR ----------------------------------------------------------------------------------------
# A Vector represents document terms (called features) and their tf or tf * idf relevance weight.
# A Vector is a sparse represenation: i.e., a dictionary with only those features > 0.
# This is fast, usually also faster than LSA which creates a full vector space with non-zero values.
# Document vectors can be used to calculate similarity between documents,
# for example in a clustering or classification algorithm.
# To find the average feature length in a model:
# sum(len(d.vector) for d in model.documents) / float(len(model))
class Vector(readonlydict):
id = 0
def __init__(self, *args, **kwargs):
""" A dictionary of (feature, weight)-items of the features (terms, words) in a Document.
A vector can be used to compare the document to another document with a distance metric.
For example, vectors with 2 features (x, y) can be compared using 2D Euclidean distance.
Vectors that represent text documents can be compared using cosine similarity.
"""
s = kwargs.pop("sparse", True)
f = ()
w = None
if len(args) > 0:
# From a Vector (copy weighting scheme).
if isinstance(args[0], Vector):
w = args[0].weight
# From a dict.
if isinstance(args[0], dict):
f = list(args[0].items())
# From an iterator.
elif hasattr(args[0], "__iter__"):
f = iter(args[0])
Vector.id += 1
self.id = Vector.id # Unique ID.
self.weight = kwargs.pop("weight", w) # TF, TFIDF, IG, BINARY or None.
self._norm = None # Cached L2-norm.
# Exclude zero weights (sparse=True).
f = chain(f, list(kwargs.items()))
f = ((k, v) for k, v in f if not s or v != 0)
readonlydict.__init__(self, f)
@classmethod
def fromkeys(cls, k, default=None, **kwargs):
return Vector(((k, default) for k in k), **kwargs)
@property
def features(self):
return list(self.keys())
@property
def l2_norm(self):
""" Yields the Frobenius matrix norm (cached).
n = the square root of the sum of the absolute squares of the values.
The matrix norm is used to normalize (0.0-1.0) cosine similarity between documents.
"""
if self._norm is None:
self._norm = sum(w * w for w in self.values()) ** 0.5
return self._norm
norm = l2 = L2 = L2norm = l2norm = L2_norm = l2_norm
def copy(self):
return Vector(self, weight=self.weight, sparse=False)
def __call__(self, vector={}):
""" Vector(vector) returns a new vector updated with values from the given vector.
No new features are added. For example: Vector({1:1, 2:2})({1:0, 3:3}) => {1:0, 2:2}.
"""
if isinstance(vector, (Document, Model)):
vector = vector.vector
v = self.copy()
s = dict.__setitem__
for f, w in vector.items():
if f in v:
s(v, f, w)
return v
#--- VECTOR DISTANCE -------------------------------------------------------------------------------
# The "distance" between two vectors can be calculated using different metrics.
# For vectors that represent text, cosine similarity is a good metric.
# For more information, see Domain Similarity Measures (Vincent Van Asch, 2012).
# The following functions can be used if you work with Vectors or plain dictionaries,
# instead of Documents and Models (which use caching for cosine similarity).
def features(vectors=[]):
""" Returns the set of unique features for all given vectors.
"""
return set(chain(*vectors))
_features = features
def sparse(v):
""" Returns the vector with features that have weight 0 removed.
"""
for f, w in list(v.items()):
if w == 0:
del v[f]
return v
def relative(v):
""" Returns the vector with feature weights normalized so that their sum is 1.0 (in-place).
"""
n = float(sum(v.values())) or 1.0
s = dict.__setitem__
for f in v: # Modified in-place.
s(v, f, v[f] / n)
return v
normalize = rel = relative
def l2_norm(v):
""" Returns the L2-norm of the given vector.
"""
if isinstance(v, Vector):
return v.l2_norm
return sum(w * w for w in v.values()) ** 0.5
norm = l2 = L2 = L2norm = l2norm = L2_norm = l2_norm
def cosine_similarity(v1, v2):
""" Returns the cosine similarity of the given vectors.
"""
s = sum(v1.get(f, 0) * w for f, w in v2.items())
s = float(s) / (l2_norm(v1) * l2_norm(v2) or 1)
return s
cos = cosine_similarity
def tf_idf(vectors=[], base=2.71828): # Euler's number
""" Calculates tf * idf on the vector feature weights (in-place).
"""
df = {}
for v in vectors:
for f in v:
if v[f] != 0:
df[f] = df[f] + 1 if f in df else 1.0
n = len(vectors)
s = dict.__setitem__
for v in vectors:
for f in v: # Modified in-place.
s(v, f, v[f] * (log(n / df[f], base)))
return vectors
tfidf = tf_idf
COSINE, EUCLIDEAN, MANHATTAN, CHEBYSHEV, HAMMING = \
"cosine", "euclidean", "manhattan", "chebyshev", "hamming"
def distance(v1, v2, method=COSINE):
""" Returns the distance between two vectors.
"""
if method == COSINE:
return 1 - cosine_similarity(v1, v2)
if method == EUCLIDEAN: # Squared Euclidean distance is used (1.5x faster).
return sum((v1.get(w, 0) - v2.get(w, 0)) ** 2 for w in set(chain(v1, v2)))
if method == MANHATTAN:
return sum(abs(v1.get(w, 0) - v2.get(w, 0)) for w in set(chain(v1, v2)))
if method == CHEBYSHEV:
return max(abs(v1.get(w, 0) - v2.get(w, 0)) for w in set(chain(v1, v2)))
if method == HAMMING:
d = sum(not (w in v1 and w in v2 and v1[w] == v2[w]) for w in set(chain(v1, v2)))
d = d / float(max(len(v1), len(v2)) or 1)
return d
if isinstance(method, type(distance)):
# Given method is a function of the form: distance(v1, v2) => float.
return method(v1, v2)
_distance = distance
def entropy(p=[], base=None):
""" Returns the Shannon entropy for the given list of probabilities
as a value between 0.0-1.0, where higher values indicate uncertainty.
"""
# entropy([1.0]) => 0.0, one possible outcome with a 100% chance
# entropy([0.5, 0.5]) => 1.0, two outcomes with a 50% chance each (random).
p = list(p)
s = float(sum(p)) or 1.0
s = s if len(p) > 1 else max(s, 1.0)
b = base or max(len(p), 2)
return -sum(x / s * log(x / s, b) for x in p if x != 0) or 0.0
#### MODEL #########################################################################################
#--- MODEL -----------------------------------------------------------------------------------------
# A Model is a representation of a collection of documents as bag-of-words.
# A Model is a matrix (or vector space) with features as columns and documents as rows,
# where each document is a vector of features (e.g., words) and feature weights (e.g., frequency).
# The matrix is used to calculate adjusted weights (e.g., tf * idf), document similarity and LSA.
# Export formats:
ORANGE, WEKA = "orange", "weka"
# LSA reduction methods:
NORM, L1, L2, TOP300 = "norm", "L1", "L2", "top300"
# Feature selection methods:
INFOGAIN, GAINRATIO, CHISQUARE, CHISQUARED = "infogain", "gainratio", "chisquare", "chisquared"
IG, GR, X2, DF = "ig", "gr", "x2", "df"
# Clustering methods:
KMEANS, HIERARCHICAL = "k-means", "hierarchical"
# Resampling methods:
MINORITY, MAJORITY = "minority", "majority"
class Model(object):
def __init__(self, documents=[], weight=TFIDF):
""" A model is a bag-of-word representation of a corpus of documents,
where each document vector is a bag of (word, relevance)-items.