-
Notifications
You must be signed in to change notification settings - Fork 100
Expand file tree
/
Copy pathsarray_mode.py
More file actions
69 lines (49 loc) · 2.07 KB
/
sarray_mode.py
File metadata and controls
69 lines (49 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import graphlab as gl
def mode_sa(sa, single_mode=True):
"""Return a mode of sa, or all modes if there are several.
single_mode: whether to return a single mode or an SArray of all modes (default: True)."""
if len(sa) == 0:
raise ValueError("Can't find mode(s) in empty SArray")
sf = gl.SFrame({"value": sa})
sf2 = sf.groupby("value", {"count": gl.aggregate.COUNT()})
max_count_index = sf2["count"].argmax()
if single_mode:
return sf2[max_count_index]["value"]
else:
max_count = sf2[max_count_index]["count"]
return sf2[sf2["count"] == max_count]["value"]
# Create an SArray with two modes (most-common elements: 2 and 3)
sa = gl.SArray([1, 2, 2, 3, 3])
# Find one of the modes
single_mode = mode_sa(sa) # returns 2
# Find all modes
all_modes = mode_sa(sa, single_mode=False)
# Returns
# dtype: int
# Rows: 2
# [2, 3]
# A faster (albeit maybe less accurate) way to find the mode value is using sa.sketch_summary().frequent_items() .
# There are two caveats to this approach:
# 1. won't work for very low-frequency mode values, and
# 2. won't necessarily give the correct result if there are multiple likely candidates.
def sketch_mode_sa(sa, single_mode=True):
"""Fast (albeit less accurate) way to find the mode value(s) of SArray sa.
single_mode: whether to return a single mode or an SArray of all modes (default: True)."""
if len(sa) == 0:
raise ValueError("Can't find mode(s) in empty SArray")
frequent_items_sketch = sa.sketch_summary().frequent_items()
modes_sketch = [k for (k, v) in frequent_items_sketch.iteritems()
if v == max(frequent_items_sketch.itervalues())]
return modes_sketch[0] if single_mode else modes_sketch
sketch_mode_sa(sa) # returns 2
sketch_mode_sa(sa, single_mode=False) # returns [2, 3]
# Both approaches should handle empty SArrays.
# The implementations above will simply raise a ValueError if `sa` is empty.
try:
mode_sa(gl.SArray([]))
except ValueError:
pass
try:
sketch_mode_sa(gl.SArray([]))
except ValueError:
pass