-
Notifications
You must be signed in to change notification settings - Fork 32
Expand file tree
/
Copy pathstats.rb
More file actions
148 lines (120 loc) · 4.69 KB
/
stats.rb
File metadata and controls
148 lines (120 loc) · 4.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
class Stats
class << self
# Welch's t-test (two-tailed). Returns the p-value, or nil if
# either sample is too small to compute a meaningful test.
def welch_p_value(a, b)
return nil if a.size < 2 || b.size < 2
stats_a = new(a)
stats_b = new(b)
n_a = a.size.to_f
n_b = b.size.to_f
var_a = stats_a.sample_variance
var_b = stats_b.sample_variance
se_sq = var_a / n_a + var_b / n_b
if se_sq == 0.0
# Both samples have zero variance — if means match they're
# indistinguishable, otherwise they're trivially different.
return stats_a.mean == stats_b.mean ? 1.0 : 0.0
end
t = (stats_a.mean - stats_b.mean) / Math.sqrt(se_sq)
# Welch-Satterthwaite degrees of freedom
df = se_sq ** 2 / ((var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1))
# Two-tailed p-value: I_x(df/2, 1/2) where x = df/(df + t^2)
x = df / (df + t * t)
regularized_incomplete_beta(x, df / 2.0, 0.5)
end
private
# Regularized incomplete beta function I_x(alpha, beta) via continued fraction (Lentz's method).
# Returns the probability that a Beta(alpha, beta)-distributed variable is <= x.
def regularized_incomplete_beta(x, alpha, beta)
return 0.0 if x <= 0.0
return 1.0 if x >= 1.0
# Symmetry relation: pick the side that converges faster
if x > (alpha + 1.0) / (alpha + beta + 2.0)
return 1.0 - regularized_incomplete_beta(1.0 - x, beta, alpha)
end
# B(alpha, beta) * x^alpha * (1-x)^beta — computed in log-space to avoid overflow
ln_normalizer = Math.lgamma(alpha + beta)[0] - Math.lgamma(alpha)[0] - Math.lgamma(beta)[0] +
alpha * Math.log(x) + beta * Math.log(1.0 - x)
normalizer = Math.exp(ln_normalizer)
normalizer * beta_continued_fraction(x, alpha, beta) / alpha
end
# Evaluates the continued fraction for I_x(alpha, beta) using Lentz's algorithm.
# Each iteration computes two sub-steps (even and odd terms of the fraction).
def beta_continued_fraction(x, alpha, beta)
floor = 1.0e-30 # prevent division by zero in Lentz's method
converged = false
numerator_term = 1.0
denominator_term = 1.0 - (alpha + beta) * x / (alpha + 1.0)
denominator_term = floor if denominator_term.abs < floor
denominator_term = 1.0 / denominator_term
fraction = denominator_term
(1..200).each do |iteration|
two_i = 2 * iteration
# Even sub-step: d_{2m} coefficient of the continued fraction
coeff = iteration * (beta - iteration) * x / ((alpha + two_i - 1.0) * (alpha + two_i))
denominator_term = 1.0 + coeff * denominator_term
denominator_term = floor if denominator_term.abs < floor
numerator_term = 1.0 + coeff / numerator_term
numerator_term = floor if numerator_term.abs < floor
denominator_term = 1.0 / denominator_term
fraction *= denominator_term * numerator_term
# Odd sub-step: d_{2m+1} coefficient of the continued fraction
coeff = -(alpha + iteration) * (alpha + beta + iteration) * x / ((alpha + two_i) * (alpha + two_i + 1.0))
denominator_term = 1.0 + coeff * denominator_term
denominator_term = floor if denominator_term.abs < floor
numerator_term = 1.0 + coeff / numerator_term
numerator_term = floor if numerator_term.abs < floor
denominator_term = 1.0 / denominator_term
correction = denominator_term * numerator_term
fraction *= correction
if (correction - 1.0).abs < 1.0e-10
converged = true
break
end
end
warn "Stats.beta_continued_fraction: did not converge (alpha=#{alpha}, beta=#{beta}, x=#{x})" unless converged
fraction
end
end
def initialize(data)
@data = data
end
def min
@data.min
end
def max
@data.max
end
def mean
@data.sum(0.0) / @data.size
end
# Population standard deviation (N denominator) — describes these specific values.
def stddev
mean = self.mean
diffs_squared = @data.map { |v| (v-mean) * (v-mean) }
mean_squared = diffs_squared.sum(0.0) / @data.size
Math.sqrt(mean_squared)
end
# Unbiased sample variance (N-1 denominator, Bessel's correction) — for inference.
def sample_variance
m = mean
@data.sum { |v| (v - m) ** 2 } / (@data.size - 1).to_f
end
def median
compute_median(@data)
end
def median_absolute_deviation(median)
compute_median(@data.map { |v| (v - median).abs })
end
private
def compute_median(data)
size = data.size
sorted = data.sort
if size.odd?
sorted[size/2]
else
(sorted[size/2-1] + sorted[size/2]) / 2.0
end
end
end