datasketches-java/src/main/java/org/apache/datasketches/thetacommon/BinomialBoundsN.java at ad3f0618f9c8e84a1b1ad7e3c3945bf6cb5a91d7 · ZENOTME/datasketches-java · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.thetacommon;

import org.apache.datasketches.common.SketchesArgumentException;

/**
 * This class enables the estimation of error bounds given a sample set size, the sampling
 * probability theta, the number of standard deviations and a simple noDataSeen flag.  This can
 * be used to estimate error bounds for fixed threshold sampling as well as the error bounds
 * calculations for sketches.
 *
 * @author Kevin Lang
 */
// BTW, the suffixes "NStar", "NPrimeB", and "NPrimeF" correspond to variables in the formal
// writeup of this scheme.
public final class BinomialBoundsN {

  private BinomialBoundsN() {}

  private static final double[] deltaOfNumSDev =
  {
    0.5000000000000000000, // = 0.5 (1 + erf(0)
    0.1586553191586026479, // = 0.5 (1 + erf((-1/sqrt(2))))
    0.0227502618904135701, // = 0.5 (1 + erf((-2/sqrt(2))))
    0.0013498126861731796  // = 0.5 (1 + erf((-3/sqrt(2))))
  };

  // our "classic" bounds, but now with continuity correction

  private static double contClassicLB(final double numSamplesF, final double theta,
      final double numSDev) {
    final double nHat = (numSamplesF - 0.5) / theta;
    final double b = numSDev * Math.sqrt((1.0 - theta) / theta);
    final double d  = 0.5 * b * Math.sqrt((b * b) + (4.0 * nHat));
    final double center = nHat + (0.5 * (b * b));
    return (center - d);
  }

  private static double contClassicUB(final double numSamplesF, final double theta,
      final double numSDev) {
    final double nHat = (numSamplesF + 0.5) / theta;
    final double b   = numSDev * Math.sqrt((1.0 - theta) / theta);
    final double d  = 0.5 * b * Math.sqrt((b * b) + (4.0 * nHat));
    final double center = nHat + (0.5 * (b * b));
    return (center + d);
  }

  // This is a special purpose calculator for NStar, using a computational
  // strategy inspired by its Bayesian definition. It is only appropriate
  // for a very limited set of inputs. However, the procedure computeApproxBinoLB ()
  // below does in fact only call it for suitably limited inputs.
  // Outside of this limited range, two different bad things will happen.
  // First, because we are not using logarithms, the values of intermediate
  // quantities will exceed the dynamic range of doubles. Second, even if that
  // problem were fixed, the running time of this procedure is essentially linear
  // in est = (numSamples / p), and that can be Very, Very Big.

  private static long specialNStar(final long numSamplesI, final double p, final double delta) {
    final double q, numSamplesF;
    double tot, curTerm;
    long m;
    assertTrue(numSamplesI >= 1);
    assertTrue((0.0 < p) && (p < 1.0));
    assertTrue((0.0 < delta) && (delta < 1.0));
    q = 1.0 - p;
    numSamplesF = numSamplesI;
    // Use a different algorithm if the following isn't true; this one will be too slow, or worse.
    assertTrue((numSamplesF / p) < 500.0);
    curTerm = Math.pow(p, numSamplesF);  // curTerm = posteriorProbability (k, k, p)
    assertTrue(curTerm > 1e-100); // sanity check for non-use of logarithms
    tot = curTerm;
    m = numSamplesI;
    while (tot <= delta) { // this test can fail even the first time
      curTerm = (curTerm * q * (m)) / ((m + 1) - numSamplesI);
      tot += curTerm;
      m += 1;
    }
    // we have reached a state where tot > delta, so back up one
    return (m - 1);
  }

  //  The following procedure has very limited applicability.
  //  The above remarks about specialNStar() also apply here.
  private static long specialNPrimeB(final long numSamplesI, final double p, final double delta) {
    final double q, numSamplesF, oneMinusDelta;
    double tot, curTerm;
    long m;
    assertTrue(numSamplesI >= 1);
    assertTrue((0.0 < p) && (p < 1.0));
    assertTrue((0.0 < delta) && (delta < 1.0));
    q = 1.0 - p;
    oneMinusDelta = 1.0 - delta;
    numSamplesF = numSamplesI;
    curTerm = Math.pow(p, numSamplesF);  // curTerm = posteriorProbability (k, k, p)
    assertTrue(curTerm > 1e-100); // sanity check for non-use of logarithms
    tot = curTerm;
    m = numSamplesI;
    while (tot < oneMinusDelta) {
      curTerm = (curTerm * q * (m)) / ((m + 1) - numSamplesI);
      tot += curTerm;
      m += 1;
    }
    return (m); // don't need to back up
  }

  private static long specialNPrimeF(final long numSamplesI, final double p, final double delta) {
    // Use a different algorithm if the following isn't true; this one will be too slow, or worse.
    assertTrue(((numSamplesI) / p) < 500.0); //A super-small delta could also make it slow.
    return (specialNPrimeB(numSamplesI + 1, p, delta));
  }

  // The following computes an approximation to the lower bound of
  // a Frequentist confidence interval based on the tails of the Binomial distribution.
  private static double computeApproxBinoLB(final long numSamplesI, final double theta,
      final int numSDev) {
    if (theta == 1.0) {
      return (numSamplesI);
    }

    else if (numSamplesI == 0) {
      return (0.0);
    }

    else if (numSamplesI == 1) {
      final double delta = deltaOfNumSDev[numSDev];
      final double rawLB = (Math.log(1.0 - delta)) / (Math.log(1.0 - theta));
      return (Math.floor(rawLB)); // round down
    }

    else if (numSamplesI > 120) {
      // plenty of samples, so gaussian approximation to binomial distribution isn't too bad
      final double rawLB = contClassicLB( numSamplesI, theta, numSDev);
      return (rawLB - 0.5); // fake round down
    }

    // at this point we know 2 <= numSamplesI <= 120

    else if (theta > (1.0 - 1e-5)) {  // empirically-determined threshold
      return (numSamplesI);
    }

    else if (theta < ((numSamplesI) / 360.0)) {  // empirically-determined threshold
      // here we use the gaussian approximation, but with a modified "numSDev"
      final int index;
      final double rawLB;
      index = (3 * ((int) numSamplesI)) + (numSDev - 1);
      rawLB = contClassicLB(numSamplesI, theta, EquivTables.getLB(index));
      return (rawLB - 0.5); // fake round down
    }

    else { // This is the most difficult range to approximate; we will compute an "exact" LB.
      // We know that est <= 360, so specialNStar() shouldn't be ridiculously slow.
      final double delta = deltaOfNumSDev[numSDev];
      final long nstar = specialNStar(numSamplesI, theta, delta);
      return (nstar); // don't need to round
    }
  }

  // The following computes an approximation to the upper bound of
  // a Frequentist confidence interval based on the tails of the Binomial distribution.
  private static double computeApproxBinoUB(final long numSamplesI, final double theta,
      final int numSDev) {
    if (theta == 1.0) {
      return (numSamplesI);
    }

    else if (numSamplesI == 0) {
      final double delta = deltaOfNumSDev[numSDev];
      final double rawUB = (Math.log(delta)) / (Math.log(1.0 - theta));
      return (Math.ceil(rawUB)); // round up
    }

    else if (numSamplesI > 120) {
      // plenty of samples, so gaussian approximation to binomial distribution isn't too bad
      final double rawUB = contClassicUB(numSamplesI, theta, numSDev);
      return (rawUB + 0.5); // fake round up
    }

    // at this point we know 1 <= numSamplesI <= 120

    else if (theta > (1.0 - 1e-5)) { // empirically-determined threshold
      return (numSamplesI + 1);
    }

    else if (theta < ((numSamplesI) / 360.0)) {  // empirically-determined threshold
      // here we use the gaussian approximation, but with a modified "numSDev"
      final int index;
      final double rawUB;
      index = (3 * ((int) numSamplesI)) + (numSDev - 1);
      rawUB = contClassicUB(numSamplesI, theta, EquivTables.getUB(index));
      return (rawUB + 0.5); // fake round up
    }

    else { // This is the most difficult range to approximate; we will compute an "exact" UB.
      // We know that est <= 360, so specialNPrimeF() shouldn't be ridiculously slow.
      final double delta = deltaOfNumSDev[numSDev];
      final long nprimef = specialNPrimeF(numSamplesI, theta, delta);
      return (nprimef); // don't need to round
    }
  }

  // The following two procedures enforce some extra rules that help
  // to prevent the return of bounds that might be confusing to users.
  /**
   * Returns the approximate lower bound value
   * @param numSamples the number of samples in the sample set
   * @param theta the sampling probability
   * @param numSDev the number of "standard deviations" from the mean for the tail bounds.
   * This must be an integer value of 1, 2 or 3.
   * @param noDataSeen this is normally false. However, in the case where you have zero samples
   * and a theta &lt; 1.0, this flag enables the distinction between a virgin case when no actual
   * data has been seen and the case where the estimate may be zero but an upper error bound may
   * still exist.
   * @return the approximate lower bound value
   */
  public static double getLowerBound(final long numSamples, final double theta, final int numSDev,
      final boolean noDataSeen) {
    //in earlier code numSamples was called numSamplesI
    if (noDataSeen) { return 0.0; }
    checkArgs(numSamples, theta, numSDev);
    final double lb = computeApproxBinoLB(numSamples, theta, numSDev);
    final double numSamplesF = numSamples;
    final double est = numSamplesF / theta;
    return (Math.min(est, Math.max(numSamplesF, lb)));
  }

  /**
   * Returns the approximate upper bound value
   * @param numSamples the number of samples in the sample set
   * @param theta the sampling probability
   * @param numSDev the number of "standard deviations" from the mean for the tail bounds.
   * This must be an integer value of 1, 2 or 3.
   * @param noDataSeen this is normally false. However, in the case where you have zero samples
   * and a theta &lt; 1.0, this flag enables the distinction between a virgin case when no actual
   * data has been seen and the case where the estimate may be zero but an upper error bound may
   * still exist.
   * @return the approximate upper bound value
   */
  public static double getUpperBound(final long numSamples, final double theta, final int numSDev,
      final boolean noDataSeen) {
    //in earlier code numSamples was called numSamplesI
    if (noDataSeen) { return 0.0; }
    checkArgs(numSamples, theta, numSDev);
    final double ub = computeApproxBinoUB(numSamples, theta, numSDev);
    final double numSamplesF = numSamples;
    final double est = numSamplesF / theta;
    return (Math.max(est, ub));
  }

  //exposed only for test
  static void checkArgs(final long numSamples, final double theta, final int numSDev) {
    if ((numSDev | (numSDev - 1) | (3 - numSDev) | numSamples) < 0) {
      throw new SketchesArgumentException(
          "numSDev must only be 1,2, or 3 and numSamples must >= 0: numSDev="
              + numSDev + ", numSamples=" + numSamples);
    }
    if ((theta <= 0.0) || (theta > 1.0)) {
      throw new SketchesArgumentException("0.0 < theta <= 1.0: " + theta);
    }
  }

  private static void assertTrue(final boolean truth) {
    assert (truth);
  }

} // end of class "BinomialBoundsN"