Skip to content

Commit fdb38b8

Browse files
ZENOTMEtisonkun
andauthored
feat: add theta sketch (part 2) (#59)
* feat: add theta sketch (part 2) * add binomial_bounds to support calculate lower_bound&&upper_bound * add get_lower_bound&&get_upper_bound in ThetaSketch * refine NumStdDev and some interface * refine code: remove unnecessary pub * better common layout Signed-off-by: tison <wander4096@gmail.com> --------- Signed-off-by: tison <wander4096@gmail.com> Co-authored-by: tison <wander4096@gmail.com>
1 parent 1640af4 commit fdb38b8

19 files changed

Lines changed: 1090 additions & 36 deletions

datasketches/src/common/binomial_bounds.rs

Lines changed: 773 additions & 0 deletions
Large diffs are not rendered by default.

datasketches/src/common/mod.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Data structures and functions that may be used across all the sketch families.
19+
20+
// public common components for datasketches crate
21+
mod num_std_dev;
22+
mod resize;
23+
pub use self::num_std_dev::NumStdDev;
24+
pub use self::resize::ResizeFactor;
25+
26+
// private to datasketches crate
27+
pub(crate) mod binomial_bounds;
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Standard deviation enums for confidence bounds
19+
//!
20+
//! This module provides types for specifying confidence levels when computing
21+
//! upper and lower bounds for sketch estimates.
22+
23+
#[allow(clippy::excessive_precision)]
24+
const DELTA_OF_NUM_STD_DEVS: [f64; 4] = [
25+
0.5000000000000000000, // = 0.5 (1 + erf(0))
26+
0.1586553191586026479, // = 0.5 (1 + erf((-1/sqrt(2))))
27+
0.0227502618904135701, // = 0.5 (1 + erf((-2/sqrt(2))))
28+
0.0013498126861731796, // = 0.5 (1 + erf((-3/sqrt(2))))
29+
];
30+
31+
/// Number of standard deviations for confidence bounds
32+
///
33+
/// This enum specifies the number of standard deviations to use when computing
34+
/// upper and lower bounds for cardinality estimates. Higher values provide wider
35+
/// confidence intervals with greater certainty that the true cardinality falls
36+
/// within the bounds.
37+
#[repr(u8)]
38+
#[derive(Debug, Clone, Copy, PartialEq)]
39+
pub enum NumStdDev {
40+
/// One standard deviation (\~68% confidence interval)
41+
One = 1,
42+
/// Two standard deviations (\~95% confidence interval)
43+
Two = 2,
44+
/// Three standard deviations (\~99.7% confidence interval)
45+
Three = 3,
46+
}
47+
48+
impl NumStdDev {
49+
/// Returns the tail probability (delta) for this confidence level
50+
pub const fn tail_probability(&self) -> f64 {
51+
DELTA_OF_NUM_STD_DEVS[*self as usize]
52+
}
53+
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
/// # Examples
3737
///
3838
/// ```
39-
/// # use datasketches::ResizeFactor;
39+
/// # use datasketches::common::ResizeFactor;
4040
/// let factor = ResizeFactor::X4;
4141
/// assert_eq!(factor.value(), 4);
4242
/// assert_eq!(factor.lg_value(), 2);

datasketches/src/error.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ impl Error {
9191

9292
// Convenience constructors for deserialization errors
9393
impl Error {
94+
pub(crate) fn invalid_argument(msg: impl Into<String>) -> Self {
95+
Self::new(ErrorKind::InvalidArgument, msg)
96+
}
97+
9498
pub(crate) fn deserial(msg: impl Into<String>) -> Self {
9599
Self::new(ErrorKind::InvalidData, msg)
96100
}

datasketches/src/hll/array4.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
use super::aux_map::AuxMap;
2424
use crate::codec::SketchBytes;
2525
use crate::codec::SketchSlice;
26+
use crate::common::NumStdDev;
2627
use crate::error::Error;
27-
use crate::hll::NumStdDev;
2828
use crate::hll::estimator::HipEstimator;
2929
use crate::hll::get_slot;
3030
use crate::hll::get_value;

datasketches/src/hll/array6.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
2424
use crate::codec::SketchBytes;
2525
use crate::codec::SketchSlice;
26+
use crate::common::NumStdDev;
2627
use crate::error::Error;
27-
use crate::hll::NumStdDev;
2828
use crate::hll::estimator::HipEstimator;
2929
use crate::hll::get_slot;
3030
use crate::hll::get_value;

datasketches/src/hll/array8.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
2323
use crate::codec::SketchBytes;
2424
use crate::codec::SketchSlice;
25+
use crate::common::NumStdDev;
2526
use crate::error::Error;
26-
use crate::hll::NumStdDev;
2727
use crate::hll::estimator::HipEstimator;
2828
use crate::hll::get_slot;
2929
use crate::hll::get_value;

datasketches/src/hll/container.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
//! Provides a simple array-based storage for coupons (hash values) with
2121
//! cubic interpolation-based cardinality estimation and confidence bounds.
2222
23+
use crate::common::NumStdDev;
2324
use crate::hll::COUPON_RSE;
24-
use crate::hll::NumStdDev;
2525
use crate::hll::coupon_mapping::X_ARR;
2626
use crate::hll::coupon_mapping::Y_ARR;
2727
use crate::hll::cubic_interpolation::using_x_and_y_tables;

datasketches/src/hll/estimator.rs

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
//! This is more accurate than the standard HLL estimator, especially for
2323
//! moderate cardinalities.
2424
25+
use crate::common::NumStdDev;
2526
use crate::hll::composite_interpolation;
2627
use crate::hll::cubic_interpolation;
2728
use crate::hll::harmonic_numbers;
@@ -324,22 +325,6 @@ fn inv_pow2(value: u8) -> f64 {
324325
}
325326
}
326327

327-
/// Number of standard deviations for confidence bounds
328-
///
329-
/// This enum specifies the number of standard deviations to use when computing
330-
/// upper and lower bounds for cardinality estimates. Higher values provide wider
331-
/// confidence intervals with greater certainty that the true cardinality falls
332-
/// within the bounds.
333-
#[repr(u8)]
334-
pub enum NumStdDev {
335-
/// One standard deviation (\~68% confidence interval)
336-
One = 1,
337-
/// Two standard deviations (\~95% confidence interval)
338-
Two = 2,
339-
/// Three standard deviations (\~99.7% confidence interval)
340-
Three = 3,
341-
}
342-
343328
/// Get relative error for HLL estimates
344329
///
345330
/// This matches the implementation in datasketches-cpp HllUtil.hpp and RelativeErrorTables.hpp

0 commit comments

Comments
 (0)