Skip to content

Commit 78fe6bd

Browse files
committed
fix(bloom): align builder parameters to java limits
1 parent e407d53 commit 78fe6bd

3 files changed

Lines changed: 75 additions & 58 deletions

File tree

datasketches/src/bloom/builder.rs

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,21 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::codec::family::Family;
1918
use super::BloomFilter;
19+
use crate::codec::family::Family;
2020
use crate::hash::DEFAULT_UPDATE_SEED;
2121

22-
const MIN_NUM_BITS: u64 = 1;
23-
const MAX_NUM_BITS: u64 = (i32::MAX as u64 - Family::BLOOMFILTER.max_pre_longs as u64) * 64;
22+
pub const MIN_NUM_BITS: u64 = 1;
23+
pub const MAX_NUM_BITS: u64 = (i32::MAX as u64 - Family::BLOOMFILTER.max_pre_longs as u64) * 64;
24+
pub const MIN_NUM_HASHES: u16 = 1;
25+
pub const MAX_NUM_HASHES: u16 = i16::MAX as u16;
2426

2527
/// Builder for creating [`BloomFilter`] instances.
2628
///
2729
/// Provides two construction modes:
2830
/// - [`with_accuracy()`](Self::with_accuracy): Specify target items and false positive rate
2931
/// (recommended)
30-
/// - [`with_size()`](Self::with_size): Specify exact bit count and hash functions (manual)
32+
/// - [`with_size()`](Self::with_size): Specify requested bit count and hash functions (manual)
3133
#[derive(Debug, Clone)]
3234
pub struct BloomFilterBuilder {
3335
num_bits: u64,
@@ -48,7 +50,7 @@ impl BloomFilterBuilder {
4850
///
4951
/// # Panics
5052
///
51-
/// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0).
53+
/// Panics if `max_items` is 0 or `fpp` is not in (0.0, 1.0].
5254
///
5355
/// # Examples
5456
///
@@ -62,8 +64,8 @@ impl BloomFilterBuilder {
6264
pub fn with_accuracy(max_items: u64, fpp: f64) -> Self {
6365
assert!(max_items > 0, "max_items must be greater than 0");
6466
assert!(
65-
fpp > 0.0 && fpp < 1.0,
66-
"fpp must be between 0.0 and 1.0 (exclusive)"
67+
fpp > 0.0 && fpp <= 1.0,
68+
"fpp must be between 0.0 and 1.0 (inclusive of 1.0)"
6769
);
6870

6971
let num_bits = Self::suggest_num_bits(max_items, fpp);
@@ -78,9 +80,12 @@ impl BloomFilterBuilder {
7880

7981
/// Creates a builder with manual size specification.
8082
///
81-
/// Use this when you want precise control over the filter size,
83+
/// Use this when you want precise control over the requested filter size,
8284
/// or when working with pre-calculated parameters.
8385
///
86+
/// The underlying storage is word-based, so the actual capacity is rounded
87+
/// up to the next multiple of 64 bits.
88+
///
8489
/// # Arguments
8590
///
8691
/// - `num_bits`: Total number of bits in the filter
@@ -90,7 +95,7 @@ impl BloomFilterBuilder {
9095
///
9196
/// Panics if any of:
9297
/// - `num_bits` < MIN_NUM_BITS or `num_bits` > MAX_NUM_BITS
93-
/// - `num_hashes` < 1 or `num_hashes` > 100
98+
/// - `num_hashes` < MIN_NUM_HASHES or `num_hashes` > MIN_NUM_HASHES
9499
///
95100
/// # Examples
96101
///
@@ -109,8 +114,16 @@ impl BloomFilterBuilder {
109114
"num_bits must not exceed {}",
110115
MAX_NUM_BITS
111116
);
112-
assert!(num_hashes >= 1, "num_hashes must be at least 1");
113-
assert!(num_hashes <= 100, "num_hashes must not exceed 100");
117+
assert!(
118+
num_hashes >= MIN_NUM_HASHES,
119+
"num_hashes must be at least {}",
120+
MIN_NUM_HASHES
121+
);
122+
assert!(
123+
num_hashes <= MAX_NUM_HASHES,
124+
"num_hashes must not exceed {}",
125+
MAX_NUM_HASHES
126+
);
114127

115128
BloomFilterBuilder {
116129
num_bits,
@@ -142,16 +155,13 @@ impl BloomFilterBuilder {
142155
///
143156
/// Panics if neither `with_accuracy()` nor `with_size()` was called.
144157
pub fn build(self) -> BloomFilter {
145-
let capacity_bits = self.num_bits;
146158
let num_hashes = self.num_hashes;
147-
148-
let num_words = capacity_bits.div_ceil(64) as usize;
149-
let bit_array = vec![0u64; num_words];
159+
let num_words = self.num_bits.div_ceil(64) as usize;
160+
let bit_array = vec![0u64; num_words].into_boxed_slice();
150161

151162
BloomFilter {
152163
seed: self.seed,
153164
num_hashes,
154-
capacity_bits,
155165
num_bits_set: 0,
156166
bit_array,
157167
}
@@ -176,9 +186,6 @@ impl BloomFilterBuilder {
176186

177187
let bits = (-n * p.ln() / ln2_squared).ceil() as u64;
178188

179-
// Round up to multiple of 64 for efficiency
180-
let bits = bits.div_ceil(64) * 64;
181-
182189
bits.clamp(MIN_NUM_BITS, MAX_NUM_BITS)
183190
}
184191

@@ -198,9 +205,9 @@ impl BloomFilterBuilder {
198205
let m = num_bits as f64;
199206
let n = max_items as f64;
200207

201-
let k = (m / n * std::f64::consts::LN_2).round();
202-
203-
(k as u16).clamp(1, 100) // Reasonable bounds
208+
// Ceil to avoid selecting too few hashes.
209+
let k = (m / n * std::f64::consts::LN_2).ceil();
210+
k.clamp(f64::from(MIN_NUM_HASHES), f64::from(MAX_NUM_HASHES)) as u16
204211
}
205212

206213
/// Suggests optimal number of hash functions from target FPP.
@@ -216,7 +223,9 @@ impl BloomFilterBuilder {
216223
/// assert_eq!(hashes, 7); // -log2(0.01) ≈ 6.64
217224
/// ```
218225
pub fn suggest_num_hashes_from_fpp(fpp: f64) -> u16 {
226+
// Ceil to avoid selecting too few hashes.
219227
let k = -fpp.log2();
220-
(k.round() as u16).clamp(1, 100)
228+
k.ceil()
229+
.clamp(f64::from(MIN_NUM_HASHES), f64::from(MAX_NUM_HASHES)) as u16
221230
}
222231
}

datasketches/src/bloom/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272
//!
7373
//! ## By Size (Manual)
7474
//!
75-
//! Specify exact bit count and hash functions:
75+
//! Specify requested bit count and hash functions (rounded up to a multiple of 64 bits):
7676
//!
7777
//! ```rust
7878
//! # use datasketches::bloom::BloomFilterBuilder;

datasketches/src/bloom/sketch.rs

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,10 @@ pub struct BloomFilter {
4242
pub(super) seed: u64,
4343
/// Number of hash functions to use (k)
4444
pub(super) num_hashes: u16,
45-
/// Total number of bits in the filter (m)
46-
pub(super) capacity_bits: u64,
4745
/// Count of bits set to 1 (for statistics)
4846
pub(super) num_bits_set: u64,
4947
/// Bit array packed into u64 words
50-
/// Length = ceil(capacity_bits / 64)
51-
pub(super) bit_array: Vec<u64>,
48+
pub(super) bit_array: Box<[u64]>,
5249
}
5350

5451
impl BloomFilter {
@@ -246,26 +243,10 @@ impl BloomFilter {
246243
/// // Now "apple" probably returns false, and most other items return true
247244
/// ```
248245
pub fn invert(&mut self) {
249-
// Invert bits and count during operation (single pass)
250-
let mut num_bits_set = 0;
251246
for word in &mut self.bit_array {
252247
*word = !*word;
253-
num_bits_set += word.count_ones() as u64;
254-
}
255-
256-
// Mask off excess bits in the last word
257-
let excess_bits = self.capacity_bits % 64;
258-
if excess_bits != 0 {
259-
let last_idx = self.bit_array.len() - 1;
260-
let old_count = self.bit_array[last_idx].count_ones() as u64;
261-
let mask = (1u64 << excess_bits) - 1;
262-
self.bit_array[last_idx] &= mask;
263-
let new_count = self.bit_array[last_idx].count_ones() as u64;
264-
// Adjust count for masked-off bits
265-
num_bits_set = num_bits_set - old_count + new_count;
266248
}
267-
268-
self.num_bits_set = num_bits_set;
249+
self.num_bits_set = self.capacity() as u64 - self.num_bits_set;
269250
}
270251

271252
/// Returns whether the filter is empty (no items inserted).
@@ -281,8 +262,8 @@ impl BloomFilter {
281262
}
282263

283264
/// Returns the total number of bits in the filter (capacity).
284-
pub fn capacity(&self) -> u64 {
285-
self.capacity_bits
265+
pub fn capacity(&self) -> usize {
266+
self.bit_array.len() * 64
286267
}
287268

288269
/// Returns the number of hash functions used.
@@ -300,7 +281,7 @@ impl BloomFilter {
300281
/// Values near 0.5 indicate the filter is approaching saturation.
301282
/// Values above 0.5 indicate degraded false positive rates.
302283
pub fn load_factor(&self) -> f64 {
303-
self.num_bits_set as f64 / self.capacity_bits as f64
284+
self.num_bits_set as f64 / self.capacity() as f64
304285
}
305286

306287
/// Estimates the current false positive probability.
@@ -327,8 +308,8 @@ impl BloomFilter {
327308
/// - Capacity (number of bits)
328309
/// - Number of hash functions
329310
/// - Seed
330-
pub fn is_compatible(&self, other: &BloomFilter) -> bool {
331-
self.capacity_bits == other.capacity_bits
311+
pub fn is_compatible(&self, other: &Self) -> bool {
312+
self.bit_array.len() == other.bit_array.len()
332313
&& self.num_hashes == other.num_hashes
333314
&& self.seed == other.seed
334315
}
@@ -374,8 +355,8 @@ impl BloomFilter {
374355

375356
bytes.write_u64_le(self.seed);
376357

377-
// Bit array capacity stored as number of 64-bit words (int32) + unused padding (uint32)
378-
let num_longs = (self.capacity_bits / 64) as i32;
358+
// Bit array capacity is stored as number of 64-bit words (int32) + unused padding (uint32).
359+
let num_longs = self.bit_array.len() as i32;
379360
bytes.write_i32_le(num_longs);
380361
bytes.write_u32_le(0); // unused
381362

@@ -454,6 +435,13 @@ impl BloomFilter {
454435
let num_hashes = cursor
455436
.read_u16_le()
456437
.map_err(|_| Error::insufficient_data("num_hashes"))?;
438+
if num_hashes == 0 || num_hashes > i16::MAX as u16 {
439+
return Err(Error::deserial(format!(
440+
"invalid num_hashes: expected [1, {}], got {}",
441+
i16::MAX,
442+
num_hashes
443+
)));
444+
}
457445
// Bytes 6-7: unused (u16)
458446
let _unused = cursor
459447
.read_u16_le()
@@ -462,17 +450,23 @@ impl BloomFilter {
462450
.read_u64_le()
463451
.map_err(|_| Error::insufficient_data("seed"))?;
464452

465-
// Bit array capacity stored as number of 64-bit words (int32) + unused padding (uint32)
453+
// Bit array capacity is stored as number of 64-bit words (int32) + unused padding (uint32).
466454
let num_longs = cursor
467455
.read_i32_le()
468-
.map_err(|_| Error::insufficient_data("num_longs"))? as u64;
456+
.map_err(|_| Error::insufficient_data("num_longs"))?;
469457
let _unused = cursor
470458
.read_u32_le()
471459
.map_err(|_| Error::insufficient_data("unused"))?;
472460

473-
let capacity_bits = num_longs * 64; // Convert longs to bits
461+
if num_longs <= 0 {
462+
return Err(Error::deserial(format!(
463+
"invalid num_longs: expected at least 1, got {}",
464+
num_longs
465+
)));
466+
}
467+
474468
let num_words = num_longs as usize;
475-
let mut bit_array = vec![0u64; num_words];
469+
let mut bit_array = vec![0u64; num_words].into_boxed_slice();
476470
let num_bits_set;
477471

478472
if is_empty {
@@ -493,14 +487,21 @@ impl BloomFilter {
493487
if raw_num_bits_set == DIRTY_BITS_VALUE {
494488
num_bits_set = bit_array.iter().map(|w| w.count_ones() as u64).sum();
495489
} else {
490+
let raw_num_words_set = raw_num_bits_set.div_ceil(64) as usize;
491+
if raw_num_words_set > num_words {
492+
return Err(Error::deserial(format!(
493+
"invalid num_bits_set: expected <= {}, got {}",
494+
num_words * 64,
495+
raw_num_bits_set
496+
)));
497+
}
496498
num_bits_set = raw_num_bits_set;
497499
}
498500
}
499501

500502
Ok(BloomFilter {
501503
seed,
502504
num_hashes,
503-
capacity_bits,
504505
num_bits_set,
505506
bit_array,
506507
})
@@ -554,7 +555,7 @@ impl BloomFilter {
554555
/// The right shift by 1 improves bit distribution. The index `i` is 1-based.
555556
fn compute_bit_index(&self, h0: u64, h1: u64, i: u16) -> u64 {
556557
let hash = h0.wrapping_add(u64::from(i).wrapping_mul(h1));
557-
(hash >> 1) % self.capacity_bits
558+
(hash >> 1) % self.capacity() as u64
558559
}
559560

560561
/// Gets the value of a single bit.
@@ -598,6 +599,13 @@ mod tests {
598599
assert_eq!(filter.num_hashes(), 5);
599600
}
600601

602+
#[test]
603+
fn test_builder_with_size_rounds_to_word_boundary() {
604+
let filter = BloomFilterBuilder::with_size(1, 3).build();
605+
assert_eq!(filter.capacity(), 64);
606+
assert_eq!(filter.num_hashes(), 3);
607+
}
608+
601609
#[test]
602610
fn test_insert_and_contains() {
603611
let mut filter = BloomFilterBuilder::with_accuracy(100, 0.01).build();

0 commit comments

Comments
 (0)