11"""Contiguous array storage implementation for DDSketch using offset-based indexing.
22
33Optimized for high throughput by using Python lists instead of numpy arrays
4- and adopting Datadog's chunk-based dynamic growth pattern.
4+ and chunk-based dynamic growth pattern.
55"""
66
77import math
88import warnings
99from .base import Storage
1010
1111
12- # Chunk size for dynamic growth (matches Datadog's default)
12+ # Chunk size for dynamic growth
1313CHUNK_SIZE = 128
1414
1515
@@ -32,7 +32,8 @@ class ContiguousStorage(Storage):
3232
3333 __slots__ = ('count' , 'bins' , 'min_key' , 'max_key' ,
3434 'offset' , 'collapse_count' , 'bin_limit' ,
35- 'chunk_size' , 'is_collapsed' )
35+ 'chunk_size' , 'is_collapsed' ,
36+ '_cumulative_sums' , '_cumulative_valid' )
3637
3738 def __init__ (self , bin_limit : int = 2048 , chunk_size : int = CHUNK_SIZE , max_buckets : int = None ):
3839 """
@@ -51,7 +52,7 @@ def __init__(self, bin_limit: int = 2048, chunk_size: int = CHUNK_SIZE, max_buck
5152 raise ValueError ("bin_limit must be positive for ContiguousStorage" )
5253
5354 # Don't call super().__init__ to avoid overhead - inline what we need
54- self .count = 0.0 # Use float like Datadog for weighted values
55+ self .count = 0.0 # Use float for weighted values
5556 self .bins = [] # Start empty, grow dynamically
5657 self .bin_limit = bin_limit
5758 self .chunk_size = chunk_size
@@ -60,6 +61,9 @@ def __init__(self, bin_limit: int = 2048, chunk_size: int = CHUNK_SIZE, max_buck
6061 self .offset = 0
6162 self .collapse_count = 0
6263 self .is_collapsed = False
64+ # Lazy cumulative sums for O(log n) quantile queries
65+ self ._cumulative_sums = []
66+ self ._cumulative_valid = False
6367
6468 @property
6569 def total_count (self ):
@@ -118,19 +122,29 @@ def add(self, key, weight=1.0):
118122 idx = self ._get_index (key )
119123 self .bins [idx ] += weight
120124 self .count += weight
125+ self ._cumulative_valid = False
121126
122127 def _get_index (self , key ):
123- """Calculate the bin index for the key, extending the range if necessary."""
124- if self .min_key is None :
128+ """Calculate the bin index for the key, extending the range if necessary.
129+
130+ Optimized for the common case where key is within the existing range.
131+ """
132+ # Fast path: key is within existing range (most common case)
133+ min_key = self .min_key
134+ if min_key is not None and min_key <= key <= self .max_key :
135+ return key - self .offset
136+
137+ # Slow path: need to extend range or handle edge cases
138+ if min_key is None :
125139 # First insertion
126140 self ._extend_range (key )
127- elif key < self . min_key :
141+ elif key < min_key :
128142 if self .is_collapsed :
129143 return 0
130144 self ._extend_range (key )
131145 if self .is_collapsed :
132146 return 0
133- elif key > self .max_key :
147+ else : # key > self.max_key
134148 self ._extend_range (key )
135149
136150 return key - self .offset
@@ -241,6 +255,7 @@ def remove(self, bucket_index: int, count: int = 1) -> bool:
241255
242256 self .bins [pos ] = max (0 , old_count - count )
243257 self .count = max (0 , self .count - count )
258+ self ._cumulative_valid = False
244259
245260 # Update min/max keys if we emptied a boundary bucket
246261 if old_count > 0 and self .bins [pos ] == 0 :
@@ -282,11 +297,27 @@ def get_count(self, bucket_index: int) -> int:
282297 return 0
283298 return int (self .bins [pos ])
284299
300+ def _rebuild_cumulative_sums (self ):
301+ """Rebuild cumulative sums array for O(log n) rank queries."""
302+ bins = self .bins
303+ n = len (bins )
304+ if n == 0 :
305+ self ._cumulative_sums = []
306+ else :
307+ # Build cumulative sums
308+ cumsum = [0.0 ] * n
309+ running = 0.0
310+ for i in range (n ):
311+ running += bins [i ]
312+ cumsum [i ] = running
313+ self ._cumulative_sums = cumsum
314+ self ._cumulative_valid = True
315+
285316 def key_at_rank (self , rank , lower = True ):
286317 """
287318 Return the key for the value at given rank.
288319
289- This method is compatible with Datadog's interface .
320+ Uses lazy cumulative sums and binary search for O(log n) performance .
290321
291322 Args:
292323 rank: The rank to find.
@@ -296,11 +327,37 @@ def key_at_rank(self, rank, lower=True):
296327 Returns:
297328 The key at the specified rank.
298329 """
299- running_ct = 0.0
300- for i , bin_ct in enumerate (self .bins ):
301- running_ct += bin_ct
302- if (lower and running_ct > rank ) or (not lower and running_ct >= rank + 1 ):
303- return i + self .offset
330+ if not self ._cumulative_valid :
331+ self ._rebuild_cumulative_sums ()
332+
333+ cumsum = self ._cumulative_sums
334+ n = len (cumsum )
335+ if n == 0 :
336+ return self .max_key if self .max_key is not None else 0
337+
338+ # Use binary search for O(log n) lookup
339+ # Binary search to find first index where condition is true
340+ lo , hi = 0 , n
341+ if lower :
342+ # Find first index where cumsum[i] > rank
343+ while lo < hi :
344+ mid = (lo + hi ) >> 1
345+ if cumsum [mid ] > rank :
346+ hi = mid
347+ else :
348+ lo = mid + 1
349+ else :
350+ # Find first index where cumsum[i] >= rank + 1
351+ target = rank + 1
352+ while lo < hi :
353+ mid = (lo + hi ) >> 1
354+ if cumsum [mid ] >= target :
355+ hi = mid
356+ else :
357+ lo = mid + 1
358+
359+ if lo < n :
360+ return lo + self .offset
304361
305362 return self .max_key if self .max_key is not None else 0
306363
@@ -329,6 +386,7 @@ def merge(self, other: 'ContiguousStorage'):
329386 self .bins [self_idx ] += other .bins [other_idx ]
330387
331388 self .count += other .count
389+ self ._cumulative_valid = False
332390
333391 def copy (self , store : 'ContiguousStorage' ):
334392 """Copy another storage into this one."""
@@ -339,3 +397,4 @@ def copy(self, store: 'ContiguousStorage'):
339397 self .offset = store .offset
340398 self .is_collapsed = store .is_collapsed
341399 self .collapse_count = store .collapse_count
400+ self ._cumulative_valid = False
0 commit comments