Skip to content

Commit 2f94e47

Browse files
committed
add ArrayAndStats
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent a083533 commit 2f94e47

12 files changed

Lines changed: 334 additions & 412 deletions

File tree

vortex-btrblocks/src/builder.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[
1717
// Integer schemes.
1818
&crate::compressor::integer::UncompressedScheme as &dyn Scheme,
1919
&crate::compressor::integer::ConstantScheme,
20+
// NOTE: For must precede BitPacking to avoid unnecessary patches.
2021
&crate::compressor::integer::FORScheme,
21-
&crate::compressor::integer::ZigZagScheme,
2222
&crate::compressor::integer::BitPackingScheme,
23+
&crate::compressor::integer::ZigZagScheme,
2324
&crate::compressor::integer::SparseScheme,
2425
&crate::compressor::integer::DictScheme,
2526
&crate::compressor::integer::RunEndScheme,

vortex-btrblocks/src/canonical_compressor.rs

Lines changed: 17 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ use vortex_array::arrays::ExtensionArray;
1616
use vortex_array::arrays::FixedSizeListArray;
1717
use vortex_array::arrays::ListArray;
1818
use vortex_array::arrays::ListViewArray;
19-
use vortex_array::arrays::Primitive;
2019
use vortex_array::arrays::StructArray;
2120
use vortex_array::arrays::TemporalArray;
2221
use vortex_array::arrays::listview::list_from_list_view;
@@ -30,18 +29,14 @@ use vortex_array::scalar::Scalar;
3029
use vortex_array::vtable::ValidityHelper;
3130
use vortex_error::VortexResult;
3231

32+
use crate::ArrayAndStats;
3333
use crate::BtrBlocksCompressorBuilder;
3434
use crate::CompressorContext;
35-
use crate::CompressorStats;
3635
use crate::GenerateStatsOptions;
3736
use crate::Scheme;
3837
use crate::SchemeId;
39-
use crate::StatsCache;
4038
use crate::compressor::decimal::compress_decimal;
41-
use crate::compressor::float::FloatStats;
4239
use crate::compressor::integer::DictScheme as IntDictScheme;
43-
use crate::compressor::integer::IntegerStats;
44-
use crate::compressor::string::StringStats;
4540
use crate::compressor::temporal::compress_temporal;
4641

4742
/// The main compressor type implementing BtrBlocks-inspired compression.
@@ -223,69 +218,41 @@ impl BtrBlocksCompressor {
223218
}
224219

225220
let before_nbytes = array.nbytes();
226-
let needs_distinct = eligible.iter().any(|s| s.needs_distinct_values());
227-
let mut cache = StatsCache::new();
228-
229-
// Pre-populate the stats cache with the right `count_distinct_values` setting.
230-
// This matches the old `gen_stats` behavior where distinct values were only computed
231-
// when Dict was in the scheme list.
232-
if let Some(prim) = array.as_opt::<Primitive>() {
233-
let prim = prim.to_primitive();
234-
if prim.ptype().is_int() {
235-
cache.get_or_insert_with::<IntegerStats>(|| {
236-
IntegerStats::generate_opts(
237-
&prim,
238-
GenerateStatsOptions {
239-
count_distinct_values: needs_distinct,
240-
},
241-
)
242-
});
243-
} else {
244-
cache.get_or_insert_with::<FloatStats>(|| {
245-
FloatStats::generate_opts(
246-
&prim,
247-
GenerateStatsOptions {
248-
count_distinct_values: needs_distinct,
249-
},
250-
)
251-
});
252-
}
253-
} else if array.as_opt::<vortex_array::arrays::VarBinView>().is_some() {
254-
cache.get_or_insert_with::<StringStats>(|| {
255-
StringStats::generate_opts(
256-
&array.to_varbinview(),
257-
GenerateStatsOptions {
258-
count_distinct_values: needs_distinct,
259-
},
260-
)
221+
let merged_opts = eligible
222+
.iter()
223+
.fold(GenerateStatsOptions::default(), |acc, s| {
224+
acc.merge(s.stats_options())
261225
});
262-
}
263226

264-
if let Some(winner) = self.choose_scheme(&eligible, &array, ctx, &mut cache, excludes)? {
265-
let compressed = winner.compress(self, &array, ctx, &mut cache, excludes)?;
227+
let mut ctx = ctx;
228+
ctx.stats_options = merged_opts;
229+
230+
let mut data = ArrayAndStats::new(array, merged_opts);
231+
232+
if let Some(winner) = self.choose_scheme(&eligible, &mut data, ctx, excludes)? {
233+
let compressed = winner.compress(self, &mut data, ctx, excludes)?;
266234
if compressed.nbytes() < before_nbytes {
267235
return Ok(compressed);
268236
}
269237
}
270238

271239
// No scheme improved on the original.
272-
Ok(array)
240+
Ok(data.into_array())
273241
}
274242

275243
/// Evaluates each candidate scheme and returns the one with the best compression ratio
276244
/// (must be > 1.0).
277245
fn choose_scheme(
278246
&self,
279247
schemes: &[&'static dyn Scheme],
280-
array: &ArrayRef,
248+
data: &mut ArrayAndStats,
281249
ctx: CompressorContext,
282-
cache: &mut StatsCache,
283250
excludes: &[SchemeId],
284251
) -> VortexResult<Option<&'static dyn Scheme>> {
285252
let mut best: Option<(&'static dyn Scheme, f64)> = None;
286253

287254
for &scheme in schemes {
288-
let ratio = self.evaluate_scheme(scheme, array, ctx, cache, excludes)?;
255+
let ratio = self.evaluate_scheme(scheme, data, ctx, excludes)?;
289256
if is_valid_ratio(ratio) && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) {
290257
best = Some((scheme, ratio));
291258
}
@@ -298,12 +265,11 @@ impl BtrBlocksCompressor {
298265
fn evaluate_scheme(
299266
&self,
300267
scheme: &'static dyn Scheme,
301-
array: &ArrayRef,
268+
data: &mut ArrayAndStats,
302269
ctx: CompressorContext,
303-
cache: &mut StatsCache,
304270
excludes: &[SchemeId],
305271
) -> VortexResult<f64> {
306-
let ratio = scheme.expected_compression_ratio(self, array, ctx, cache, excludes)?;
272+
let ratio = scheme.expected_compression_ratio(self, data, ctx, excludes)?;
307273

308274
tracing::debug!(
309275
scheme = %scheme.id(),

0 commit comments

Comments
 (0)