From afe48ee811cade2489643adfb7bbb0c3a215621c Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Sat, 25 Apr 2026 22:27:40 +0100 Subject: [PATCH 1/2] Report column size stat for primitive columns in df Signed-off-by: Adam Gutglick --- vortex-datafusion/src/persistent/format.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index cb24be972c2..bad455e6955 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -511,6 +511,9 @@ impl FileFormat for VortexFormat { }; let mut column_statistics = Vec::with_capacity(table_schema.fields().len()); + let num_rows = usize::try_from(row_count) + .map_err(|_| vortex_err!("Row count overflow")) + .vortex_expect("Row count overflow"); for field in table_schema.fields().iter() { // If the column does not exist, continue. This can happen if the schema has evolved @@ -523,8 +526,15 @@ impl FileFormat for VortexFormat { let (stats_set, stats_dtype) = file_stats.get(col_idx); // Update the total size in bytes. - let column_size = - stats_set.get_as::(Stat::UncompressedSizeInBytes, &PType::U64.into()); + + let column_size = stats_set + .get_as::(Stat::UncompressedSizeInBytes, &PType::U64.into()) + .or_else(|| { + field + .data_type() + .primitive_width() + .map(|width| stats::Precision::Exact(width * num_rows)) + }); let target_dtype = DType::from_arrow(field.as_ref()); let min = scalar_stat_to_df( @@ -562,11 +572,7 @@ impl FileFormat for VortexFormat { .fold(Precision::Exact(0), |acc, cs| acc.add(&cs.byte_size)); Ok(Statistics { - num_rows: Precision::Exact( - usize::try_from(row_count) - .map_err(|_| vortex_err!("Row count overflow")) - .vortex_expect("Row count overflow"), - ), + num_rows: Precision::Exact(num_rows), total_byte_size, column_statistics, }) From ca6942fe6101455d338a5008a46662068cb72768 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Sat, 25 Apr 2026 22:39:47 +0100 Subject: [PATCH 2/2] checked mul Signed-off-by: Adam Gutglick --- vortex-datafusion/src/persistent/format.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index bad455e6955..2802bdc8616 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -530,10 +530,9 @@ impl FileFormat for VortexFormat { let column_size = stats_set .get_as::(Stat::UncompressedSizeInBytes, &PType::U64.into()) .or_else(|| { - field - .data_type() - .primitive_width() - .map(|width| stats::Precision::Exact(width * num_rows)) + field.data_type().primitive_width().and_then(|width| { + width.checked_mul(num_rows).map(stats::Precision::Exact) + }) }); let target_dtype = DType::from_arrow(field.as_ref());