Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions vortex-duckdb/cpp/expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ extern "C" duckdb_vx_expr_class duckdb_vx_expr_get_class(duckdb_vx_expr ffi_expr
return static_cast<duckdb_vx_expr_class>(expr->GetExpressionClass());
}

extern "C" duckdb_logical_type duckdb_vx_expr_get_return_type(duckdb_vx_expr ffi_expr) {
if (!ffi_expr) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use D_ASSERT(ffi_expr); instead of if here.
I'll migrate other code to this macro as well.

return nullptr;
}
auto expr = reinterpret_cast<Expression *>(ffi_expr);
return reinterpret_cast<duckdb_logical_type>(&expr->return_type);
}

extern "C" const char *duckdb_vx_expr_get_bound_column_ref_get_name(duckdb_vx_expr ffi_expr) {
if (!ffi_expr) {
return nullptr;
Expand Down
4 changes: 4 additions & 0 deletions vortex-duckdb/cpp/include/duckdb_vx/expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ typedef enum DUCKDB_VX_EXPR_TYPE {

duckdb_vx_expr_class duckdb_vx_expr_get_class(duckdb_vx_expr expr);

/// Return the (bound) return type of the expression. The logical type is borrowed from the
/// expression and must not be freed.
duckdb_logical_type duckdb_vx_expr_get_return_type(duckdb_vx_expr expr);

const char *duckdb_vx_expr_get_bound_column_ref_get_name(duckdb_vx_expr expr);

duckdb_value duckdb_vx_expr_bound_constant_get_value(duckdb_vx_expr expr);
Expand Down
46 changes: 46 additions & 0 deletions vortex-duckdb/src/convert/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use vortex::expr::get_item;
use vortex::expr::is_not_null;
use vortex::expr::is_null;
use vortex::expr::list_contains;
use vortex::expr::list_length;
use vortex::expr::lit;
use vortex::expr::not;
use vortex::expr::or_collect;
Expand All @@ -37,6 +38,7 @@ use vortex::scalar_fn::fns::like::LikeOptions;
use vortex::scalar_fn::fns::literal::Literal;
use vortex::scalar_fn::fns::operators::Operator;

use crate::cpp::DUCKDB_TYPE;
use crate::cpp::DUCKDB_VX_EXPR_TYPE;
use crate::duckdb;
use crate::duckdb::BoundFunction;
Expand All @@ -57,6 +59,15 @@ fn from_bound_str(value: &duckdb::ExpressionRef) -> VortexResult<String> {
}
}

/// Returns true if the expression's bound return type is a `LIST` or fixed-size `ARRAY`. Used to
/// disambiguate the overloaded `len`/`length` functions, which also apply to strings and bits.
fn is_list_typed(expr: &duckdb::ExpressionRef) -> bool {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

returns_a_list? Let's move the comment about overloaded functions to the invocation point.

matches!(
expr.return_type().as_type_id(),
DUCKDB_TYPE::DUCKDB_TYPE_LIST | DUCKDB_TYPE::DUCKDB_TYPE_ARRAY
)
}

fn try_from_bound_function(
func: &BoundFunction,
col_sub: Option<&Expression>,
Expand Down Expand Up @@ -115,6 +126,25 @@ fn try_from_bound_function(
};
Like.new_expr(LikeOptions::default(), [value, lit(pattern)])
}
// `array_length` is list-only, but `len`/`length` are also defined for strings and bits,
// so we gate on the argument's bound return type being a list or fixed-size array.
"len" | "length" | "array_length" => {
let children: Vec<_> = func.children().collect();
// Only the single-argument form maps to list_length; the two-argument
// (dimension) form of array_length has different semantics.
if children.len() != 1 || !is_list_typed(children[0]) {
return Ok(None);
}
let Some(col) = try_from_expression_inner(children[0], col_sub)? else {
return Ok(None);
};
let col = list_length(col);
// list_length returns u64, len()/array_length() return i64. We don't know the
// column's nullability here, so we set it to nullable; for a non-nullable
// column the validity will be AllValid so it's a marginal cost.
let dtype = DType::Primitive(PType::I64, Nullability::Nullable);
cast(col, dtype)
}
_ => {
debug!("bound function {}", func.scalar_function.name());
return Ok(None);
Expand Down Expand Up @@ -173,6 +203,10 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool {
|| name == "~~"
|| name == "!~~"
|| name == "strlen"
|| (matches!(name, "len" | "length" | "array_length") && {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move this to a separate function?

let children: Vec<_> = func.children().collect();
children.len() == 1 && is_list_typed(children[0])
})
}
ExpressionClass::BoundOperator(op) => {
if !matches!(
Expand Down Expand Up @@ -208,6 +242,18 @@ pub fn try_from_projection_expression(
let col = cast(col, dtype);
Some(col)
}
// `len`/`length`/`array_length` over a list column. The column dtype is known here, so

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can reference the comment in can_pushdown_* without duplicating it

// unlike the filter path we can safely accept the overloaded `len`/`length` names by
// gating on the field being a list. Only the single-argument form is supported.
"len" | "length" | "array_length"
if matches!(field.dtype, DType::List(..) | DType::FixedSizeList(..))
&& func.children().count() == 1 =>
{
let col = list_length(get_item(field.name.as_str(), root()));
// list_length returns u64, len()/array_length() return i64 (BIGINT).
let dtype = DType::Primitive(PType::I64, field.dtype.nullability());
Some(cast(col, dtype))
}
_ => None,
})
}
Expand Down
7 changes: 7 additions & 0 deletions vortex-duckdb/src/duckdb/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use std::ptr;
use crate::cpp;
use crate::cpp::duckdb_vx_expr_class;
use crate::duckdb::DDBString;
use crate::duckdb::LogicalType;
use crate::duckdb::LogicalTypeRef;
use crate::duckdb::ScalarFunction;
use crate::duckdb::ScalarFunctionRef;
use crate::duckdb::Value;
Expand All @@ -33,6 +35,11 @@ impl ExpressionRef {
unsafe { cpp::duckdb_vx_expr_get_class(self.as_ptr()) }
}

/// The (bound) return type of this expression. Borrowed from the expression.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we care whether the return type is bound?
Filter pushdown takes place after bind() is called, so everything you can get is bound, not sure it's worth mentioning specifically.

pub fn return_type(&self) -> &LogicalTypeRef {
unsafe { LogicalType::borrow(cpp::duckdb_vx_expr_get_return_type(self.as_ptr())) }
}

/// Match the subclass of the expression.
pub fn as_class(&self) -> Option<ExpressionClass<'_>> {
Some(
Expand Down
98 changes: 98 additions & 0 deletions vortex-duckdb/src/e2e_test/vortex_scan_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -992,3 +992,101 @@ fn test_geometry() {
let area = vec.as_slice_with_len::<f64>(chunk.len().as_())[0];
assert_eq!(area, 1000.0);
}

/// `SELECT array_length(list)` / `len(list)` / `length(list)` should push the list-length
/// computation into the Vortex scan (computed from offsets, without materializing the list
/// elements) and return the per-row element counts.
#[test]
fn test_vortex_scan_list_length_projection() {
let file = RUNTIME.block_on(async {
let integers = PrimitiveArray::from_iter([
10i32, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
]);
// Variable-length lists with 3, 4, 1, 5, 2 elements respectively.
let offsets = buffer![0i32, 3, 7, 8, 13, 15];
let list_array = ListArray::try_new(
integers.into_array(),
offsets.into_array(),
Validity::AllValid,
)
.unwrap();

write_single_column_vortex_file("int_list", list_array).await
});

let conn = database_connection();
let file_path = file.path().to_string_lossy();

// `len`/`length` bind to the same DuckDB function set as `array_length` for list arguments.
for func in ["array_length", "len", "length"] {
let result = conn
.query(&format!("SELECT {func}(int_list) FROM '{file_path}'"))
.unwrap();

let mut lengths = Vec::new();
for chunk in result {
let len = chunk.len().as_();
let vec = chunk.get_vector(0);
lengths.extend_from_slice(vec.as_slice_with_len::<i64>(len));
}

assert_eq!(lengths, vec![3, 4, 1, 5, 2], "{func}(int_list) mismatch");
}
}

/// `WHERE array_length(list) >= k` should push down as a complex filter.
#[test]
fn test_vortex_scan_list_length_filter() {
let file = RUNTIME.block_on(async {
let integers = PrimitiveArray::from_iter([
10i32, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
]);
// Variable-length lists with 3, 4, 1, 5, 2 elements respectively.
let offsets = buffer![0i32, 3, 7, 8, 13, 15];
let list_array = ListArray::try_new(
integers.into_array(),
offsets.into_array(),
Validity::AllValid,
)
.unwrap();

write_single_column_vortex_file("int_list", list_array).await
});

// Lists with length >= 4: the 4-element and 5-element lists => 2 rows.
let count = scan_vortex_file_single_row::<i64, i64>(
file,
"SELECT COUNT(*) FROM ? WHERE array_length(int_list) >= 4",
0,
);
assert_eq!(count, 2);
}

/// `array_length`/`len`/`length` over a FixedSizeList column. The length is the fixed list size.
#[test]
fn test_vortex_scan_fixed_size_list_length_projection() {
let file = RUNTIME.block_on(async {
// 6 fixed-size lists of 4 i32 elements each.
let elements = (0..24i32).collect::<PrimitiveArray>();
let fsl = FixedSizeListArray::new(elements.into_array(), 4, Validity::AllValid, 6);
write_single_column_vortex_file("int_lists", fsl).await
});

let conn = database_connection();
let file_path = file.path().to_string_lossy();

for func in ["array_length", "len", "length"] {
let result = conn
.query(&format!("SELECT {func}(int_lists) FROM '{file_path}'"))
.unwrap();

let mut lengths = Vec::new();
for chunk in result {
let len = chunk.len().as_();
let vec = chunk.get_vector(0);
lengths.extend_from_slice(vec.as_slice_with_len::<i64>(len));
}

assert_eq!(lengths, vec![4i64; 6], "{func}(int_lists) mismatch");
}
}
Loading