From 25478ef29db75bbb4984a3951c5809585657e996 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 6 Mar 2026 14:32:52 -0500 Subject: [PATCH 1/4] add UUID extension type Signed-off-by: Connor Tsui --- Cargo.lock | 1 + vortex-array/Cargo.toml | 1 + vortex-array/public-api.lock | 174 +++++++++ vortex-array/src/extension/mod.rs | 1 + vortex-array/src/extension/uuid/metadata.rs | 14 + vortex-array/src/extension/uuid/mod.rs | 20 ++ vortex-array/src/extension/uuid/vtable.rs | 205 +++++++++++ vortex-array/src/scalar_fn/fns/mod.rs | 1 + .../src/scalar_fn/fns/uuid_from_string.rs | 330 ++++++++++++++++++ 9 files changed, 747 insertions(+) create mode 100644 vortex-array/src/extension/uuid/metadata.rs create mode 100644 vortex-array/src/extension/uuid/mod.rs create mode 100644 vortex-array/src/extension/uuid/vtable.rs create mode 100644 vortex-array/src/scalar_fn/fns/uuid_from_string.rs diff --git a/Cargo.lock b/Cargo.lock index 4bd2006c54e..ce332cca287 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9731,6 +9731,7 @@ dependencies = [ "tabled", "termtree", "tracing", + "uuid", "vortex-array", "vortex-buffer", "vortex-error", diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index e915c8e2002..575a41459b9 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -67,6 +67,7 @@ tabled = { workspace = true, optional = true, default-features = false, features ] } termtree = { workspace = true } tracing = { workspace = true } +uuid = { workspace = true } vortex-buffer = { workspace = true, features = ["arrow"] } vortex-error = { workspace = true, features = ["flatbuffers"] } vortex-flatbuffers = { workspace = true, features = ["array", "dtype"] } diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 3faed612fa6..05d01075f47 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -7038,6 +7038,24 @@ pub fn vortex_array::extension::datetime::Timestamp::validate_dtype(&self, _meta pub fn vortex_array::extension::datetime::Timestamp::validate_scalar_value(&self, metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType, storage_value: &vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult<()> +impl vortex_array::dtype::extension::ExtVTable for vortex_array::extension::uuid::Uuid + +pub type vortex_array::extension::uuid::Uuid::Metadata = vortex_array::extension::uuid::UuidMetadata + +pub type vortex_array::extension::uuid::Uuid::NativeValue<'a> = uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::deserialize_metadata(&self, _metadata: &[u8]) -> vortex_error::VortexResult + +pub fn vortex_array::extension::uuid::Uuid::id(&self) -> vortex_array::dtype::extension::ExtId + +pub fn vortex_array::extension::uuid::Uuid::serialize_metadata(&self, _metadata: &Self::Metadata) -> vortex_error::VortexResult> + +pub fn vortex_array::extension::uuid::Uuid::unpack_native<'a>(&self, _metadata: &'a Self::Metadata, _storage_dtype: &'a vortex_array::dtype::DType, storage_value: &'a vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult + +pub fn vortex_array::extension::uuid::Uuid::validate_dtype(&self, _metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult<()> + +pub fn vortex_array::extension::uuid::Uuid::validate_scalar_value(&self, metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType, storage_value: &vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult<()> + pub trait vortex_array::dtype::extension::Matcher pub type vortex_array::dtype::extension::Matcher::Match<'a> @@ -11002,6 +11020,82 @@ pub fn vortex_array::extension::datetime::TimestampOptions::hash<__H: core::hash impl core::marker::StructuralPartialEq for vortex_array::extension::datetime::TimestampOptions +pub mod vortex_array::extension::uuid + +pub struct vortex_array::extension::uuid::Uuid + +impl core::clone::Clone for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::clone(&self) -> vortex_array::extension::uuid::Uuid + +impl core::cmp::Eq for vortex_array::extension::uuid::Uuid + +impl core::cmp::PartialEq for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::eq(&self, other: &vortex_array::extension::uuid::Uuid) -> bool + +impl core::default::Default for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::default() -> vortex_array::extension::uuid::Uuid + +impl core::fmt::Debug for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::hash<__H: core::hash::Hasher>(&self, state: &mut __H) + +impl core::marker::StructuralPartialEq for vortex_array::extension::uuid::Uuid + +impl vortex_array::dtype::extension::ExtVTable for vortex_array::extension::uuid::Uuid + +pub type vortex_array::extension::uuid::Uuid::Metadata = vortex_array::extension::uuid::UuidMetadata + +pub type vortex_array::extension::uuid::Uuid::NativeValue<'a> = uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::deserialize_metadata(&self, _metadata: &[u8]) -> vortex_error::VortexResult + +pub fn vortex_array::extension::uuid::Uuid::id(&self) -> vortex_array::dtype::extension::ExtId + +pub fn vortex_array::extension::uuid::Uuid::serialize_metadata(&self, _metadata: &Self::Metadata) -> vortex_error::VortexResult> + +pub fn vortex_array::extension::uuid::Uuid::unpack_native<'a>(&self, _metadata: &'a Self::Metadata, _storage_dtype: &'a vortex_array::dtype::DType, storage_value: &'a vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult + +pub fn vortex_array::extension::uuid::Uuid::validate_dtype(&self, _metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult<()> + +pub fn vortex_array::extension::uuid::Uuid::validate_scalar_value(&self, metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType, storage_value: &vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult<()> + +pub struct vortex_array::extension::uuid::UuidMetadata + +impl core::clone::Clone for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::clone(&self) -> vortex_array::extension::uuid::UuidMetadata + +impl core::cmp::Eq for vortex_array::extension::uuid::UuidMetadata + +impl core::cmp::PartialEq for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::eq(&self, other: &vortex_array::extension::uuid::UuidMetadata) -> bool + +impl core::default::Default for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::default() -> vortex_array::extension::uuid::UuidMetadata + +impl core::fmt::Debug for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::hash<__H: core::hash::Hasher>(&self, state: &mut __H) + +impl core::marker::StructuralPartialEq for vortex_array::extension::uuid::UuidMetadata + pub struct vortex_array::extension::EmptyMetadata impl core::clone::Clone for vortex_array::extension::EmptyMetadata @@ -15098,6 +15192,50 @@ pub fn vortex_array::scalar_fn::fns::select::Select::stat_falsification(&self, o pub fn vortex_array::scalar_fn::fns::select::Select::validity(&self, options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult> +pub mod vortex_array::scalar_fn::fns::uuid_from_string + +pub struct vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString + +impl core::clone::Clone for vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::clone(&self) -> vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString + +impl vortex_array::scalar_fn::ScalarFnVTable for vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString + +pub type vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::Options = vortex_array::scalar_fn::EmptyOptions + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::arity(&self, _options: &Self::Options) -> vortex_array::scalar_fn::Arity + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::child_name(&self, _options: &Self::Options, child_idx: usize) -> vortex_array::scalar_fn::ChildName + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::deserialize(&self, _metadata: &[u8], _session: &vortex_session::VortexSession) -> vortex_error::VortexResult + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::execute(&self, _options: &Self::Options, args: &dyn vortex_array::scalar_fn::ExecutionArgs, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::fmt_sql(&self, _options: &Self::Options, expr: &vortex_array::expr::Expression, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::id(&self) -> vortex_array::scalar_fn::ScalarFnId + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::is_fallible(&self, _options: &Self::Options) -> bool + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::is_null_sensitive(&self, _options: &Self::Options) -> bool + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::reduce(&self, options: &Self::Options, node: &dyn vortex_array::scalar_fn::ReduceNode, ctx: &dyn vortex_array::scalar_fn::ReduceCtx) -> vortex_error::VortexResult> + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::return_dtype(&self, _options: &Self::Options, arg_dtypes: &[vortex_array::dtype::DType]) -> vortex_error::VortexResult + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::serialize(&self, options: &Self::Options) -> vortex_error::VortexResult>> + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::simplify(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, ctx: &dyn vortex_array::scalar_fn::SimplifyCtx) -> vortex_error::VortexResult> + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::simplify_untyped(&self, options: &Self::Options, expr: &vortex_array::expr::Expression) -> vortex_error::VortexResult> + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::stat_expression(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, stat: vortex_array::expr::stats::Stat, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::stat_falsification(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::validity(&self, _options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult> + pub mod vortex_array::scalar_fn::fns::zip pub struct vortex_array::scalar_fn::fns::zip::Zip @@ -16126,6 +16264,42 @@ pub fn vortex_array::scalar_fn::fns::select::Select::stat_falsification(&self, o pub fn vortex_array::scalar_fn::fns::select::Select::validity(&self, options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult> +impl vortex_array::scalar_fn::ScalarFnVTable for vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString + +pub type vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::Options = vortex_array::scalar_fn::EmptyOptions + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::arity(&self, _options: &Self::Options) -> vortex_array::scalar_fn::Arity + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::child_name(&self, _options: &Self::Options, child_idx: usize) -> vortex_array::scalar_fn::ChildName + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::deserialize(&self, _metadata: &[u8], _session: &vortex_session::VortexSession) -> vortex_error::VortexResult + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::execute(&self, _options: &Self::Options, args: &dyn vortex_array::scalar_fn::ExecutionArgs, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::fmt_sql(&self, _options: &Self::Options, expr: &vortex_array::expr::Expression, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::id(&self) -> vortex_array::scalar_fn::ScalarFnId + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::is_fallible(&self, _options: &Self::Options) -> bool + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::is_null_sensitive(&self, _options: &Self::Options) -> bool + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::reduce(&self, options: &Self::Options, node: &dyn vortex_array::scalar_fn::ReduceNode, ctx: &dyn vortex_array::scalar_fn::ReduceCtx) -> vortex_error::VortexResult> + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::return_dtype(&self, _options: &Self::Options, arg_dtypes: &[vortex_array::dtype::DType]) -> vortex_error::VortexResult + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::serialize(&self, options: &Self::Options) -> vortex_error::VortexResult>> + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::simplify(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, ctx: &dyn vortex_array::scalar_fn::SimplifyCtx) -> vortex_error::VortexResult> + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::simplify_untyped(&self, options: &Self::Options, expr: &vortex_array::expr::Expression) -> vortex_error::VortexResult> + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::stat_expression(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, stat: vortex_array::expr::stats::Stat, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::stat_falsification(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option + +pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::validity(&self, _options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult> + impl vortex_array::scalar_fn::ScalarFnVTable for vortex_array::scalar_fn::fns::zip::Zip pub type vortex_array::scalar_fn::fns::zip::Zip::Options = vortex_array::scalar_fn::EmptyOptions diff --git a/vortex-array/src/extension/mod.rs b/vortex-array/src/extension/mod.rs index 5c29154ded0..9f81e7fb310 100644 --- a/vortex-array/src/extension/mod.rs +++ b/vortex-array/src/extension/mod.rs @@ -6,6 +6,7 @@ use std::fmt; pub mod datetime; +pub mod uuid; #[cfg(test)] mod tests; diff --git a/vortex-array/src/extension/uuid/metadata.rs b/vortex-array/src/extension/uuid/metadata.rs new file mode 100644 index 00000000000..133646eba41 --- /dev/null +++ b/vortex-array/src/extension/uuid/metadata.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; + +/// Metadata for the UUID extension type, which is empty. +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] +pub struct UuidMetadata; + +impl fmt::Display for UuidMetadata { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "UUID") + } +} diff --git a/vortex-array/src/extension/uuid/mod.rs b/vortex-array/src/extension/uuid/mod.rs new file mode 100644 index 00000000000..e4347c2513c --- /dev/null +++ b/vortex-array/src/extension/uuid/mod.rs @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! UUID extension type for Vortex. +//! +//! Provides a UUID extension type backed by `FixedSizeList(Primitive(U8), 16)` storage. Each UUID +//! is stored as 16 bytes in big-endian (network) byte order, matching [RFC 4122] and Arrow's +//! [canonical UUID extension]. +//! +//! [RFC 4122]: https://www.rfc-editor.org/rfc/rfc4122 +//! [canonical UUID extension]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid + +mod metadata; +pub use metadata::UuidMetadata; + +pub(crate) mod vtable; + +/// The VTable for the UUID extension type. +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] +pub struct Uuid; diff --git a/vortex-array/src/extension/uuid/vtable.rs b/vortex-array/src/extension/uuid/vtable.rs new file mode 100644 index 00000000000..9ea35dac217 --- /dev/null +++ b/vortex-array/src/extension/uuid/vtable.rs @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use uuid; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_ensure_eq; + +use crate::dtype::DType; +use crate::dtype::PType; +use crate::dtype::extension::ExtId; +use crate::dtype::extension::ExtVTable; +use crate::extension::uuid::Uuid; +use crate::extension::uuid::UuidMetadata; +use crate::scalar::PValue; +use crate::scalar::ScalarValue; + +/// The number of bytes in a UUID. +pub(crate) const UUID_BYTE_LEN: usize = 16; + +impl ExtVTable for Uuid { + type Metadata = UuidMetadata; + type NativeValue<'a> = uuid::Uuid; + + fn id(&self) -> ExtId { + ExtId::new_ref("vortex.uuid") + } + + fn serialize_metadata(&self, _metadata: &Self::Metadata) -> VortexResult> { + Ok(Vec::new()) + } + + fn deserialize_metadata(&self, _metadata: &[u8]) -> VortexResult { + Ok(UuidMetadata) + } + + fn validate_dtype( + &self, + _metadata: &Self::Metadata, + storage_dtype: &DType, + ) -> VortexResult<()> { + let DType::FixedSizeList(element_dtype, list_size, _nullability) = storage_dtype else { + vortex_bail!("UUID storage dtype must be a FixedSizeList, got {storage_dtype}"); + }; + + vortex_ensure_eq!( + *list_size as usize, + UUID_BYTE_LEN, + "UUID storage FixedSizeList must have size {UUID_BYTE_LEN}, got {list_size}" + ); + + let DType::Primitive(ptype, elem_nullability) = element_dtype.as_ref() else { + vortex_bail!("UUID element dtype must be Primitive(U8), got {element_dtype}"); + }; + + vortex_ensure_eq!( + *ptype, + PType::U8, + "UUID element dtype must be U8, got {ptype}" + ); + vortex_ensure!( + !elem_nullability.is_nullable(), + "UUID element dtype must be non-nullable" + ); + + Ok(()) + } + + fn unpack_native<'a>( + &self, + _metadata: &'a Self::Metadata, + _storage_dtype: &'a DType, + storage_value: &'a ScalarValue, + ) -> VortexResult> { + let elements = storage_value.as_list(); + vortex_ensure_eq!( + elements.len(), + UUID_BYTE_LEN, + "UUID scalar must have exactly {UUID_BYTE_LEN} bytes, got {}", + elements.len() + ); + + let mut bytes = [0u8; UUID_BYTE_LEN]; + for (i, elem) in elements.iter().enumerate() { + let Some(scalar_value) = elem else { + vortex_bail!("UUID byte at index {i} must not be null"); + }; + let PValue::U8(b) = scalar_value.as_primitive() else { + vortex_bail!("UUID byte at index {i} must be U8"); + }; + bytes[i] = *b; + } + + Ok(uuid::Uuid::from_bytes(bytes)) + } +} + +#[expect( + clippy::cast_possible_truncation, + reason = "UUID_BYTE_LEN always fits both usize and u32" +)] +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use rstest::rstest; + use vortex_error::VortexResult; + + use crate::dtype::DType; + use crate::dtype::Nullability; + use crate::dtype::PType; + use crate::dtype::extension::ExtVTable; + use crate::extension::uuid::Uuid; + use crate::extension::uuid::UuidMetadata; + use crate::extension::uuid::vtable::UUID_BYTE_LEN; + use crate::scalar::Scalar; + + #[test] + fn roundtrip_metadata() -> VortexResult<()> { + let vtable = Uuid; + let bytes = vtable.serialize_metadata(&UuidMetadata)?; + let deserialized = vtable.deserialize_metadata(&bytes)?; + assert_eq!(deserialized, UuidMetadata); + Ok(()) + } + + #[rstest] + #[case::non_nullable(Nullability::NonNullable)] + #[case::nullable(Nullability::Nullable)] + fn validate_correct_storage_dtype(#[case] nullability: Nullability) -> VortexResult<()> { + let storage_dtype = uuid_storage_dtype(nullability); + Uuid.validate_dtype(&UuidMetadata, &storage_dtype) + } + + #[test] + fn validate_rejects_wrong_list_size() { + let storage_dtype = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), + 8, + Nullability::NonNullable, + ); + assert!(Uuid.validate_dtype(&UuidMetadata, &storage_dtype).is_err()); + } + + #[test] + fn validate_rejects_wrong_element_type() { + let storage_dtype = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U64, Nullability::NonNullable)), + UUID_BYTE_LEN as u32, + Nullability::NonNullable, + ); + assert!(Uuid.validate_dtype(&UuidMetadata, &storage_dtype).is_err()); + } + + #[test] + fn validate_rejects_nullable_elements() { + let storage_dtype = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U8, Nullability::Nullable)), + UUID_BYTE_LEN as u32, + Nullability::NonNullable, + ); + assert!(Uuid.validate_dtype(&UuidMetadata, &storage_dtype).is_err()); + } + + #[test] + fn validate_rejects_non_fsl() { + let storage_dtype = DType::Primitive(PType::U8, Nullability::NonNullable); + assert!(Uuid.validate_dtype(&UuidMetadata, &storage_dtype).is_err()); + } + + #[test] + fn unpack_native_uuid() -> VortexResult<()> { + let expected = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + + let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); + let children: Vec = expected + .as_bytes() + .iter() + .map(|&b| Scalar::primitive(b, Nullability::NonNullable)) + .collect(); + let storage_scalar = Scalar::fixed_size_list( + DType::Primitive(PType::U8, Nullability::NonNullable), + children, + Nullability::NonNullable, + ); + + let storage_value = storage_scalar + .value() + .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; + let result = Uuid.unpack_native(&UuidMetadata, &storage_dtype, storage_value)?; + assert_eq!(result, expected); + assert_eq!(result.to_string(), "550e8400-e29b-41d4-a716-446655440000"); + Ok(()) + } + + fn uuid_storage_dtype(nullability: Nullability) -> DType { + DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), + UUID_BYTE_LEN as u32, + nullability, + ) + } +} diff --git a/vortex-array/src/scalar_fn/fns/mod.rs b/vortex-array/src/scalar_fn/fns/mod.rs index 94fc8fb0384..a3cfd487dfc 100644 --- a/vortex-array/src/scalar_fn/fns/mod.rs +++ b/vortex-array/src/scalar_fn/fns/mod.rs @@ -19,4 +19,5 @@ pub mod operators; pub mod pack; pub mod root; pub mod select; +pub mod uuid_from_string; pub mod zip; diff --git a/vortex-array/src/scalar_fn/fns/uuid_from_string.rs b/vortex-array/src/scalar_fn/fns/uuid_from_string.rs new file mode 100644 index 00000000000..08fbffaf20f --- /dev/null +++ b/vortex-array/src/scalar_fn/fns/uuid_from_string.rs @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scalar function to parse UTF-8 strings into [`Uuid`] extension arrays. + +use std::fmt::Formatter; +use std::sync::Arc; + +use uuid; +use vortex_buffer::Buffer; +use vortex_error::VortexResult; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; + +use crate::ArrayRef; +use crate::DynArray; +use crate::ExecutionCtx; +use crate::IntoArray; +use crate::arrays::ExtensionArray; +use crate::arrays::FixedSizeListArray; +use crate::dtype::DType; +use crate::dtype::Nullability; +use crate::dtype::PType; +use crate::dtype::extension::ExtDType; +use crate::expr::Expression; +use crate::extension::uuid::Uuid; +use crate::extension::uuid::UuidMetadata; +use crate::extension::uuid::vtable::UUID_BYTE_LEN; +use crate::scalar_fn::Arity; +use crate::scalar_fn::ChildName; +use crate::scalar_fn::EmptyOptions; +use crate::scalar_fn::ExecutionArgs; +use crate::scalar_fn::ScalarFnId; +use crate::scalar_fn::ScalarFnVTable; + +/// Parses a UTF-8 string column into a [`Uuid`] extension array. +/// +/// Accepts any standard UUID string format (hyphenated, simple, braced, URN). Invalid strings +/// cause an error. +#[derive(Clone)] +pub struct UuidFromString; + +#[expect( + clippy::cast_possible_truncation, + reason = "UUID_BYTE_LEN always fits both usize and u32" +)] +impl ScalarFnVTable for UuidFromString { + type Options = EmptyOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::new_ref("vortex.uuid_from_string") + } + + fn arity(&self, _options: &Self::Options) -> Arity { + Arity::Exact(1) + } + + fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName { + match child_idx { + 0 => ChildName::from("input"), + _ => unreachable!("uuid_from_string must have exactly one child"), + } + } + + fn fmt_sql( + &self, + _options: &Self::Options, + expr: &Expression, + f: &mut Formatter<'_>, + ) -> std::fmt::Result { + write!(f, "uuid_from_string(")?; + expr.child(0).fmt_sql(f)?; + write!(f, ")") + } + + fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult { + debug_assert_eq!(arg_dtypes.len(), 1); + + let input = &arg_dtypes[0]; + vortex_ensure!( + input.is_utf8(), + "uuid_from_string requires a Utf8 input, got {input}" + ); + + let nullability = input.nullability(); + + let storage_dtype = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), + UUID_BYTE_LEN as u32, + nullability, + ); + + let ext_dtype = ExtDType::::try_new(UuidMetadata, storage_dtype)?.erased(); + + Ok(DType::Extension(ext_dtype)) + } + + fn execute( + &self, + _options: &Self::Options, + args: &dyn ExecutionArgs, + _ctx: &mut ExecutionCtx, + ) -> VortexResult { + let input = args.get(0)?; + let row_count = args.row_count(); + + let varbinview = input + .to_canonical() + .map_err(|e| vortex_err!("uuid_from_string: failed to canonicalize input: {e}"))? + .into_varbinview(); + + let validity = varbinview.validity()?; + + let mut bytes = vec![0u8; row_count * UUID_BYTE_LEN]; + + for i in 0..row_count { + if !validity.is_valid(i)? { + continue; + } + + let str_bytes = varbinview.bytes_at(i); + let s = std::str::from_utf8(&str_bytes) + .map_err(|e| vortex_err!("uuid_from_string: invalid UTF-8 at row {i}: {e}"))?; + + let parsed = uuid::Uuid::parse_str(s) + .map_err(|e| vortex_err!("uuid_from_string: invalid UUID at row {i}: {e}"))?; + + bytes[i * UUID_BYTE_LEN..(i + 1) * UUID_BYTE_LEN].copy_from_slice(parsed.as_bytes()); + } + + // Build the flat u8 elements array. + let elements: ArrayRef = Buffer::copy_from(&bytes).into_array(); + + // Wrap in FixedSizeList and Extension. + let fsl = FixedSizeListArray::new(elements, UUID_BYTE_LEN as u32, validity, row_count); + let ext_dtype = ExtDType::::try_new(UuidMetadata, fsl.dtype().clone())?.erased(); + + Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()) + } + + fn validity( + &self, + _options: &Self::Options, + expression: &Expression, + ) -> VortexResult> { + // Output validity is the same as the input. + Ok(Some(expression.child(0).validity()?)) + } + + fn is_null_sensitive(&self, _options: &Self::Options) -> bool { + false + } + + fn is_fallible(&self, _options: &Self::Options) -> bool { + // Invalid UUID strings cause errors. + true + } +} + +#[cfg(test)] +mod tests { + use vortex_error::VortexResult; + + use crate::ArrayRef; + use crate::DynArray; + use crate::IntoArray; + use crate::ToCanonical; + use crate::arrays::ScalarFnArray; + use crate::arrays::VarBinViewArray; + use crate::dtype::DType; + use crate::dtype::Nullability; + use crate::dtype::PType; + use crate::dtype::extension::ExtVTable; + use crate::extension::uuid::Uuid; + use crate::extension::uuid::UuidMetadata; + use crate::extension::uuid::vtable::UUID_BYTE_LEN; + use crate::scalar_fn::EmptyOptions; + use crate::scalar_fn::ScalarFn; + use crate::scalar_fn::fns::uuid_from_string::UuidFromString; + + /// Builds a string array from the given values, with nullable support. + fn string_array(values: &[Option<&str>]) -> ArrayRef { + VarBinViewArray::from_iter_nullable_str(values.iter().copied()).into_array() + } + + /// Evaluates `uuid_from_string` and returns the resulting extension array. + fn eval_uuid_from_string(input: ArrayRef, len: usize) -> VortexResult { + let scalar_fn = ScalarFn::new(UuidFromString, EmptyOptions).erased(); + let result = ScalarFnArray::try_new(scalar_fn, vec![input], len)?; + result.to_canonical().map(|c| c.into_array()) + } + + /// Extracts the flat u8 bytes from a UUID extension array. + fn extract_uuid_bytes(array: &ArrayRef) -> Vec { + let ext = array.to_extension(); + let fsl = ext.storage().to_fixed_size_list(); + let prim = fsl.elements().to_primitive(); + prim.as_slice::().to_vec() + } + + #[test] + fn parse_single_uuid() -> VortexResult<()> { + let input = string_array(&[Some("550e8400-e29b-41d4-a716-446655440000")]); + let result = eval_uuid_from_string(input, 1)?; + + let expected = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + + let bytes = extract_uuid_bytes(&result); + assert_eq!(&bytes, expected.as_bytes()); + Ok(()) + } + + #[test] + fn parse_multiple_uuids() -> VortexResult<()> { + let uuids = [ + "550e8400-e29b-41d4-a716-446655440000", + "6ba7b810-9dad-11d1-80b4-00c04fd430c8", + "f47ac10b-58cc-4372-a567-0e02b2c3d479", + ]; + let input = string_array(&uuids.iter().map(|s| Some(*s)).collect::>()); + let result = eval_uuid_from_string(input, 3)?; + + let bytes = extract_uuid_bytes(&result); + for (i, uuid_str) in uuids.iter().enumerate() { + let expected = + uuid::Uuid::parse_str(uuid_str).map_err(|e| vortex_error::vortex_err!("{e}"))?; + assert_eq!(&bytes[i * 16..(i + 1) * 16], expected.as_bytes()); + } + Ok(()) + } + + #[test] + fn parse_invalid_uuid_errors() { + let input = string_array(&[Some("not-a-uuid")]); + let result = eval_uuid_from_string(input, 1); + assert!(result.is_err()); + } + + #[test] + fn parse_null_input_produces_null() -> VortexResult<()> { + let input = string_array(&[ + Some("550e8400-e29b-41d4-a716-446655440000"), + None, + Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"), + ]); + let result = eval_uuid_from_string(input, 3)?; + + // Row 1 should be null. + assert!(result.is_valid(0)?); + assert!(result.is_invalid(1)?); + assert!(result.is_valid(2)?); + Ok(()) + } + + #[expect( + clippy::cast_possible_truncation, + reason = "UUID_BYTE_LEN always fits both usize and u32" + )] + #[test] + fn storage_array_structure() -> VortexResult<()> { + // Note that this test assumes that the storage type is a `FixedSizeList`. That will likely + // change in the future. + + let input = string_array(&[ + Some("550e8400-e29b-41d4-a716-446655440000"), + None, + Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"), + ]); + let result = eval_uuid_from_string(input, 3)?; + + // The result should be an extension array. + let ext = result.to_extension(); + assert_eq!(ext.ext_dtype().id().as_ref(), "vortex.uuid"); + assert_eq!(ext.len(), 3); + + // The storage should be a FixedSizeList of u8 with size 16. + let fsl = ext.storage().to_fixed_size_list(); + assert_eq!(fsl.len(), 3); + assert_eq!(fsl.list_size(), UUID_BYTE_LEN as u32); + + // The elements should be a flat u8 primitive array of length 3 * 16 = 48. + let prim = fsl.elements().to_primitive(); + assert_eq!(prim.len(), 3 * UUID_BYTE_LEN); + assert_eq!( + prim.dtype(), + &DType::Primitive(PType::U8, Nullability::NonNullable) + ); + + // Validity on the FSL should match the input: valid, null, valid. + assert!(fsl.is_valid(0)?); + assert!(fsl.is_invalid(1)?); + assert!(fsl.is_valid(2)?); + + // Verify the byte content of the two valid UUIDs. + let bytes = prim.as_slice::(); + let expected_0 = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + let expected_2 = uuid::Uuid::parse_str("6ba7b810-9dad-11d1-80b4-00c04fd430c8") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + assert_eq!(&bytes[0..UUID_BYTE_LEN], expected_0.as_bytes()); + assert_eq!( + &bytes[2 * UUID_BYTE_LEN..3 * UUID_BYTE_LEN], + expected_2.as_bytes() + ); + + Ok(()) + } + + #[test] + fn unpack_native_from_parsed() -> VortexResult<()> { + let input = string_array(&[Some("550e8400-e29b-41d4-a716-446655440000")]); + let result = eval_uuid_from_string(input, 1)?; + + let scalar = result.scalar_at(0)?; + let ext_scalar = scalar.as_extension(); + let storage_scalar = ext_scalar.to_storage_scalar(); + let storage_value = storage_scalar + .value() + .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; + + let native = Uuid.unpack_native( + &UuidMetadata, + ext_scalar.ext_dtype().storage_dtype(), + storage_value, + )?; + assert_eq!(native.to_string(), "550e8400-e29b-41d4-a716-446655440000"); + Ok(()) + } +} From 48c165f6d9d1d97ad98ebf14a2227232d2f9be18 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 6 Mar 2026 15:04:18 -0500 Subject: [PATCH 2/4] remove UuidMetadata Signed-off-by: Connor Tsui --- vortex-array/src/extension/uuid/metadata.rs | 14 ----------- vortex-array/src/extension/uuid/mod.rs | 3 --- vortex-array/src/extension/uuid/vtable.rs | 24 +++++++++---------- .../src/scalar_fn/fns/uuid_from_string.rs | 10 ++++---- 4 files changed, 17 insertions(+), 34 deletions(-) delete mode 100644 vortex-array/src/extension/uuid/metadata.rs diff --git a/vortex-array/src/extension/uuid/metadata.rs b/vortex-array/src/extension/uuid/metadata.rs deleted file mode 100644 index 133646eba41..00000000000 --- a/vortex-array/src/extension/uuid/metadata.rs +++ /dev/null @@ -1,14 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use std::fmt; - -/// Metadata for the UUID extension type, which is empty. -#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] -pub struct UuidMetadata; - -impl fmt::Display for UuidMetadata { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "UUID") - } -} diff --git a/vortex-array/src/extension/uuid/mod.rs b/vortex-array/src/extension/uuid/mod.rs index e4347c2513c..6df102ce852 100644 --- a/vortex-array/src/extension/uuid/mod.rs +++ b/vortex-array/src/extension/uuid/mod.rs @@ -10,9 +10,6 @@ //! [RFC 4122]: https://www.rfc-editor.org/rfc/rfc4122 //! [canonical UUID extension]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid -mod metadata; -pub use metadata::UuidMetadata; - pub(crate) mod vtable; /// The VTable for the UUID extension type. diff --git a/vortex-array/src/extension/uuid/vtable.rs b/vortex-array/src/extension/uuid/vtable.rs index 9ea35dac217..9fbfff125e1 100644 --- a/vortex-array/src/extension/uuid/vtable.rs +++ b/vortex-array/src/extension/uuid/vtable.rs @@ -11,8 +11,8 @@ use crate::dtype::DType; use crate::dtype::PType; use crate::dtype::extension::ExtId; use crate::dtype::extension::ExtVTable; +use crate::extension::EmptyMetadata; use crate::extension::uuid::Uuid; -use crate::extension::uuid::UuidMetadata; use crate::scalar::PValue; use crate::scalar::ScalarValue; @@ -20,7 +20,7 @@ use crate::scalar::ScalarValue; pub(crate) const UUID_BYTE_LEN: usize = 16; impl ExtVTable for Uuid { - type Metadata = UuidMetadata; + type Metadata = EmptyMetadata; type NativeValue<'a> = uuid::Uuid; fn id(&self) -> ExtId { @@ -32,7 +32,7 @@ impl ExtVTable for Uuid { } fn deserialize_metadata(&self, _metadata: &[u8]) -> VortexResult { - Ok(UuidMetadata) + Ok(EmptyMetadata) } fn validate_dtype( @@ -111,17 +111,17 @@ mod tests { use crate::dtype::Nullability; use crate::dtype::PType; use crate::dtype::extension::ExtVTable; + use crate::extension::EmptyMetadata; use crate::extension::uuid::Uuid; - use crate::extension::uuid::UuidMetadata; use crate::extension::uuid::vtable::UUID_BYTE_LEN; use crate::scalar::Scalar; #[test] fn roundtrip_metadata() -> VortexResult<()> { let vtable = Uuid; - let bytes = vtable.serialize_metadata(&UuidMetadata)?; + let bytes = vtable.serialize_metadata(&EmptyMetadata)?; let deserialized = vtable.deserialize_metadata(&bytes)?; - assert_eq!(deserialized, UuidMetadata); + assert_eq!(deserialized, EmptyMetadata); Ok(()) } @@ -130,7 +130,7 @@ mod tests { #[case::nullable(Nullability::Nullable)] fn validate_correct_storage_dtype(#[case] nullability: Nullability) -> VortexResult<()> { let storage_dtype = uuid_storage_dtype(nullability); - Uuid.validate_dtype(&UuidMetadata, &storage_dtype) + Uuid.validate_dtype(&EmptyMetadata, &storage_dtype) } #[test] @@ -140,7 +140,7 @@ mod tests { 8, Nullability::NonNullable, ); - assert!(Uuid.validate_dtype(&UuidMetadata, &storage_dtype).is_err()); + assert!(Uuid.validate_dtype(&EmptyMetadata, &storage_dtype).is_err()); } #[test] @@ -150,7 +150,7 @@ mod tests { UUID_BYTE_LEN as u32, Nullability::NonNullable, ); - assert!(Uuid.validate_dtype(&UuidMetadata, &storage_dtype).is_err()); + assert!(Uuid.validate_dtype(&EmptyMetadata, &storage_dtype).is_err()); } #[test] @@ -160,13 +160,13 @@ mod tests { UUID_BYTE_LEN as u32, Nullability::NonNullable, ); - assert!(Uuid.validate_dtype(&UuidMetadata, &storage_dtype).is_err()); + assert!(Uuid.validate_dtype(&EmptyMetadata, &storage_dtype).is_err()); } #[test] fn validate_rejects_non_fsl() { let storage_dtype = DType::Primitive(PType::U8, Nullability::NonNullable); - assert!(Uuid.validate_dtype(&UuidMetadata, &storage_dtype).is_err()); + assert!(Uuid.validate_dtype(&EmptyMetadata, &storage_dtype).is_err()); } #[test] @@ -189,7 +189,7 @@ mod tests { let storage_value = storage_scalar .value() .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; - let result = Uuid.unpack_native(&UuidMetadata, &storage_dtype, storage_value)?; + let result = Uuid.unpack_native(&EmptyMetadata, &storage_dtype, storage_value)?; assert_eq!(result, expected); assert_eq!(result.to_string(), "550e8400-e29b-41d4-a716-446655440000"); Ok(()) diff --git a/vortex-array/src/scalar_fn/fns/uuid_from_string.rs b/vortex-array/src/scalar_fn/fns/uuid_from_string.rs index 08fbffaf20f..1d198a2d3e1 100644 --- a/vortex-array/src/scalar_fn/fns/uuid_from_string.rs +++ b/vortex-array/src/scalar_fn/fns/uuid_from_string.rs @@ -23,8 +23,8 @@ use crate::dtype::Nullability; use crate::dtype::PType; use crate::dtype::extension::ExtDType; use crate::expr::Expression; +use crate::extension::EmptyMetadata; use crate::extension::uuid::Uuid; -use crate::extension::uuid::UuidMetadata; use crate::extension::uuid::vtable::UUID_BYTE_LEN; use crate::scalar_fn::Arity; use crate::scalar_fn::ChildName; @@ -90,7 +90,7 @@ impl ScalarFnVTable for UuidFromString { nullability, ); - let ext_dtype = ExtDType::::try_new(UuidMetadata, storage_dtype)?.erased(); + let ext_dtype = ExtDType::::try_new(EmptyMetadata, storage_dtype)?.erased(); Ok(DType::Extension(ext_dtype)) } @@ -133,7 +133,7 @@ impl ScalarFnVTable for UuidFromString { // Wrap in FixedSizeList and Extension. let fsl = FixedSizeListArray::new(elements, UUID_BYTE_LEN as u32, validity, row_count); - let ext_dtype = ExtDType::::try_new(UuidMetadata, fsl.dtype().clone())?.erased(); + let ext_dtype = ExtDType::::try_new(EmptyMetadata, fsl.dtype().clone())?.erased(); Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()) } @@ -171,8 +171,8 @@ mod tests { use crate::dtype::Nullability; use crate::dtype::PType; use crate::dtype::extension::ExtVTable; + use crate::extension::EmptyMetadata; use crate::extension::uuid::Uuid; - use crate::extension::uuid::UuidMetadata; use crate::extension::uuid::vtable::UUID_BYTE_LEN; use crate::scalar_fn::EmptyOptions; use crate::scalar_fn::ScalarFn; @@ -320,7 +320,7 @@ mod tests { .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; let native = Uuid.unpack_native( - &UuidMetadata, + &EmptyMetadata, ext_scalar.ext_dtype().storage_dtype(), storage_value, )?; From f1cdeef7dd62aaf6ec75cd6fbaeec41e6428d4a0 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 6 Mar 2026 16:43:44 -0500 Subject: [PATCH 3/4] address comments Signed-off-by: Connor Tsui --- vortex-array/public-api.lock | 108 +----- vortex-array/src/extension/uuid/metadata.rs | 70 ++++ vortex-array/src/extension/uuid/mod.rs | 3 + vortex-array/src/extension/uuid/vtable.rs | 192 ++++++++-- vortex-array/src/scalar_fn/fns/mod.rs | 1 - .../src/scalar_fn/fns/uuid_from_string.rs | 330 ------------------ 6 files changed, 258 insertions(+), 446 deletions(-) create mode 100644 vortex-array/src/extension/uuid/metadata.rs delete mode 100644 vortex-array/src/scalar_fn/fns/uuid_from_string.rs diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 05d01075f47..8f2a8705104 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -7044,13 +7044,13 @@ pub type vortex_array::extension::uuid::Uuid::Metadata = vortex_array::extension pub type vortex_array::extension::uuid::Uuid::NativeValue<'a> = uuid::Uuid -pub fn vortex_array::extension::uuid::Uuid::deserialize_metadata(&self, _metadata: &[u8]) -> vortex_error::VortexResult +pub fn vortex_array::extension::uuid::Uuid::deserialize_metadata(&self, metadata: &[u8]) -> vortex_error::VortexResult pub fn vortex_array::extension::uuid::Uuid::id(&self) -> vortex_array::dtype::extension::ExtId -pub fn vortex_array::extension::uuid::Uuid::serialize_metadata(&self, _metadata: &Self::Metadata) -> vortex_error::VortexResult> +pub fn vortex_array::extension::uuid::Uuid::serialize_metadata(&self, metadata: &Self::Metadata) -> vortex_error::VortexResult> -pub fn vortex_array::extension::uuid::Uuid::unpack_native<'a>(&self, _metadata: &'a Self::Metadata, _storage_dtype: &'a vortex_array::dtype::DType, storage_value: &'a vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult +pub fn vortex_array::extension::uuid::Uuid::unpack_native<'a>(&self, metadata: &'a Self::Metadata, _storage_dtype: &'a vortex_array::dtype::DType, storage_value: &'a vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult pub fn vortex_array::extension::uuid::Uuid::validate_dtype(&self, _metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult<()> @@ -11054,13 +11054,13 @@ pub type vortex_array::extension::uuid::Uuid::Metadata = vortex_array::extension pub type vortex_array::extension::uuid::Uuid::NativeValue<'a> = uuid::Uuid -pub fn vortex_array::extension::uuid::Uuid::deserialize_metadata(&self, _metadata: &[u8]) -> vortex_error::VortexResult +pub fn vortex_array::extension::uuid::Uuid::deserialize_metadata(&self, metadata: &[u8]) -> vortex_error::VortexResult pub fn vortex_array::extension::uuid::Uuid::id(&self) -> vortex_array::dtype::extension::ExtId -pub fn vortex_array::extension::uuid::Uuid::serialize_metadata(&self, _metadata: &Self::Metadata) -> vortex_error::VortexResult> +pub fn vortex_array::extension::uuid::Uuid::serialize_metadata(&self, metadata: &Self::Metadata) -> vortex_error::VortexResult> -pub fn vortex_array::extension::uuid::Uuid::unpack_native<'a>(&self, _metadata: &'a Self::Metadata, _storage_dtype: &'a vortex_array::dtype::DType, storage_value: &'a vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult +pub fn vortex_array::extension::uuid::Uuid::unpack_native<'a>(&self, metadata: &'a Self::Metadata, _storage_dtype: &'a vortex_array::dtype::DType, storage_value: &'a vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult pub fn vortex_array::extension::uuid::Uuid::validate_dtype(&self, _metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult<()> @@ -11068,6 +11068,12 @@ pub fn vortex_array::extension::uuid::Uuid::validate_scalar_value(&self, metadat pub struct vortex_array::extension::uuid::UuidMetadata +pub vortex_array::extension::uuid::UuidMetadata::version: core::option::Option + +impl vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::any() -> Self + impl core::clone::Clone for vortex_array::extension::uuid::UuidMetadata pub fn vortex_array::extension::uuid::UuidMetadata::clone(&self) -> vortex_array::extension::uuid::UuidMetadata @@ -11076,11 +11082,7 @@ impl core::cmp::Eq for vortex_array::extension::uuid::UuidMetadata impl core::cmp::PartialEq for vortex_array::extension::uuid::UuidMetadata -pub fn vortex_array::extension::uuid::UuidMetadata::eq(&self, other: &vortex_array::extension::uuid::UuidMetadata) -> bool - -impl core::default::Default for vortex_array::extension::uuid::UuidMetadata - -pub fn vortex_array::extension::uuid::UuidMetadata::default() -> vortex_array::extension::uuid::UuidMetadata +pub fn vortex_array::extension::uuid::UuidMetadata::eq(&self, other: &Self) -> bool impl core::fmt::Debug for vortex_array::extension::uuid::UuidMetadata @@ -11092,9 +11094,7 @@ pub fn vortex_array::extension::uuid::UuidMetadata::fmt(&self, f: &mut core::fmt impl core::hash::Hash for vortex_array::extension::uuid::UuidMetadata -pub fn vortex_array::extension::uuid::UuidMetadata::hash<__H: core::hash::Hasher>(&self, state: &mut __H) - -impl core::marker::StructuralPartialEq for vortex_array::extension::uuid::UuidMetadata +pub fn vortex_array::extension::uuid::UuidMetadata::hash(&self, state: &mut H) pub struct vortex_array::extension::EmptyMetadata @@ -15192,50 +15192,6 @@ pub fn vortex_array::scalar_fn::fns::select::Select::stat_falsification(&self, o pub fn vortex_array::scalar_fn::fns::select::Select::validity(&self, options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult> -pub mod vortex_array::scalar_fn::fns::uuid_from_string - -pub struct vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString - -impl core::clone::Clone for vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::clone(&self) -> vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString - -impl vortex_array::scalar_fn::ScalarFnVTable for vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString - -pub type vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::Options = vortex_array::scalar_fn::EmptyOptions - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::arity(&self, _options: &Self::Options) -> vortex_array::scalar_fn::Arity - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::child_name(&self, _options: &Self::Options, child_idx: usize) -> vortex_array::scalar_fn::ChildName - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::deserialize(&self, _metadata: &[u8], _session: &vortex_session::VortexSession) -> vortex_error::VortexResult - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::execute(&self, _options: &Self::Options, args: &dyn vortex_array::scalar_fn::ExecutionArgs, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::fmt_sql(&self, _options: &Self::Options, expr: &vortex_array::expr::Expression, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::id(&self) -> vortex_array::scalar_fn::ScalarFnId - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::is_fallible(&self, _options: &Self::Options) -> bool - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::is_null_sensitive(&self, _options: &Self::Options) -> bool - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::reduce(&self, options: &Self::Options, node: &dyn vortex_array::scalar_fn::ReduceNode, ctx: &dyn vortex_array::scalar_fn::ReduceCtx) -> vortex_error::VortexResult> - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::return_dtype(&self, _options: &Self::Options, arg_dtypes: &[vortex_array::dtype::DType]) -> vortex_error::VortexResult - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::serialize(&self, options: &Self::Options) -> vortex_error::VortexResult>> - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::simplify(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, ctx: &dyn vortex_array::scalar_fn::SimplifyCtx) -> vortex_error::VortexResult> - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::simplify_untyped(&self, options: &Self::Options, expr: &vortex_array::expr::Expression) -> vortex_error::VortexResult> - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::stat_expression(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, stat: vortex_array::expr::stats::Stat, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::stat_falsification(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::validity(&self, _options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult> - pub mod vortex_array::scalar_fn::fns::zip pub struct vortex_array::scalar_fn::fns::zip::Zip @@ -16264,42 +16220,6 @@ pub fn vortex_array::scalar_fn::fns::select::Select::stat_falsification(&self, o pub fn vortex_array::scalar_fn::fns::select::Select::validity(&self, options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult> -impl vortex_array::scalar_fn::ScalarFnVTable for vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString - -pub type vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::Options = vortex_array::scalar_fn::EmptyOptions - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::arity(&self, _options: &Self::Options) -> vortex_array::scalar_fn::Arity - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::child_name(&self, _options: &Self::Options, child_idx: usize) -> vortex_array::scalar_fn::ChildName - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::deserialize(&self, _metadata: &[u8], _session: &vortex_session::VortexSession) -> vortex_error::VortexResult - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::execute(&self, _options: &Self::Options, args: &dyn vortex_array::scalar_fn::ExecutionArgs, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::fmt_sql(&self, _options: &Self::Options, expr: &vortex_array::expr::Expression, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::id(&self) -> vortex_array::scalar_fn::ScalarFnId - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::is_fallible(&self, _options: &Self::Options) -> bool - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::is_null_sensitive(&self, _options: &Self::Options) -> bool - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::reduce(&self, options: &Self::Options, node: &dyn vortex_array::scalar_fn::ReduceNode, ctx: &dyn vortex_array::scalar_fn::ReduceCtx) -> vortex_error::VortexResult> - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::return_dtype(&self, _options: &Self::Options, arg_dtypes: &[vortex_array::dtype::DType]) -> vortex_error::VortexResult - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::serialize(&self, options: &Self::Options) -> vortex_error::VortexResult>> - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::simplify(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, ctx: &dyn vortex_array::scalar_fn::SimplifyCtx) -> vortex_error::VortexResult> - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::simplify_untyped(&self, options: &Self::Options, expr: &vortex_array::expr::Expression) -> vortex_error::VortexResult> - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::stat_expression(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, stat: vortex_array::expr::stats::Stat, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::stat_falsification(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option - -pub fn vortex_array::scalar_fn::fns::uuid_from_string::UuidFromString::validity(&self, _options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult> - impl vortex_array::scalar_fn::ScalarFnVTable for vortex_array::scalar_fn::fns::zip::Zip pub type vortex_array::scalar_fn::fns::zip::Zip::Options = vortex_array::scalar_fn::EmptyOptions diff --git a/vortex-array/src/extension/uuid/metadata.rs b/vortex-array/src/extension/uuid/metadata.rs new file mode 100644 index 00000000000..812214985cc --- /dev/null +++ b/vortex-array/src/extension/uuid/metadata.rs @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::hash::Hash; +use std::hash::Hasher; + +use uuid::Version; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +/// Converts a `u8` discriminant back to a [`uuid::Version`]. +pub(crate) fn u8_to_version(b: u8) -> VortexResult { + match b { + 0 => Ok(Version::Nil), + 1 => Ok(Version::Mac), + 2 => Ok(Version::Dce), + 3 => Ok(Version::Md5), + 4 => Ok(Version::Random), + 5 => Ok(Version::Sha1), + 6 => Ok(Version::SortMac), + 7 => Ok(Version::SortRand), + 8 => Ok(Version::Custom), + 0xff => Ok(Version::Max), + _ => vortex_bail!("unknown UUID version discriminant: {b}"), + } +} + +/// Metadata for the UUID extension type. +/// +/// Optionally records which UUID version the column contains (e.g. v4 random, v7 +/// sort-random). When `None`, the column may contain any mix of versions. +#[derive(Clone, Debug)] +pub struct UuidMetadata { + /// The UUID version, if known. + pub version: Option, +} + +impl UuidMetadata { + /// Creates metadata with no version constraint. + pub fn any() -> Self { + Self { version: None } + } +} + +impl fmt::Display for UuidMetadata { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.version { + None => write!(f, "UUID"), + Some(v) => write!(f, "UUID(v{})", v as u8), + } + } +} + +// `uuid::Version` derives `PartialEq` but not `Eq` or `Hash`, so we implement these +// manually using the `#[repr(u8)]` discriminant. + +impl PartialEq for UuidMetadata { + fn eq(&self, other: &Self) -> bool { + self.version.map(|v| v as u8) == other.version.map(|v| v as u8) + } +} + +impl Eq for UuidMetadata {} + +impl Hash for UuidMetadata { + fn hash(&self, state: &mut H) { + self.version.map(|v| v as u8).hash(state); + } +} diff --git a/vortex-array/src/extension/uuid/mod.rs b/vortex-array/src/extension/uuid/mod.rs index 6df102ce852..e4347c2513c 100644 --- a/vortex-array/src/extension/uuid/mod.rs +++ b/vortex-array/src/extension/uuid/mod.rs @@ -10,6 +10,9 @@ //! [RFC 4122]: https://www.rfc-editor.org/rfc/rfc4122 //! [canonical UUID extension]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid +mod metadata; +pub use metadata::UuidMetadata; + pub(crate) mod vtable; /// The VTable for the UUID extension type. diff --git a/vortex-array/src/extension/uuid/vtable.rs b/vortex-array/src/extension/uuid/vtable.rs index 9fbfff125e1..87ffac83f5b 100644 --- a/vortex-array/src/extension/uuid/vtable.rs +++ b/vortex-array/src/extension/uuid/vtable.rs @@ -6,13 +6,15 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_error::vortex_ensure_eq; +use vortex_error::vortex_err; use crate::dtype::DType; use crate::dtype::PType; use crate::dtype::extension::ExtId; use crate::dtype::extension::ExtVTable; -use crate::extension::EmptyMetadata; use crate::extension::uuid::Uuid; +use crate::extension::uuid::UuidMetadata; +use crate::extension::uuid::metadata::u8_to_version; use crate::scalar::PValue; use crate::scalar::ScalarValue; @@ -20,19 +22,28 @@ use crate::scalar::ScalarValue; pub(crate) const UUID_BYTE_LEN: usize = 16; impl ExtVTable for Uuid { - type Metadata = EmptyMetadata; + type Metadata = UuidMetadata; type NativeValue<'a> = uuid::Uuid; fn id(&self) -> ExtId { ExtId::new_ref("vortex.uuid") } - fn serialize_metadata(&self, _metadata: &Self::Metadata) -> VortexResult> { - Ok(Vec::new()) + fn serialize_metadata(&self, metadata: &Self::Metadata) -> VortexResult> { + match metadata.version { + None => Ok(Vec::new()), + Some(v) => Ok(vec![v as u8]), + } } - fn deserialize_metadata(&self, _metadata: &[u8]) -> VortexResult { - Ok(EmptyMetadata) + fn deserialize_metadata(&self, metadata: &[u8]) -> VortexResult { + let version = match metadata.len() { + 0 => None, + 1 => Some(u8_to_version(metadata[0])?), + other => vortex_bail!("UUID metadata must be 0 or 1 bytes, got {other}"), + }; + + Ok(UuidMetadata { version }) } fn validate_dtype( @@ -69,7 +80,7 @@ impl ExtVTable for Uuid { fn unpack_native<'a>( &self, - _metadata: &'a Self::Metadata, + metadata: &'a Self::Metadata, _storage_dtype: &'a DType, storage_value: &'a ScalarValue, ) -> VortexResult> { @@ -92,7 +103,24 @@ impl ExtVTable for Uuid { bytes[i] = *b; } - Ok(uuid::Uuid::from_bytes(bytes)) + let parsed = uuid::Uuid::from_bytes(bytes); + + // Verify the parsed UUID matches the expected version, if one is set. + if let Some(expected) = metadata.version { + let expected = expected as u8; + let actual = parsed + .get_version() + .ok_or_else(|| vortex_err!("UUID has unrecognized version nibble"))? + as u8; + + vortex_ensure_eq!( + expected, + actual, + "UUID version mismatch: expected v{expected}, got v{actual}", + ); + } + + Ok(parsed) } } @@ -105,32 +133,61 @@ mod tests { use std::sync::Arc; use rstest::rstest; + use uuid::Version; use vortex_error::VortexResult; use crate::dtype::DType; use crate::dtype::Nullability; use crate::dtype::PType; use crate::dtype::extension::ExtVTable; - use crate::extension::EmptyMetadata; use crate::extension::uuid::Uuid; + use crate::extension::uuid::UuidMetadata; use crate::extension::uuid::vtable::UUID_BYTE_LEN; use crate::scalar::Scalar; + use crate::scalar::ScalarValue; - #[test] - fn roundtrip_metadata() -> VortexResult<()> { - let vtable = Uuid; - let bytes = vtable.serialize_metadata(&EmptyMetadata)?; - let deserialized = vtable.deserialize_metadata(&bytes)?; - assert_eq!(deserialized, EmptyMetadata); + #[rstest] + #[case::no_version(None)] + #[case::v4_random(Some(Version::Random))] + #[case::v7_sort_rand(Some(Version::SortRand))] + #[case::nil(Some(Version::Nil))] + #[case::max(Some(Version::Max))] + fn roundtrip_metadata(#[case] version: Option) -> VortexResult<()> { + let metadata = UuidMetadata { version }; + let bytes = Uuid.serialize_metadata(&metadata)?; + let expected_len = if version.is_none() { 0 } else { 1 }; + assert_eq!(bytes.len(), expected_len); + let deserialized = Uuid.deserialize_metadata(&bytes)?; + assert_eq!(deserialized, metadata); Ok(()) } + #[test] + fn metadata_display_no_version() { + let metadata = UuidMetadata { version: None }; + assert_eq!(metadata.to_string(), "UUID"); + } + + #[test] + fn metadata_display_with_version() { + let metadata = UuidMetadata { + version: Some(Version::Random), + }; + assert_eq!(metadata.to_string(), "UUID(v4)"); + + let metadata = UuidMetadata { + version: Some(Version::SortRand), + }; + assert_eq!(metadata.to_string(), "UUID(v7)"); + } + #[rstest] #[case::non_nullable(Nullability::NonNullable)] #[case::nullable(Nullability::Nullable)] fn validate_correct_storage_dtype(#[case] nullability: Nullability) -> VortexResult<()> { + let metadata = UuidMetadata::any(); let storage_dtype = uuid_storage_dtype(nullability); - Uuid.validate_dtype(&EmptyMetadata, &storage_dtype) + Uuid.validate_dtype(&metadata, &storage_dtype) } #[test] @@ -140,7 +197,10 @@ mod tests { 8, Nullability::NonNullable, ); - assert!(Uuid.validate_dtype(&EmptyMetadata, &storage_dtype).is_err()); + assert!( + Uuid.validate_dtype(&UuidMetadata::any(), &storage_dtype) + .is_err() + ); } #[test] @@ -150,7 +210,10 @@ mod tests { UUID_BYTE_LEN as u32, Nullability::NonNullable, ); - assert!(Uuid.validate_dtype(&EmptyMetadata, &storage_dtype).is_err()); + assert!( + Uuid.validate_dtype(&UuidMetadata::any(), &storage_dtype) + .is_err() + ); } #[test] @@ -160,13 +223,19 @@ mod tests { UUID_BYTE_LEN as u32, Nullability::NonNullable, ); - assert!(Uuid.validate_dtype(&EmptyMetadata, &storage_dtype).is_err()); + assert!( + Uuid.validate_dtype(&UuidMetadata::any(), &storage_dtype) + .is_err() + ); } #[test] fn validate_rejects_non_fsl() { let storage_dtype = DType::Primitive(PType::U8, Nullability::NonNullable); - assert!(Uuid.validate_dtype(&EmptyMetadata, &storage_dtype).is_err()); + assert!( + Uuid.validate_dtype(&UuidMetadata::any(), &storage_dtype) + .is_err() + ); } #[test] @@ -174,6 +243,7 @@ mod tests { let expected = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") .map_err(|e| vortex_error::vortex_err!("{e}"))?; + let metadata = UuidMetadata::any(); let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); let children: Vec = expected .as_bytes() @@ -189,12 +259,92 @@ mod tests { let storage_value = storage_scalar .value() .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; - let result = Uuid.unpack_native(&EmptyMetadata, &storage_dtype, storage_value)?; + let result = Uuid.unpack_native(&metadata, &storage_dtype, storage_value)?; assert_eq!(result, expected); assert_eq!(result.to_string(), "550e8400-e29b-41d4-a716-446655440000"); Ok(()) } + #[test] + fn unpack_native_rejects_version_mismatch() -> VortexResult<()> { + // This is a v4 UUID. + let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + assert_eq!(v4_uuid.get_version(), Some(Version::Random)); + + // Metadata says v7, but the UUID is v4. + let metadata = UuidMetadata { + version: Some(Version::SortRand), + }; + let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); + let children: Vec = v4_uuid + .as_bytes() + .iter() + .map(|&b| Scalar::primitive(b, Nullability::NonNullable)) + .collect(); + let storage_scalar = Scalar::fixed_size_list( + DType::Primitive(PType::U8, Nullability::NonNullable), + children, + Nullability::NonNullable, + ); + + let storage_value = storage_scalar + .value() + .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; + assert!( + Uuid.unpack_native(&metadata, &storage_dtype, storage_value) + .is_err() + ); + Ok(()) + } + + /// Builds a [`ScalarValue`] for a UUID's 16 bytes, suitable for passing to `unpack_native`. + fn uuid_storage_scalar(uuid: &uuid::Uuid) -> ScalarValue { + let children: Vec = uuid + .as_bytes() + .iter() + .map(|&b| Scalar::primitive(b, Nullability::NonNullable)) + .collect(); + let scalar = Scalar::fixed_size_list( + DType::Primitive(PType::U8, Nullability::NonNullable), + children, + Nullability::NonNullable, + ); + scalar.value().unwrap().clone() + } + + #[test] + fn unpack_native_accepts_matching_version() -> VortexResult<()> { + // This is a v4 UUID. + let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + + let metadata = UuidMetadata { + version: Some(Version::Random), + }; + let storage_value = uuid_storage_scalar(&v4_uuid); + let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); + + let result = Uuid.unpack_native(&metadata, &storage_dtype, &storage_value)?; + assert_eq!(result, v4_uuid); + Ok(()) + } + + #[test] + fn unpack_native_any_version_accepts_all() -> VortexResult<()> { + // A v4 UUID should be accepted when metadata has no version constraint. + let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + + let metadata = UuidMetadata::any(); + let storage_value = uuid_storage_scalar(&v4_uuid); + let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); + + let result = Uuid.unpack_native(&metadata, &storage_dtype, &storage_value)?; + assert_eq!(result, v4_uuid); + Ok(()) + } + fn uuid_storage_dtype(nullability: Nullability) -> DType { DType::FixedSizeList( Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), diff --git a/vortex-array/src/scalar_fn/fns/mod.rs b/vortex-array/src/scalar_fn/fns/mod.rs index a3cfd487dfc..94fc8fb0384 100644 --- a/vortex-array/src/scalar_fn/fns/mod.rs +++ b/vortex-array/src/scalar_fn/fns/mod.rs @@ -19,5 +19,4 @@ pub mod operators; pub mod pack; pub mod root; pub mod select; -pub mod uuid_from_string; pub mod zip; diff --git a/vortex-array/src/scalar_fn/fns/uuid_from_string.rs b/vortex-array/src/scalar_fn/fns/uuid_from_string.rs deleted file mode 100644 index 1d198a2d3e1..00000000000 --- a/vortex-array/src/scalar_fn/fns/uuid_from_string.rs +++ /dev/null @@ -1,330 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Scalar function to parse UTF-8 strings into [`Uuid`] extension arrays. - -use std::fmt::Formatter; -use std::sync::Arc; - -use uuid; -use vortex_buffer::Buffer; -use vortex_error::VortexResult; -use vortex_error::vortex_ensure; -use vortex_error::vortex_err; - -use crate::ArrayRef; -use crate::DynArray; -use crate::ExecutionCtx; -use crate::IntoArray; -use crate::arrays::ExtensionArray; -use crate::arrays::FixedSizeListArray; -use crate::dtype::DType; -use crate::dtype::Nullability; -use crate::dtype::PType; -use crate::dtype::extension::ExtDType; -use crate::expr::Expression; -use crate::extension::EmptyMetadata; -use crate::extension::uuid::Uuid; -use crate::extension::uuid::vtable::UUID_BYTE_LEN; -use crate::scalar_fn::Arity; -use crate::scalar_fn::ChildName; -use crate::scalar_fn::EmptyOptions; -use crate::scalar_fn::ExecutionArgs; -use crate::scalar_fn::ScalarFnId; -use crate::scalar_fn::ScalarFnVTable; - -/// Parses a UTF-8 string column into a [`Uuid`] extension array. -/// -/// Accepts any standard UUID string format (hyphenated, simple, braced, URN). Invalid strings -/// cause an error. -#[derive(Clone)] -pub struct UuidFromString; - -#[expect( - clippy::cast_possible_truncation, - reason = "UUID_BYTE_LEN always fits both usize and u32" -)] -impl ScalarFnVTable for UuidFromString { - type Options = EmptyOptions; - - fn id(&self) -> ScalarFnId { - ScalarFnId::new_ref("vortex.uuid_from_string") - } - - fn arity(&self, _options: &Self::Options) -> Arity { - Arity::Exact(1) - } - - fn child_name(&self, _options: &Self::Options, child_idx: usize) -> ChildName { - match child_idx { - 0 => ChildName::from("input"), - _ => unreachable!("uuid_from_string must have exactly one child"), - } - } - - fn fmt_sql( - &self, - _options: &Self::Options, - expr: &Expression, - f: &mut Formatter<'_>, - ) -> std::fmt::Result { - write!(f, "uuid_from_string(")?; - expr.child(0).fmt_sql(f)?; - write!(f, ")") - } - - fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult { - debug_assert_eq!(arg_dtypes.len(), 1); - - let input = &arg_dtypes[0]; - vortex_ensure!( - input.is_utf8(), - "uuid_from_string requires a Utf8 input, got {input}" - ); - - let nullability = input.nullability(); - - let storage_dtype = DType::FixedSizeList( - Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), - UUID_BYTE_LEN as u32, - nullability, - ); - - let ext_dtype = ExtDType::::try_new(EmptyMetadata, storage_dtype)?.erased(); - - Ok(DType::Extension(ext_dtype)) - } - - fn execute( - &self, - _options: &Self::Options, - args: &dyn ExecutionArgs, - _ctx: &mut ExecutionCtx, - ) -> VortexResult { - let input = args.get(0)?; - let row_count = args.row_count(); - - let varbinview = input - .to_canonical() - .map_err(|e| vortex_err!("uuid_from_string: failed to canonicalize input: {e}"))? - .into_varbinview(); - - let validity = varbinview.validity()?; - - let mut bytes = vec![0u8; row_count * UUID_BYTE_LEN]; - - for i in 0..row_count { - if !validity.is_valid(i)? { - continue; - } - - let str_bytes = varbinview.bytes_at(i); - let s = std::str::from_utf8(&str_bytes) - .map_err(|e| vortex_err!("uuid_from_string: invalid UTF-8 at row {i}: {e}"))?; - - let parsed = uuid::Uuid::parse_str(s) - .map_err(|e| vortex_err!("uuid_from_string: invalid UUID at row {i}: {e}"))?; - - bytes[i * UUID_BYTE_LEN..(i + 1) * UUID_BYTE_LEN].copy_from_slice(parsed.as_bytes()); - } - - // Build the flat u8 elements array. - let elements: ArrayRef = Buffer::copy_from(&bytes).into_array(); - - // Wrap in FixedSizeList and Extension. - let fsl = FixedSizeListArray::new(elements, UUID_BYTE_LEN as u32, validity, row_count); - let ext_dtype = ExtDType::::try_new(EmptyMetadata, fsl.dtype().clone())?.erased(); - - Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array()) - } - - fn validity( - &self, - _options: &Self::Options, - expression: &Expression, - ) -> VortexResult> { - // Output validity is the same as the input. - Ok(Some(expression.child(0).validity()?)) - } - - fn is_null_sensitive(&self, _options: &Self::Options) -> bool { - false - } - - fn is_fallible(&self, _options: &Self::Options) -> bool { - // Invalid UUID strings cause errors. - true - } -} - -#[cfg(test)] -mod tests { - use vortex_error::VortexResult; - - use crate::ArrayRef; - use crate::DynArray; - use crate::IntoArray; - use crate::ToCanonical; - use crate::arrays::ScalarFnArray; - use crate::arrays::VarBinViewArray; - use crate::dtype::DType; - use crate::dtype::Nullability; - use crate::dtype::PType; - use crate::dtype::extension::ExtVTable; - use crate::extension::EmptyMetadata; - use crate::extension::uuid::Uuid; - use crate::extension::uuid::vtable::UUID_BYTE_LEN; - use crate::scalar_fn::EmptyOptions; - use crate::scalar_fn::ScalarFn; - use crate::scalar_fn::fns::uuid_from_string::UuidFromString; - - /// Builds a string array from the given values, with nullable support. - fn string_array(values: &[Option<&str>]) -> ArrayRef { - VarBinViewArray::from_iter_nullable_str(values.iter().copied()).into_array() - } - - /// Evaluates `uuid_from_string` and returns the resulting extension array. - fn eval_uuid_from_string(input: ArrayRef, len: usize) -> VortexResult { - let scalar_fn = ScalarFn::new(UuidFromString, EmptyOptions).erased(); - let result = ScalarFnArray::try_new(scalar_fn, vec![input], len)?; - result.to_canonical().map(|c| c.into_array()) - } - - /// Extracts the flat u8 bytes from a UUID extension array. - fn extract_uuid_bytes(array: &ArrayRef) -> Vec { - let ext = array.to_extension(); - let fsl = ext.storage().to_fixed_size_list(); - let prim = fsl.elements().to_primitive(); - prim.as_slice::().to_vec() - } - - #[test] - fn parse_single_uuid() -> VortexResult<()> { - let input = string_array(&[Some("550e8400-e29b-41d4-a716-446655440000")]); - let result = eval_uuid_from_string(input, 1)?; - - let expected = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") - .map_err(|e| vortex_error::vortex_err!("{e}"))?; - - let bytes = extract_uuid_bytes(&result); - assert_eq!(&bytes, expected.as_bytes()); - Ok(()) - } - - #[test] - fn parse_multiple_uuids() -> VortexResult<()> { - let uuids = [ - "550e8400-e29b-41d4-a716-446655440000", - "6ba7b810-9dad-11d1-80b4-00c04fd430c8", - "f47ac10b-58cc-4372-a567-0e02b2c3d479", - ]; - let input = string_array(&uuids.iter().map(|s| Some(*s)).collect::>()); - let result = eval_uuid_from_string(input, 3)?; - - let bytes = extract_uuid_bytes(&result); - for (i, uuid_str) in uuids.iter().enumerate() { - let expected = - uuid::Uuid::parse_str(uuid_str).map_err(|e| vortex_error::vortex_err!("{e}"))?; - assert_eq!(&bytes[i * 16..(i + 1) * 16], expected.as_bytes()); - } - Ok(()) - } - - #[test] - fn parse_invalid_uuid_errors() { - let input = string_array(&[Some("not-a-uuid")]); - let result = eval_uuid_from_string(input, 1); - assert!(result.is_err()); - } - - #[test] - fn parse_null_input_produces_null() -> VortexResult<()> { - let input = string_array(&[ - Some("550e8400-e29b-41d4-a716-446655440000"), - None, - Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"), - ]); - let result = eval_uuid_from_string(input, 3)?; - - // Row 1 should be null. - assert!(result.is_valid(0)?); - assert!(result.is_invalid(1)?); - assert!(result.is_valid(2)?); - Ok(()) - } - - #[expect( - clippy::cast_possible_truncation, - reason = "UUID_BYTE_LEN always fits both usize and u32" - )] - #[test] - fn storage_array_structure() -> VortexResult<()> { - // Note that this test assumes that the storage type is a `FixedSizeList`. That will likely - // change in the future. - - let input = string_array(&[ - Some("550e8400-e29b-41d4-a716-446655440000"), - None, - Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"), - ]); - let result = eval_uuid_from_string(input, 3)?; - - // The result should be an extension array. - let ext = result.to_extension(); - assert_eq!(ext.ext_dtype().id().as_ref(), "vortex.uuid"); - assert_eq!(ext.len(), 3); - - // The storage should be a FixedSizeList of u8 with size 16. - let fsl = ext.storage().to_fixed_size_list(); - assert_eq!(fsl.len(), 3); - assert_eq!(fsl.list_size(), UUID_BYTE_LEN as u32); - - // The elements should be a flat u8 primitive array of length 3 * 16 = 48. - let prim = fsl.elements().to_primitive(); - assert_eq!(prim.len(), 3 * UUID_BYTE_LEN); - assert_eq!( - prim.dtype(), - &DType::Primitive(PType::U8, Nullability::NonNullable) - ); - - // Validity on the FSL should match the input: valid, null, valid. - assert!(fsl.is_valid(0)?); - assert!(fsl.is_invalid(1)?); - assert!(fsl.is_valid(2)?); - - // Verify the byte content of the two valid UUIDs. - let bytes = prim.as_slice::(); - let expected_0 = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") - .map_err(|e| vortex_error::vortex_err!("{e}"))?; - let expected_2 = uuid::Uuid::parse_str("6ba7b810-9dad-11d1-80b4-00c04fd430c8") - .map_err(|e| vortex_error::vortex_err!("{e}"))?; - assert_eq!(&bytes[0..UUID_BYTE_LEN], expected_0.as_bytes()); - assert_eq!( - &bytes[2 * UUID_BYTE_LEN..3 * UUID_BYTE_LEN], - expected_2.as_bytes() - ); - - Ok(()) - } - - #[test] - fn unpack_native_from_parsed() -> VortexResult<()> { - let input = string_array(&[Some("550e8400-e29b-41d4-a716-446655440000")]); - let result = eval_uuid_from_string(input, 1)?; - - let scalar = result.scalar_at(0)?; - let ext_scalar = scalar.as_extension(); - let storage_scalar = ext_scalar.to_storage_scalar(); - let storage_value = storage_scalar - .value() - .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; - - let native = Uuid.unpack_native( - &EmptyMetadata, - ext_scalar.ext_dtype().storage_dtype(), - storage_value, - )?; - assert_eq!(native.to_string(), "550e8400-e29b-41d4-a716-446655440000"); - Ok(()) - } -} From 32ccf9f5a212bc138776be117973d32ecbebc30f Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 6 Mar 2026 17:04:31 -0500 Subject: [PATCH 4/4] make default Signed-off-by: Connor Tsui --- vortex-array/public-api.lock | 8 ++++---- vortex-array/src/extension/uuid/metadata.rs | 13 +++---------- vortex-array/src/extension/uuid/vtable.rs | 20 ++++++++++---------- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 8f2a8705104..9224b79a23f 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -11070,10 +11070,6 @@ pub struct vortex_array::extension::uuid::UuidMetadata pub vortex_array::extension::uuid::UuidMetadata::version: core::option::Option -impl vortex_array::extension::uuid::UuidMetadata - -pub fn vortex_array::extension::uuid::UuidMetadata::any() -> Self - impl core::clone::Clone for vortex_array::extension::uuid::UuidMetadata pub fn vortex_array::extension::uuid::UuidMetadata::clone(&self) -> vortex_array::extension::uuid::UuidMetadata @@ -11084,6 +11080,10 @@ impl core::cmp::PartialEq for vortex_array::extension::uuid::UuidMetadata pub fn vortex_array::extension::uuid::UuidMetadata::eq(&self, other: &Self) -> bool +impl core::default::Default for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::default() -> vortex_array::extension::uuid::UuidMetadata + impl core::fmt::Debug for vortex_array::extension::uuid::UuidMetadata pub fn vortex_array::extension::uuid::UuidMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/vortex-array/src/extension/uuid/metadata.rs b/vortex-array/src/extension/uuid/metadata.rs index 812214985cc..7e7dd8b16d8 100644 --- a/vortex-array/src/extension/uuid/metadata.rs +++ b/vortex-array/src/extension/uuid/metadata.rs @@ -30,24 +30,17 @@ pub(crate) fn u8_to_version(b: u8) -> VortexResult { /// /// Optionally records which UUID version the column contains (e.g. v4 random, v7 /// sort-random). When `None`, the column may contain any mix of versions. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct UuidMetadata { /// The UUID version, if known. pub version: Option, } -impl UuidMetadata { - /// Creates metadata with no version constraint. - pub fn any() -> Self { - Self { version: None } - } -} - impl fmt::Display for UuidMetadata { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.version { - None => write!(f, "UUID"), - Some(v) => write!(f, "UUID(v{})", v as u8), + None => write!(f, ""), + Some(v) => write!(f, "v{}", v as u8), } } } diff --git a/vortex-array/src/extension/uuid/vtable.rs b/vortex-array/src/extension/uuid/vtable.rs index 87ffac83f5b..3564d0cf39f 100644 --- a/vortex-array/src/extension/uuid/vtable.rs +++ b/vortex-array/src/extension/uuid/vtable.rs @@ -165,7 +165,7 @@ mod tests { #[test] fn metadata_display_no_version() { let metadata = UuidMetadata { version: None }; - assert_eq!(metadata.to_string(), "UUID"); + assert_eq!(metadata.to_string(), ""); } #[test] @@ -173,19 +173,19 @@ mod tests { let metadata = UuidMetadata { version: Some(Version::Random), }; - assert_eq!(metadata.to_string(), "UUID(v4)"); + assert_eq!(metadata.to_string(), "v4"); let metadata = UuidMetadata { version: Some(Version::SortRand), }; - assert_eq!(metadata.to_string(), "UUID(v7)"); + assert_eq!(metadata.to_string(), "v7"); } #[rstest] #[case::non_nullable(Nullability::NonNullable)] #[case::nullable(Nullability::Nullable)] fn validate_correct_storage_dtype(#[case] nullability: Nullability) -> VortexResult<()> { - let metadata = UuidMetadata::any(); + let metadata = UuidMetadata::default(); let storage_dtype = uuid_storage_dtype(nullability); Uuid.validate_dtype(&metadata, &storage_dtype) } @@ -198,7 +198,7 @@ mod tests { Nullability::NonNullable, ); assert!( - Uuid.validate_dtype(&UuidMetadata::any(), &storage_dtype) + Uuid.validate_dtype(&UuidMetadata::default(), &storage_dtype) .is_err() ); } @@ -211,7 +211,7 @@ mod tests { Nullability::NonNullable, ); assert!( - Uuid.validate_dtype(&UuidMetadata::any(), &storage_dtype) + Uuid.validate_dtype(&UuidMetadata::default(), &storage_dtype) .is_err() ); } @@ -224,7 +224,7 @@ mod tests { Nullability::NonNullable, ); assert!( - Uuid.validate_dtype(&UuidMetadata::any(), &storage_dtype) + Uuid.validate_dtype(&UuidMetadata::default(), &storage_dtype) .is_err() ); } @@ -233,7 +233,7 @@ mod tests { fn validate_rejects_non_fsl() { let storage_dtype = DType::Primitive(PType::U8, Nullability::NonNullable); assert!( - Uuid.validate_dtype(&UuidMetadata::any(), &storage_dtype) + Uuid.validate_dtype(&UuidMetadata::default(), &storage_dtype) .is_err() ); } @@ -243,7 +243,7 @@ mod tests { let expected = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") .map_err(|e| vortex_error::vortex_err!("{e}"))?; - let metadata = UuidMetadata::any(); + let metadata = UuidMetadata::default(); let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); let children: Vec = expected .as_bytes() @@ -336,7 +336,7 @@ mod tests { let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") .map_err(|e| vortex_error::vortex_err!("{e}"))?; - let metadata = UuidMetadata::any(); + let metadata = UuidMetadata::default(); let storage_value = uuid_storage_scalar(&v4_uuid); let storage_dtype = uuid_storage_dtype(Nullability::NonNullable);