diff --git a/Cargo.lock b/Cargo.lock index 4bd2006c54e..ce332cca287 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9731,6 +9731,7 @@ dependencies = [ "tabled", "termtree", "tracing", + "uuid", "vortex-array", "vortex-buffer", "vortex-error", diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index e915c8e2002..575a41459b9 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -67,6 +67,7 @@ tabled = { workspace = true, optional = true, default-features = false, features ] } termtree = { workspace = true } tracing = { workspace = true } +uuid = { workspace = true } vortex-buffer = { workspace = true, features = ["arrow"] } vortex-error = { workspace = true, features = ["flatbuffers"] } vortex-flatbuffers = { workspace = true, features = ["array", "dtype"] } diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 3faed612fa6..9224b79a23f 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -7038,6 +7038,24 @@ pub fn vortex_array::extension::datetime::Timestamp::validate_dtype(&self, _meta pub fn vortex_array::extension::datetime::Timestamp::validate_scalar_value(&self, metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType, storage_value: &vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult<()> +impl vortex_array::dtype::extension::ExtVTable for vortex_array::extension::uuid::Uuid + +pub type vortex_array::extension::uuid::Uuid::Metadata = vortex_array::extension::uuid::UuidMetadata + +pub type vortex_array::extension::uuid::Uuid::NativeValue<'a> = uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::deserialize_metadata(&self, metadata: &[u8]) -> vortex_error::VortexResult + +pub fn vortex_array::extension::uuid::Uuid::id(&self) -> vortex_array::dtype::extension::ExtId + +pub fn vortex_array::extension::uuid::Uuid::serialize_metadata(&self, metadata: &Self::Metadata) -> vortex_error::VortexResult> + +pub fn vortex_array::extension::uuid::Uuid::unpack_native<'a>(&self, metadata: &'a Self::Metadata, _storage_dtype: &'a vortex_array::dtype::DType, storage_value: &'a vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult + +pub fn vortex_array::extension::uuid::Uuid::validate_dtype(&self, _metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult<()> + +pub fn vortex_array::extension::uuid::Uuid::validate_scalar_value(&self, metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType, storage_value: &vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult<()> + pub trait vortex_array::dtype::extension::Matcher pub type vortex_array::dtype::extension::Matcher::Match<'a> @@ -11002,6 +11020,82 @@ pub fn vortex_array::extension::datetime::TimestampOptions::hash<__H: core::hash impl core::marker::StructuralPartialEq for vortex_array::extension::datetime::TimestampOptions +pub mod vortex_array::extension::uuid + +pub struct vortex_array::extension::uuid::Uuid + +impl core::clone::Clone for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::clone(&self) -> vortex_array::extension::uuid::Uuid + +impl core::cmp::Eq for vortex_array::extension::uuid::Uuid + +impl core::cmp::PartialEq for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::eq(&self, other: &vortex_array::extension::uuid::Uuid) -> bool + +impl core::default::Default for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::default() -> vortex_array::extension::uuid::Uuid + +impl core::fmt::Debug for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_array::extension::uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::hash<__H: core::hash::Hasher>(&self, state: &mut __H) + +impl core::marker::StructuralPartialEq for vortex_array::extension::uuid::Uuid + +impl vortex_array::dtype::extension::ExtVTable for vortex_array::extension::uuid::Uuid + +pub type vortex_array::extension::uuid::Uuid::Metadata = vortex_array::extension::uuid::UuidMetadata + +pub type vortex_array::extension::uuid::Uuid::NativeValue<'a> = uuid::Uuid + +pub fn vortex_array::extension::uuid::Uuid::deserialize_metadata(&self, metadata: &[u8]) -> vortex_error::VortexResult + +pub fn vortex_array::extension::uuid::Uuid::id(&self) -> vortex_array::dtype::extension::ExtId + +pub fn vortex_array::extension::uuid::Uuid::serialize_metadata(&self, metadata: &Self::Metadata) -> vortex_error::VortexResult> + +pub fn vortex_array::extension::uuid::Uuid::unpack_native<'a>(&self, metadata: &'a Self::Metadata, _storage_dtype: &'a vortex_array::dtype::DType, storage_value: &'a vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult + +pub fn vortex_array::extension::uuid::Uuid::validate_dtype(&self, _metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult<()> + +pub fn vortex_array::extension::uuid::Uuid::validate_scalar_value(&self, metadata: &Self::Metadata, storage_dtype: &vortex_array::dtype::DType, storage_value: &vortex_array::scalar::ScalarValue) -> vortex_error::VortexResult<()> + +pub struct vortex_array::extension::uuid::UuidMetadata + +pub vortex_array::extension::uuid::UuidMetadata::version: core::option::Option + +impl core::clone::Clone for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::clone(&self) -> vortex_array::extension::uuid::UuidMetadata + +impl core::cmp::Eq for vortex_array::extension::uuid::UuidMetadata + +impl core::cmp::PartialEq for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::eq(&self, other: &Self) -> bool + +impl core::default::Default for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::default() -> vortex_array::extension::uuid::UuidMetadata + +impl core::fmt::Debug for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_array::extension::uuid::UuidMetadata + +pub fn vortex_array::extension::uuid::UuidMetadata::hash(&self, state: &mut H) + pub struct vortex_array::extension::EmptyMetadata impl core::clone::Clone for vortex_array::extension::EmptyMetadata diff --git a/vortex-array/src/extension/mod.rs b/vortex-array/src/extension/mod.rs index 5c29154ded0..9f81e7fb310 100644 --- a/vortex-array/src/extension/mod.rs +++ b/vortex-array/src/extension/mod.rs @@ -6,6 +6,7 @@ use std::fmt; pub mod datetime; +pub mod uuid; #[cfg(test)] mod tests; diff --git a/vortex-array/src/extension/uuid/metadata.rs b/vortex-array/src/extension/uuid/metadata.rs new file mode 100644 index 00000000000..7e7dd8b16d8 --- /dev/null +++ b/vortex-array/src/extension/uuid/metadata.rs @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::hash::Hash; +use std::hash::Hasher; + +use uuid::Version; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +/// Converts a `u8` discriminant back to a [`uuid::Version`]. +pub(crate) fn u8_to_version(b: u8) -> VortexResult { + match b { + 0 => Ok(Version::Nil), + 1 => Ok(Version::Mac), + 2 => Ok(Version::Dce), + 3 => Ok(Version::Md5), + 4 => Ok(Version::Random), + 5 => Ok(Version::Sha1), + 6 => Ok(Version::SortMac), + 7 => Ok(Version::SortRand), + 8 => Ok(Version::Custom), + 0xff => Ok(Version::Max), + _ => vortex_bail!("unknown UUID version discriminant: {b}"), + } +} + +/// Metadata for the UUID extension type. +/// +/// Optionally records which UUID version the column contains (e.g. v4 random, v7 +/// sort-random). When `None`, the column may contain any mix of versions. +#[derive(Clone, Debug, Default)] +pub struct UuidMetadata { + /// The UUID version, if known. + pub version: Option, +} + +impl fmt::Display for UuidMetadata { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.version { + None => write!(f, ""), + Some(v) => write!(f, "v{}", v as u8), + } + } +} + +// `uuid::Version` derives `PartialEq` but not `Eq` or `Hash`, so we implement these +// manually using the `#[repr(u8)]` discriminant. + +impl PartialEq for UuidMetadata { + fn eq(&self, other: &Self) -> bool { + self.version.map(|v| v as u8) == other.version.map(|v| v as u8) + } +} + +impl Eq for UuidMetadata {} + +impl Hash for UuidMetadata { + fn hash(&self, state: &mut H) { + self.version.map(|v| v as u8).hash(state); + } +} diff --git a/vortex-array/src/extension/uuid/mod.rs b/vortex-array/src/extension/uuid/mod.rs new file mode 100644 index 00000000000..e4347c2513c --- /dev/null +++ b/vortex-array/src/extension/uuid/mod.rs @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! UUID extension type for Vortex. +//! +//! Provides a UUID extension type backed by `FixedSizeList(Primitive(U8), 16)` storage. Each UUID +//! is stored as 16 bytes in big-endian (network) byte order, matching [RFC 4122] and Arrow's +//! [canonical UUID extension]. +//! +//! [RFC 4122]: https://www.rfc-editor.org/rfc/rfc4122 +//! [canonical UUID extension]: https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid + +mod metadata; +pub use metadata::UuidMetadata; + +pub(crate) mod vtable; + +/// The VTable for the UUID extension type. +#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] +pub struct Uuid; diff --git a/vortex-array/src/extension/uuid/vtable.rs b/vortex-array/src/extension/uuid/vtable.rs new file mode 100644 index 00000000000..3564d0cf39f --- /dev/null +++ b/vortex-array/src/extension/uuid/vtable.rs @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use uuid; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_ensure_eq; +use vortex_error::vortex_err; + +use crate::dtype::DType; +use crate::dtype::PType; +use crate::dtype::extension::ExtId; +use crate::dtype::extension::ExtVTable; +use crate::extension::uuid::Uuid; +use crate::extension::uuid::UuidMetadata; +use crate::extension::uuid::metadata::u8_to_version; +use crate::scalar::PValue; +use crate::scalar::ScalarValue; + +/// The number of bytes in a UUID. +pub(crate) const UUID_BYTE_LEN: usize = 16; + +impl ExtVTable for Uuid { + type Metadata = UuidMetadata; + type NativeValue<'a> = uuid::Uuid; + + fn id(&self) -> ExtId { + ExtId::new_ref("vortex.uuid") + } + + fn serialize_metadata(&self, metadata: &Self::Metadata) -> VortexResult> { + match metadata.version { + None => Ok(Vec::new()), + Some(v) => Ok(vec![v as u8]), + } + } + + fn deserialize_metadata(&self, metadata: &[u8]) -> VortexResult { + let version = match metadata.len() { + 0 => None, + 1 => Some(u8_to_version(metadata[0])?), + other => vortex_bail!("UUID metadata must be 0 or 1 bytes, got {other}"), + }; + + Ok(UuidMetadata { version }) + } + + fn validate_dtype( + &self, + _metadata: &Self::Metadata, + storage_dtype: &DType, + ) -> VortexResult<()> { + let DType::FixedSizeList(element_dtype, list_size, _nullability) = storage_dtype else { + vortex_bail!("UUID storage dtype must be a FixedSizeList, got {storage_dtype}"); + }; + + vortex_ensure_eq!( + *list_size as usize, + UUID_BYTE_LEN, + "UUID storage FixedSizeList must have size {UUID_BYTE_LEN}, got {list_size}" + ); + + let DType::Primitive(ptype, elem_nullability) = element_dtype.as_ref() else { + vortex_bail!("UUID element dtype must be Primitive(U8), got {element_dtype}"); + }; + + vortex_ensure_eq!( + *ptype, + PType::U8, + "UUID element dtype must be U8, got {ptype}" + ); + vortex_ensure!( + !elem_nullability.is_nullable(), + "UUID element dtype must be non-nullable" + ); + + Ok(()) + } + + fn unpack_native<'a>( + &self, + metadata: &'a Self::Metadata, + _storage_dtype: &'a DType, + storage_value: &'a ScalarValue, + ) -> VortexResult> { + let elements = storage_value.as_list(); + vortex_ensure_eq!( + elements.len(), + UUID_BYTE_LEN, + "UUID scalar must have exactly {UUID_BYTE_LEN} bytes, got {}", + elements.len() + ); + + let mut bytes = [0u8; UUID_BYTE_LEN]; + for (i, elem) in elements.iter().enumerate() { + let Some(scalar_value) = elem else { + vortex_bail!("UUID byte at index {i} must not be null"); + }; + let PValue::U8(b) = scalar_value.as_primitive() else { + vortex_bail!("UUID byte at index {i} must be U8"); + }; + bytes[i] = *b; + } + + let parsed = uuid::Uuid::from_bytes(bytes); + + // Verify the parsed UUID matches the expected version, if one is set. + if let Some(expected) = metadata.version { + let expected = expected as u8; + let actual = parsed + .get_version() + .ok_or_else(|| vortex_err!("UUID has unrecognized version nibble"))? + as u8; + + vortex_ensure_eq!( + expected, + actual, + "UUID version mismatch: expected v{expected}, got v{actual}", + ); + } + + Ok(parsed) + } +} + +#[expect( + clippy::cast_possible_truncation, + reason = "UUID_BYTE_LEN always fits both usize and u32" +)] +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use rstest::rstest; + use uuid::Version; + use vortex_error::VortexResult; + + use crate::dtype::DType; + use crate::dtype::Nullability; + use crate::dtype::PType; + use crate::dtype::extension::ExtVTable; + use crate::extension::uuid::Uuid; + use crate::extension::uuid::UuidMetadata; + use crate::extension::uuid::vtable::UUID_BYTE_LEN; + use crate::scalar::Scalar; + use crate::scalar::ScalarValue; + + #[rstest] + #[case::no_version(None)] + #[case::v4_random(Some(Version::Random))] + #[case::v7_sort_rand(Some(Version::SortRand))] + #[case::nil(Some(Version::Nil))] + #[case::max(Some(Version::Max))] + fn roundtrip_metadata(#[case] version: Option) -> VortexResult<()> { + let metadata = UuidMetadata { version }; + let bytes = Uuid.serialize_metadata(&metadata)?; + let expected_len = if version.is_none() { 0 } else { 1 }; + assert_eq!(bytes.len(), expected_len); + let deserialized = Uuid.deserialize_metadata(&bytes)?; + assert_eq!(deserialized, metadata); + Ok(()) + } + + #[test] + fn metadata_display_no_version() { + let metadata = UuidMetadata { version: None }; + assert_eq!(metadata.to_string(), ""); + } + + #[test] + fn metadata_display_with_version() { + let metadata = UuidMetadata { + version: Some(Version::Random), + }; + assert_eq!(metadata.to_string(), "v4"); + + let metadata = UuidMetadata { + version: Some(Version::SortRand), + }; + assert_eq!(metadata.to_string(), "v7"); + } + + #[rstest] + #[case::non_nullable(Nullability::NonNullable)] + #[case::nullable(Nullability::Nullable)] + fn validate_correct_storage_dtype(#[case] nullability: Nullability) -> VortexResult<()> { + let metadata = UuidMetadata::default(); + let storage_dtype = uuid_storage_dtype(nullability); + Uuid.validate_dtype(&metadata, &storage_dtype) + } + + #[test] + fn validate_rejects_wrong_list_size() { + let storage_dtype = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), + 8, + Nullability::NonNullable, + ); + assert!( + Uuid.validate_dtype(&UuidMetadata::default(), &storage_dtype) + .is_err() + ); + } + + #[test] + fn validate_rejects_wrong_element_type() { + let storage_dtype = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U64, Nullability::NonNullable)), + UUID_BYTE_LEN as u32, + Nullability::NonNullable, + ); + assert!( + Uuid.validate_dtype(&UuidMetadata::default(), &storage_dtype) + .is_err() + ); + } + + #[test] + fn validate_rejects_nullable_elements() { + let storage_dtype = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U8, Nullability::Nullable)), + UUID_BYTE_LEN as u32, + Nullability::NonNullable, + ); + assert!( + Uuid.validate_dtype(&UuidMetadata::default(), &storage_dtype) + .is_err() + ); + } + + #[test] + fn validate_rejects_non_fsl() { + let storage_dtype = DType::Primitive(PType::U8, Nullability::NonNullable); + assert!( + Uuid.validate_dtype(&UuidMetadata::default(), &storage_dtype) + .is_err() + ); + } + + #[test] + fn unpack_native_uuid() -> VortexResult<()> { + let expected = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + + let metadata = UuidMetadata::default(); + let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); + let children: Vec = expected + .as_bytes() + .iter() + .map(|&b| Scalar::primitive(b, Nullability::NonNullable)) + .collect(); + let storage_scalar = Scalar::fixed_size_list( + DType::Primitive(PType::U8, Nullability::NonNullable), + children, + Nullability::NonNullable, + ); + + let storage_value = storage_scalar + .value() + .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; + let result = Uuid.unpack_native(&metadata, &storage_dtype, storage_value)?; + assert_eq!(result, expected); + assert_eq!(result.to_string(), "550e8400-e29b-41d4-a716-446655440000"); + Ok(()) + } + + #[test] + fn unpack_native_rejects_version_mismatch() -> VortexResult<()> { + // This is a v4 UUID. + let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + assert_eq!(v4_uuid.get_version(), Some(Version::Random)); + + // Metadata says v7, but the UUID is v4. + let metadata = UuidMetadata { + version: Some(Version::SortRand), + }; + let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); + let children: Vec = v4_uuid + .as_bytes() + .iter() + .map(|&b| Scalar::primitive(b, Nullability::NonNullable)) + .collect(); + let storage_scalar = Scalar::fixed_size_list( + DType::Primitive(PType::U8, Nullability::NonNullable), + children, + Nullability::NonNullable, + ); + + let storage_value = storage_scalar + .value() + .ok_or_else(|| vortex_error::vortex_err!("expected non-null scalar"))?; + assert!( + Uuid.unpack_native(&metadata, &storage_dtype, storage_value) + .is_err() + ); + Ok(()) + } + + /// Builds a [`ScalarValue`] for a UUID's 16 bytes, suitable for passing to `unpack_native`. + fn uuid_storage_scalar(uuid: &uuid::Uuid) -> ScalarValue { + let children: Vec = uuid + .as_bytes() + .iter() + .map(|&b| Scalar::primitive(b, Nullability::NonNullable)) + .collect(); + let scalar = Scalar::fixed_size_list( + DType::Primitive(PType::U8, Nullability::NonNullable), + children, + Nullability::NonNullable, + ); + scalar.value().unwrap().clone() + } + + #[test] + fn unpack_native_accepts_matching_version() -> VortexResult<()> { + // This is a v4 UUID. + let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + + let metadata = UuidMetadata { + version: Some(Version::Random), + }; + let storage_value = uuid_storage_scalar(&v4_uuid); + let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); + + let result = Uuid.unpack_native(&metadata, &storage_dtype, &storage_value)?; + assert_eq!(result, v4_uuid); + Ok(()) + } + + #[test] + fn unpack_native_any_version_accepts_all() -> VortexResult<()> { + // A v4 UUID should be accepted when metadata has no version constraint. + let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000") + .map_err(|e| vortex_error::vortex_err!("{e}"))?; + + let metadata = UuidMetadata::default(); + let storage_value = uuid_storage_scalar(&v4_uuid); + let storage_dtype = uuid_storage_dtype(Nullability::NonNullable); + + let result = Uuid.unpack_native(&metadata, &storage_dtype, &storage_value)?; + assert_eq!(result, v4_uuid); + Ok(()) + } + + fn uuid_storage_dtype(nullability: Nullability) -> DType { + DType::FixedSizeList( + Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), + UUID_BYTE_LEN as u32, + nullability, + ) + } +}