diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs index 3759739..638259f 100644 --- a/datasketches/src/bloom/sketch.rs +++ b/datasketches/src/bloom/sketch.rs @@ -20,13 +20,11 @@ use std::hash::Hasher; use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::error::Error; use crate::hash::XxHash64; // Serialization constants -const PREAMBLE_LONGS_EMPTY: u8 = 3; -const PREAMBLE_LONGS_STANDARD: u8 = 4; -const BLOOM_FAMILY_ID: u8 = 21; // Bloom filter family ID const SERIAL_VERSION: u8 = 1; const EMPTY_FLAG_MASK: u8 = 1 << 2; @@ -353,9 +351,9 @@ impl BloomFilter { pub fn serialize(&self) -> Vec { let is_empty = self.is_empty(); let preamble_longs = if is_empty { - PREAMBLE_LONGS_EMPTY + Family::BLOOMFILTER.min_pre_longs } else { - PREAMBLE_LONGS_STANDARD + Family::BLOOMFILTER.max_pre_longs }; let capacity = 8 * preamble_longs as usize @@ -369,7 +367,7 @@ impl BloomFilter { // Preamble bytes.write_u8(preamble_longs); // Byte 0 bytes.write_u8(SERIAL_VERSION); // Byte 1 - bytes.write_u8(BLOOM_FAMILY_ID); // Byte 2 + bytes.write_u8(Family::BLOOMFILTER.id); // Byte 2 bytes.write_u8(if is_empty { EMPTY_FLAG_MASK } else { 0 }); // Byte 3: flags bytes.write_u16_le(self.num_hashes); // Bytes 4-5 bytes.write_u16_le(0); // Bytes 6-7: unused @@ -432,24 +430,22 @@ impl BloomFilter { .map_err(|_| Error::insufficient_data("flags"))?; // Validate - if family_id != BLOOM_FAMILY_ID { - return Err(Error::invalid_family( - BLOOM_FAMILY_ID, - family_id, - "BloomFilter", - )); - } + Family::BLOOMFILTER.validate_id(family_id)?; if serial_version != SERIAL_VERSION { return Err(Error::unsupported_serial_version( SERIAL_VERSION, serial_version, )); } - if preamble_longs != PREAMBLE_LONGS_EMPTY && preamble_longs != PREAMBLE_LONGS_STANDARD { - return Err(Error::invalid_preamble_longs( - PREAMBLE_LONGS_STANDARD, - preamble_longs, - )); + if !(Family::BLOOMFILTER.min_pre_longs..=Family::BLOOMFILTER.max_pre_longs) + .contains(&preamble_longs) + { + return Err(Error::deserial(format!( + "invalid preamble longs: expected [{}, {}], got {}", + Family::BLOOMFILTER.min_pre_longs, + Family::BLOOMFILTER.max_pre_longs, + preamble_longs + ))); } let is_empty = (flags & EMPTY_FLAG_MASK) != 0; diff --git a/datasketches/src/codec.rs b/datasketches/src/codec/decode.rs similarity index 67% rename from datasketches/src/codec.rs rename to datasketches/src/codec/decode.rs index 4df7b22..37c1523 100644 --- a/datasketches/src/codec.rs +++ b/datasketches/src/codec/decode.rs @@ -15,226 +15,154 @@ // specific language governing permissions and limitations // under the License. -#![allow(dead_code)] - use std::io; use std::io::Cursor; use std::io::Read; -pub(crate) struct SketchBytes { - bytes: Vec, -} - -impl SketchBytes { - pub fn with_capacity(capacity: usize) -> Self { - Self { - bytes: Vec::with_capacity(capacity), - } - } - - pub fn into_bytes(self) -> Vec { - self.bytes - } - - pub fn write(&mut self, buf: &[u8]) { - self.bytes.extend_from_slice(buf); - } - - pub fn write_u8(&mut self, n: u8) { - self.bytes.push(n); - } - - pub fn write_i8(&mut self, n: i8) { - self.bytes.push(n as u8); - } - - pub fn write_u16_le(&mut self, n: u16) { - self.write(&n.to_le_bytes()); - } - - pub fn write_u16_be(&mut self, n: u16) { - self.write(&n.to_be_bytes()); - } - - pub fn write_i16_le(&mut self, n: i16) { - self.write(&n.to_le_bytes()); - } - - pub fn write_i16_be(&mut self, n: i16) { - self.write(&n.to_be_bytes()); - } - - pub fn write_u32_le(&mut self, n: u32) { - self.write(&n.to_le_bytes()); - } - - pub fn write_u32_be(&mut self, n: u32) { - self.write(&n.to_be_bytes()); - } - - pub fn write_i32_le(&mut self, n: i32) { - self.write(&n.to_le_bytes()); - } - - pub fn write_i32_be(&mut self, n: i32) { - self.write(&n.to_be_bytes()); - } - - pub fn write_u64_le(&mut self, n: u64) { - self.write(&n.to_le_bytes()); - } - - pub fn write_u64_be(&mut self, n: u64) { - self.write(&n.to_be_bytes()); - } - - pub fn write_i64_le(&mut self, n: i64) { - self.write(&n.to_le_bytes()); - } - - pub fn write_i64_be(&mut self, n: i64) { - self.write(&n.to_be_bytes()); - } - - pub fn write_f32_le(&mut self, n: f32) { - self.write(&n.to_le_bytes()); - } - - pub fn write_f32_be(&mut self, n: f32) { - self.write(&n.to_be_bytes()); - } - - pub fn write_f64_le(&mut self, n: f64) { - self.write(&n.to_le_bytes()); - } - - pub fn write_f64_be(&mut self, n: f64) { - self.write(&n.to_be_bytes()); - } -} - -pub(crate) struct SketchSlice<'a> { +/// A wrapper around a byte slice that provides methods for reading various types of data from it. +pub struct SketchSlice<'a> { slice: Cursor<&'a [u8]>, } impl SketchSlice<'_> { + /// Creates a new `SketchSlice` from the given byte slice. pub fn new(slice: &[u8]) -> SketchSlice<'_> { SketchSlice { slice: Cursor::new(slice), } } + /// Advances the position of the slice by `n` bytes. pub fn advance(&mut self, n: u64) { let pos = self.slice.position(); self.slice.set_position(pos + n); } + /// Reads exactly `buf.len()` bytes from the slice into `buf`. pub fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> { self.slice.read_exact(buf) } + /// Reads a single byte from the slice and returns it as a `u8`. pub fn read_u8(&mut self) -> io::Result { let mut buf = [0u8; 1]; self.read_exact(&mut buf)?; Ok(buf[0]) } + /// Reads a single byte from the slice and returns it as an `i8`. pub fn read_i8(&mut self) -> io::Result { let mut buf = [0u8; 1]; self.read_exact(&mut buf)?; Ok(buf[0] as i8) } + /// Reads a 16-bit unsigned integer from the slice in little-endian byte order. pub fn read_u16_le(&mut self) -> io::Result { let mut buf = [0u8; 2]; self.read_exact(&mut buf)?; Ok(u16::from_le_bytes(buf)) } + /// Reads a 16-bit unsigned integer from the slice in big-endian byte order. pub fn read_u16_be(&mut self) -> io::Result { let mut buf = [0u8; 2]; self.read_exact(&mut buf)?; Ok(u16::from_be_bytes(buf)) } + /// Reads a 16-bit signed integer from the slice in little-endian byte order. pub fn read_i16_le(&mut self) -> io::Result { let mut buf = [0u8; 2]; self.read_exact(&mut buf)?; Ok(i16::from_le_bytes(buf)) } + /// Reads a 16-bit signed integer from the slice in big-endian byte order. pub fn read_i16_be(&mut self) -> io::Result { let mut buf = [0u8; 2]; self.read_exact(&mut buf)?; Ok(i16::from_be_bytes(buf)) } + /// Reads a 32-bit unsigned integer from the slice in little-endian byte order. pub fn read_u32_le(&mut self) -> io::Result { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(u32::from_le_bytes(buf)) } + /// Reads a 32-bit unsigned integer from the slice in big-endian byte order. pub fn read_u32_be(&mut self) -> io::Result { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(u32::from_be_bytes(buf)) } + /// Reads a 32-bit signed integer from the slice in little-endian byte order. pub fn read_i32_le(&mut self) -> io::Result { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(i32::from_le_bytes(buf)) } + /// Reads a 32-bit signed integer from the slice in big-endian byte order. pub fn read_i32_be(&mut self) -> io::Result { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(i32::from_be_bytes(buf)) } + /// Reads a 16-bit unsigned integer from the slice in little-endian byte order. pub fn read_u64_le(&mut self) -> io::Result { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(u64::from_le_bytes(buf)) } + /// Reads a 16-bit unsigned integer from the slice in big-endian byte order. pub fn read_u64_be(&mut self) -> io::Result { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(u64::from_be_bytes(buf)) } + /// Reads a 16-bit signed integer from the slice in little-endian byte order. pub fn read_i64_le(&mut self) -> io::Result { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(i64::from_le_bytes(buf)) } + /// Reads a 16-bit signed integer from the slice in big-endian byte order. pub fn read_i64_be(&mut self) -> io::Result { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(i64::from_be_bytes(buf)) } + /// Reads a 32-bit floating-point number from the slice in little-endian byte order. pub fn read_f32_le(&mut self) -> io::Result { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(f32::from_le_bytes(buf)) } + /// Reads a 32-bit floating-point number from the slice in big-endian byte order. pub fn read_f32_be(&mut self) -> io::Result { let mut buf = [0u8; 4]; self.read_exact(&mut buf)?; Ok(f32::from_be_bytes(buf)) } + /// Reads a 64-bit floating-point number from the slice in little-endian byte order. pub fn read_f64_le(&mut self) -> io::Result { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; Ok(f64::from_le_bytes(buf)) } + /// Reads a 64-bit floating-point number from the slice in big-endian byte order. pub fn read_f64_be(&mut self) -> io::Result { let mut buf = [0u8; 8]; self.read_exact(&mut buf)?; diff --git a/datasketches/src/codec/encode.rs b/datasketches/src/codec/encode.rs new file mode 100644 index 0000000..60e85c8 --- /dev/null +++ b/datasketches/src/codec/encode.rs @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// A simple wrapper around a `Vec` that provides methods for writing various types of data. +pub struct SketchBytes { + bytes: Vec, +} + +impl SketchBytes { + /// Constructs an empty `SketchBytes` with at least the specified capacity. + pub fn with_capacity(capacity: usize) -> Self { + Self { + bytes: Vec::with_capacity(capacity), + } + } + + /// Consumes the `SketchBytes` and returns the underlying `Vec`. + pub fn into_bytes(self) -> Vec { + self.bytes + } + + /// Writes the given byte slice to the `SketchBytes`. + pub fn write(&mut self, buf: &[u8]) { + self.bytes.extend_from_slice(buf); + } + + /// Writes a single byte to the `SketchBytes`. + pub fn write_u8(&mut self, n: u8) { + self.bytes.push(n); + } + + /// Writes a single byte to the `SketchBytes`. + pub fn write_i8(&mut self, n: i8) { + self.bytes.push(n as u8); + } + + /// Writes a 16-bit unsigned integer to the `SketchBytes` in little-endian byte order. + pub fn write_u16_le(&mut self, n: u16) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 16-bit unsigned integer to the `SketchBytes` in big-endian byte order. + pub fn write_u16_be(&mut self, n: u16) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 16-bit signed integer to the `SketchBytes` in little-endian byte order. + pub fn write_i16_le(&mut self, n: i16) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 16-bit signed integer to the `SketchBytes` in big-endian byte order. + pub fn write_i16_be(&mut self, n: i16) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 32-bit unsigned integer to the `SketchBytes` in little-endian byte order. + pub fn write_u32_le(&mut self, n: u32) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 32-bit unsigned integer to the `SketchBytes` in big-endian byte order. + pub fn write_u32_be(&mut self, n: u32) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 32-bit signed integer to the `SketchBytes` in little-endian byte order. + pub fn write_i32_le(&mut self, n: i32) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 32-bit signed integer to the `SketchBytes` in big-endian byte order. + pub fn write_i32_be(&mut self, n: i32) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 64-bit unsigned integer to the `SketchBytes` in little-endian byte order. + pub fn write_u64_le(&mut self, n: u64) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 64-bit unsigned integer to the `SketchBytes` in big-endian byte order. + pub fn write_u64_be(&mut self, n: u64) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 64-bit signed integer to the `SketchBytes` in little-endian byte order. + pub fn write_i64_le(&mut self, n: i64) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 64-bit signed integer to the `SketchBytes` in big-endian byte order. + pub fn write_i64_be(&mut self, n: i64) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 32-bit floating-point number to the `SketchBytes` in little-endian byte order. + pub fn write_f32_le(&mut self, n: f32) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 32-bit floating-point number to the `SketchBytes` in big-endian byte order. + pub fn write_f32_be(&mut self, n: f32) { + self.write(&n.to_be_bytes()); + } + + /// Writes a 64-bit floating-point number to the `SketchBytes` in little-endian byte order. + pub fn write_f64_le(&mut self, n: f64) { + self.write(&n.to_le_bytes()); + } + + /// Writes a 64-bit floating-point number to the `SketchBytes` in big-endian byte order. + pub fn write_f64_be(&mut self, n: f64) { + self.write(&n.to_be_bytes()); + } +} diff --git a/datasketches/src/codec/family.rs b/datasketches/src/codec/family.rs new file mode 100644 index 0000000..11efcc5 --- /dev/null +++ b/datasketches/src/codec/family.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error; + +/// Defines the various families of sketch and set operation classes. +/// +/// A family defines a set of classes that share fundamental algorithms and behaviors. The classes +/// within a family may still differ by how they are stored and accessed. +pub struct Family { + /// The byte ID for this family. + pub id: u8, + /// The name for this family. + pub name: &'static str, + /// The minimum preamble size for this family in longs (8-bytes integer). + pub min_pre_longs: u8, + /// The maximum preamble size for this family in longs (8-bytes integer). + pub max_pre_longs: u8, +} + +impl Family { + /// The HLL family of sketches. + pub const HLL: Family = Family { + id: 7, + name: "HLL", + min_pre_longs: 1, + max_pre_longs: 1, + }; + + /// The Frequency family of sketches. + pub const FREQUENCY: Family = Family { + id: 10, + name: "FREQUENCY", + min_pre_longs: 1, + max_pre_longs: 4, + }; + + /// Compressed Probabilistic Counting (CPC) Sketch. + pub const CPC: Family = Family { + id: 16, + name: "CPC", + min_pre_longs: 1, + max_pre_longs: 5, + }; + + /// CountMin Sketch + pub const COUNTMIN: Family = Family { + id: 18, + name: "COUNTMIN", + min_pre_longs: 2, + max_pre_longs: 2, + }; + + /// T-Digest for estimating quantiles and ranks. + pub const TDIGEST: Family = Family { + id: 20, + name: "TDIGEST", + min_pre_longs: 1, + max_pre_longs: 2, + }; + + /// Bloom Filter. + pub const BLOOMFILTER: Family = Family { + id: 21, + name: "BLOOMFILTER", + min_pre_longs: 3, + max_pre_longs: 4, + }; +} + +impl Family { + pub fn validate_id(&self, family_id: u8) -> Result<(), Error> { + if family_id != self.id { + Err(Error::invalid_family(self.id, family_id, self.name)) + } else { + Ok(()) + } + } +} diff --git a/datasketches/src/codec/mod.rs b/datasketches/src/codec/mod.rs new file mode 100644 index 0000000..3c003c6 --- /dev/null +++ b/datasketches/src/codec/mod.rs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Codec utilities for datasketches crate. + +// public common codec utilities for datasketches crate +mod decode; +mod encode; +pub use self::decode::SketchSlice; +pub use self::encode::SketchBytes; + +// private to datasketches crate +pub(crate) mod family; diff --git a/datasketches/src/countmin/serialization.rs b/datasketches/src/countmin/serialization.rs index 7d10f59..4f078a9 100644 --- a/datasketches/src/countmin/serialization.rs +++ b/datasketches/src/countmin/serialization.rs @@ -17,6 +17,5 @@ pub(super) const PREAMBLE_LONGS_SHORT: u8 = 2; pub(super) const SERIAL_VERSION: u8 = 1; -pub(super) const COUNTMIN_FAMILY_ID: u8 = 18; pub(super) const FLAGS_IS_EMPTY: u8 = 1 << 0; pub(super) const LONG_SIZE_BYTES: usize = 8; diff --git a/datasketches/src/countmin/sketch.rs b/datasketches/src/countmin/sketch.rs index ad014a1..0618d4c 100644 --- a/datasketches/src/countmin/sketch.rs +++ b/datasketches/src/countmin/sketch.rs @@ -20,9 +20,9 @@ use std::hash::Hasher; use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::countmin::CountMinValue; use crate::countmin::UnsignedCountMinValue; -use crate::countmin::serialization::COUNTMIN_FAMILY_ID; use crate::countmin::serialization::FLAGS_IS_EMPTY; use crate::countmin::serialization::LONG_SIZE_BYTES; use crate::countmin::serialization::PREAMBLE_LONGS_SHORT; @@ -279,7 +279,7 @@ impl CountMinSketch { bytes.write_u8(PREAMBLE_LONGS_SHORT); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(COUNTMIN_FAMILY_ID); + bytes.write_u8(Family::COUNTMIN.id); bytes.write_u8(if self.is_empty() { FLAGS_IS_EMPTY } else { 0 }); bytes.write_u32_le(0); // unused @@ -349,13 +349,7 @@ impl CountMinSketch { let flags = cursor.read_u8().map_err(make_error("flags"))?; cursor.read_u32_le().map_err(make_error(""))?; - if family_id != COUNTMIN_FAMILY_ID { - return Err(Error::invalid_family( - COUNTMIN_FAMILY_ID, - family_id, - "CountMinSketch", - )); - } + Family::COUNTMIN.validate_id(family_id)?; if serial_version != SERIAL_VERSION { return Err(Error::unsupported_serial_version( SERIAL_VERSION, diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 233c486..ae3de67 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -19,6 +19,7 @@ use std::hash::Hash; use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::common::NumStdDev; use crate::common::canonical_double; use crate::common::inv_pow2_table::INVERSE_POWERS_OF_2; @@ -435,7 +436,6 @@ impl CpcSketch { } const SERIAL_VERSION: u8 = 1; -const CPC_FAMILY_ID: u8 = 16; const FLAG_COMPRESSED: u8 = 1; const FLAG_HAS_HIP: u8 = 2; const FLAG_HAS_TABLE: u8 = 3; @@ -455,7 +455,7 @@ impl CpcSketch { let preamble_ints = make_preamble_ints(self.num_coupons, has_hip, has_table, has_window); bytes.write_u8(preamble_ints); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(CPC_FAMILY_ID); + bytes.write_u8(Family::CPC.id); bytes.write_u8(self.lg_k); bytes.write_u8(self.first_interesting_column); let flags = (1 << FLAG_COMPRESSED) @@ -517,9 +517,7 @@ impl CpcSketch { let preamble_ints = cursor.read_u8().map_err(make_error("preamble_ints"))?; let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; let family_id = cursor.read_u8().map_err(make_error("family_id"))?; - if family_id != CPC_FAMILY_ID { - return Err(Error::invalid_family(CPC_FAMILY_ID, family_id, "TDigest")); - } + Family::CPC.validate_id(family_id)?; if serial_version != SERIAL_VERSION { return Err(Error::unsupported_serial_version( SERIAL_VERSION, diff --git a/datasketches/src/frequencies/serialization.rs b/datasketches/src/frequencies/serialization.rs index 44d2891..ed7a898 100644 --- a/datasketches/src/frequencies/serialization.rs +++ b/datasketches/src/frequencies/serialization.rs @@ -21,8 +21,6 @@ use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; -/// Family ID for frequency sketches. -pub const FREQUENCY_FAMILY_ID: u8 = 10; /// Serialization version. pub const SERIAL_VERSION: u8 = 1; diff --git a/datasketches/src/frequencies/sketch.rs b/datasketches/src/frequencies/sketch.rs index 1534448..b49b391 100644 --- a/datasketches/src/frequencies/sketch.rs +++ b/datasketches/src/frequencies/sketch.rs @@ -21,6 +21,7 @@ use std::hash::Hash; use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::error::Error; use crate::frequencies::reverse_purge_item_hash_map::ReversePurgeItemHashMap; use crate::frequencies::serialization::*; @@ -409,7 +410,7 @@ impl FrequentItemsSketch { let mut bytes = SketchBytes::with_capacity(8); bytes.write_u8(PREAMBLE_LONGS_EMPTY); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(FREQUENCY_FAMILY_ID); + bytes.write_u8(Family::FREQUENCY.id); bytes.write_u8(self.lg_max_map_size); bytes.write_u8(self.hash_map.lg_length()); bytes.write_u8(EMPTY_FLAG_MASK); @@ -425,7 +426,7 @@ impl FrequentItemsSketch { let mut bytes = SketchBytes::with_capacity(total_bytes); bytes.write_u8(PREAMBLE_LONGS_NONEMPTY); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(FREQUENCY_FAMILY_ID); + bytes.write_u8(Family::FREQUENCY.id); bytes.write_u8(self.lg_max_map_size); bytes.write_u8(self.hash_map.lg_length()); bytes.write_u8(0); // flags @@ -462,21 +463,13 @@ impl FrequentItemsSketch { let flags = cursor.read_u8().map_err(make_error("flags"))?; cursor.read_u16_le().map_err(make_error(""))?; + Family::FREQUENCY.validate_id(family)?; if serial_version != SERIAL_VERSION { return Err(Error::unsupported_serial_version( SERIAL_VERSION, serial_version, )); } - - if family != FREQUENCY_FAMILY_ID { - return Err(Error::invalid_family( - FREQUENCY_FAMILY_ID, - family, - "FrequentItemsSketch", - )); - } - if lg_cur > lg_max { return Err(Error::deserial("lg_cur_map_size exceeds lg_max_map_size")); } diff --git a/datasketches/src/hll/array4.rs b/datasketches/src/hll/array4.rs index d55883c..a17b4da 100644 --- a/datasketches/src/hll/array4.rs +++ b/datasketches/src/hll/array4.rs @@ -23,6 +23,7 @@ use super::aux_map::AuxMap; use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::common::NumStdDev; use crate::error::Error; use crate::hll::estimator::HipEstimator; @@ -376,7 +377,7 @@ impl Array4 { // Write standard header bytes.write_u8(HLL_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(0); // unused for HLL mode diff --git a/datasketches/src/hll/array6.rs b/datasketches/src/hll/array6.rs index 4e77e0b..0bdb6eb 100644 --- a/datasketches/src/hll/array6.rs +++ b/datasketches/src/hll/array6.rs @@ -23,6 +23,7 @@ use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::common::NumStdDev; use crate::error::Error; use crate::hll::estimator::HipEstimator; @@ -229,7 +230,7 @@ impl Array6 { // Write standard header bytes.write_u8(HLL_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(0); // unused for HLL mode diff --git a/datasketches/src/hll/array8.rs b/datasketches/src/hll/array8.rs index 530d18e..00faf16 100644 --- a/datasketches/src/hll/array8.rs +++ b/datasketches/src/hll/array8.rs @@ -22,6 +22,7 @@ use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::common::NumStdDev; use crate::error::Error; use crate::hll::estimator::HipEstimator; @@ -301,7 +302,7 @@ impl Array8 { // Write standard header bytes.write_u8(HLL_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(0); // unused for HLL mode diff --git a/datasketches/src/hll/hash_set.rs b/datasketches/src/hll/hash_set.rs index 874d3a4..cbe99ff 100644 --- a/datasketches/src/hll/hash_set.rs +++ b/datasketches/src/hll/hash_set.rs @@ -22,6 +22,7 @@ use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::error::Error; use crate::hll::HllType; use crate::hll::KEY_MASK_26; @@ -149,7 +150,7 @@ impl HashSet { // Write preamble bytes.write_u8(HASH_SET_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(lg_arr as u8); diff --git a/datasketches/src/hll/list.rs b/datasketches/src/hll/list.rs index 1abf699..6cd92f8 100644 --- a/datasketches/src/hll/list.rs +++ b/datasketches/src/hll/list.rs @@ -22,6 +22,7 @@ use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::error::Error; use crate::hll::HllType; use crate::hll::container::COUPON_EMPTY; @@ -111,7 +112,7 @@ impl List { // Write preamble bytes.write_u8(LIST_PREINTS); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(HLL_FAMILY_ID); + bytes.write_u8(Family::HLL.id); bytes.write_u8(lg_config_k); bytes.write_u8(lg_arr as u8); diff --git a/datasketches/src/hll/serialization.rs b/datasketches/src/hll/serialization.rs index 5fdb2b3..014b890 100644 --- a/datasketches/src/hll/serialization.rs +++ b/datasketches/src/hll/serialization.rs @@ -20,9 +20,6 @@ //! This module contains all constants related to the Apache DataSketches //! binary serialization format, shared across all sketch modes. -/// Family ID for HLL sketches in DataSketches format -pub const HLL_FAMILY_ID: u8 = 7; - /// Current serialization version pub const SERIAL_VERSION: u8 = 1; diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs index bc1ce42..911f2c0 100644 --- a/datasketches/src/hll/sketch.rs +++ b/datasketches/src/hll/sketch.rs @@ -23,6 +23,7 @@ use std::hash::Hash; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::common::NumStdDev; use crate::error::Error; use crate::hll::HllType; @@ -277,9 +278,7 @@ impl HllSketch { let mode_byte = cursor.read_u8().map_err(make_error("mode"))?; // Verify family ID - if family_id != HLL_FAMILY_ID { - return Err(Error::invalid_family(HLL_FAMILY_ID, family_id, "HLL")); - } + Family::HLL.validate_id(family_id)?; // Verify serialization version if serial_version != SERIAL_VERSION { diff --git a/datasketches/src/lib.rs b/datasketches/src/lib.rs index 17701ab..02dc692 100644 --- a/datasketches/src/lib.rs +++ b/datasketches/src/lib.rs @@ -31,6 +31,7 @@ compile_error!("datasketches does not support big-endian targets"); pub mod bloom; +pub mod codec; pub mod common; pub mod countmin; pub mod cpc; @@ -40,5 +41,4 @@ pub mod hll; pub mod tdigest; pub mod theta; -mod codec; mod hash; diff --git a/datasketches/src/tdigest/serialization.rs b/datasketches/src/tdigest/serialization.rs index e5b9788..407e2ac 100644 --- a/datasketches/src/tdigest/serialization.rs +++ b/datasketches/src/tdigest/serialization.rs @@ -18,7 +18,6 @@ pub(super) const PREAMBLE_LONGS_EMPTY_OR_SINGLE: u8 = 1; pub(super) const PREAMBLE_LONGS_MULTIPLE: u8 = 2; pub(super) const SERIAL_VERSION: u8 = 1; -pub(super) const TDIGEST_FAMILY_ID: u8 = 20; pub(super) const FLAGS_IS_EMPTY: u8 = 1 << 0; pub(super) const FLAGS_IS_SINGLE_VALUE: u8 = 1 << 1; pub(super) const FLAGS_REVERSE_MERGE: u8 = 1 << 2; diff --git a/datasketches/src/tdigest/sketch.rs b/datasketches/src/tdigest/sketch.rs index a9ef093..fde9fbc 100644 --- a/datasketches/src/tdigest/sketch.rs +++ b/datasketches/src/tdigest/sketch.rs @@ -21,6 +21,7 @@ use std::num::NonZeroU64; use crate::codec::SketchBytes; use crate::codec::SketchSlice; +use crate::codec::family::Family; use crate::error::Error; use crate::tdigest::serialization::*; @@ -428,7 +429,7 @@ impl TDigestMut { _ => PREAMBLE_LONGS_MULTIPLE, }); bytes.write_u8(SERIAL_VERSION); - bytes.write_u8(TDIGEST_FAMILY_ID); + bytes.write_u8(Family::TDIGEST.id); bytes.write_u16_le(self.k); bytes.write_u8({ let mut flags = 0; @@ -493,15 +494,11 @@ impl TDigestMut { let preamble_longs = cursor.read_u8().map_err(make_error("preamble_longs"))?; let serial_version = cursor.read_u8().map_err(make_error("serial_version"))?; let family_id = cursor.read_u8().map_err(make_error("family_id"))?; - if family_id != TDIGEST_FAMILY_ID { + if let Err(err) = Family::TDIGEST.validate_id(family_id) { return if preamble_longs == 0 && serial_version == 0 && family_id == 0 { Self::deserialize_compat(bytes) } else { - Err(Error::invalid_family( - TDIGEST_FAMILY_ID, - family_id, - "TDigest", - )) + Err(err) }; } if serial_version != SERIAL_VERSION {