1313#include " bucket.h"
1414#include " sketch_types.h"
1515
16+ #pragma pack(push,1)
17+ struct SparseBucket {
18+ uint8_t next; // index of next sparse bucket in this column
19+ uint8_t row; // row of sparse bucket
20+ Bucket bkt; // actual bucket content
21+ };
22+ #pragma pack(pop)
23+
1624// TODO: Do we want to use row major or column major order?
1725// So the advantage of row-major is that we can update faster. Most updates will only touch
1826// first few rows of data-structure. However, could slow down queries. (Although most query
1927// answers will probably be in sparse data-structure). OH! Also, range_merge is important here
2028// if column-major then the column we are merging is contig, if not, then not.
2129// A: Keep column-major for the moment, performance evaluation later.
2230
23-
24- // TODO: How do we want to handle raw_bucket_merge() and get_readonly_bucket_ptr()?
25- // These functions are nice for performance because we can skip serialization but aren't
26- // strictly necessary.
27- // A: Make function to get size in bytes of bucket data and have the 'hash table' be contig with
28- // the bucket data. This way we can still use these functions.
29-
30-
31- // TODO: It would be nice to preallocate the structure if we know how big its probably going to be.
32- // This would be helpful for delta sketches for example.
33- // A: Yeah do this
34-
35-
36- // TODO: What are we doing with the num_buckets variable? Could be nice to just be the size of
37- // buckets array. Could also be upperbound on the size.
38- // A: Need two variables. Both the current number of buckets (rows) allocated AND the maximum.
39-
40- // A strategy that could work well would be to allocate a chunk of memory some of which is given to
41- // the dense region of the sketch and 3 * sizeof(uint64_t) are given to sparse region.
42- // 3 -> position, alpha, gamma (could save a little more space by using 16 bits for position)
43-
44- /* Memory Allocation of a Sketch. Contiguous
45- _________________________________________________________________________________________
46- | Dense | Sparse |
47- | Sketch | Bucket |
48- | Buckets | Region (hash-table) |
49- | log n * log z buckets | clog n buckets |
50- |__________________________________________________________|______________________________|
31+ /* Memory Allocation of a SparseSketch. Contiguous (only roughly to scale).
32+ Where z is number of non-zero elements in vector we are sketching.
33+ _________________________________________________________________________________________________
34+ | Dense | Sparse | Linked List |
35+ | Bucket | Bucket | Metadata |
36+ | Region | Region | for Sparse bkts |
37+ | log n * log z buckets | clog n buckets | clogn/16 buckets |
38+ |_________________________________________________|____________________________|__________________|
5139*/
5240
5341/* *
54- * Sketch for graph processing, either CubeSketch or CameoSketch.
42+ * SparseSketch for graph processing
5543 * Sub-linear representation of a vector.
5644 */
5745class SparseSketch {
5846 private:
59- const uint64_t seed; // seed for hash functions
60- size_t num_samples; // number of samples we can perform
61- size_t cols_per_sample; // number of columns to use on each sample
62- size_t num_columns; // Total number of columns. (product of above 2)
63- size_t bkt_per_col; // maximum number of buckets per column (max number of rows)
64- size_t num_buckets; // number of total buckets
65- // (either product of above two or col * dense_rows + sparse_capacity)
47+ const uint64_t seed; // seed for hash functions
48+ const size_t num_samples; // number of samples we can perform
49+ const size_t cols_per_sample; // number of columns to use on each sample
50+ const size_t num_columns; // Total number of columns. (product of above 2)
51+ const size_t bkt_per_col; // maximum number of buckets per column (max number of rows)
6652
67- size_t sample_idx = 0 ; // number of samples performed so far
53+ size_t num_buckets; // number of total buckets (col * dense_rows + sparse_capacity)
54+ size_t sample_idx = 0 ; // number of samples performed so far
6855
6956 // Allocated buckets
7057 Bucket* buckets;
@@ -74,8 +61,9 @@ class SparseSketch {
7461
7562 // Variables for sparse representation of lower levels of bucket Matrix
7663 // TODO: evaluate implications of this constant
77- static constexpr double sparse_bucket_constant = 3 ; // constant factor c (see above )
64+ static constexpr double sparse_bucket_constant = 3 ; // constant factor c (see diagram )
7865 SparseBucket* sparse_buckets; // a pointer into the buckets array
66+ uint8_t *ll_metadata; // pointer to heads of column LLs
7967 size_t number_of_sparse_buckets = 0 ; // cur number of sparse buckets
8068 size_t sparse_capacity = sparse_bucket_constant * num_columns; // max number of sparse buckets
8169
@@ -85,11 +73,12 @@ class SparseSketch {
8573 void reallocate_if_needed (int delta);
8674 void dense_realloc (size_t new_num_dense_rows);
8775
88- // This variable lets us know how many Buckets to allocate to make space for the SparseBuckets
89- // that will be using that space
76+ // These variables let us know how many Buckets to allocate to make space for the SparseBuckets
77+ // and the LL metadata that will use that space
9078 size_t sparse_data_size = ceil(double (sparse_capacity) * sizeof (SparseBucket) / sizeof (Bucket));
79+ size_t ll_metadata_size = ceil((double (num_columns) + 1 ) * sizeof (uint8_t ) / sizeof (Bucket));
9180
92- void update_sparse (SparseBucket to_add, bool realloc_if_needed = true );
81+ void update_sparse (uint8_t col, SparseBucket to_add, bool realloc_if_needed = true );
9382 SketchSample sample_sparse (size_t first_col, size_t end_col);
9483
9584 inline Bucket& deterministic_bucket () {
@@ -113,6 +102,23 @@ class SparseSketch {
113102 return buckets[position_func (col, row, num_dense_rows)];
114103 }
115104
105+ size_t calc_num_buckets (size_t new_num_dense_rows) {
106+ return num_columns * new_num_dense_rows + sparse_data_size + ll_metadata_size + 1 ;
107+ }
108+
109+ size_t calc_sparse_index (size_t rows) {
110+ return num_columns * rows + 1 ;
111+ }
112+
113+ size_t calc_metadata_index (size_t rows) {
114+ return num_columns * rows + sparse_data_size + 1 ;
115+ }
116+
117+ void upd_sparse_ptrs () {
118+ sparse_buckets = (SparseBucket *) &buckets[calc_sparse_index (num_dense_rows)];
119+ ll_metadata = (uint8_t *) &buckets[calc_metadata_index (num_dense_rows)];
120+ }
121+
116122 public:
117123 /* *
118124 * The below constructors use vector length as their input. However, in graph sketching our input
0 commit comments