Skip to content

Commit 6931393

Browse files
committed
optimize and make work
1 parent 2711d7f commit 6931393

File tree

4 files changed

+264
-197
lines changed

4 files changed

+264
-197
lines changed

include/bucket.h

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,6 @@ struct Bucket {
99
vec_t alpha;
1010
vec_hash_t gamma;
1111
};
12-
struct SparseBucket {
13-
uint16_t position; // (col << 8) | row
14-
Bucket bkt;
15-
16-
// TODO: Use these functions and also maybe optimize
17-
inline uint16_t col() const {
18-
return position >> 8;
19-
}
20-
inline uint16_t row() const {
21-
return position & 0xFF;
22-
}
23-
inline void set_col(uint16_t col) {
24-
position = (col << 8) + row();
25-
}
26-
inline void set_row(uint16_t row) {
27-
position = (col() << 8) + row;
28-
}
29-
};
3012
#pragma pack(pop)
3113

3214
namespace Bucket_Boruvka {

include/sparse_sketch.h

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -13,58 +13,45 @@
1313
#include "bucket.h"
1414
#include "sketch_types.h"
1515

16+
#pragma pack(push,1)
17+
struct SparseBucket {
18+
uint8_t next; // index of next sparse bucket in this column
19+
uint8_t row; // row of sparse bucket
20+
Bucket bkt; // actual bucket content
21+
};
22+
#pragma pack(pop)
23+
1624
// TODO: Do we want to use row major or column major order?
1725
// So the advantage of row-major is that we can update faster. Most updates will only touch
1826
// first few rows of data-structure. However, could slow down queries. (Although most query
1927
// answers will probably be in sparse data-structure). OH! Also, range_merge is important here
2028
// if column-major then the column we are merging is contig, if not, then not.
2129
// A: Keep column-major for the moment, performance evaluation later.
2230

23-
24-
// TODO: How do we want to handle raw_bucket_merge() and get_readonly_bucket_ptr()?
25-
// These functions are nice for performance because we can skip serialization but aren't
26-
// strictly necessary.
27-
// A: Make function to get size in bytes of bucket data and have the 'hash table' be contig with
28-
// the bucket data. This way we can still use these functions.
29-
30-
31-
// TODO: It would be nice to preallocate the structure if we know how big its probably going to be.
32-
// This would be helpful for delta sketches for example.
33-
// A: Yeah do this
34-
35-
36-
// TODO: What are we doing with the num_buckets variable? Could be nice to just be the size of
37-
// buckets array. Could also be upperbound on the size.
38-
// A: Need two variables. Both the current number of buckets (rows) allocated AND the maximum.
39-
40-
// A strategy that could work well would be to allocate a chunk of memory some of which is given to
41-
// the dense region of the sketch and 3 * sizeof(uint64_t) are given to sparse region.
42-
// 3 -> position, alpha, gamma (could save a little more space by using 16 bits for position)
43-
44-
/* Memory Allocation of a Sketch. Contiguous
45-
_________________________________________________________________________________________
46-
| Dense | Sparse |
47-
| Sketch | Bucket |
48-
| Buckets | Region (hash-table) |
49-
| log n * log z buckets | clog n buckets |
50-
|__________________________________________________________|______________________________|
31+
/* Memory Allocation of a SparseSketch. Contiguous (only roughly to scale).
32+
Where z is number of non-zero elements in vector we are sketching.
33+
_________________________________________________________________________________________________
34+
| Dense | Sparse | Linked List |
35+
| Bucket | Bucket | Metadata |
36+
| Region | Region | for Sparse bkts |
37+
| log n * log z buckets | clog n buckets | clogn/16 buckets |
38+
|_________________________________________________|____________________________|__________________|
5139
*/
5240

5341
/**
54-
* Sketch for graph processing, either CubeSketch or CameoSketch.
42+
* SparseSketch for graph processing
5543
* Sub-linear representation of a vector.
5644
*/
5745
class SparseSketch {
5846
private:
59-
const uint64_t seed; // seed for hash functions
60-
size_t num_samples; // number of samples we can perform
61-
size_t cols_per_sample; // number of columns to use on each sample
62-
size_t num_columns; // Total number of columns. (product of above 2)
63-
size_t bkt_per_col; // maximum number of buckets per column (max number of rows)
64-
size_t num_buckets; // number of total buckets
65-
// (either product of above two or col * dense_rows + sparse_capacity)
47+
const uint64_t seed; // seed for hash functions
48+
const size_t num_samples; // number of samples we can perform
49+
const size_t cols_per_sample; // number of columns to use on each sample
50+
const size_t num_columns; // Total number of columns. (product of above 2)
51+
const size_t bkt_per_col; // maximum number of buckets per column (max number of rows)
6652

67-
size_t sample_idx = 0; // number of samples performed so far
53+
size_t num_buckets; // number of total buckets (col * dense_rows + sparse_capacity)
54+
size_t sample_idx = 0; // number of samples performed so far
6855

6956
// Allocated buckets
7057
Bucket* buckets;
@@ -74,8 +61,9 @@ class SparseSketch {
7461

7562
// Variables for sparse representation of lower levels of bucket Matrix
7663
// TODO: evaluate implications of this constant
77-
static constexpr double sparse_bucket_constant = 3; // constant factor c (see above)
64+
static constexpr double sparse_bucket_constant = 3; // constant factor c (see diagram)
7865
SparseBucket* sparse_buckets; // a pointer into the buckets array
66+
uint8_t *ll_metadata; // pointer to heads of column LLs
7967
size_t number_of_sparse_buckets = 0; // cur number of sparse buckets
8068
size_t sparse_capacity = sparse_bucket_constant * num_columns; // max number of sparse buckets
8169

@@ -85,11 +73,12 @@ class SparseSketch {
8573
void reallocate_if_needed(int delta);
8674
void dense_realloc(size_t new_num_dense_rows);
8775

88-
// This variable lets us know how many Buckets to allocate to make space for the SparseBuckets
89-
// that will be using that space
76+
// These variables let us know how many Buckets to allocate to make space for the SparseBuckets
77+
// and the LL metadata that will use that space
9078
size_t sparse_data_size = ceil(double(sparse_capacity) * sizeof(SparseBucket) / sizeof(Bucket));
79+
size_t ll_metadata_size = ceil((double(num_columns) + 1) * sizeof(uint8_t) / sizeof(Bucket));
9180

92-
void update_sparse(SparseBucket to_add, bool realloc_if_needed = true);
81+
void update_sparse(uint8_t col, SparseBucket to_add, bool realloc_if_needed = true);
9382
SketchSample sample_sparse(size_t first_col, size_t end_col);
9483

9584
inline Bucket& deterministic_bucket() {
@@ -113,6 +102,23 @@ class SparseSketch {
113102
return buckets[position_func(col, row, num_dense_rows)];
114103
}
115104

105+
size_t calc_num_buckets(size_t new_num_dense_rows) {
106+
return num_columns * new_num_dense_rows + sparse_data_size + ll_metadata_size + 1;
107+
}
108+
109+
size_t calc_sparse_index(size_t rows) {
110+
return num_columns * rows + 1;
111+
}
112+
113+
size_t calc_metadata_index(size_t rows) {
114+
return num_columns * rows + sparse_data_size + 1;
115+
}
116+
117+
void upd_sparse_ptrs() {
118+
sparse_buckets = (SparseBucket *) &buckets[calc_sparse_index(num_dense_rows)];
119+
ll_metadata = (uint8_t *) &buckets[calc_metadata_index(num_dense_rows)];
120+
}
121+
116122
public:
117123
/**
118124
* The below constructors use vector length as their input. However, in graph sketching our input

0 commit comments

Comments
 (0)