got it working

etwest · etwest · commit 58d7edfb52b2 · 2025-08-08T21:59:50.000-04:00
diff --git a/include/bucket.h b/include/bucket.h
@@ -3,6 +3,7 @@
 
 #include <iostream>
 #include <vector>
+#include <bitset>
 
 #include "types.h"
 
@@ -42,13 +43,12 @@ inline static Depths get_index_depths(vec_t update_idx, size_t seed, col_hash_t
   uint64_t depth_hash = col_hash(&update_idx, sizeof(vec_t), seed);
   Depths ret;
 
-  depth_hash |= (1ull << max_depth);  // assert not > max_depth by ORing
-  ret[0] = __builtin_ctzll(depth_hash);
+  // assert not > max_depth by ORing
+  ret[0] = __builtin_ctzll(depth_hash | (1ull << max_depth));
 
-  // shift hash over and reassert max_depth
-  depth_hash >>= 32;
-  depth_hash |= (1ull << max_depth);
-  ret[1] = __builtin_ctzll(depth_hash);
+  // shift hash over, reassert max_depth, and grab another depth
+  depth_hash >>= max_depth;
+  ret[1] = __builtin_ctzll(depth_hash | (1ull << max_depth));
 
   return ret;
 }
diff --git a/include/cc_sketch_alg.h b/include/cc_sketch_alg.h
@@ -187,6 +187,17 @@ class CCSketchAlg {
   void apply_update_batch(int thr_id, node_id_t src_vertex,
                           const std::vector<node_id_t> &dst_vertices);
 
+  /**
+   * Update the sketches for a particular vertex, given a batch of edge indices. These indices must
+   * be constructed using concat_pairing_fn() and must all be associated with a particular graph
+   * vertex.
+   * param: thr_id         The id of the thread performing the update [0, num_threads)
+   * param: src_vertex     The vertex where the edges originate.
+   * param: idxs           A vector of concatenated edges.
+   */
+  void apply_concat_update_batch(int thr_id, node_id_t src_vertex,
+                                 const std::vector<edge_id_t> &idxs);
+
   /**
    * Return if we have cached an answer to query.
    * This allows the driver to avoid flushing the gutters before calling query functions.
diff --git a/include/dense_sketch.h b/include/dense_sketch.h
@@ -13,6 +13,8 @@
 #include "bucket.h"
 #include "sketch_types.h"
 
+class SparseSketch;
+
 /**
  * Sketch for graph processing, either CubeSketch or CameoSketch.
  * Sub-linear representation of a vector.
@@ -151,6 +153,7 @@ class DenseSketch {
   void zero_contents();
 
   friend bool operator==(const DenseSketch& sketch1, const DenseSketch& sketch2);
+  friend bool operator==(const SparseSketch& sparse, const DenseSketch& dense);
   friend std::ostream& operator<<(std::ostream& os, const DenseSketch& sketch);
 
   /**
@@ -175,7 +178,7 @@ class DenseSketch {
 
   inline const Bucket* get_readonly_bucket_ptr() const { return (const Bucket*) buckets; }
   inline uint64_t get_seed() const { return seed; }
-  inline size_t column_seed(size_t column_idx) const { return seed + column_idx * 5; }
+  inline size_t column_seed(size_t column_idx) const { return seed + (5 * column_idx); }
   inline size_t checksum_seed() const { return seed; }
   inline size_t get_columns() const { return num_columns; }
   inline size_t get_buckets() const { return num_buckets; }
diff --git a/include/sparse_sketch.h b/include/sparse_sketch.h
@@ -13,11 +13,13 @@
 #include "bucket.h"
 #include "sketch_types.h"
 
+class DenseSketch;
+
 #pragma pack(push,1)
 struct SparseBucket {
-  uint8_t next; // index of next sparse bucket in this column
-  uint8_t row;  // row of sparse bucket
-  Bucket bkt;   // actual bucket content
+  uint16_t next; // index of next sparse bucket in this column
+  uint8_t row;   // row of sparse bucket
+  Bucket bkt;    // actual bucket content
 };
 #pragma pack(pop)
 
@@ -67,9 +69,10 @@ class SparseSketch {
   // TODO: evaluate implications of this constant
   static constexpr double sparse_bucket_constant = 3;            // constant factor c (see diagram)
   SparseBucket* sparse_buckets;                                  // a pointer into the buckets array
-  uint8_t *ll_metadata;                                          // pointer to heads of column LLs
+  uint16_t *ll_metadata;                                         // pointer to heads of column LLs
   size_t number_of_sparse_buckets = 0;                           // cur number of sparse buckets
   size_t sparse_capacity = sparse_bucket_constant * num_columns; // max number of sparse buckets
+	static constexpr size_t max_columns = uint16_t(-1) / sparse_bucket_constant - 1;
 
   /**
    * Reallocates the bucket array if necessary to either grow or shrink the dense region
@@ -80,43 +83,43 @@ class SparseSketch {
   // These variables let us know how many Buckets to allocate to make space for the SparseBuckets
   // and the LL metadata that will use that space
   size_t sparse_data_size = ceil(double(sparse_capacity) * sizeof(SparseBucket) / sizeof(Bucket));
-  size_t ll_metadata_size = ceil((double(num_columns) + 1) * sizeof(uint8_t) / sizeof(Bucket));
+  size_t ll_metadata_size = ceil((double(num_columns) + 1) * sizeof(uint16_t) / sizeof(Bucket));
 
-  void update_sparse(uint8_t col, const SparseBucket &to_add);
+  void update_sparse(uint16_t col, const SparseBucket &to_add);
   SketchSample sample_sparse(size_t first_col, size_t end_col);
 
-  inline uint8_t remove_ll_head(size_t col) {
-    uint8_t temp = ll_metadata[col];
+  inline uint16_t remove_ll_head(size_t col) {
+    uint16_t temp = ll_metadata[col];
     ll_metadata[col] = sparse_buckets[ll_metadata[col]].next;
     return temp;
   }
-  inline uint8_t claim_free_bucket() {
-    assert(ll_metadata[num_columns] != uint8_t(-1));
+  inline uint16_t claim_free_bucket() {
+    assert(ll_metadata[num_columns] != uint16_t(-1));
     return remove_ll_head(num_columns);
   }
-  inline void insert_to_ll_head(size_t col, uint8_t add_idx) {
+  inline void insert_to_ll_head(size_t col, uint16_t add_idx) {
     sparse_buckets[add_idx].next = ll_metadata[col];
     ll_metadata[col] = add_idx;
   }
-  inline void free_bucket(uint8_t bkt_idx) {
+  inline void free_bucket(uint16_t bkt_idx) {
     sparse_buckets[bkt_idx].row = 0;
     sparse_buckets[bkt_idx].bkt = {0, 0};
     insert_to_ll_head(num_columns, bkt_idx);
   }
-  inline void insert_to_ll(uint8_t add_idx, SparseBucket &prev) {
+  inline void insert_to_ll(uint16_t add_idx, SparseBucket &prev) {
     sparse_buckets[add_idx].next = prev.next;
     prev.next = add_idx;
   }
   inline void remove_from_ll(SparseBucket& bkt_to_remove, SparseBucket &prev) {
     prev.next = bkt_to_remove.next;
   }
-  inline bool merge_sparse_bkt(uint8_t our_idx, const SparseBucket& oth, uint8_t prev_idx,
+  inline bool merge_sparse_bkt(uint16_t our_idx, const SparseBucket& oth, uint16_t prev_idx,
                                size_t col) {
     SparseBucket &ours = sparse_buckets[our_idx];
     ours.bkt.alpha ^= oth.bkt.alpha;
     ours.bkt.gamma ^= oth.bkt.gamma;
     if (SketchBucket::is_empty(ours.bkt)) {
-      if (prev_idx == uint8_t(-1)) 
+      if (prev_idx == uint16_t(-1)) 
         remove_ll_head(col);
       else 
         remove_from_ll(ours, sparse_buckets[prev_idx]);
@@ -162,11 +165,11 @@ class SparseSketch {
 
   void upd_sparse_ptrs() {
     sparse_buckets = (SparseBucket *) &buckets[calc_sparse_index(num_dense_rows)];
-    ll_metadata = (uint8_t *) &buckets[calc_metadata_index(num_dense_rows)];
+    ll_metadata = (uint16_t *) &buckets[calc_metadata_index(num_dense_rows)];
   }
 
   // given another SparseSketch column, merge it into ours
-  void merge_sparse_column(const SparseBucket* oth_sparse_buckets, const uint8_t* oth_ll_metadata,
+  void merge_sparse_column(const SparseBucket* oth_sparse_buckets, const uint16_t* oth_ll_metadata,
                            size_t col);
  public:
   /**
@@ -274,6 +277,7 @@ class SparseSketch {
   void zero_contents();
 
   friend bool operator==(const SparseSketch& sketch1, const SparseSketch& sketch2);
+  friend bool operator==(const SparseSketch& sparse, const DenseSketch& dense);
   friend std::ostream& operator<<(std::ostream& os, const SparseSketch& sketch);
 
   /**
@@ -294,7 +298,7 @@ class SparseSketch {
   // return the size of a sketch given vector size n and number of samples s
   static size_t estimate_bytes(size_t /*n*/, size_t s) {
     size_t num_cols = s * default_cols_per_sample;
-    size_t metadata_size = ceil(double(num_cols + 1) * sizeof(uint8_t) / sizeof(Bucket)) * sizeof(Bucket);
+    size_t metadata_size = ceil(double(num_cols + 1) * sizeof(uint16_t) / sizeof(Bucket)) * sizeof(Bucket);
     size_t sparse_size =
         ceil(double(num_cols) * sparse_bucket_constant * sizeof(SparseBucket) / sizeof(Bucket)) *
         sizeof(Bucket);
diff --git a/src/cc_sketch_alg.cpp b/src/cc_sketch_alg.cpp
@@ -146,7 +146,7 @@ inline bool CCSketchAlg::sample_supernode(Sketch &skt) {
   Edge e = inv_concat_pairing_fn(sample.idx);
   SampleResult result_type = sample.result;
 
-  // std::cout << " " << result_type << " e:" << e.src << " " << e.dst << std::endl;
+  // std::cerr << " " << result_type << " e:" << e.src << " " << e.dst << std::endl;
 
   if (result_type == FAIL) {
     modified = true;
@@ -495,7 +495,7 @@ void CCSketchAlg::boruvka_emulation() {
   //             << std::endl;
 
   while (true) {
-    // std::cout << "   Round: " << round_num << std::endl;
+    // std::cerr << "   Round: " << round_num << std::endl;
     // start = std::chrono::steady_clock::now();
     modified = perform_boruvka_round(round_num, merge_instr, global_merges);
     // std::cout << "     perform_boruvka_round = "
@@ -637,12 +637,16 @@ std::vector<SpanningForest> CCSketchAlg::calc_disjoint_spanning_forests(size_t k
   size_t max_rounds = 0;
 
   for (size_t i = 0; i < k; i++) {
+    std::cout << " Spanning forest: " << i << std::endl;
     compute_dsu();
 
     SFs.emplace_back(num_vertices, spanning_forest);
     max_rounds = std::max(last_query_rounds, max_rounds);
 
     filter_sf_edges(SFs[SFs.size() - 1]);
+
+    std::cout << "Spanning Forest " << i << " size = " << SFs[SFs.size() - 1].get_edges().size() << std::endl;
+    std::cout << "Last query rounds = " << last_query_rounds << std::endl;
     if (SFs[SFs.size() - 1].get_edges().size() == 0) break;
   }
 
diff --git a/src/dense_sketch.cpp b/src/dense_sketch.cpp
@@ -63,10 +63,21 @@ void DenseSketch::update(const vec_t update_idx) {
   SketchBucket::update(deterministic_bucket(), update_idx, checksum);
 
   // Update higher depth buckets
-  for (unsigned i = 0; i < num_columns; ++i) {
-    col_hash_t depth = SketchBucket::get_index_depth(update_idx, column_seed(i), bkt_per_col);
+  SketchBucket::Depths depths;
+  for (size_t i = 0; i < num_columns - 1; i += 2) {
+    depths = SketchBucket::get_index_depths(update_idx, column_seed(i), bkt_per_col);
+    for (size_t j = 0; j < 2; j++) {
+      col_hash_t depth = depths[j];
+      likely_if(depth < bkt_per_col) {
+        SketchBucket::update(bucket(i + j, depth), update_idx, checksum);
+      }
+    }
+  }
+  if ((num_columns & 0x1) == 1) {
+    size_t col = num_columns - 1;
+    size_t depth = SketchBucket::get_index_depth(update_idx, column_seed(col), bkt_per_col);
     likely_if(depth < bkt_per_col) {
-      SketchBucket::update(bucket(i, depth), update_idx, checksum);
+      SketchBucket::update(bucket(col, depth), update_idx, checksum);
     }
   }
 }
@@ -223,7 +234,7 @@ bool operator==(const DenseSketch &sketch1, const DenseSketch &sketch2) {
 }
 
 std::ostream &operator<<(std::ostream &os, const DenseSketch &sketch) {
-  Bucket bkt = sketch.buckets[sketch.num_buckets - 1];
+  Bucket bkt = sketch.deterministic_bucket();
   bool good = SketchBucket::is_good(bkt, sketch.checksum_seed());
   vec_t a = bkt.alpha;
   vec_hash_t c = bkt.gamma;
diff --git a/src/min_cut_sketch_alg.cpp b/src/min_cut_sketch_alg.cpp
@@ -137,9 +137,16 @@ void MinCutSketchAlg::apply_update_batch(size_t thr_id, node_id_t src_vertex,
 
     std::fill(&num_mapped[0], &num_mapped[max_subgraphs - 1], 0);
     for (auto tagged_edge : batch.dsts_data) {
-      assert(tagged_edge.subgraph < cur_subgraphs);
-      
-      buffers[tagged_edge.subgraph][num_mapped[tagged_edge.subgraph]++] = tagged_edge.dst;
+      size_t subgraph = tagged_edge.subgraph;
+      assert(subgraph < cur_subgraphs);
+
+      buffers[subgraph][num_mapped[subgraph]++] = tagged_edge.dst;
+      assert(num_mapped[subgraph] <= buffers[subgraph].capacity());
+
+      unlikely_if (num_mapped[subgraph] >= buffer_elms) {
+        cc_sketches[subgraph]->apply_update_batch(thr_id, batch.src, buffers[subgraph]);
+        num_mapped[subgraph] = 0;
+      }
     }
 
     for (size_t i = 1; i < batch.edge_store_subgraph; i++) {
diff --git a/src/sparse_sketch.cpp b/src/sparse_sketch.cpp
diff --git a/test/sketch_test.cpp b/test/sketch_test.cpp
diff --git a/tools/benchmark/graphcc_bench.cpp b/tools/benchmark/graphcc_bench.cpp