GraphStreamingProject
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/cc_sketch_alg.h‎
Lines changed: 1 addition & 2 deletions b/‎include/cc_sketch_alg.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎include/edge_store.h‎
Lines changed: 74 additions & 0 deletions b/‎include/edge_store.h‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎include/min_cut_sketch_alg.h‎
Lines changed: 162 additions & 0 deletions b/‎include/min_cut_sketch_alg.h‎
Lines changed: 162 additions & 0 deletions
@@ -88,6 +88,8 @@ FetchContent_MakeAvailable(GutterTree StreamingUtilities)
 
 add_library(GraphZeppelin
   src/cc_sketch_alg.cpp
+  src/edge_store.cpp
+  src/min_cut_sketch_alg.cpp
   src/return_types.cpp
   src/driver_configuration.cpp
   src/cc_alg_configuration.cpp
@@ -102,6 +104,8 @@ target_compile_definitions(GraphZeppelin PUBLIC XXH_INLINE_ALL)
 
 add_library(GraphZeppelinVerifyCC
   src/cc_sketch_alg.cpp
+  src/edge_store.cpp
+  src/min_cut_sketch_alg.cpp
   src/return_types.cpp
   src/driver_configuration.cpp
   src/cc_alg_configuration.cpp
@@ -119,6 +123,7 @@ if (BUILD_EXE)
   add_executable(tests
     test/test_runner.cpp
     test/cc_alg_test.cpp
+    test/min_cut_test.cpp
     test/sketch_test.cpp
     test/dsu_test.cpp
     test/util_test.cpp
 
@@ -60,6 +60,7 @@ struct alignas(64) GlobalMergeData {
 enum QueryCode {
   CONNECTIVITY,     // connected components and spanning forest of graph
   KSPANNINGFORESTS, // k disjoint spanning forests
+  MINIMUMCUT,       // minimum cut query
 };
 
 /**
@@ -71,8 +72,6 @@ class CCSketchAlg {
   node_id_t num_vertices;
   size_t seed;
   bool update_locked = false;
-  // a set containing one "representative" from each supernode
-  std::set<node_id_t> *representatives;
   Sketch **sketches;
   // DSU representation of supernode relationship
   DisjointSetUnion_MT<node_id_t> dsu;
 
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <atomic>
+#include <iostream>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "types.h"
+
+class EdgeStore {
+ private:
+  static constexpr size_t store_edge_bytes = sizeof(SubgraphTaggedUpdate);  // Bytes of one edge
+  static constexpr double contract_factor = 2; // switch to sketch when within this factor of max
+
+  size_t seed;
+  node_id_t num_vertices;
+  size_t num_subgraphs;
+  volatile size_t cur_subgraph = 0;       // subgraph depth at which edges enter the edge store
+  volatile size_t true_min_subgraph = 0;  // the minimum subgraph of elements in the store
+
+  std::atomic<edge_id_t> num_edges;
+  std::atomic<node_id_t> needs_contraction;
+
+  std::vector<std::vector<SubgraphTaggedUpdate>> adjlist;
+
+  // This is a vector of booleans BUT we don't want to use vector<bool> because its not
+  // multithread friendly
+  std::vector<char> vertex_contracted;
+
+  size_t max_edges;                  // Bytes of sketch graph
+  size_t default_buffer_allocation;  // size we allocate each buffer in adjlist to
+
+  // locks that protect the adjacency list
+  // we have a single lock for each vertex and a lock for handling contraction logic
+  std::mutex* adj_mutex;
+  std::mutex contract_lock;
+
+  std::vector<SubgraphTaggedUpdate> vertex_contract(node_id_t src);
+  void check_if_too_big();
+
+#ifdef VERIFY_SAMPLES_F
+  void verify_contract_complete();
+  std::atomic<size_t> num_inserted;
+  std::atomic<size_t> num_duplicate;
+  std::atomic<size_t> num_returned;
+#endif
+ public:
+  // Constructor
+  EdgeStore(size_t seed, node_id_t num_vertices, size_t sketch_bytes, size_t num_subgraphs,
+            size_t start_subgraph = 0);
+  ~EdgeStore();
+
+  // functions for adding data to the edge store
+  // may return a vector of edges that need to be applied to
+
+  // this first function is only called when there exist no sketch subgraphs
+  TaggedUpdateBatch insert_adj_edges(node_id_t src, const std::vector<node_id_t>& dst_vertices);
+
+  // this function is called when there are some sketch subgraphs.
+  TaggedUpdateBatch insert_adj_edges(node_id_t src, node_id_t caller_first_es_subgraph,
+                                     SubgraphTaggedUpdate* dst_data, size_t dst_data_size);
+
+  // contract vertex data by removing all updates bound for lower subgraphs than the store
+  // is responsible for
+  TaggedUpdateBatch vertex_advance_subgraph(node_id_t cur_first_es_subgraph);
+
+  // Get methods
+  size_t get_num_edges() { return num_edges; }
+  size_t get_footprint() { return num_edges * store_edge_bytes; }
+  size_t get_first_store_subgraph() { return cur_subgraph; }
+  std::vector<Edge> get_edges();
+  bool contract_in_progress() { return true_min_subgraph < cur_subgraph; }
+};
@@ -0,0 +1,162 @@
+#pragma once
+#include <iostream>
+#include <vector>
+#include <memory>
+
+#include "cc_sketch_alg.h"
+#include "edge_store.h"
+
+
+// Configuration options for the minimum cut sketch algorithm
+class MCAlgConfiguration {
+ private:
+  // How large to make update batches as factor of sketch size
+  double _batch_factor = 1;
+  
+  // Returned min-cut guaranteed to be a +/- epsilon multiplicative approx of the true min cut.
+  double _epsilon = 0.5;
+
+  // Number of subgraphs for which we use a delta sketch
+  // When applying sketch updates to other subgraphs, apply updates directly to sketch
+  size_t _num_subgraphs_use_delta = 2;
+
+  friend class MinCutSketchAlg;
+ public:
+  // setters
+  MCAlgConfiguration& batch_factor(double batch_factor) {
+    if (batch_factor <= 0) {
+      std::cerr << "WARNING: Batch factor in MCAlgConfiguration must be > 0." << std::endl;
+      std::cerr << "         Setting to default value: " << _batch_factor << std::endl;
+    } else {
+      _batch_factor = batch_factor;
+    }
+    return *this;
+  }
+  MCAlgConfiguration& epsilon(double epsilon) {
+    if (epsilon <= 0 || epsilon > 1) {
+      std::cerr << "WARNING: MCAlgConfiguration epsilon must be in range (0, 1]." << std::endl;
+      std::cerr << "         Setting to default value: " << _epsilon << std::endl;
+    } else {
+      _epsilon = epsilon;
+    }
+    return *this;
+  }
+  MCAlgConfiguration& num_subgraphs_use_delta(size_t num_subgraphs) {
+    _num_subgraphs_use_delta = num_subgraphs;
+    return *this;
+  }
+
+  // getters
+  double get_batch_factor() { return _batch_factor; }
+  double get_epsilon() { return _epsilon; }
+  size_t get_num_subgraphs_use_delta() { return _num_subgraphs_use_delta; }
+
+  friend std::ostream& operator<< (std::ostream &out, const MCAlgConfiguration &conf) {
+    out << "Minimum Cut Algorithm Configuration:" << std::endl;
+    out << "  batch_factor = " << conf._batch_factor << std::endl;
+    return out;
+  }
+};
+
+// Minimum cut sketch algorithm class
+class MinCutSketchAlg {
+ private:
+  const node_id_t num_vertices;
+  const size_t seed;
+  MCAlgConfiguration config;
+  const size_t max_subgraphs;
+  size_t cur_subgraphs;
+
+  const double sketch_factor;
+  const size_t sketch_samples;
+
+  CCSketchAlg **cc_sketches;
+  EdgeStore edge_store;
+
+  Sketch *delta_sketches = nullptr;
+  node_id_t **update_buffers = nullptr;
+  size_t num_delta_sketches = 0;
+  size_t num_upd_buffers = 0;
+
+#ifdef VERIFY_SAMPLES_F
+  std::unique_ptr<GraphVerifier> verifier;
+#endif
+
+  CCAlgConfiguration cc_config;
+ public:
+  /**
+   * Construct an instance of the Minimum Cut Sketching Algorithm
+   * param _num_vertices  number of graph vertices
+   * param _seed          seed to hash functions
+   * param _config        Configuration options for minimum cut sketch algorithm
+   */
+  MinCutSketchAlg(node_id_t _num_vertices, size_t _seed,
+                  MCAlgConfiguration _config = MCAlgConfiguration());
+
+  ~MinCutSketchAlg();
+
+  /**
+   * Allocate memory for the worker threads to use when updating this algorithm's sketches
+   */
+  void allocate_worker_memory(size_t num_workers);
+
+  /**
+   * Returns the number of buffered updates we would like to have in the update batches
+   */
+  size_t get_desired_updates_per_batch() {
+    return config._batch_factor; // TODO: Fill in correctly
+  }
+
+  /**
+   * Action to take on an update before inserting it to the guttering system.
+   * We use this function to manage the eager dsu.
+   */
+  void pre_insert(GraphUpdate upd, node_id_t thr_id);
+
+
+  /**
+   * Update all the sketches for a vertex, given a batch of updates.
+   * param thr_id         The id of the thread performing the update [0, num_threads)
+   * param src_vertex     The vertex where the edges originate.
+   * param dst_vertices   A vector of destinations.
+   */
+  void apply_update_batch(size_t thr_id, node_id_t src_vertex,
+                          const std::vector<node_id_t> &dst_vertices);
+
+  /**
+   * Set the verifier this algorithm will use to check its correctness
+   * TODO: What is the right way to use verifier for minimum cut?
+   */
+#ifdef VERIFY_SAMPLES_F
+  void set_verifier(std::unique_ptr<GraphVerifier> verifier) {
+    this->verifier = std::move(verifier);
+  }
+#endif
+
+  /**
+   * Main query routine of this algorithm.
+   * Returns an approximation of the minimum cut of the graph defined by the graph stream
+   * seen thus far. This approximation is guaranteed to be within 1 +/- epsilon of the true
+   * minimum cut.
+   */
+  size_t calc_minimum_cut();
+
+  /**
+   * Return if we have cached an answer to query.
+   * This allows the driver to avoid flushing the gutters before calling query functions.
+   * TODO: Is there something intelligent we can do here for mincut/k-conn
+   */
+  bool has_cached_query(int query_type) {
+    if (query_type != MINIMUMCUT) return cc_sketches[0]->has_cached_query(query_type);
+    return false; 
+  }
+
+  /**
+   * Print the configuration of minimum cut graph sketching algorithm.
+   */
+  void print_configuration() {
+    std::cout << config << std::endl;
+  }
+
+  node_id_t get_num_vertices() { return num_vertices; }
+};