Skip to content

Commit 3a1b24c

Browse files
committed
removed folly dependence
1 parent 993c527 commit 3a1b24c

File tree

3 files changed

+260
-12
lines changed

3 files changed

+260
-12
lines changed

CMakeLists.txt

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,11 @@ FetchContent_Declare(
7171
# FetchContent_MakeAvailable(fast_float)
7272

7373

74-
FetchContent_Declare(
75-
folly
76-
GIT_REPOSITORY https://github.com/facebook/folly.git
77-
GIT_TAG v2025.07.14.00
78-
)
74+
# FetchContent_Declare(
75+
# folly
76+
# GIT_REPOSITORY https://github.com/facebook/folly.git
77+
# GIT_TAG v2025.07.14.00
78+
# )
7979

8080
# Get GutterTree Project
8181
FetchContent_Declare(
@@ -103,7 +103,7 @@ add_compile_definitions(GLOG_USE_GLOG_EXPORT)
103103

104104
# Get google highway and folly
105105
FetchContent_MakeAvailable(hwy)
106-
FetchContent_MakeAvailable(folly)
106+
# FetchContent_MakeAvailable(folly)
107107

108108
# Ensure highway target is explicitly added
109109
#add_library(highway INTERFACE IMPORTED)
@@ -146,8 +146,8 @@ add_library(GraphZeppelin
146146
src/sketch_columns.cpp
147147
src/recovery.cpp
148148
src/util.cpp)
149-
add_dependencies(GraphZeppelin GutterTree StreamingUtilities hwy folly)
150-
target_link_libraries(GraphZeppelin PUBLIC xxhash GutterTree StreamingUtilities hwy folly)
149+
add_dependencies(GraphZeppelin GutterTree StreamingUtilities hwy)
150+
target_link_libraries(GraphZeppelin PUBLIC xxhash GutterTree StreamingUtilities hwy)
151151
target_include_directories(GraphZeppelin PUBLIC include/)
152152
target_compile_options(GraphZeppelin PUBLIC -fopenmp)
153153
target_link_options(GraphZeppelin PUBLIC -fopenmp)
@@ -164,7 +164,7 @@ add_library(GraphZeppelinVerifyCC
164164
src/util.cpp
165165
test/util/graph_verifier.cpp)
166166
add_dependencies(GraphZeppelinVerifyCC GutterTree StreamingUtilities hwy)
167-
target_link_libraries(GraphZeppelinVerifyCC PUBLIC xxhash GutterTree StreamingUtilities hwy folly)
167+
target_link_libraries(GraphZeppelinVerifyCC PUBLIC xxhash GutterTree StreamingUtilities hwy)
168168
target_include_directories(GraphZeppelinVerifyCC PUBLIC include/ include/test/)
169169
target_compile_options(GraphZeppelinVerifyCC PUBLIC -fopenmp)
170170
target_link_options(GraphZeppelinVerifyCC PUBLIC -fopenmp)
@@ -203,5 +203,5 @@ if (BUILD_BENCH)
203203
add_executable(bench_cc
204204
tools/benchmark/graphcc_bench.cpp)
205205
add_dependencies(bench_cc GraphZeppelin benchmark)
206-
target_link_libraries(bench_cc GraphZeppelin benchmark::benchmark xxhash folly)
206+
target_link_libraries(bench_cc GraphZeppelin benchmark::benchmark xxhash)
207207
endif()

include/RWSpinLock.h

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
/*
18+
* N.B. You most likely do _not_ want to use RWSpinLock or any other
19+
* kind of spinlock. Use SharedMutex instead.
20+
*
21+
* In short, spinlocks in preemptive multi-tasking operating systems
22+
* have serious problems and fast mutexes like SharedMutex are almost
23+
* certainly the better choice, because letting the OS scheduler put a
24+
* thread to sleep is better for system responsiveness and throughput
25+
* than wasting a timeslice repeatedly querying a lock held by a
26+
* thread that's blocked, and you can't prevent userspace
27+
* programs blocking.
28+
*
29+
* Spinlocks in an operating system kernel make much more sense than
30+
* they do in userspace.
31+
*
32+
* -------------------------------------------------------------------
33+
*
34+
* Two Read-Write spin lock implementations.
35+
*
36+
* Ref: http://locklessinc.com/articles/locks
37+
*
38+
* Both locks here are faster than pthread_rwlock and have very low
39+
* overhead (usually 20-30ns). They don't use any system mutexes and
40+
* are very compact (4/8 bytes), so are suitable for per-instance
41+
* based locking, particularly when contention is not expected.
42+
*
43+
* For a spinlock, RWSpinLock is a reasonable choice. (See the note
44+
* about for why a spin lock is frequently a bad idea generally.)
45+
* RWSpinLock has minimal overhead, and comparable contention
46+
* performance when the number of competing threads is less than or
47+
* equal to the number of logical CPUs. Even as the number of
48+
* threads gets larger, RWSpinLock can still be very competitive in
49+
* READ, although it is slower on WRITE, and also inherently unfair
50+
* to writers.
51+
*
52+
* RWSpinLock handles 2^30 - 1 concurrent readers.
53+
*/
54+
55+
#pragma once
56+
57+
/*
58+
========================================================================
59+
Benchmark on (Intel(R) Xeon(R) CPU L5630 @ 2.13GHz) 8 cores(16 HTs)
60+
========================================================================
61+
62+
------------------------------------------------------------------------------
63+
1. Single thread benchmark (read/write lock + unlock overhead)
64+
Benchmark Iters Total t t/iter iter/sec
65+
-------------------------------------------------------------------------------
66+
* BM_RWSpinLockRead 100000 1.786 ms 17.86 ns 53.4M
67+
+30.5% BM_RWSpinLockWrite 100000 2.331 ms 23.31 ns 40.91M
68+
+ 175% BM_PThreadRWMutexRead 100000 4.917 ms 49.17 ns 19.4M
69+
+ 166% BM_PThreadRWMutexWrite 100000 4.757 ms 47.57 ns 20.05M
70+
71+
------------------------------------------------------------------------------
72+
2. Contention Benchmark 90% read 10% write
73+
Benchmark hits average min max sigma
74+
------------------------------------------------------------------------------
75+
---------- 8 threads ------------
76+
RWSpinLock Write 142666 220ns 78ns 40.8us 269ns
77+
RWSpinLock Read 1282297 222ns 80ns 37.7us 248ns
78+
pthread_rwlock_t Write 84248 2.48us 99ns 269us 8.19us
79+
pthread_rwlock_t Read 761646 933ns 101ns 374us 3.25us
80+
81+
---------- 16 threads ------------
82+
RWSpinLock Write 124236 237ns 78ns 261us 801ns
83+
RWSpinLock Read 1115807 236ns 78ns 2.27ms 2.17us
84+
pthread_rwlock_t Write 83363 7.12us 99ns 785us 28.1us
85+
pthread_rwlock_t Read 754978 2.18us 101ns 1.02ms 14.3us
86+
87+
---------- 50 threads ------------
88+
RWSpinLock Write 131142 1.37us 82ns 7.53ms 68.2us
89+
RWSpinLock Read 1181240 262ns 78ns 6.62ms 12.7us
90+
pthread_rwlock_t Write 80849 112us 103ns 4.52ms 263us
91+
pthread_rwlock_t Read 728698 24us 101ns 7.28ms 194us
92+
93+
*/
94+
95+
96+
#include <algorithm>
97+
#include <atomic>
98+
#include <thread>
99+
100+
101+
namespace from_folly {
102+
103+
/*
104+
* A simple, small (4-bytes), but unfair rwlock. Use it when you want
105+
* a nice writer and don't expect a lot of write/read contention, or
106+
* when you need small rwlocks since you are creating a large number
107+
* of them.
108+
*
109+
* Note that the unfairness here is extreme: if the lock is
110+
* continually accessed for read, writers will never get a chance. If
111+
* the lock can be that highly contended this class is probably not an
112+
* ideal choice anyway.
113+
*
114+
* It currently implements most of the Lockable, SharedLockable and
115+
* UpgradeLockable concepts except the TimedLockable related locking/unlocking
116+
* interfaces.
117+
*/
118+
class RWSpinLock {
119+
enum : int32_t { READER = 4, UPGRADED = 2, WRITER = 1 };
120+
121+
public:
122+
constexpr RWSpinLock() : bits_(0) {}
123+
124+
RWSpinLock(RWSpinLock const&) = delete;
125+
RWSpinLock& operator=(RWSpinLock const&) = delete;
126+
127+
// Lockable Concept
128+
void lock() {
129+
uint_fast32_t count = 0;
130+
while (!try_lock()) [[unlikely]] {
131+
if (++count > 1000) {
132+
std::this_thread::yield();
133+
}
134+
}
135+
}
136+
137+
// Writer is responsible for clearing up both the UPGRADED and WRITER bits.
138+
void unlock() {
139+
static_assert(READER > WRITER + UPGRADED, "wrong bits!");
140+
bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
141+
}
142+
143+
// SharedLockable Concept
144+
void lock_shared() {
145+
uint_fast32_t count = 0;
146+
while (!try_lock_shared()) [[unlikely]] {
147+
if (++count > 1000) {
148+
std::this_thread::yield();
149+
}
150+
}
151+
}
152+
153+
void unlock_shared() { bits_.fetch_add(-READER, std::memory_order_release); }
154+
155+
// Downgrade the lock from writer status to reader status.
156+
void unlock_and_lock_shared() {
157+
bits_.fetch_add(READER, std::memory_order_acquire);
158+
unlock();
159+
}
160+
161+
// UpgradeLockable Concept
162+
void lock_upgrade() {
163+
uint_fast32_t count = 0;
164+
while (!try_lock_upgrade()) {
165+
if (++count > 1000) {
166+
std::this_thread::yield();
167+
}
168+
}
169+
}
170+
171+
void unlock_upgrade() {
172+
bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel);
173+
}
174+
175+
// unlock upgrade and try to acquire write lock
176+
void unlock_upgrade_and_lock() {
177+
int64_t count = 0;
178+
while (!try_unlock_upgrade_and_lock()) {
179+
if (++count > 1000) {
180+
std::this_thread::yield();
181+
}
182+
}
183+
}
184+
185+
// unlock upgrade and read lock atomically
186+
void unlock_upgrade_and_lock_shared() {
187+
bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel);
188+
}
189+
190+
// write unlock and upgrade lock atomically
191+
void unlock_and_lock_upgrade() {
192+
// need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
193+
// the same time when other threads are trying do try_lock_upgrade().
194+
bits_.fetch_or(UPGRADED, std::memory_order_acquire);
195+
bits_.fetch_add(-WRITER, std::memory_order_release);
196+
}
197+
198+
// Attempt to acquire writer permission. Return false if we didn't get it.
199+
bool try_lock() {
200+
int32_t expect = 0;
201+
return bits_.compare_exchange_strong(
202+
expect, WRITER, std::memory_order_acq_rel);
203+
}
204+
205+
// Try to get reader permission on the lock. This can fail if we
206+
// find out someone is a writer or upgrader.
207+
// Setting the UPGRADED bit would allow a writer-to-be to indicate
208+
// its intention to write and block any new readers while waiting
209+
// for existing readers to finish and release their read locks. This
210+
// helps avoid starving writers (promoted from upgraders).
211+
bool try_lock_shared() {
212+
// fetch_add is considerably (100%) faster than compare_exchange,
213+
// so here we are optimizing for the common (lock success) case.
214+
int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
215+
if (value & (WRITER | UPGRADED)) [[unlikely]] {
216+
bits_.fetch_add(-READER, std::memory_order_release);
217+
return false;
218+
}
219+
return true;
220+
}
221+
222+
// try to unlock upgrade and write lock atomically
223+
bool try_unlock_upgrade_and_lock() {
224+
int32_t expect = UPGRADED;
225+
return bits_.compare_exchange_strong(
226+
expect, WRITER, std::memory_order_acq_rel);
227+
}
228+
229+
// try to acquire an upgradable lock.
230+
bool try_lock_upgrade() {
231+
int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
232+
233+
// Note: when failed, we cannot flip the UPGRADED bit back,
234+
// as in this case there is either another upgrade lock or a write lock.
235+
// If it's a write lock, the bit will get cleared up when that lock's done
236+
// with unlock().
237+
return ((value & (UPGRADED | WRITER)) == 0);
238+
}
239+
240+
// mainly for debugging purposes.
241+
int32_t bits() const { return bits_.load(std::memory_order_acquire); }
242+
243+
private:
244+
std::atomic<int32_t> bits_;
245+
};
246+
247+
}

include/sketch/sketch_columns.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111

1212
#include <gtest/gtest.h>
1313

14-
#include <folly/synchronization/RWSpinLock.h>
14+
// #include /* <folly/synchronization/RWSpinLock.h> */
15+
#include "RWSpinLock.h"
1516

1617
/*
1718
* FOR NOW - simplest possible design
@@ -102,7 +103,7 @@ FRIEND_TEST(SketchColumnTestSuite, TestUpdateReallocation);
102103
Bucket *buckets;
103104
Bucket deterministic_bucket = {0, 0};
104105
uint64_t seed;
105-
folly::RWSpinLock lock;
106+
from_folly::RWSpinLock lock;
106107
uint8_t capacity;
107108
public:
108109
void set_seed(uint64_t new_seed) { seed = new_seed; };

0 commit comments

Comments
 (0)