-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathbench_latency.cpp
More file actions
185 lines (155 loc) · 6.01 KB
/
bench_latency.cpp
File metadata and controls
185 lines (155 loc) · 6.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#include <benchmark/benchmark.h>
#include <thread>
#include <atomic>
#include <vector>
#include <cstdio>
#include <hdr/hdr_histogram.h>
#include "daking/MPSC_queue.hpp"
// #include <moodycamel/concurrentqueue.h>
#if defined(__x86_64__) || defined(__i386__)
#if defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#include <intrin.h>
#else
#include <sched.h>
#include <pthread.h>
#include <x86intrin.h>
#endif
void pin_thread(int cpu_id) {
#if defined(_WIN32) || defined(_WIN64)
HANDLE thread = GetCurrentThread();
DWORD_PTR mask = (static_cast<DWORD_PTR>(1) << cpu_id);
SetThreadAffinityMask(thread, mask);
#else
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu_id, &cpuset);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
#endif
}
const double CYCLES_PER_NS = 3.992; // My CPU
const bool OUTPUT_DATA_FILE = false; // Get hdr file
// using TestQueue = moodycamel::ConcurrentQueue<int>;
using TestQueue = daking::MPSC_queue<int>;
static void BM_MPSC_PureEnqueueLatency(benchmark::State& state) {
TestQueue q;
hdr_histogram* hist;
hdr_init(1, 1000000, 3, &hist);
const int num_producers = (int)state.range(0);
std::atomic<bool> running{true};
std::atomic<bool> start_signal{false};
std::thread consumer([&]() {
pin_thread(0);
int val;
while (running.load(std::memory_order_relaxed)) {
q.try_dequeue(val);
}
});
std::vector<std::thread> other_producers;
for (int i = 1; i < num_producers; ++i) {
other_producers.emplace_back([&, i]() {
pin_thread(i + 1);
while (!start_signal.load(std::memory_order_acquire));
while (running.load(std::memory_order_relaxed)) {
q.enqueue(42);
}
});
}
pin_thread(1);
start_signal.store(true, std::memory_order_release);
for (auto _ : state) {
for (int i = 0; i < 10000; ++i) {
uint64_t start = __rdtsc();
q.enqueue(42);
uint64_t end = __rdtsc();
hdr_record_value(hist, end - start);
}
}
running = false;
for (auto& t : other_producers) t.join();
consumer.join();
if (OUTPUT_DATA_FILE) {
FILE* fp = fopen("mpsc_pure_enqueue_latency_dist.hgrm", "w");
if (fp) {
hdr_percentiles_print(hist, fp, 5, 1.0, CLASSIC);
fclose(fp);
}
}
state.counters["P99_ns"] = hdr_value_at_percentile(hist, 99.0) / CYCLES_PER_NS;
state.counters["P99.9_ns"] = hdr_value_at_percentile(hist, 99.9) / CYCLES_PER_NS;
hdr_close(hist);
}
static void BM_MPSC_PureDequeueLatency(benchmark::State& state) {
TestQueue q;
hdr_histogram* hist;
hdr_init(1, 1000000, 3, &hist);
pin_thread(0);
for (auto _ : state) {
state.PauseTiming();
for(int i=0; i<10000; ++i) q.enqueue(i);
state.ResumeTiming();
for(int i=0; i<10000; ++i) {
int val;
uint64_t start = __rdtsc();
if (q.try_dequeue(val)) {
uint64_t end = __rdtsc();
hdr_record_value(hist, end - start);
}
}
}
if (OUTPUT_DATA_FILE) {
FILE* fp = fopen("mpsc_pure_dequeue_latency_dist.hgrm", "w");
if (fp) {
hdr_percentiles_print(hist, fp, 5, 1.0, CLASSIC);
fclose(fp);
}
}
state.counters["P99_ns"] = hdr_value_at_percentile(hist, 99.0) / CYCLES_PER_NS;
state.counters["P99.9_ns"] = hdr_value_at_percentile(hist, 99.9) / CYCLES_PER_NS;
hdr_close(hist);
}
BENCHMARK(BM_MPSC_PureEnqueueLatency)->Arg(1)->Arg(2)->Arg(4)->Arg(8)->Arg(16)->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_MPSC_PureDequeueLatency)->Unit(benchmark::kMicrosecond);
BENCHMARK_MAIN();
/*
daking:
Run on (16 X 3992 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 1024 KiB (x8)
L3 Unified 16384 KiB (x1)
----------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
----------------------------------------------------------------------------------------
BM_MPSC_PureEnqueueLatency/1 0.028 us 0.027 us 25077890 P99.9_ns=80.1603 P99_ns=40.0802
BM_MPSC_PureEnqueueLatency/2 0.071 us 0.066 us 10681086 P99.9_ns=190.381 P99_ns=100.2
BM_MPSC_PureEnqueueLatency/4 0.085 us 0.079 us 8979347 P99.9_ns=300.601 P99_ns=120.24
BM_MPSC_PureEnqueueLatency/8 0.180 us 0.140 us 5931389 P99.9_ns=400.802 P99_ns=250.501
BM_MPSC_PureEnqueueLatency/16 0.618 us 0.196 us 11219862 P99.9_ns=561.373 P99_ns=340.681
BM_MPSC_PureDequeueLatency 21.9 us 20.3 us 34313 P99.9_ns=20.0401 P99_ns=10.02
moodycamel:
moodycamel ConcurrentQueue is a MPMC queue, so this comparison is unfair.
Run on (16 X 3992 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 1024 KiB (x8)
L3 Unified 16384 KiB (x1)
----------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
----------------------------------------------------------------------------------------
BM_MPSC_PureEnqueueLatency/1 0.043 us 0.043 us 15210240 P99.9_ns=340.681 P99_ns=90.1804
BM_MPSC_PureEnqueueLatency/2 0.028 us 0.028 us 23881411 P99.9_ns=1.81438k P99_ns=30.0601
BM_MPSC_PureEnqueueLatency/4 0.028 us 0.028 us 25712723 P99.9_ns=1.86448k P99_ns=20.0401
BM_MPSC_PureEnqueueLatency/8 0.132 us 0.121 us 10000000 P99.9_ns=7.03783k P99_ns=190.381
BM_MPSC_PureEnqueueLatency/16 0.736 us 0.328 us 2836924 P99.9_ns=11.3985k P99_ns=300.601
BM_MPSC_PureDequeueLatency 25.4 us 23.5 us 28047 P99.9_ns=20.0401 P99_ns=20.0401
*/
#else
#include <iostream>
int main(int argc, char** argv) {
std::cout << "This test is only for x86." << std::endl;
return 0;
}
#endif