Skip to content

Commit 72eca2a

Browse files
authored
gh-145230: Update lockbench (gh-145231)
Remove PyThread_type_lock (now uses PyMutex internally). Add new benchmark options: - work_inside/work_outside: control work inside and outside the critical section to vary contention levels - num_locks: use multiple independent locks with threads assigned round-robin - total_iters: fixed iteration count per thread instead of time-based, useful for measuring fairness - num_acquisitions: lock acquisitions per loop iteration - random_locks: acquire random lock each iteration Also return elapsed time from benchmark_locks() and switch lockbench.py to use argparse.
1 parent 4d89056 commit 72eca2a

File tree

3 files changed

+227
-92
lines changed

3 files changed

+227
-92
lines changed

Modules/_testinternalcapi/clinic/test_lock.c.h

Lines changed: 65 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/_testinternalcapi/test_lock.c

Lines changed: 102 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -194,101 +194,143 @@ test_lock_counter_slow(PyObject *self, PyObject *obj)
194194
Py_RETURN_NONE;
195195
}
196196

197-
struct bench_data_locks {
198-
int stop;
199-
int use_pymutex;
200-
int critical_section_length;
197+
struct bench_lock {
201198
char padding[200];
202-
PyThread_type_lock lock;
203199
PyMutex m;
204200
double value;
205-
Py_ssize_t total_iters;
201+
};
202+
203+
struct bench_config {
204+
int stop;
205+
int work_inside;
206+
int work_outside;
207+
int num_acquisitions;
208+
int random_locks;
209+
Py_ssize_t target_iters;
210+
Py_ssize_t num_locks;
211+
struct bench_lock *locks;
206212
};
207213

208214
struct bench_thread_data {
209-
struct bench_data_locks *bench_data;
215+
struct bench_config *config;
216+
struct bench_lock *lock;
217+
uint64_t rng_state;
210218
Py_ssize_t iters;
211219
PyEvent done;
212220
};
213221

222+
static uint64_t
223+
splitmix64(uint64_t *state)
224+
{
225+
uint64_t z = (*state += 0x9e3779b97f4a7c15);
226+
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
227+
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
228+
return z ^ (z >> 31);
229+
}
230+
214231
static void
215232
thread_benchmark_locks(void *arg)
216233
{
217-
struct bench_thread_data *thread_data = arg;
218-
struct bench_data_locks *bench_data = thread_data->bench_data;
219-
int use_pymutex = bench_data->use_pymutex;
220-
int critical_section_length = bench_data->critical_section_length;
221-
234+
struct bench_thread_data *td = arg;
235+
struct bench_config *config = td->config;
236+
int work_inside = config->work_inside;
237+
int work_outside = config->work_outside;
238+
int num_acquisitions = config->num_acquisitions;
239+
Py_ssize_t target_iters = config->target_iters;
240+
uint64_t rng_state = td->rng_state;
241+
242+
double local_value = 0.0;
222243
double my_value = 1.0;
223244
Py_ssize_t iters = 0;
224-
while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
225-
if (use_pymutex) {
226-
PyMutex_Lock(&bench_data->m);
227-
for (int i = 0; i < critical_section_length; i++) {
228-
bench_data->value += my_value;
229-
my_value = bench_data->value;
245+
for (;;) {
246+
if (target_iters > 0) {
247+
if (iters >= target_iters) {
248+
break;
230249
}
231-
PyMutex_Unlock(&bench_data->m);
232250
}
233-
else {
234-
PyThread_acquire_lock(bench_data->lock, 1);
235-
for (int i = 0; i < critical_section_length; i++) {
236-
bench_data->value += my_value;
237-
my_value = bench_data->value;
251+
else if (_Py_atomic_load_int_relaxed(&config->stop)) {
252+
break;
253+
}
254+
struct bench_lock *lock = td->lock;
255+
if (config->random_locks) {
256+
uint32_t r = (uint32_t)splitmix64(&rng_state);
257+
// Fast modulo reduction to pick a random lock, adapted from:
258+
// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
259+
Py_ssize_t idx = ((uint64_t)r * (uint32_t)config->num_locks) >> 32;
260+
lock = &config->locks[idx];
261+
}
262+
for (int acq = 0; acq < num_acquisitions; acq++) {
263+
PyMutex_Lock(&lock->m);
264+
for (int i = 0; i < work_inside; i++) {
265+
lock->value += my_value;
266+
my_value = lock->value;
238267
}
239-
PyThread_release_lock(bench_data->lock);
268+
PyMutex_Unlock(&lock->m);
240269
}
241-
iters++;
270+
for (int i = 0; i < work_outside; i++) {
271+
local_value += my_value;
272+
my_value = local_value;
273+
}
274+
iters += num_acquisitions;
242275
}
243276

244-
thread_data->iters = iters;
245-
_Py_atomic_add_ssize(&bench_data->total_iters, iters);
246-
_PyEvent_Notify(&thread_data->done);
277+
td->iters = iters;
278+
_PyEvent_Notify(&td->done);
247279
}
248280

249281
/*[clinic input]
250282
_testinternalcapi.benchmark_locks
251283
252284
num_threads: Py_ssize_t
253-
use_pymutex: bool = True
254-
critical_section_length: int = 1
285+
work_inside: int = 1
286+
work_outside: int = 0
255287
time_ms: int = 1000
288+
num_acquisitions: int = 1
289+
total_iters: Py_ssize_t = 0
290+
num_locks: Py_ssize_t = 1
291+
random_locks: bool = False
256292
/
257293
258294
[clinic start generated code]*/
259295

260296
static PyObject *
261297
_testinternalcapi_benchmark_locks_impl(PyObject *module,
262298
Py_ssize_t num_threads,
263-
int use_pymutex,
264-
int critical_section_length,
265-
int time_ms)
266-
/*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
299+
int work_inside, int work_outside,
300+
int time_ms, int num_acquisitions,
301+
Py_ssize_t total_iters,
302+
Py_ssize_t num_locks,
303+
int random_locks)
304+
/*[clinic end generated code: output=6258dc9de8cb9af1 input=d622cf4e1c4d008b]*/
267305
{
268306
// Run from Tools/lockbench/lockbench.py
269307
// Based on the WebKit lock benchmarks:
270308
// https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
271309
// See also https://webkit.org/blog/6161/locking-in-webkit/
272310
PyObject *thread_iters = NULL;
273311
PyObject *res = NULL;
312+
struct bench_thread_data *thread_data = NULL;
274313

275-
struct bench_data_locks bench_data;
276-
memset(&bench_data, 0, sizeof(bench_data));
277-
bench_data.use_pymutex = use_pymutex;
278-
bench_data.critical_section_length = critical_section_length;
279-
280-
bench_data.lock = PyThread_allocate_lock();
281-
if (bench_data.lock == NULL) {
282-
return PyErr_NoMemory();
314+
struct bench_config config = {
315+
.work_inside = work_inside,
316+
.work_outside = work_outside,
317+
.num_acquisitions = num_acquisitions,
318+
.target_iters = total_iters,
319+
.num_locks = num_locks,
320+
.random_locks = random_locks,
321+
};
322+
323+
config.locks = PyMem_Calloc(num_locks, sizeof(*config.locks));
324+
if (config.locks == NULL) {
325+
PyErr_NoMemory();
326+
goto exit;
283327
}
284328

285-
struct bench_thread_data *thread_data = NULL;
286329
thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
287330
if (thread_data == NULL) {
288331
PyErr_NoMemory();
289332
goto exit;
290333
}
291-
292334
thread_iters = PyList_New(num_threads);
293335
if (thread_iters == NULL) {
294336
goto exit;
@@ -300,40 +342,43 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
300342
}
301343

302344
for (Py_ssize_t i = 0; i < num_threads; i++) {
303-
thread_data[i].bench_data = &bench_data;
345+
thread_data[i].config = &config;
346+
thread_data[i].lock = &config.locks[i % num_locks];
347+
thread_data[i].rng_state = (uint64_t)i + 1;
304348
PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
305349
}
306350

307-
// Let the threads run for `time_ms` milliseconds
308-
pysleep(time_ms);
309-
_Py_atomic_store_int(&bench_data.stop, 1);
351+
if (total_iters == 0) {
352+
pysleep(time_ms);
353+
_Py_atomic_store_int(&config.stop, 1);
354+
}
310355

311-
// Wait for the threads to finish
312356
for (Py_ssize_t i = 0; i < num_threads; i++) {
313357
PyEvent_Wait(&thread_data[i].done);
314358
}
315359

316-
Py_ssize_t total_iters = bench_data.total_iters;
317360
if (PyTime_PerfCounter(&end) < 0) {
318361
goto exit;
319362
}
320363

321-
// Return the total number of acquisitions and the number of acquisitions
322-
// for each thread.
364+
Py_ssize_t sum_iters = 0;
323365
for (Py_ssize_t i = 0; i < num_threads; i++) {
324366
PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
325367
if (iter == NULL) {
326368
goto exit;
327369
}
328370
PyList_SET_ITEM(thread_iters, i, iter);
371+
sum_iters += thread_data[i].iters;
329372
}
330373

331374
assert(end != start);
332-
double rate = total_iters * 1e9 / (end - start);
333-
res = Py_BuildValue("(dO)", rate, thread_iters);
375+
PyTime_t elapsed_ns = end - start;
376+
double rate = sum_iters * 1e9 / elapsed_ns;
377+
res = Py_BuildValue("(dOL)", rate, thread_iters,
378+
(long long)elapsed_ns);
334379

335380
exit:
336-
PyThread_free_lock(bench_data.lock);
381+
PyMem_Free(config.locks);
337382
PyMem_Free(thread_data);
338383
Py_XDECREF(thread_iters);
339384
return res;
@@ -344,7 +389,7 @@ test_lock_benchmark(PyObject *module, PyObject *obj)
344389
{
345390
// Just make sure the benchmark runs without crashing
346391
PyObject *res = _testinternalcapi_benchmark_locks_impl(
347-
module, 1, 1, 1, 100);
392+
module, 1, 1, 0, 100, 1, 0, 1, 0);
348393
if (res == NULL) {
349394
return NULL;
350395
}

0 commit comments

Comments
 (0)