@@ -194,101 +194,143 @@ test_lock_counter_slow(PyObject *self, PyObject *obj)
194194 Py_RETURN_NONE ;
195195}
196196
197- struct bench_data_locks {
198- int stop ;
199- int use_pymutex ;
200- int critical_section_length ;
197+ struct bench_lock {
201198 char padding [200 ];
202- PyThread_type_lock lock ;
203199 PyMutex m ;
204200 double value ;
205- Py_ssize_t total_iters ;
201+ };
202+
203+ struct bench_config {
204+ int stop ;
205+ int work_inside ;
206+ int work_outside ;
207+ int num_acquisitions ;
208+ int random_locks ;
209+ Py_ssize_t target_iters ;
210+ Py_ssize_t num_locks ;
211+ struct bench_lock * locks ;
206212};
207213
208214struct bench_thread_data {
209- struct bench_data_locks * bench_data ;
215+ struct bench_config * config ;
216+ struct bench_lock * lock ;
217+ uint64_t rng_state ;
210218 Py_ssize_t iters ;
211219 PyEvent done ;
212220};
213221
222+ static uint64_t
223+ splitmix64 (uint64_t * state )
224+ {
225+ uint64_t z = (* state += 0x9e3779b97f4a7c15 );
226+ z = (z ^ (z >> 30 )) * 0xbf58476d1ce4e5b9 ;
227+ z = (z ^ (z >> 27 )) * 0x94d049bb133111eb ;
228+ return z ^ (z >> 31 );
229+ }
230+
214231static void
215232thread_benchmark_locks (void * arg )
216233{
217- struct bench_thread_data * thread_data = arg ;
218- struct bench_data_locks * bench_data = thread_data -> bench_data ;
219- int use_pymutex = bench_data -> use_pymutex ;
220- int critical_section_length = bench_data -> critical_section_length ;
221-
234+ struct bench_thread_data * td = arg ;
235+ struct bench_config * config = td -> config ;
236+ int work_inside = config -> work_inside ;
237+ int work_outside = config -> work_outside ;
238+ int num_acquisitions = config -> num_acquisitions ;
239+ Py_ssize_t target_iters = config -> target_iters ;
240+ uint64_t rng_state = td -> rng_state ;
241+
242+ double local_value = 0.0 ;
222243 double my_value = 1.0 ;
223244 Py_ssize_t iters = 0 ;
224- while (!_Py_atomic_load_int_relaxed (& bench_data -> stop )) {
225- if (use_pymutex ) {
226- PyMutex_Lock (& bench_data -> m );
227- for (int i = 0 ; i < critical_section_length ; i ++ ) {
228- bench_data -> value += my_value ;
229- my_value = bench_data -> value ;
245+ for (;;) {
246+ if (target_iters > 0 ) {
247+ if (iters >= target_iters ) {
248+ break ;
230249 }
231- PyMutex_Unlock (& bench_data -> m );
232250 }
233- else {
234- PyThread_acquire_lock (bench_data -> lock , 1 );
235- for (int i = 0 ; i < critical_section_length ; i ++ ) {
236- bench_data -> value += my_value ;
237- my_value = bench_data -> value ;
251+ else if (_Py_atomic_load_int_relaxed (& config -> stop )) {
252+ break ;
253+ }
254+ struct bench_lock * lock = td -> lock ;
255+ if (config -> random_locks ) {
256+ uint32_t r = (uint32_t )splitmix64 (& rng_state );
257+ // Fast modulo reduction to pick a random lock, adapted from:
258+ // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
259+ Py_ssize_t idx = ((uint64_t )r * (uint32_t )config -> num_locks ) >> 32 ;
260+ lock = & config -> locks [idx ];
261+ }
262+ for (int acq = 0 ; acq < num_acquisitions ; acq ++ ) {
263+ PyMutex_Lock (& lock -> m );
264+ for (int i = 0 ; i < work_inside ; i ++ ) {
265+ lock -> value += my_value ;
266+ my_value = lock -> value ;
238267 }
239- PyThread_release_lock ( bench_data -> lock );
268+ PyMutex_Unlock ( & lock -> m );
240269 }
241- iters ++ ;
270+ for (int i = 0 ; i < work_outside ; i ++ ) {
271+ local_value += my_value ;
272+ my_value = local_value ;
273+ }
274+ iters += num_acquisitions ;
242275 }
243276
244- thread_data -> iters = iters ;
245- _Py_atomic_add_ssize (& bench_data -> total_iters , iters );
246- _PyEvent_Notify (& thread_data -> done );
277+ td -> iters = iters ;
278+ _PyEvent_Notify (& td -> done );
247279}
248280
249281/*[clinic input]
250282_testinternalcapi.benchmark_locks
251283
252284 num_threads: Py_ssize_t
253- use_pymutex: bool = True
254- critical_section_length : int = 1
285+ work_inside: int = 1
286+ work_outside : int = 0
255287 time_ms: int = 1000
288+ num_acquisitions: int = 1
289+ total_iters: Py_ssize_t = 0
290+ num_locks: Py_ssize_t = 1
291+ random_locks: bool = False
256292 /
257293
258294[clinic start generated code]*/
259295
260296static PyObject *
261297_testinternalcapi_benchmark_locks_impl (PyObject * module ,
262298 Py_ssize_t num_threads ,
263- int use_pymutex ,
264- int critical_section_length ,
265- int time_ms )
266- /*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
299+ int work_inside , int work_outside ,
300+ int time_ms , int num_acquisitions ,
301+ Py_ssize_t total_iters ,
302+ Py_ssize_t num_locks ,
303+ int random_locks )
304+ /*[clinic end generated code: output=6258dc9de8cb9af1 input=d622cf4e1c4d008b]*/
267305{
268306 // Run from Tools/lockbench/lockbench.py
269307 // Based on the WebKit lock benchmarks:
270308 // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
271309 // See also https://webkit.org/blog/6161/locking-in-webkit/
272310 PyObject * thread_iters = NULL ;
273311 PyObject * res = NULL ;
312+ struct bench_thread_data * thread_data = NULL ;
274313
275- struct bench_data_locks bench_data ;
276- memset (& bench_data , 0 , sizeof (bench_data ));
277- bench_data .use_pymutex = use_pymutex ;
278- bench_data .critical_section_length = critical_section_length ;
279-
280- bench_data .lock = PyThread_allocate_lock ();
281- if (bench_data .lock == NULL ) {
282- return PyErr_NoMemory ();
314+ struct bench_config config = {
315+ .work_inside = work_inside ,
316+ .work_outside = work_outside ,
317+ .num_acquisitions = num_acquisitions ,
318+ .target_iters = total_iters ,
319+ .num_locks = num_locks ,
320+ .random_locks = random_locks ,
321+ };
322+
323+ config .locks = PyMem_Calloc (num_locks , sizeof (* config .locks ));
324+ if (config .locks == NULL ) {
325+ PyErr_NoMemory ();
326+ goto exit ;
283327 }
284328
285- struct bench_thread_data * thread_data = NULL ;
286329 thread_data = PyMem_Calloc (num_threads , sizeof (* thread_data ));
287330 if (thread_data == NULL ) {
288331 PyErr_NoMemory ();
289332 goto exit ;
290333 }
291-
292334 thread_iters = PyList_New (num_threads );
293335 if (thread_iters == NULL ) {
294336 goto exit ;
@@ -300,40 +342,43 @@ _testinternalcapi_benchmark_locks_impl(PyObject *module,
300342 }
301343
302344 for (Py_ssize_t i = 0 ; i < num_threads ; i ++ ) {
303- thread_data [i ].bench_data = & bench_data ;
345+ thread_data [i ].config = & config ;
346+ thread_data [i ].lock = & config .locks [i % num_locks ];
347+ thread_data [i ].rng_state = (uint64_t )i + 1 ;
304348 PyThread_start_new_thread (thread_benchmark_locks , & thread_data [i ]);
305349 }
306350
307- // Let the threads run for `time_ms` milliseconds
308- pysleep (time_ms );
309- _Py_atomic_store_int (& bench_data .stop , 1 );
351+ if (total_iters == 0 ) {
352+ pysleep (time_ms );
353+ _Py_atomic_store_int (& config .stop , 1 );
354+ }
310355
311- // Wait for the threads to finish
312356 for (Py_ssize_t i = 0 ; i < num_threads ; i ++ ) {
313357 PyEvent_Wait (& thread_data [i ].done );
314358 }
315359
316- Py_ssize_t total_iters = bench_data .total_iters ;
317360 if (PyTime_PerfCounter (& end ) < 0 ) {
318361 goto exit ;
319362 }
320363
321- // Return the total number of acquisitions and the number of acquisitions
322- // for each thread.
364+ Py_ssize_t sum_iters = 0 ;
323365 for (Py_ssize_t i = 0 ; i < num_threads ; i ++ ) {
324366 PyObject * iter = PyLong_FromSsize_t (thread_data [i ].iters );
325367 if (iter == NULL ) {
326368 goto exit ;
327369 }
328370 PyList_SET_ITEM (thread_iters , i , iter );
371+ sum_iters += thread_data [i ].iters ;
329372 }
330373
331374 assert (end != start );
332- double rate = total_iters * 1e9 / (end - start );
333- res = Py_BuildValue ("(dO)" , rate , thread_iters );
375+ PyTime_t elapsed_ns = end - start ;
376+ double rate = sum_iters * 1e9 / elapsed_ns ;
377+ res = Py_BuildValue ("(dOL)" , rate , thread_iters ,
378+ (long long )elapsed_ns );
334379
335380exit :
336- PyThread_free_lock ( bench_data . lock );
381+ PyMem_Free ( config . locks );
337382 PyMem_Free (thread_data );
338383 Py_XDECREF (thread_iters );
339384 return res ;
@@ -344,7 +389,7 @@ test_lock_benchmark(PyObject *module, PyObject *obj)
344389{
345390 // Just make sure the benchmark runs without crashing
346391 PyObject * res = _testinternalcapi_benchmark_locks_impl (
347- module , 1 , 1 , 1 , 100 );
392+ module , 1 , 1 , 0 , 100 , 1 , 0 , 1 , 0 );
348393 if (res == NULL ) {
349394 return NULL ;
350395 }
0 commit comments