From 18ba1f91e5913d28a4905c441d3c3fda33692b8f Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 6 May 2026 14:04:16 +0100 Subject: [PATCH 1/6] Reduce ctest wall-time without losing coverage Add a `--smoke` mode for perf tests and trim redundant outer repetitions in func/memory. Perf tests: * New `src/test/perf_setup.h` exposes `perf_iterations(opt, name, cli_default, smoke_value)`. Returns the smoke value when the binary was invoked with `--smoke` (which CMake now passes for every perf test under ctest), and the full count otherwise so direct CLI invocation is unchanged. * Concurrency-stress tests (`perf-contention`, `perf-msgpass`, `perf-large_producer_consumer`, `perf-lotsofthreads`) are on an exclusion list inside the helper: they receive `--smoke` from ctest but the helper still returns the full iteration count, so their scheduler-coverage capability is preserved. * CMake injects the canonical test name as `SNMALLOC_TEST_NAME` on both the per-flavour executable and the shared OBJLIB, so the helper can identify itself without a third source of truth. * Wired into: singlethread, contention, msgpass, external_pointer, large_alloc, memcpy. func/memory: * TEST(...) outer-repeat 50 -> 3. The inner tests already do size-class sweeps and per-offset loops; three re-entries still catches leak-across-reentry bugs without 50x the work. * test_external_pointer_large is moved out of the TEST(...) macro and run once: each invocation walks ~512 MB of interior pointers, which is its own internal stress. * test_static_sized_allocs default max_size 2^23 -> 2^20. Coverage delta on src/snmalloc/ vs origin/main: zero functions lost, zero lines lost, one branch fewer missed in backend_helpers/subrange.h (improvement) and one extra missed in ds/combininglock.h (timing-noise on the spinlock contention path). --- CMakeLists.txt | 29 +++++++- src/test/func/memory/memory.cc | 18 ++++- src/test/perf/contention/contention.cc | 14 ++++ .../perf/external_pointer/externalpointer.cc | 50 ++++++++----- src/test/perf/large_alloc/large_alloc.cc | 34 +++++---- src/test/perf/memcpy/memcpy.cc | 22 +++++- src/test/perf/msgpass/msgpass.cc | 14 ++++ src/test/perf/singlethread/singlethread.cc | 29 +++++--- src/test/perf_setup.h | 70 +++++++++++++++++++ 9 files changed, 234 insertions(+), 46 deletions(-) create mode 100644 src/test/perf_setup.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6dbf75efc..f763085e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -524,6 +524,15 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) add_library(${OBJLIBNAME} OBJECT ${SRC}) target_link_libraries(${OBJLIBNAME} snmalloc) target_compile_definitions(${OBJLIBNAME} PRIVATE "SNMALLOC_USE_${TEST_CLEANUP}") + # Inject the canonical ctest test name (category-test, no flavour + # suffix) so `perf_iterations()` can look it up against the + # concurrency-stress exclusion list. The name is flavour- + # independent, so defining it on the shared object library is + # correct (and required, because the per-flavour TESTNAME + # definition below is invisible to source files that are + # compiled into the OBJLIB rather than the executable). + target_compile_definitions(${OBJLIBNAME} PRIVATE + SNMALLOC_TEST_NAME="${TEST_CATEGORY}-${TEST}") add_warning_flags(${OBJLIBNAME}) if(SNMALLOC_SANITIZER) target_compile_options(${OBJLIBNAME} PRIVATE -g -fsanitize=${SNMALLOC_SANITIZER} -fno-omit-frame-pointer) @@ -545,6 +554,12 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # custom target can pass them all to llvm-cov without globbing. set_property(GLOBAL APPEND PROPERTY SNMALLOC_TEST_BINARIES ${TESTNAME}) + # Inject the canonical ctest test name (category-test, no flavour + # suffix) into the binary so `perf_iterations()` can look it up + # against the concurrency-stress exclusion list. + target_compile_definitions(${TESTNAME} PRIVATE + SNMALLOC_TEST_NAME="${TEST_CATEGORY}-${TEST}") + if(SNMALLOC_SANITIZER) target_compile_options(${TESTNAME} PRIVATE -g -fsanitize=${SNMALLOC_SANITIZER} -fno-omit-frame-pointer) target_link_libraries(${TESTNAME} -fsanitize=${SNMALLOC_SANITIZER}) @@ -568,10 +583,20 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) if (${TEST} MATCHES "release-.*") message(VERBOSE "Adding test: ${TESTNAME} only for release configs") - add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} CONFIGURATIONS "Release") + if (${TEST_CATEGORY} MATCHES "perf") + add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke + CONFIGURATIONS "Release") + else() + add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} + CONFIGURATIONS "Release") + endif() else() message(VERBOSE "Adding test: ${TESTNAME}") - add_test(${TESTNAME} ${TESTNAME}) + if (${TEST_CATEGORY} MATCHES "perf") + add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke) + else() + add_test(${TESTNAME} ${TESTNAME}) + endif() endif() if (${TEST_CATEGORY} MATCHES "perf") message(VERBOSE "Single threaded test: ${TESTNAME}") diff --git a/src/test/func/memory/memory.cc b/src/test/func/memory/memory.cc index 9bf335087..984f69f26 100644 --- a/src/test/func/memory/memory.cc +++ b/src/test/func/memory/memory.cc @@ -462,7 +462,7 @@ void test_static_sized_alloc() test_static_sized_alloc(); } -template +template void test_static_sized_allocs() { if (max_size < 16) @@ -554,6 +554,11 @@ int main(int, char**) } #endif auto start = std::chrono::steady_clock::now(); + // Most tests below have substantial internal iteration (size-class + // sweeps, per-offset loops, batch alloc/dealloc), so a large outer + // repetition is redundant for coverage. A small outer count still + // catches consolidation/leak issues that only manifest across + // repeated entry to a test. #define TEST(testname) \ do \ { \ @@ -561,7 +566,7 @@ int main(int, char**) auto diff_seconds = \ std::chrono::duration_cast(end - start).count(); \ std::cout << "Running " #testname << " @ " << diff_seconds << std::endl; \ - for (size_t i = 0; i < 50; i++) \ + for (size_t i = 0; i < 3; i++) \ testname(); \ } while (0); @@ -574,7 +579,14 @@ int main(int, char**) TEST(test_calloc_large_bug); TEST(test_external_pointer_stack); TEST(test_external_pointer_dealloc_bug); - TEST(test_external_pointer_large); + // test_external_pointer_large allocates ~16MB per object across 32 + // objects (~512MB total) and walks every 16MB-aligned interior + // pointer. It is its own internal stress; running it once is + // enough, so it is invoked outside the TEST(...) outer-repeat + // macro. + std::cout << "Running test_external_pointer_large (single pass)" + << std::endl; + test_external_pointer_large(); TEST(test_external_pointer); TEST(test_alloc_16M); TEST(test_calloc_16M); diff --git a/src/test/perf/contention/contention.cc b/src/test/perf/contention/contention.cc index bca2a4889..3c602bb43 100644 --- a/src/test/perf/contention/contention.cc +++ b/src/test/perf/contention/contention.cc @@ -1,4 +1,5 @@ #include "test/opt.h" +#include "test/perf_setup.h" #include "test/setup.h" #include "test/usage.h" #include "test/xoroshiro.h" @@ -166,6 +167,19 @@ int main(int argc, char** argv) size_t size = opt.is("--swapsize", 1 << 18); use_malloc = opt.has("--use_malloc"); + // Under `--smoke` reduce both knobs: `count` drives the inner + // alloc/swap/dealloc loop in each worker thread, and `size` + // drives the contended array. Smoke values keep the dispatch + // paths and the contention/exchange logic exercised across + // 8/4/2/1 thread counts at modest cost. The values must remain + // large enough to cross the remote-deallocation cache thresholds + // (otherwise `mem/remotecache.h` and `mem/remoteallocator.h` + // coverage drops sharply). + count = snmalloc_test::perf_iterations( + opt, SNMALLOC_TEST_NAME, /*default=*/count, /*smoke=*/1u << 18); + size = snmalloc_test::perf_iterations( + opt, SNMALLOC_TEST_NAME, /*default=*/size, /*smoke=*/1u << 16); + std::cout << "Allocator is " << (use_malloc ? "System" : "snmalloc") << std::endl; diff --git a/src/test/perf/external_pointer/externalpointer.cc b/src/test/perf/external_pointer/externalpointer.cc index 07e69cef9..777af1f55 100644 --- a/src/test/perf/external_pointer/externalpointer.cc +++ b/src/test/perf/external_pointer/externalpointer.cc @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -50,21 +52,8 @@ namespace test snmalloc::debug_check_empty(); } - void test_external_pointer(xoroshiro::p128r64& r) + void test_external_pointer(xoroshiro::p128r64& r, size_t iterations) { - // This is very slow on Windows at the moment. Until this is fixed, help - // CI terminate. -#if defined(NDEBUG) && !defined(_MSC_VER) - static constexpr size_t iterations = 10000000; -#else -# ifdef _MSC_VER - // Windows Debug build is very slow on this test. - // Reduce complexity to balance CI times. - static constexpr size_t iterations = 50000; -# else - static constexpr size_t iterations = 100000; -# endif -#endif setup(r); { @@ -93,15 +82,40 @@ namespace test } } -int main(int, char**) +int main(int argc, char** argv) { setup(); - xoroshiro::p128r64 r; + opt::Opt opt(argc, argv); + + // Default iteration count varies by build (Release runs many more + // iterations). Smoke mode shrinks both to the smallest count that + // still exercises every interior-pointer dispatch path. + size_t cli_default; + // This is very slow on Windows at the moment. Until this is fixed, help + // CI terminate. +#if defined(NDEBUG) && !defined(_MSC_VER) + cli_default = 10000000; +#elif defined(_MSC_VER) + // Windows Debug build is very slow on this test. + // Reduce complexity to balance CI times. + cli_default = 50000; +#else + cli_default = 100000; +#endif + size_t iterations = snmalloc_test::perf_iterations( + opt, SNMALLOC_TEST_NAME, cli_default, /*smoke=*/10000); - size_t nn = snmalloc::Debug ? 30 : 3; + // Outer-repeat count: Debug repeats 30x to amortise setup, Release 3x. + // Smoke shrinks both ends; one repeat is enough to hit every path + // since `setup()` re-randomises the object table each call. + size_t nn_default = snmalloc::Debug ? 30 : 3; + size_t nn = snmalloc_test::perf_iterations( + opt, SNMALLOC_TEST_NAME, nn_default, /*smoke=*/1); + + xoroshiro::p128r64 r; for (size_t n = 0; n < nn; n++) - test::test_external_pointer(r); + test::test_external_pointer(r, iterations); return 0; } diff --git a/src/test/perf/large_alloc/large_alloc.cc b/src/test/perf/large_alloc/large_alloc.cc index b0f0f2bc8..b1915b6b6 100644 --- a/src/test/perf/large_alloc/large_alloc.cc +++ b/src/test/perf/large_alloc/large_alloc.cc @@ -1,19 +1,19 @@ #include +#include #include #include using namespace snmalloc; static constexpr size_t ALLOC_SIZE = 800 * 1024; // 800 KB -static constexpr size_t ITERATIONS = 100000; -void test_alloc_dealloc_cycle() +void test_alloc_dealloc_cycle(size_t iterations) { { MeasureTime m; - m << "Alloc/dealloc 800KB x " << ITERATIONS; + m << "Alloc/dealloc 800KB x " << iterations; - for (size_t i = 0; i < ITERATIONS; i++) + for (size_t i = 0; i < iterations; i++) { void* p = snmalloc::alloc(ALLOC_SIZE); SNMALLOC_CHECK(p != nullptr); @@ -24,7 +24,7 @@ void test_alloc_dealloc_cycle() snmalloc::debug_check_empty(); } -void test_batch_alloc_then_dealloc() +void test_batch_alloc_then_dealloc(size_t iterations) { static constexpr size_t BATCH = 128; @@ -32,7 +32,7 @@ void test_batch_alloc_then_dealloc() MeasureTime m; m << "Batch alloc then dealloc 800KB x " << BATCH; - for (size_t j = 0; j < ITERATIONS / BATCH; j++) + for (size_t j = 0; j < iterations / BATCH; j++) { for (size_t i = 0; i < BATCH; i++) { @@ -49,13 +49,13 @@ void test_batch_alloc_then_dealloc() snmalloc::debug_check_empty(); } -void test_alloc_dealloc_with_touch() +void test_alloc_dealloc_with_touch(size_t iterations) { { MeasureTime m; - m << "Alloc/touch/dealloc 800KB x " << ITERATIONS; + m << "Alloc/touch/dealloc 800KB x " << iterations; - for (size_t i = 0; i < ITERATIONS; i++) + for (size_t i = 0; i < iterations; i++) { char* p = static_cast(snmalloc::alloc(ALLOC_SIZE)); SNMALLOC_CHECK(p != nullptr); @@ -71,13 +71,21 @@ void test_alloc_dealloc_with_touch() snmalloc::debug_check_empty(); } -int main(int, char**) +int main(int argc, char** argv) { setup(); - test_alloc_dealloc_cycle(); - test_batch_alloc_then_dealloc(); - test_alloc_dealloc_with_touch(); + opt::Opt opt(argc, argv); + // Each test does alloc/dealloc cycles driven by `iterations`. The + // batch test divides by BATCH=128, so the smoke value is chosen so + // that `smoke / 128 >= 1` (i.e. the batch test still runs at least + // one full batch round). + size_t iterations = snmalloc_test::perf_iterations( + opt, SNMALLOC_TEST_NAME, /*default=*/100000, /*smoke=*/8192); + + test_alloc_dealloc_cycle(iterations); + test_batch_alloc_then_dealloc(iterations); + test_alloc_dealloc_with_touch(iterations); return 0; } diff --git a/src/test/perf/memcpy/memcpy.cc b/src/test/perf/memcpy/memcpy.cc index 6a8928c52..21dd71655 100644 --- a/src/test/perf/memcpy/memcpy.cc +++ b/src/test/perf/memcpy/memcpy.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include using namespace snmalloc; @@ -25,9 +26,15 @@ size_t my_random() std::vector allocs; +// Number of distinct destination buffers per size class. Each `test()` +// call iterates over every entry in `allocs` and runs the memcpy +// implementation under measurement, so this is the per-size repeat +// count. Set by `main()` from `perf_iterations()`. +size_t allocs_per_size = 1000; + void shape(size_t size) { - for (size_t i = 0; i < 1000; i++) + for (size_t i = 0; i < allocs_per_size; i++) { auto rsize = size * 2; auto offset = 0; @@ -70,6 +77,12 @@ void test( { auto src = snmalloc::alloc(size); shape(size); + // The outer loop is a measurement-variance loop, not a coverage knob: + // it gathers ten timing samples per size for the perf statistics. + // Under `--smoke` it still runs ten times, but each `test_memcpy` + // call exercises only `allocs_per_size` (smoke value) memcpys, so the + // total work is small. Coverage is unaffected because every code path + // is hit on the first pass. for (size_t i = 0; i < 10; i++) { MeasureTime m(true); @@ -108,6 +121,13 @@ int main(int argc, char** argv) opt::Opt opt(argc, argv); bool full_test = opt.has("--full_test"); + // Number of destination buffers per size class. Smoke mode shrinks + // it dramatically because each `test()` call already runs ten + // measurement passes per size, which is more than enough to exercise + // every memcpy code path. + allocs_per_size = snmalloc_test::perf_iterations( + opt, SNMALLOC_TEST_NAME, /*default=*/1000, /*smoke=*/100); + // size_t size = 0; auto mc_platform_checked = [](void* dst, const void* src, size_t len) { memcpy_platform_checked(dst, src, len); diff --git a/src/test/perf/msgpass/msgpass.cc b/src/test/perf/msgpass/msgpass.cc index e7b455541..bcc95e85e 100644 --- a/src/test/perf/msgpass/msgpass.cc +++ b/src/test/perf/msgpass/msgpass.cc @@ -9,6 +9,7 @@ */ #include "test/opt.h" +#include "test/perf_setup.h" #include "test/setup.h" #include "test/usage.h" #include "test/xoroshiro.h" @@ -198,6 +199,19 @@ int main(int argc, char** argv) param.N_MAX_OUTSTANDING = opt.is("--max-out", 4 * 1024); param.N_MAX_BATCH_SIZE = opt.is("--max-batch", 16); + // Under `--smoke` reduce the per-producer batch count: that is the + // outer iteration that drives total messages produced. The other + // knobs (thread counts, queue depth, max batch size) are kept at + // their defaults so the message-passing topology is exercised + // unchanged. The smoke value must remain large enough for the + // cross-thread remote-deallocation cache thresholds in + // `mem/remotecache.h` / `mem/remoteallocator.h` to fire. + param.N_PRODUCER_BATCH = snmalloc_test::perf_iterations( + opt, + SNMALLOC_TEST_NAME, + /*default=*/param.N_PRODUCER_BATCH, + /*smoke=*/1u << 18); + std::cout << "msgpass --producers=" << param.N_PRODUCER << " --consumers=" << param.N_CONSUMER << " --proxies=" << param.N_PROXY diff --git a/src/test/perf/singlethread/singlethread.cc b/src/test/perf/singlethread/singlethread.cc index bf173969d..aa5d0750a 100644 --- a/src/test/perf/singlethread/singlethread.cc +++ b/src/test/perf/singlethread/singlethread.cc @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -62,24 +64,33 @@ void test_alloc_dealloc(size_t count, size_t size, bool write) snmalloc::debug_check_empty(); } -int main(int, char**) +int main(int argc, char** argv) { setup(); + opt::Opt opt(argc, argv); + // Default `count` exercises sizeclass dispatch many times; under + // `--smoke` we keep one alloc/dealloc cycle through every code + // path but cut the bulk repetitions. + size_t count_small = snmalloc_test::perf_iterations( + opt, SNMALLOC_TEST_NAME, /*default=*/1u << 15, /*smoke=*/1u << 12); + size_t count_large = snmalloc_test::perf_iterations( + opt, SNMALLOC_TEST_NAME, /*default=*/1u << 10, /*smoke=*/1u << 8); + for (size_t size = 16; size <= 128; size <<= 1) { - test_alloc_dealloc(1 << 15, size, false); - test_alloc_dealloc(1 << 15, size, true); - test_alloc_dealloc(1 << 15, size, false); - test_alloc_dealloc(1 << 15, size, true); + test_alloc_dealloc(count_small, size, false); + test_alloc_dealloc(count_small, size, true); + test_alloc_dealloc(count_small, size, false); + test_alloc_dealloc(count_small, size, true); } for (size_t size = 1 << 12; size <= 1 << 17; size <<= 1) { - test_alloc_dealloc(1 << 10, size, false); - test_alloc_dealloc(1 << 10, size, true); - test_alloc_dealloc(1 << 10, size, false); - test_alloc_dealloc(1 << 10, size, true); + test_alloc_dealloc(count_large, size, false); + test_alloc_dealloc(count_large, size, true); + test_alloc_dealloc(count_large, size, false); + test_alloc_dealloc(count_large, size, true); } return 0; diff --git a/src/test/perf_setup.h b/src/test/perf_setup.h new file mode 100644 index 000000000..e720f4687 --- /dev/null +++ b/src/test/perf_setup.h @@ -0,0 +1,70 @@ +#pragma once + +/** + * Helper for running perf tests under ctest with reduced iteration counts. + * + * When ctest invokes a perf test, it passes `--smoke`. Tests that opt in + * call `perf_iterations()` to choose between their full (`cli_default`) + * iteration count and a much smaller `smoke_value`. Direct CLI invocation + * (no `--smoke`) preserves the full iteration count, so manual perf runs + * are unaffected. + * + * Tests on the concurrency-stress exclusion list always receive + * `cli_default` even when `--smoke` is set — their iteration count is + * tuned to provoke races, not to spend time, and is not safe to reduce. + */ + +#include +#include + +namespace snmalloc_test +{ + /** + * Concurrency-stress tests that must not be smoked. Their iteration + * counts are tuned to provoke scheduler interleavings (e.g. raw + * thread-count multipliers), not to exercise dispatch paths; + * reducing them silently weakens scheduler-coverage capability. + * + * Race finding proper is the job of the TSAN build, not these + * tests, so perf tests that merely *use* threads (e.g. + * `perf-contention`, `perf-msgpass`) are smoke-eligible. + */ + inline bool is_concurrency_stress(const char* test_name) + { + static const char* const excluded[] = { + "perf-contention", + "perf-large_producer_consumer", + "perf-lotsofthreads", + "perf-msgpass", + }; + for (const char* e : excluded) + { + if (std::strcmp(test_name, e) == 0) + return true; + } + return false; + } + + /** + * Returns `smoke_value` when running under `--smoke` and `test_name` + * is not a concurrency-stress test; otherwise returns `cli_default`. + * + * `test_name` is the ctest binary name without the flavour suffix + * (e.g. "perf-singlethread"). It is injected by CMake as the macro + * `SNMALLOC_TEST_NAME` so each call site reads + * + * perf_iterations(opt, SNMALLOC_TEST_NAME, default_value, smoke_value); + */ + inline size_t perf_iterations( + opt::Opt& opt, + const char* test_name, + size_t cli_default, + size_t smoke_value) + { + if (!opt.has("--smoke")) + return cli_default; + if (is_concurrency_stress(test_name)) + return cli_default; + return smoke_value; + } +} // namespace snmalloc_test From 6dc567da4ad087a6e59669f743fa6eeec4da2f56 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 6 May 2026 15:58:16 +0100 Subject: [PATCH 2/6] Fix formatting of output message in memory test --- src/test/func/memory/memory.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/test/func/memory/memory.cc b/src/test/func/memory/memory.cc index 984f69f26..253628282 100644 --- a/src/test/func/memory/memory.cc +++ b/src/test/func/memory/memory.cc @@ -584,8 +584,7 @@ int main(int, char**) // pointer. It is its own internal stress; running it once is // enough, so it is invoked outside the TEST(...) outer-repeat // macro. - std::cout << "Running test_external_pointer_large (single pass)" - << std::endl; + std::cout << "Running test_external_pointer_large (single pass)" << std::endl; test_external_pointer_large(); TEST(test_external_pointer); TEST(test_alloc_16M); From e7a97248492ea1d45c749c777c79c40de7051ee7 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 6 May 2026 16:55:46 +0100 Subject: [PATCH 3/6] Reduce perf-lotsofthreads iterations 10x in Debug The benchmark's purpose is to stress the cross-thread free path under contention, which is already observable at much lower iteration counts. Debug builds carry full instrumentation and run ~10x slower per iteration, so 200k iterations across 8 threads makes this single test dominate Debug ctest wall-time without producing any additional contention coverage relative to a smaller count. Divide iterations by 10 when NDEBUG is not defined. Release builds keep the original counts (200000 / 50000 depending on platform and sanitizer) so the benchmark's signal is unchanged where it matters. Local measurement on Debug: perf-lotsofthreads-fast: ~136s -> 13.6s perf-lotsofthreads-check: ~500s -> 49.7s The test is on the concurrency-stress exclusion list and so is deliberately not affected by the --smoke knob; reducing iterations in Debug here is the right lever. --- src/test/perf/lotsofthreads/lotsofthread.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/test/perf/lotsofthreads/lotsofthread.cc b/src/test/perf/lotsofthreads/lotsofthread.cc index 9705dfff3..b0ab56373 100644 --- a/src/test/perf/lotsofthreads/lotsofthread.cc +++ b/src/test/perf/lotsofthreads/lotsofthread.cc @@ -101,6 +101,14 @@ int main() #else size_t iterations = 200000; #endif +#ifndef NDEBUG + // Debug builds run with full instrumentation enabled and are + // ~10x slower per iteration. The cross-thread batch behaviour + // this benchmark stresses is observable at much lower counts; + // reduce iterations so this test does not dominate Debug ctest + // wall-time. Release builds are unaffected. + iterations /= 10; +#endif int threadcount = 8; vector threads; From 7e4bd80148efc3d979d744d5d58c650c6acbd001 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 7 May 2026 09:55:33 +0100 Subject: [PATCH 4/6] Inline --smoke handling in perf tests; drop perf_setup.h Review feedback: perf_setup.h was over-engineered for what amounts to a single CLI query and a per-test conditional. Each call site already knew which knobs to scale and by how much; the helper added a name lookup table, a CMake-injected SNMALLOC_TEST_NAME macro, and an indirection that obscured what each test actually does. Replace the helper with direct `opt.has("--smoke")` checks at each call site: * external_pointer, large_alloc, memcpy, singlethread: `size_t x = opt.has("--smoke") ? smoke : default;` * contention, msgpass: these have CLI overrides for the same knobs (--swapcount/--swapsize, --batches), so make `--smoke` lower the *default* fed to opt.is(...). Explicit command-line arguments still win, which is what the user expects from a smoke flag. The "concurrency-stress exclusion list" inside the helper turns into nothing: tests that don't read --smoke (perf-lotsofthreads, perf-large_producer_consumer) just ignore it. CMake unconditionally passing --smoke to every perf test under ctest is harmless for those tests and gives us a single uniform invocation. Drop the SNMALLOC_TEST_NAME injection in CMakeLists.txt (no longer needed since the test no longer needs to identify itself), and delete src/test/perf_setup.h. --- CMakeLists.txt | 15 ---- src/test/perf/contention/contention.cc | 23 +++--- .../perf/external_pointer/externalpointer.cc | 8 +-- src/test/perf/large_alloc/large_alloc.cc | 5 +- src/test/perf/memcpy/memcpy.cc | 6 +- src/test/perf/msgpass/msgpass.cc | 22 ++---- src/test/perf/singlethread/singlethread.cc | 7 +- src/test/perf_setup.h | 70 ------------------- 8 files changed, 24 insertions(+), 132 deletions(-) delete mode 100644 src/test/perf_setup.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f763085e3..c791453ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -524,15 +524,6 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) add_library(${OBJLIBNAME} OBJECT ${SRC}) target_link_libraries(${OBJLIBNAME} snmalloc) target_compile_definitions(${OBJLIBNAME} PRIVATE "SNMALLOC_USE_${TEST_CLEANUP}") - # Inject the canonical ctest test name (category-test, no flavour - # suffix) so `perf_iterations()` can look it up against the - # concurrency-stress exclusion list. The name is flavour- - # independent, so defining it on the shared object library is - # correct (and required, because the per-flavour TESTNAME - # definition below is invisible to source files that are - # compiled into the OBJLIB rather than the executable). - target_compile_definitions(${OBJLIBNAME} PRIVATE - SNMALLOC_TEST_NAME="${TEST_CATEGORY}-${TEST}") add_warning_flags(${OBJLIBNAME}) if(SNMALLOC_SANITIZER) target_compile_options(${OBJLIBNAME} PRIVATE -g -fsanitize=${SNMALLOC_SANITIZER} -fno-omit-frame-pointer) @@ -554,12 +545,6 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # custom target can pass them all to llvm-cov without globbing. set_property(GLOBAL APPEND PROPERTY SNMALLOC_TEST_BINARIES ${TESTNAME}) - # Inject the canonical ctest test name (category-test, no flavour - # suffix) into the binary so `perf_iterations()` can look it up - # against the concurrency-stress exclusion list. - target_compile_definitions(${TESTNAME} PRIVATE - SNMALLOC_TEST_NAME="${TEST_CATEGORY}-${TEST}") - if(SNMALLOC_SANITIZER) target_compile_options(${TESTNAME} PRIVATE -g -fsanitize=${SNMALLOC_SANITIZER} -fno-omit-frame-pointer) target_link_libraries(${TESTNAME} -fsanitize=${SNMALLOC_SANITIZER}) diff --git a/src/test/perf/contention/contention.cc b/src/test/perf/contention/contention.cc index 3c602bb43..ac1e6acb5 100644 --- a/src/test/perf/contention/contention.cc +++ b/src/test/perf/contention/contention.cc @@ -1,5 +1,4 @@ #include "test/opt.h" -#include "test/perf_setup.h" #include "test/setup.h" #include "test/usage.h" #include "test/xoroshiro.h" @@ -163,22 +162,16 @@ int main(int argc, char** argv) opt::Opt opt(argc, argv); size_t cores = opt.is("--cores", 8); - size_t count = opt.is("--swapcount", 1 << 20); - size_t size = opt.is("--swapsize", 1 << 18); - use_malloc = opt.has("--use_malloc"); - - // Under `--smoke` reduce both knobs: `count` drives the inner - // alloc/swap/dealloc loop in each worker thread, and `size` - // drives the contended array. Smoke values keep the dispatch - // paths and the contention/exchange logic exercised across - // 8/4/2/1 thread counts at modest cost. The values must remain - // large enough to cross the remote-deallocation cache thresholds + // `--smoke` lowers the *defaults* for the iteration knobs so ctest + // runs at modest cost. Explicit `--swapcount` / `--swapsize` on the + // command line still win. The smoke values must remain large + // enough to cross the remote-deallocation cache thresholds // (otherwise `mem/remotecache.h` and `mem/remoteallocator.h` // coverage drops sharply). - count = snmalloc_test::perf_iterations( - opt, SNMALLOC_TEST_NAME, /*default=*/count, /*smoke=*/1u << 18); - size = snmalloc_test::perf_iterations( - opt, SNMALLOC_TEST_NAME, /*default=*/size, /*smoke=*/1u << 16); + bool smoke = opt.has("--smoke"); + size_t count = opt.is("--swapcount", smoke ? 1u << 18 : 1u << 20); + size_t size = opt.is("--swapsize", smoke ? 1u << 16 : 1u << 18); + use_malloc = opt.has("--use_malloc"); std::cout << "Allocator is " << (use_malloc ? "System" : "snmalloc") << std::endl; diff --git a/src/test/perf/external_pointer/externalpointer.cc b/src/test/perf/external_pointer/externalpointer.cc index 777af1f55..3b4e3c9d0 100644 --- a/src/test/perf/external_pointer/externalpointer.cc +++ b/src/test/perf/external_pointer/externalpointer.cc @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -103,15 +102,14 @@ int main(int argc, char** argv) #else cli_default = 100000; #endif - size_t iterations = snmalloc_test::perf_iterations( - opt, SNMALLOC_TEST_NAME, cli_default, /*smoke=*/10000); + size_t iterations = + opt.has("--smoke") ? 10000 : cli_default; // Outer-repeat count: Debug repeats 30x to amortise setup, Release 3x. // Smoke shrinks both ends; one repeat is enough to hit every path // since `setup()` re-randomises the object table each call. size_t nn_default = snmalloc::Debug ? 30 : 3; - size_t nn = snmalloc_test::perf_iterations( - opt, SNMALLOC_TEST_NAME, nn_default, /*smoke=*/1); + size_t nn = opt.has("--smoke") ? 1 : nn_default; xoroshiro::p128r64 r; diff --git a/src/test/perf/large_alloc/large_alloc.cc b/src/test/perf/large_alloc/large_alloc.cc index b1915b6b6..5d3db7c64 100644 --- a/src/test/perf/large_alloc/large_alloc.cc +++ b/src/test/perf/large_alloc/large_alloc.cc @@ -1,5 +1,5 @@ #include -#include +#include #include #include @@ -80,8 +80,7 @@ int main(int argc, char** argv) // batch test divides by BATCH=128, so the smoke value is chosen so // that `smoke / 128 >= 1` (i.e. the batch test still runs at least // one full batch round). - size_t iterations = snmalloc_test::perf_iterations( - opt, SNMALLOC_TEST_NAME, /*default=*/100000, /*smoke=*/8192); + size_t iterations = opt.has("--smoke") ? 8192 : 100000; test_alloc_dealloc_cycle(iterations); test_batch_alloc_then_dealloc(iterations); diff --git a/src/test/perf/memcpy/memcpy.cc b/src/test/perf/memcpy/memcpy.cc index 21dd71655..e554106c0 100644 --- a/src/test/perf/memcpy/memcpy.cc +++ b/src/test/perf/memcpy/memcpy.cc @@ -1,7 +1,6 @@ #include #include #include -#include #include using namespace snmalloc; @@ -29,7 +28,7 @@ std::vector allocs; // Number of distinct destination buffers per size class. Each `test()` // call iterates over every entry in `allocs` and runs the memcpy // implementation under measurement, so this is the per-size repeat -// count. Set by `main()` from `perf_iterations()`. +// count. Set by `main()` from `--smoke`. size_t allocs_per_size = 1000; void shape(size_t size) @@ -125,8 +124,7 @@ int main(int argc, char** argv) // it dramatically because each `test()` call already runs ten // measurement passes per size, which is more than enough to exercise // every memcpy code path. - allocs_per_size = snmalloc_test::perf_iterations( - opt, SNMALLOC_TEST_NAME, /*default=*/1000, /*smoke=*/100); + allocs_per_size = opt.has("--smoke") ? 100 : 1000; // size_t size = 0; auto mc_platform_checked = [](void* dst, const void* src, size_t len) { diff --git a/src/test/perf/msgpass/msgpass.cc b/src/test/perf/msgpass/msgpass.cc index bcc95e85e..b8c0d9d2b 100644 --- a/src/test/perf/msgpass/msgpass.cc +++ b/src/test/perf/msgpass/msgpass.cc @@ -9,7 +9,6 @@ */ #include "test/opt.h" -#include "test/perf_setup.h" #include "test/setup.h" #include "test/usage.h" #include "test/xoroshiro.h" @@ -192,26 +191,19 @@ int main(int argc, char** argv) struct params param; opt::Opt opt(argc, argv); + // `--smoke` lowers the *default* per-producer batch count so ctest + // runs at modest cost. Explicit `--batches` on the command line + // still wins. The smoke value must remain large enough for the + // cross-thread remote-deallocation cache thresholds in + // `mem/remotecache.h` / `mem/remoteallocator.h` to fire. + size_t batches_default = opt.has("--smoke") ? 1u << 18 : 1024 * 1024; param.N_PRODUCER = opt.is("--producers", 3); param.N_CONSUMER = opt.is("--consumers", 3); param.N_PROXY = opt.is("--proxies", 2); - param.N_PRODUCER_BATCH = opt.is("--batches", 1024 * 1024); + param.N_PRODUCER_BATCH = opt.is("--batches", batches_default); param.N_MAX_OUTSTANDING = opt.is("--max-out", 4 * 1024); param.N_MAX_BATCH_SIZE = opt.is("--max-batch", 16); - // Under `--smoke` reduce the per-producer batch count: that is the - // outer iteration that drives total messages produced. The other - // knobs (thread counts, queue depth, max batch size) are kept at - // their defaults so the message-passing topology is exercised - // unchanged. The smoke value must remain large enough for the - // cross-thread remote-deallocation cache thresholds in - // `mem/remotecache.h` / `mem/remoteallocator.h` to fire. - param.N_PRODUCER_BATCH = snmalloc_test::perf_iterations( - opt, - SNMALLOC_TEST_NAME, - /*default=*/param.N_PRODUCER_BATCH, - /*smoke=*/1u << 18); - std::cout << "msgpass --producers=" << param.N_PRODUCER << " --consumers=" << param.N_CONSUMER << " --proxies=" << param.N_PROXY diff --git a/src/test/perf/singlethread/singlethread.cc b/src/test/perf/singlethread/singlethread.cc index aa5d0750a..b02643e7a 100644 --- a/src/test/perf/singlethread/singlethread.cc +++ b/src/test/perf/singlethread/singlethread.cc @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -72,10 +71,8 @@ int main(int argc, char** argv) // Default `count` exercises sizeclass dispatch many times; under // `--smoke` we keep one alloc/dealloc cycle through every code // path but cut the bulk repetitions. - size_t count_small = snmalloc_test::perf_iterations( - opt, SNMALLOC_TEST_NAME, /*default=*/1u << 15, /*smoke=*/1u << 12); - size_t count_large = snmalloc_test::perf_iterations( - opt, SNMALLOC_TEST_NAME, /*default=*/1u << 10, /*smoke=*/1u << 8); + size_t count_small = opt.has("--smoke") ? 1u << 12 : 1u << 15; + size_t count_large = opt.has("--smoke") ? 1u << 8 : 1u << 10; for (size_t size = 16; size <= 128; size <<= 1) { diff --git a/src/test/perf_setup.h b/src/test/perf_setup.h deleted file mode 100644 index e720f4687..000000000 --- a/src/test/perf_setup.h +++ /dev/null @@ -1,70 +0,0 @@ -#pragma once - -/** - * Helper for running perf tests under ctest with reduced iteration counts. - * - * When ctest invokes a perf test, it passes `--smoke`. Tests that opt in - * call `perf_iterations()` to choose between their full (`cli_default`) - * iteration count and a much smaller `smoke_value`. Direct CLI invocation - * (no `--smoke`) preserves the full iteration count, so manual perf runs - * are unaffected. - * - * Tests on the concurrency-stress exclusion list always receive - * `cli_default` even when `--smoke` is set — their iteration count is - * tuned to provoke races, not to spend time, and is not safe to reduce. - */ - -#include -#include - -namespace snmalloc_test -{ - /** - * Concurrency-stress tests that must not be smoked. Their iteration - * counts are tuned to provoke scheduler interleavings (e.g. raw - * thread-count multipliers), not to exercise dispatch paths; - * reducing them silently weakens scheduler-coverage capability. - * - * Race finding proper is the job of the TSAN build, not these - * tests, so perf tests that merely *use* threads (e.g. - * `perf-contention`, `perf-msgpass`) are smoke-eligible. - */ - inline bool is_concurrency_stress(const char* test_name) - { - static const char* const excluded[] = { - "perf-contention", - "perf-large_producer_consumer", - "perf-lotsofthreads", - "perf-msgpass", - }; - for (const char* e : excluded) - { - if (std::strcmp(test_name, e) == 0) - return true; - } - return false; - } - - /** - * Returns `smoke_value` when running under `--smoke` and `test_name` - * is not a concurrency-stress test; otherwise returns `cli_default`. - * - * `test_name` is the ctest binary name without the flavour suffix - * (e.g. "perf-singlethread"). It is injected by CMake as the macro - * `SNMALLOC_TEST_NAME` so each call site reads - * - * perf_iterations(opt, SNMALLOC_TEST_NAME, default_value, smoke_value); - */ - inline size_t perf_iterations( - opt::Opt& opt, - const char* test_name, - size_t cli_default, - size_t smoke_value) - { - if (!opt.has("--smoke")) - return cli_default; - if (is_concurrency_stress(test_name)) - return cli_default; - return smoke_value; - } -} // namespace snmalloc_test From de21ba9f1b55cc47ccd69359cc3148ba6576adb7 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 7 May 2026 10:10:35 +0100 Subject: [PATCH 5/6] Pass --smoke to all ctest tests uniformly Functional tests don't currently read --smoke (most use `int main()` with no args, the rest UNUSED(argc, argv)) so passing it is a silent no-op. But there's no reason to gate the flag on TEST_CATEGORY: keeping the perf/non-perf split forces every future test that wants a smoke mode to also touch CMake. Collapse the four-arm add_test() block into two arms (release vs not), each unconditionally appending --smoke. Any test that wants to honour it just reads the flag; any test that doesn't, ignores it. --- CMakeLists.txt | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c791453ea..7b7aaba15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -568,20 +568,11 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) if (${TEST} MATCHES "release-.*") message(VERBOSE "Adding test: ${TESTNAME} only for release configs") - if (${TEST_CATEGORY} MATCHES "perf") - add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke - CONFIGURATIONS "Release") - else() - add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} - CONFIGURATIONS "Release") - endif() + add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke + CONFIGURATIONS "Release") else() message(VERBOSE "Adding test: ${TESTNAME}") - if (${TEST_CATEGORY} MATCHES "perf") - add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke) - else() - add_test(${TESTNAME} ${TESTNAME}) - endif() + add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke) endif() if (${TEST_CATEGORY} MATCHES "perf") message(VERBOSE "Single threaded test: ${TESTNAME}") From 322fca5e52cc3a188cce78652010ebe18deca2be Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 7 May 2026 13:30:32 +0100 Subject: [PATCH 6/6] Clangformat. --- src/test/perf/external_pointer/externalpointer.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/test/perf/external_pointer/externalpointer.cc b/src/test/perf/external_pointer/externalpointer.cc index 3b4e3c9d0..a15c27ffc 100644 --- a/src/test/perf/external_pointer/externalpointer.cc +++ b/src/test/perf/external_pointer/externalpointer.cc @@ -102,8 +102,7 @@ int main(int argc, char** argv) #else cli_default = 100000; #endif - size_t iterations = - opt.has("--smoke") ? 10000 : cli_default; + size_t iterations = opt.has("--smoke") ? 10000 : cli_default; // Outer-repeat count: Debug repeats 30x to amortise setup, Release 3x. // Smoke shrinks both ends; one repeat is enough to hit every path