From 56177937f61cee3782b7c205235a6c74716e37b2 Mon Sep 17 00:00:00 2001 From: Pavel Guzenfeld Date: Sun, 22 Mar 2026 13:50:30 +0200 Subject: [PATCH 1/2] Fix potential deadlock in TimeSource::destroy_clock_sub MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit destroy_clock_sub() held clock_sub_lock_ while calling clock_executor_thread_.join(). If the executor thread's shutdown path contends on clock_sub_lock_, this produces a deadlock. Fix: move the thread into a local variable under the lock, release the lock, then join outside the critical section. The thread is not dangling — it is unconditionally joined before the function returns. Key improvement over the naive move approach: the subscription is kept alive until after join completes, ensuring the executor thread can finish any in-flight callback without accessing a destroyed subscription. Cleanup order: cancel → (release lock) → join → remove_callback_group → (reacquire lock) → reset subscription. Fixes #2962 Signed-off-by: Pavel Guzenfeld --- rclcpp/src/rclcpp/time_source.cpp | 34 +++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/rclcpp/src/rclcpp/time_source.cpp b/rclcpp/src/rclcpp/time_source.cpp index f3c20a8e3d..09a2487dc5 100644 --- a/rclcpp/src/rclcpp/time_source.cpp +++ b/rclcpp/src/rclcpp/time_source.cpp @@ -422,13 +422,35 @@ class TimeSource::NodeState final // Destroy the subscription for the clock topic void destroy_clock_sub() { - std::lock_guard guard(clock_sub_lock_); - if (clock_executor_thread_.joinable()) { - clock_executor_->cancel(); - clock_executor_thread_.join(); - clock_executor_->remove_callback_group(clock_callback_group_); + std::thread thread_to_join; + rclcpp::executors::SingleThreadedExecutor::SharedPtr executor_to_clean; + rclcpp::CallbackGroup::SharedPtr cbg_to_remove; + + { + std::lock_guard guard(clock_sub_lock_); + if (clock_executor_thread_.joinable()) { + clock_executor_->cancel(); + // Move the thread out so we can join without holding clock_sub_lock_. + // Holding the lock during join() can deadlock if the executor thread's + // shutdown path contends on clock_sub_lock_ (see ros2/rclcpp#2962). + // The thread is unconditionally joined below — it is not dangling. + thread_to_join = std::move(clock_executor_thread_); + executor_to_clean = clock_executor_; + cbg_to_remove = clock_callback_group_; + } + } + + // Join outside the lock. The subscription is intentionally kept alive + // until after join so the executor thread can finish any in-flight callback. + if (thread_to_join.joinable()) { + thread_to_join.join(); + executor_to_clean->remove_callback_group(cbg_to_remove); + } + + { + std::lock_guard guard(clock_sub_lock_); + clock_subscription_.reset(); } - clock_subscription_.reset(); } // On set Parameters callback handle From e0274abc469ed912337c507d0a050bf0d414b425 Mon Sep 17 00:00:00 2001 From: Pavel Guzenfeld Date: Sun, 22 Mar 2026 14:26:17 +0200 Subject: [PATCH 2/2] Add stress tests for TimeSource deadlock fix Tests reproduce the conditions from ros2/rclcpp#2962: - Rapid construct/destroy of nodes with use_sim_time=true (50 cycles) - Concurrent construct/destroy from 4 threads - Toggle use_sim_time while spinning - Verify no lingering thread after destruction If the deadlock is present, these tests will hang and be caught by the 60-second timeout. Signed-off-by: Pavel Guzenfeld --- rclcpp/test/rclcpp/CMakeLists.txt | 5 + .../test/rclcpp/test_time_source_deadlock.cpp | 137 ++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 rclcpp/test/rclcpp/test_time_source_deadlock.cpp diff --git a/rclcpp/test/rclcpp/CMakeLists.txt b/rclcpp/test/rclcpp/CMakeLists.txt index 5c6469ec69..04552c98df 100644 --- a/rclcpp/test/rclcpp/CMakeLists.txt +++ b/rclcpp/test/rclcpp/CMakeLists.txt @@ -438,6 +438,11 @@ ament_add_ros_isolated_gtest(test_time_source test_time_source.cpp if(TARGET test_time_source) target_link_libraries(test_time_source ${PROJECT_NAME} rcl::rcl) endif() +ament_add_ros_isolated_gtest(test_time_source_deadlock test_time_source_deadlock.cpp + TIMEOUT 60) +if(TARGET test_time_source_deadlock) + target_link_libraries(test_time_source_deadlock ${PROJECT_NAME}) +endif() ament_add_ros_isolated_gtest(test_utilities test_utilities.cpp APPEND_LIBRARY_DIRS "${append_library_dirs}") diff --git a/rclcpp/test/rclcpp/test_time_source_deadlock.cpp b/rclcpp/test/rclcpp/test_time_source_deadlock.cpp new file mode 100644 index 0000000000..478d4749b6 --- /dev/null +++ b/rclcpp/test/rclcpp/test_time_source_deadlock.cpp @@ -0,0 +1,137 @@ +// Copyright 2026 Open Source Robotics Foundation, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * Stress tests for TimeSource::destroy_clock_sub deadlock fix. + * + * Exercises the conditions from ros2/rclcpp#2962: rapid node + * construction/destruction with use_sim_time enabled. If the + * deadlock is present, these tests will hang (caught by the test + * timeout). Under ThreadSanitizer, any data race or dangling-thread + * access will be flagged. + */ + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "rclcpp/rclcpp.hpp" + +using namespace std::chrono_literals; + +class TestTimeSourceDeadlock : public ::testing::Test +{ +public: + static void SetUpTestCase() + { + rclcpp::init(0, nullptr); + } + + static void TearDownTestCase() + { + rclcpp::shutdown(); + } +}; + +/// Reproducer for ros2/rclcpp#2962: rapid construction and destruction +/// of a node with use_sim_time=true. The clock executor thread must +/// be joined without deadlock. +TEST_F(TestTimeSourceDeadlock, rapid_construct_destroy_sim_time) +{ + for (int i = 0; i < 50; ++i) { + rclcpp::NodeOptions opts; + opts.parameter_overrides({rclcpp::Parameter("use_sim_time", true)}); + auto node = std::make_shared( + "deadlock_test_" + std::to_string(i), "/test_ns", opts); + + // Immediately destroy — the clock executor thread must be + // joined without deadlock. + node.reset(); + } +} + +/// Create and destroy nodes with use_sim_time from multiple threads +/// to stress the lock/join interaction. +TEST_F(TestTimeSourceDeadlock, concurrent_construct_destroy) +{ + const int num_threads = 4; + const int iterations = 10; + std::vector> futures; + + for (int t = 0; t < num_threads; ++t) { + futures.push_back(std::async(std::launch::async, [t, iterations]() { + for (int i = 0; i < iterations; ++i) { + rclcpp::NodeOptions opts; + opts.parameter_overrides({rclcpp::Parameter("use_sim_time", true)}); + auto node = std::make_shared( + "concurrent_test_" + std::to_string(t) + "_" + std::to_string(i), + "/test_ns", opts); + node.reset(); + } + })); + } + + for (auto & f : futures) { + ASSERT_NO_THROW(f.get()); + } +} + +/// Toggle use_sim_time parameter while spinning — exercises the +/// create_clock_sub / destroy_clock_sub interplay. +TEST_F(TestTimeSourceDeadlock, toggle_use_sim_time_while_spinning) +{ + auto node = std::make_shared("toggle_test", "/test_ns"); + rclcpp::executors::SingleThreadedExecutor executor; + executor.add_node(node); + + // Spin in background + auto spin_future = std::async(std::launch::async, [&executor]() { + executor.spin(); + }); + + // Toggle use_sim_time several times + for (int i = 0; i < 10; ++i) { + node->set_parameter(rclcpp::Parameter("use_sim_time", true)); + std::this_thread::sleep_for(10ms); + node->set_parameter(rclcpp::Parameter("use_sim_time", false)); + std::this_thread::sleep_for(10ms); + } + + executor.cancel(); + spin_future.get(); +} + +/// Verify the clock thread is properly cleaned up after destruction. +/// After destroying a sim-time node, no threads from its clock +/// executor should be lingering. +TEST_F(TestTimeSourceDeadlock, no_lingering_thread_after_destroy) +{ + { + rclcpp::NodeOptions opts; + opts.parameter_overrides({rclcpp::Parameter("use_sim_time", true)}); + auto node = std::make_shared( + "linger_test", "/test_ns", opts); + // Node destroyed here — clock thread must be fully joined + } + + // If the thread were dangling, subsequent operations on the + // same context would be unreliable. Create another node to + // verify the context is clean. + auto node2 = std::make_shared("after_linger", "/test_ns"); + ASSERT_NE(node2, nullptr); +}