pytorch
diff --git a/‎.github/workflows/build-test-linux-x86_64.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/build-test-linux-x86_64.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎core/runtime/TRTEngine.cpp‎
Lines changed: 38 additions & 0 deletions b/‎core/runtime/TRTEngine.cpp‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎core/runtime/execute_engine.cpp‎
Lines changed: 0 additions & 16 deletions b/‎core/runtime/execute_engine.cpp‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎core/runtime/register_jit_hooks.cpp‎
Lines changed: 20 additions & 0 deletions b/‎core/runtime/register_jit_hooks.cpp‎
Lines changed: 20 additions & 0 deletions
@@ -526,7 +526,12 @@ jobs:
         pushd .
         cd tests/py
         cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
+          distributed/test_nccl_ops.py \
+          distributed/test_native_nccl.py \
+          distributed/test_export_save_load.py
+        torchrun --nproc_per_node=2 distributed/test_native_nccl.py --multirank
+        torchrun --nproc_per_node=2 distributed/test_export_save_load.py --multirank
         popd
 
 concurrency:
 
@@ -290,6 +290,15 @@ TRTEngine::TRTEngine(
 TRTEngine::~TRTEngine() {
   torch::cuda::synchronize(device_info.id);
   trt_engine_profiler.reset();
+#ifdef ENABLE_TRT_NCCL_COLLECTIVES
+  // Null out the NCCL communicator before destroying the execution context.
+  // dist.destroy_process_group() may have already freed the ncclComm_t; if we
+  // let IExecutionContext::~IExecutionContext() run with a dangling pointer it
+  // will segfault.
+  if (nccl_initialized && exec_ctx) {
+    exec_ctx->setCommunicator(nullptr);
+  }
+#endif
   exec_ctx.reset();
   cuda_engine.reset();
   if (empty_tensor_placeholder) {
@@ -554,6 +563,35 @@ void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationSt
 
 #ifdef ENABLE_TRT_NCCL_COLLECTIVES
 bool TRTEngine::bind_nccl_comm() {
+  // When group_name is empty (e.g. engine loaded from a serialized
+  // ExportedProgram where the Python TorchTensorRTModule wrapper was
+  // inlined and set_group_name() was never called), auto-resolve the
+  // process group from the c10d registry.  PyTorch assigns sequential
+  // numeric names ("0", "1", ...) to process groups; probe until we
+  // find one with an NCCL backend.
+  if (this->group_name.empty() && this->is_md) {
+    // PyTorch assigns sequential numeric names ("0", "1", ...) to process
+    // groups.  In practice most jobs create fewer than 10 groups; we probe
+    // up to 20 to allow for destroyed-and-recreated groups.
+    for (int i = 0; i < 20; ++i) {
+      auto candidate = std::to_string(i);
+      auto probe = c10d::resolve_process_group(candidate);
+      if (probe != nullptr &&
+          probe->getBackendType() == c10d::ProcessGroup::BackendType::NCCL) {
+        this->group_name = candidate;
+        LOG_INFO("Auto-resolved distributed group name to '" << candidate << "'");
+        break;
+      }
+    }
+    if (this->group_name.empty()) {
+      LOG_WARNING(
+          "This TRT engine requires NCCL (is_md=true) but no NCCL process group "
+          "was found in the c10d registry. Ensure dist.init_process_group(backend='nccl') "
+          "has been called before loading the engine. You can also set the group name "
+          "manually via: engine.set_group_name(NCCL_GROUP_NAME)");
+    }
+  }
+
   // Soft-return when the process group isn't available yet (e.g. at engine
   // construction time when the caller hasn't called dist.init_process_group()).
   auto pg = c10d::resolve_process_group(this->group_name);
 
@@ -330,22 +330,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
       }
 
-      // Distributed setup - set NCCL communicator on TensorRT execution context
-#ifdef ENABLE_TRT_NCCL_COLLECTIVES
-      if (compiled_engine->rank >= 0 && compiled_engine->world_size > 1) {
-        bool result = compiled_engine->set_nccl_communicator_to_trt_context();
-        if (!result) {
-          LOG_ERROR("Failed to set NCCL communicator on TRT context");
-          LOG_ERROR("This will cause collective operations to fail at runtime");
-          LOG_ERROR("Make sure to call module.init_nccl_comm() after compilation");
-        }
-      } else {
-        LOG_DEBUG(
-            "Single-device mode (rank=" << compiled_engine->rank << ", world_size=" << compiled_engine->world_size
-                                        << ") - skipping NCCL setup");
-      }
-#endif
-
       // Block engine stream until results are available on caller stream
       at::cuda::CUDAEvent caller_exec_complete;
       caller_exec_complete.record(compiled_engine->caller_stream);
 
@@ -126,6 +126,26 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
             })
         .def("bind_nccl_comm", [](c10::intrusive_ptr<TRTEngine> self) { self->bind_nccl_comm(); })
         .def_readonly("nccl_initialized", &TRTEngine::nccl_initialized)
+#else
+        .def(
+            "set_group_name",
+            [](c10::intrusive_ptr<TRTEngine> self, std::string group_name) {
+              LOG_ERROR(
+                  "This build does not support MultiDevice TensorRT (ENABLE_TRT_NCCL_COLLECTIVES is OFF); set_group_name is a no-op");
+            })
+        .def(
+            "bind_nccl_comm",
+            [](c10::intrusive_ptr<TRTEngine> self) {
+              LOG_ERROR(
+                  "This build does not support MultiDevice TensorRT (ENABLE_TRT_NCCL_COLLECTIVES is OFF); bind_nccl_comm is a no-op");
+            })
+        .def_property_readonly(
+            "nccl_initialized",
+            [](c10::intrusive_ptr<TRTEngine> self) -> bool {
+              LOG_ERROR(
+                  "This build does not support MultiDevice TensorRT (ENABLE_TRT_NCCL_COLLECTIVES is OFF); nccl_initialized always returns false");
+              return false;
+            })
 #endif
         .def_pickle(
             [](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> { return self->serialize(); },