Initialize threadResource only once per thread (#2199)

umangyadav · web-flow · commit 4644c1918ccf · 2026-01-07T12:26:40.000-05:00
Only initialize once per thread
diff --git a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp
@@ -292,6 +292,55 @@ struct CompilationResult {
   SmallVector<uint32_t> gridSizes;
 };
 
+// Thread-local resources to avoid per-config initialization overhead.
+// Each worker thread gets its own context, PassManagers, and parsed module.
+// Note: MLIR's MLIRContext cannot be safely shared across parallel pass
+// executions - it asserts when the registry is modified during multi-threaded
+// execution. Therefore, each thread needs its own context.
+struct ThreadResources {
+  std::unique_ptr<MLIRContext> ctx;
+  std::unique_ptr<PassManager> applicabilityPM;
+  std::unique_ptr<PassManager> compilationPM;
+  OwningOpRef<ModuleOp> sourceModule;
+
+  ThreadResources() = default;
+  ThreadResources(ThreadResources &&) = default;
+  ThreadResources &operator=(ThreadResources &&) = default;
+
+  // Non-copyable
+  ThreadResources(const ThreadResources &) = delete;
+  ThreadResources &operator=(const ThreadResources &) = delete;
+
+  // Initialize all resources for this thread
+  bool initialize(const std::string &sourceModuleStr,
+                  const rock::KernelOptions &applicabilityOpts,
+                  const rock::KernelOptions &compilationKernOpts,
+                  const rock::BackendOptions &backendOpts) {
+    DialectRegistry registry;
+    registerRocMLIRDialects(registry);
+    ctx = std::make_unique<MLIRContext>(registry);
+    ctx->getDiagEngine().registerHandler([](Diagnostic &) {});
+
+    // Pre-build pipelines once per thread
+    applicabilityPM = std::make_unique<PassManager>(
+        ctx.get(), PassManager::getAnyOpAnchorName(),
+        PassManager::Nesting::Implicit);
+    compilationPM = std::make_unique<PassManager>(
+        ctx.get(), PassManager::getAnyOpAnchorName(),
+        PassManager::Nesting::Implicit);
+
+    rock::buildKernelPipeline(*applicabilityPM, applicabilityOpts);
+    rock::buildKernelPipeline(*compilationPM, compilationKernOpts);
+    rock::buildBackendPipeline(*compilationPM, backendOpts);
+
+    // Parse source module once per thread
+    sourceModule = parseSourceString<ModuleOp>(sourceModuleStr, ctx.get());
+    return sourceModule && *sourceModule;
+  }
+
+  bool isValid() const { return sourceModule && *sourceModule; }
+};
+
 static LogicalResult
 measureSmallKernel(unsigned iterations, hipStream_t stream,
                    const std::vector<hipFunction_t> &functions,
@@ -740,51 +789,64 @@ static LogicalResult runTuningLoop(ModuleOp source) {
     // Don't create more threads than configs to compile
     numThreads = std::min(numThreads, static_cast<unsigned>(configs.size()));
 
-    // Serialize source module once (shared by all threads for cloning)
+    // Serialize source module once (shared by all threads for parsing)
     std::string sourceModuleStr;
-    llvm::raw_string_ostream sourceOs(sourceModuleStr);
-    source->print(sourceOs);
-    sourceOs.flush();
+    {
+      llvm::raw_string_ostream sourceOs(sourceModuleStr);
+      source->print(sourceOs);
+    }
+
+    // PHASE 2: Pre-initialize thread resources (contexts, PassManagers, parsed
+    // modules). This avoids the expensive per-config overhead of creating
+    // contexts, parsing modules, and building pipelines.
+    // Note: MLIR's MLIRContext cannot be safely shared across parallel pass
+    // executions, so each thread needs its own context.
+    std::vector<ThreadResources> threadResources(numThreads);
+    std::atomic<bool> initFailed{false};
+
+    {
+      std::vector<std::thread> initThreads;
+      initThreads.reserve(numThreads);
+      for (unsigned i = 0; i < numThreads; ++i) {
+        initThreads.emplace_back([&, i]() {
+          if (!threadResources[i].initialize(sourceModuleStr, applicabilityOpts,
+                                             compilationKernOpts,
+                                             backendOpts)) {
+            initFailed.store(true, std::memory_order_relaxed);
+          }
+        });
+      }
+      for (auto &t : initThreads) {
+        t.join();
+      }
+    }
 
-    // PHASE 2: Parallel compilation phase
+    if (initFailed.load(std::memory_order_relaxed)) {
+      llvm::errs() << "Failed to initialize thread resources\n";
+      return failure();
+    }
+
+    // PHASE 3: Parallel compilation phase using pre-initialized resources
     std::vector<CompilationResult> compilationResults(configs.size());
     std::mutex outputMutex; // For thread-safe console output
     std::atomic<bool> compilationFailed{
         false}; // Flag to signal early termination
 
-    auto compileConfig = [&](size_t idx) -> CompilationResult {
+    // Compile a single config using pre-initialized thread resources
+    auto compileConfig = [&](size_t idx,
+                             ThreadResources &res) -> CompilationResult {
       CompilationResult result;
       result.perfConfig = configs[idx];
-      // Each thread needs its own context and pass managers for thread-safety
-      DialectRegistry threadRegistry;
-      registerRocMLIRDialects(threadRegistry);
-      MLIRContext threadCtx(threadRegistry);
-      threadCtx.getDiagEngine().registerHandler([](Diagnostic &diag) {});
-
-      // Parse the serialized module in this thread's context
-      OwningOpRef<ModuleOp> threadSource =
-          parseSourceString<ModuleOp>(sourceModuleStr, &threadCtx);
-      if (!threadSource)
-        return result;
-
-      // Set up pipelines for this thread
-      PassManager threadApplicability(&threadCtx,
-                                      PassManager::getAnyOpAnchorName(),
-                                      PassManager::Nesting::Implicit);
-      PassManager threadCompilation(&threadCtx,
-                                    PassManager::getAnyOpAnchorName(),
-                                    PassManager::Nesting::Implicit);
 
-      rock::buildKernelPipeline(threadApplicability, applicabilityOpts);
-      rock::buildKernelPipeline(threadCompilation, compilationKernOpts);
-      rock::buildBackendPipeline(threadCompilation, backendOpts);
+      if (!res.isValid())
+        return result;
 
       StringAttr perfConfigAttr =
-          StringAttr::get(&threadCtx, result.perfConfig);
+          StringAttr::get(res.ctx.get(), result.perfConfig);
 
       // Helper to copy IR with perf config set
-      auto copyIRThread = [&](ModuleOp src,
-                              StringAttr attr) -> OwningOpRef<ModuleOp> {
+      auto copyIR = [&](ModuleOp src,
+                        StringAttr attr) -> OwningOpRef<ModuleOp> {
         OwningOpRef<ModuleOp> copy = cast<ModuleOp>(src->clone());
         copy->walk([&attr](rock::RockGemmWrapperInterface op) {
           op->setAttr("perf_config", attr);
@@ -795,16 +857,16 @@ static LogicalResult runTuningLoop(ModuleOp source) {
         return copy;
       };
 
-      if (doesModuleHaveFusions(threadSource.get()) &&
-          !rock::isModuleFusible(threadSource.get(), result.perfConfig)) {
+      if (doesModuleHaveFusions(res.sourceModule.get()) &&
+          !rock::isModuleFusible(res.sourceModule.get(), result.perfConfig)) {
         result.status = CompilationStatus::NotApplicable;
         return result;
       }
 
-      // Applicability check
+      // Applicability check - clone the pre-parsed module
       OwningOpRef<ModuleOp> sourceCopy =
-          copyIRThread(threadSource.get(), perfConfigAttr);
-      if (failed(threadApplicability.run(sourceCopy.get()))) {
+          copyIR(res.sourceModule.get(), perfConfigAttr);
+      if (failed(res.applicabilityPM->run(sourceCopy.get()))) {
         result.status = CompilationStatus::NotApplicable;
         return result;
       }
@@ -823,8 +885,8 @@ static LogicalResult runTuningLoop(ModuleOp source) {
             tunedFunc->getAttrOfType<IntegerAttr>("grid_size").getInt());
       }
 
-      // Compilation
-      if (failed(threadCompilation.run(sourceCopy.get()))) {
+      // Compilation - use pre-built pipeline
+      if (failed(res.compilationPM->run(sourceCopy.get()))) {
         std::lock_guard<std::mutex> lock(outputMutex);
         llvm::errs() << "Backend pipeline failed for config: "
                      << result.perfConfig << "\n";
@@ -860,8 +922,14 @@ static LogicalResult runTuningLoop(ModuleOp source) {
     // load balancing by allowing fast threads to pick up more work.
     {
       std::atomic<size_t> nextIdx{0};
+      std::atomic<unsigned> nextThreadId{0};
 
       auto worker = [&]() {
+        // Each worker gets assigned a unique thread ID for its resources
+        unsigned myThreadId =
+            nextThreadId.fetch_add(1, std::memory_order_relaxed);
+        ThreadResources &myRes = threadResources[myThreadId];
+
         while (true) {
           if (compilationFailed.load(std::memory_order_relaxed))
             break;
@@ -870,7 +938,7 @@ static LogicalResult runTuningLoop(ModuleOp source) {
           if (idx >= configs.size())
             break;
 
-          compilationResults[idx] = compileConfig(idx);
+          compilationResults[idx] = compileConfig(idx, myRes);
         }
       };