issue/632 - adapt to iluvatar core 20

wooway777 · wooway777 · commit 3dfa28f8fdc3 · 2026-01-19T18:57:47.000+08:00
diff --git a/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh b/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
@@ -14,6 +14,7 @@
 // Posible maximum number of threads per block for CUDA architectures
 // Used for picking correct kernel launch configuration
 #define CUDA_BLOCK_SIZE_4096 4096
+#define CUDA_BLOCK_SIZE_2048 2048
 #define CUDA_BLOCK_SIZE_1024 1024
 #define CUDA_BLOCK_SIZE_512 512
 
diff --git a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
@@ -76,18 +76,22 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
                                      const void *x,
                                      void *stream_) const {
     cudaStream_t stream = (cudaStream_t)stream_;
-    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
+            _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
+            y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
+            _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
             y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
             _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
             y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
             _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
-        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
-            y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
-            _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
     }
diff --git a/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
@@ -117,12 +117,14 @@ infiniStatus_t Descriptor::calculate(
     auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
 
     // launch kernel with different block sizes
-    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
-        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(batch_size, nhead, dim, y, _info.atype, stride_y_batch, stride_y_nhead, x, stride_x_batch, stride_x_nhead, w, _info.wtype, _info.epsilon, cuda_stream));
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
     }
diff --git a/xmake.lua b/xmake.lua
@@ -114,6 +114,12 @@ option("iluvatar-gpu")
     set_description("Whether to compile implementations for Iluvatar GPU")
 option_end()
 
+option("ivcore-20")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Use ivcore20")
+option_end()
+
 if has_config("iluvatar-gpu") then
     add_defines("ENABLE_ILUVATAR_API")
     includes("xmake/iluvatar.lua")
diff --git a/xmake/iluvatar.lua b/xmake/iluvatar.lua
@@ -44,6 +44,9 @@ target("infiniop-iluvatar")
     set_warnings("all", "error")
     add_cuflags("-Wno-error=unused-private-field")
     add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
+    if has_config("ivcore-20") then
+        add_cuflags("--cuda-gpu-arch=ivcore20", {force = true})
+    end
     add_culdflags("-fPIC")
     add_cxflags("-fPIC")