apache · tlopex · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
@@ -1225,14 +1225,13 @@ def inner(func):
     return inner
 
 
-def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
-    """Mark a test as requiring at least a compute architecture
+def requires_cuda_compute_version_marks(major_version, minor_version=0, exact=False):
+    """Return pytest marks requiring at least a CUDA compute architecture.
 
-    Unit test marked with this decorator will run only if the CUDA
-    compute architecture of the GPU is at least `(major_version,
-    minor_version)`.
+    Tests marked with the returned marks will run only if the CUDA compute
+    architecture of the GPU is at least `(major_version, minor_version)`.
 
-    This also marks the test as requiring a cuda support.
+    This also marks the test as requiring CUDA support.
 
     Parameters
     ----------
@@ -1255,14 +1254,36 @@ def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
 
     min_version_str = ".".join(str(v) for v in min_version)
     compute_version_str = ".".join(str(v) for v in compute_version)
-    requires = [
+    return [
         pytest.mark.skipif(
             compute_version < min_version or (exact and compute_version != min_version),
             reason=f"Requires CUDA compute >= {min_version_str}, but have {compute_version_str}",
         ),
         *requires_cuda.marks(),
     ]
 
+
+def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
+    """Mark a test as requiring at least a compute architecture
+
+    Unit test marked with this decorator will run only if the CUDA
+    compute architecture of the GPU is at least `(major_version,
+    minor_version)`.
+
+    This also marks the test as requiring a cuda support.
+
+    Parameters
+    ----------
+    major_version: int
+
+        The major version of the (major,minor) version tuple.
+
+    minor_version: int
+
+        The minor version of the (major,minor) version tuple.
+    """
+    requires = requires_cuda_compute_version_marks(major_version, minor_version, exact)
+
     def inner(func):
         return _compose([func], requires)
 

diff --git a/tests/python/tirx/codegen/test_codegen_blackwell.py b/tests/python/tirx/codegen/test_codegen_blackwell.py
@@ -22,6 +22,8 @@
 import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(10)
+
 
 def _get_source(func: tvm.tirx.PrimFunc) -> str:
     target = tvm.target.Target("cuda")
@@ -31,7 +33,6 @@ def _get_source(func: tvm.tirx.PrimFunc) -> str:
     return src, mod
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_tmem_alloc_dealloc_relinquish():
     N_COLS = 512
     cta_group = 1
@@ -69,7 +70,6 @@ def test_tmem(A: Tx.Buffer((16, 16), "float16")):
         assert f"tcgen05.relinquish_alloc_permit.cta_group::{cta_group}.sync.aligned" in src
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_mbarrier_try_wait_once_codegen():
     # fmt: off
     @Tx.prim_func
@@ -89,7 +89,6 @@ def test_try_wait_once(A: Tx.Buffer((16, 16), "float16")):
         assert "selp.u32" in src
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_fence_before_after_thread_sync():
     # fmt: off
     @Tx.prim_func
@@ -112,7 +111,6 @@ def test_fence(A: Tx.Buffer((16, 16), "float16")):
         assert "tcgen05.fence::before_thread_sync" in src
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_tcgen05_ld_st_roundtrip():
     HEIGHT = 128
     WIDTH = 256
@@ -182,7 +180,6 @@ def test_ld_st(A: Tx.Buffer((HEIGHT, WIDTH), "float32"), B: Tx.Buffer((HEIGHT, W
         np.testing.assert_allclose(A.numpy(), B.numpy())
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_tcgen05_cp_ld_roundtrip():
     dtype = "float32"
     dtype_bits = tvm.DataType(dtype).bits
@@ -271,7 +268,6 @@ def test_cp_ld(A: Tx.Buffer((HEIGHT, WIDTH), dtype, layout=Tx.TileLayout(Tx.S[(H
 
 
 @pytest.mark.parametrize("swizzle", [0, 1, 2, 3])
-@tvm.testing.requires_cuda_compute_version(10)
 def test_tcgen05_mma_ss_no_tma(swizzle):
     d_type, a_type, b_type = "float32", "float16", "float16"
     M, N, K = 128, 128, 64

diff --git a/tests/python/tirx/codegen/test_codegen_cuda.py b/tests/python/tirx/codegen/test_codegen_cuda.py
@@ -23,6 +23,8 @@
 import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 DEV = tvm.device("cuda")
 
 
@@ -60,6 +62,7 @@ def main(A: Tx.Buffer((4,), "int32")):
     assert "break;" in src
 
 
+@tvm.testing.requires_cuda_compute_version(9)
 def test_cluster_cta_id_codegen_uses_coordinate_sregs():
     @Tx.prim_func
     def main(A: Tx.Buffer((1,), "int32")):
@@ -139,6 +142,7 @@ def main(
     assert "ld.volatile.global.u64" in src
 
 
+@tvm.testing.requires_cuda_compute_version(9)
 def test_megamoe_extracted_intrinsics_codegen():
     @Tx.prim_func
     def main(
@@ -245,6 +249,7 @@ def main(
         assert snippet in src
 
 
+@tvm.testing.requires_cuda_compute_version(9)
 def test_ptx_cp_async_bulk_non_tma_form_codegen():
     @Tx.prim_func
     def main(
@@ -274,6 +279,7 @@ def main(
     assert "unsigned long long cache_policy" in src
 
 
+@tvm.testing.requires_cuda_compute_version(9)
 def test_tensor_map_param_codegen():
     @Tx.prim_func
     def main(A_map: Tx.TensorMap()):
@@ -288,6 +294,7 @@ def main(A_map: Tx.TensorMap()):
     assert "((unsigned long long)(&(A_map)))" in src
 
 
+@tvm.testing.requires_cuda_compute_version(10)
 def test_tma_cache_policy_operand_codegen():
     @Tx.prim_func
     def main(Cache: Tx.Buffer((1,), "uint64")):
@@ -528,6 +535,7 @@ def func(A_ptr: Tx.handle):
 @pytest.mark.parametrize("prefetch_size", [-1, 64, 128, 256])
 @pytest.mark.parametrize("predicate", [-1, Tx.int32(0), Tx.int32(1)])
 @pytest.mark.parametrize("fill_mode", ["", "zero"])
+@tvm.testing.requires_cuda_compute_version(8)
 def test_ptx_cp_async(cp_size, cache_hint, prefetch_size, predicate, fill_mode):
     if fill_mode != "" and predicate == -1:
         return
@@ -569,6 +577,7 @@ def main(A: Tx.Buffer((N), "float16")):
 
 @pytest.mark.parametrize("trans", [False, True])
 @pytest.mark.parametrize("num", [1, 2, 4])
+@tvm.testing.requires_cuda_compute_version(7, 5)
 def test_ptx_ldmatrix(trans, num):
     dtype = ".b16"
 
@@ -642,6 +651,7 @@ def main(A: Tx.Buffer((16, 16), "float16"), B: Tx.Buffer((16, 16), "float16")):
 
 @pytest.mark.parametrize("d_type", ["float16", "float32"])
 @pytest.mark.parametrize("no_c_ptr", [False, True])
+@tvm.testing.requires_cuda_compute_version(8)
 def test_ptx_mma_half_m16n8k16(d_type, no_c_ptr):
     shape = "m16n8k16"
     a_type = "float16"
@@ -733,6 +743,7 @@ def L2G(buf_local, buf_global, block_8x8):
 
 @pytest.mark.parametrize("d_type", ["float16", "float32"])
 @pytest.mark.parametrize("no_c_ptr", [False, True])
+@tvm.testing.requires_cuda_compute_version(8)
 def test_ptx_mma_half_m16n8k8(d_type, no_c_ptr):
     shape = "m16n8k8"
     a_type = "float16"

diff --git a/tests/python/tirx/codegen/test_codegen_dsmem.py b/tests/python/tirx/codegen/test_codegen_dsmem.py
@@ -21,6 +21,8 @@
 import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(9)
+
 
 def _get_source(func: tvm.tirx.PrimFunc) -> str:
     target = tvm.target.Target("cuda")

diff --git a/tests/python/tirx/codegen/test_codegen_hopper.py b/tests/python/tirx/codegen/test_codegen_hopper.py
@@ -25,6 +25,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx import Buffer
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(9)
+
 
 def _get_source(func: tvm.tirx.PrimFunc) -> tuple[str, tvm.IRModule]:
     target = tvm.target.Target("cuda")
@@ -58,7 +60,6 @@ def main(A_ptr: Tx.handle):
 
 
 @pytest.mark.parametrize("inc", [False, True])
-@tvm.testing.requires_cuda_compute_version(9)
 def test_ptx_setmaxnreg(inc):
     # fmt: off
     @Tx.prim_func
@@ -79,7 +80,6 @@ def func(A: Tx.Buffer(1)):
 
 
 @pytest.mark.parametrize("trans", [False, True])
-@tvm.testing.requires_cuda_compute_version(9)
 def test_stmatrix_sync_aligned(trans):
     # fmt: off
     @Tx.prim_func
@@ -196,7 +196,6 @@ def main(A: Tx.Buffer((16, 16), "float16")):
     np.testing.assert_allclose(A.numpy(), A_ref)
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_bar_arrive():
     # fmt: off
     @Tx.prim_func
@@ -213,7 +212,6 @@ def func(A: Tx.Buffer(1)):
     assert 'bar.arrive %0, %1;" : : "r"(name_bar_id), "r"(thread_count) : "memory"' in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_bar_sync():
     # fmt: off
     @Tx.prim_func
@@ -230,7 +228,6 @@ def func(A: Tx.Buffer(1)):
     assert 'bar.sync %0, %1;" : : "r"(name_bar_id), "r"(thread_count) : "memory"' in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_fence_mbarrier_init_release_clsuter():
     # fmt: off
     @Tx.prim_func
@@ -246,7 +243,6 @@ def func(A: Tx.Buffer(1)):
     assert "fence.mbarrier_init.release.cluster" in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_ptx_elect_sync():
     # fmt: off
     @Tx.prim_func
@@ -264,7 +260,6 @@ def func(A: Tx.Buffer(1)):
     assert "elect.sync %%rx|%%px, %2;" in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("sem,scope", [("sc", "cta"), ("acq_rel", "gpu"), ("sc", "sys")])
 def test_ptx_fence(sem, scope):
     # fmt: off
@@ -281,7 +276,6 @@ def func(A: Tx.Buffer(1)):
     assert f"fence.{sem}.{scope};" in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_fence_proxy_async():
     # fmt: off
     @Tx.prim_func
@@ -300,7 +294,6 @@ def func(A: Tx.Buffer(1)):
     assert "fence.proxy.async.shared::cta" in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("dtype", ["float16", "float32", "float8_e4m3fn", "float8_e5m2"])
 @pytest.mark.parametrize(
     "inputs",
@@ -385,7 +378,6 @@ def get_np_dtype(dtype):
     assert np.allclose(A.numpy().astype("float32"), B.numpy().astype("float32"))
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize(
     ("shape", "dtype", "encode_args", "error_msg"),
     [
@@ -461,7 +453,6 @@ def test_tensormap_encode_tiled_runtime_validation(shape, dtype, encode_args, er
 
 @pytest.mark.parametrize("swizzle", [1, 2, 3])
 @pytest.mark.parametrize("dtype", ["uint8", "float16", "float32"])
-@tvm.testing.requires_cuda_compute_version(9)
 def test_cp_async_bulk_tensor_global_to_shared_swizzle(swizzle, dtype):
     def get_ir(swizzle, dtype):
         dtype = tvm.DataType(dtype)
@@ -560,7 +551,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
         ),
     ],
 )
-@tvm.testing.requires_cuda_compute_version(9)
 def test_cp_async_bulk_tensor_global_to_shared_multicast1(inputs):
     # 1 CTA does the copy, and then multicast to all CTAs in the cluster
     def get_ir(shape, tma_args):
@@ -635,7 +625,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
         ((16, 16, 4), [16, 16, 4, 64, 64 * 16, 16, 16, 1, 1, 1, 1, 0, 0, 0, 0]),
     ],
 )
-@tvm.testing.requires_cuda_compute_version(9)
 def test_cp_async_bulk_tensor_global_to_shared_multicast2(inputs):
     # 4 CTAs in the cluster do the copy of separate chunks, and then multicast to all CTAs in the cluster  # noqa: E501
     def get_ir(shape, tma_args):
@@ -726,7 +715,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
         ((16, 16, 4), [16, 16, 4, 64, 64 * 16, 16, 16, 4, 1, 1, 1, 0, 0, 0, 0]),
     ],
 )
-@tvm.testing.requires_cuda_compute_version(9)
 def test_cp_async_bulk_tensor_shared_to_global(inputs):
     def get_ir(shape, tma_args):
         assert shape[0] % 4 == 0
@@ -1092,7 +1080,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle, C_ptr: Tx.handle):
     tvm.testing.assert_allclose(C_tvm.numpy(), C_ref, rtol=1e-3, atol=1e-3)
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_ptx_map_shared_rank():
     @Tx.prim_func
     def func(A: Tx.Buffer(1)):

diff --git a/tests/python/tirx/codegen/test_codegen_nvshmem.py b/tests/python/tirx/codegen/test_codegen_nvshmem.py
@@ -29,6 +29,8 @@
 from tvm.runtime import disco as di
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 NUM_WORKERS = 4
 
 

diff --git a/tests/python/tirx/codegen/test_cuda_copy.py b/tests/python/tirx/codegen/test_cuda_copy.py
@@ -20,8 +20,11 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 DEV = tvm.cuda(0)
 TARGET = tvm.target.Target("cuda")
 

diff --git a/tests/python/tirx/codegen/test_cuda_cta_reduce.py b/tests/python/tirx/codegen/test_cuda_cta_reduce.py
@@ -20,8 +20,11 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 DEV = tvm.cuda(0)
 TARGET = tvm.target.Target("cuda")
 

diff --git a/tests/python/tirx/codegen/test_cuda_warp_reduce.py b/tests/python/tirx/codegen/test_cuda_warp_reduce.py
@@ -20,8 +20,11 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 DEV = tvm.cuda(0)
 TARGET = tvm.target.Target("cuda")
 

diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py b/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py
@@ -24,6 +24,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx.layout import S, TileLayout, wg_local_layout
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 @pytest.mark.parametrize(
     "input",

diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py
@@ -25,6 +25,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx.layout import S, TileLayout
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 @pytest.mark.parametrize(
     "task",