diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 3b78278de120..ab5fd1d0be2f 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1225,14 +1225,13 @@ def inner(func):
     return inner
 
 
-def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
-    """Mark a test as requiring at least a compute architecture
+def requires_cuda_compute_version_marks(major_version, minor_version=0, exact=False):
+    """Return pytest marks requiring at least a CUDA compute architecture.
 
-    Unit test marked with this decorator will run only if the CUDA
-    compute architecture of the GPU is at least `(major_version,
-    minor_version)`.
+    Tests marked with the returned marks will run only if the CUDA compute
+    architecture of the GPU is at least `(major_version, minor_version)`.
 
-    This also marks the test as requiring a cuda support.
+    This also marks the test as requiring CUDA support.
 
     Parameters
     ----------
@@ -1255,7 +1254,7 @@ def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
 
     min_version_str = ".".join(str(v) for v in min_version)
     compute_version_str = ".".join(str(v) for v in compute_version)
-    requires = [
+    return [
         pytest.mark.skipif(
             compute_version < min_version or (exact and compute_version != min_version),
             reason=f"Requires CUDA compute >= {min_version_str}, but have {compute_version_str}",
@@ -1263,6 +1262,28 @@ def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
         *requires_cuda.marks(),
     ]
 
+
+def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
+    """Mark a test as requiring at least a compute architecture
+
+    Unit test marked with this decorator will run only if the CUDA
+    compute architecture of the GPU is at least `(major_version,
+    minor_version)`.
+
+    This also marks the test as requiring a cuda support.
+
+    Parameters
+    ----------
+    major_version: int
+
+        The major version of the (major,minor) version tuple.
+
+    minor_version: int
+
+        The minor version of the (major,minor) version tuple.
+    """
+    requires = requires_cuda_compute_version_marks(major_version, minor_version, exact)
+
     def inner(func):
         return _compose([func], requires)
 
diff --git a/tests/python/tirx/codegen/test_codegen_blackwell.py b/tests/python/tirx/codegen/test_codegen_blackwell.py
index 22d0705c145c..5ba14a33c772 100644
--- a/tests/python/tirx/codegen/test_codegen_blackwell.py
+++ b/tests/python/tirx/codegen/test_codegen_blackwell.py
@@ -22,6 +22,8 @@
 import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(10)
+
 
 def _get_source(func: tvm.tirx.PrimFunc) -> str:
     target = tvm.target.Target("cuda")
@@ -31,7 +33,6 @@ def _get_source(func: tvm.tirx.PrimFunc) -> str:
     return src, mod
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_tmem_alloc_dealloc_relinquish():
     N_COLS = 512
     cta_group = 1
@@ -69,7 +70,6 @@ def test_tmem(A: Tx.Buffer((16, 16), "float16")):
         assert f"tcgen05.relinquish_alloc_permit.cta_group::{cta_group}.sync.aligned" in src
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_mbarrier_try_wait_once_codegen():
     # fmt: off
     @Tx.prim_func
@@ -89,7 +89,6 @@ def test_try_wait_once(A: Tx.Buffer((16, 16), "float16")):
         assert "selp.u32" in src
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_fence_before_after_thread_sync():
     # fmt: off
     @Tx.prim_func
@@ -112,7 +111,6 @@ def test_fence(A: Tx.Buffer((16, 16), "float16")):
         assert "tcgen05.fence::before_thread_sync" in src
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_tcgen05_ld_st_roundtrip():
     HEIGHT = 128
     WIDTH = 256
@@ -182,7 +180,6 @@ def test_ld_st(A: Tx.Buffer((HEIGHT, WIDTH), "float32"), B: Tx.Buffer((HEIGHT, W
         np.testing.assert_allclose(A.numpy(), B.numpy())
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_tcgen05_cp_ld_roundtrip():
     dtype = "float32"
     dtype_bits = tvm.DataType(dtype).bits
@@ -271,7 +268,6 @@ def test_cp_ld(A: Tx.Buffer((HEIGHT, WIDTH), dtype, layout=Tx.TileLayout(Tx.S[(H
 
 
 @pytest.mark.parametrize("swizzle", [0, 1, 2, 3])
-@tvm.testing.requires_cuda_compute_version(10)
 def test_tcgen05_mma_ss_no_tma(swizzle):
     d_type, a_type, b_type = "float32", "float16", "float16"
     M, N, K = 128, 128, 64
diff --git a/tests/python/tirx/codegen/test_codegen_cuda.py b/tests/python/tirx/codegen/test_codegen_cuda.py
index 826a6e4e5e4a..69fd084700f6 100644
--- a/tests/python/tirx/codegen/test_codegen_cuda.py
+++ b/tests/python/tirx/codegen/test_codegen_cuda.py
@@ -23,6 +23,8 @@
 import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 DEV = tvm.device("cuda")
 
 
@@ -60,6 +62,7 @@ def main(A: Tx.Buffer((4,), "int32")):
     assert "break;" in src
 
 
+@tvm.testing.requires_cuda_compute_version(9)
 def test_cluster_cta_id_codegen_uses_coordinate_sregs():
     @Tx.prim_func
     def main(A: Tx.Buffer((1,), "int32")):
@@ -139,6 +142,7 @@ def main(
     assert "ld.volatile.global.u64" in src
 
 
+@tvm.testing.requires_cuda_compute_version(9)
 def test_megamoe_extracted_intrinsics_codegen():
     @Tx.prim_func
     def main(
@@ -245,6 +249,7 @@ def main(
         assert snippet in src
 
 
+@tvm.testing.requires_cuda_compute_version(9)
 def test_ptx_cp_async_bulk_non_tma_form_codegen():
     @Tx.prim_func
     def main(
@@ -274,6 +279,7 @@ def main(
     assert "unsigned long long cache_policy" in src
 
 
+@tvm.testing.requires_cuda_compute_version(9)
 def test_tensor_map_param_codegen():
     @Tx.prim_func
     def main(A_map: Tx.TensorMap()):
@@ -288,6 +294,7 @@ def main(A_map: Tx.TensorMap()):
     assert "((unsigned long long)(&(A_map)))" in src
 
 
+@tvm.testing.requires_cuda_compute_version(10)
 def test_tma_cache_policy_operand_codegen():
     @Tx.prim_func
     def main(Cache: Tx.Buffer((1,), "uint64")):
@@ -528,6 +535,7 @@ def func(A_ptr: Tx.handle):
 @pytest.mark.parametrize("prefetch_size", [-1, 64, 128, 256])
 @pytest.mark.parametrize("predicate", [-1, Tx.int32(0), Tx.int32(1)])
 @pytest.mark.parametrize("fill_mode", ["", "zero"])
+@tvm.testing.requires_cuda_compute_version(8)
 def test_ptx_cp_async(cp_size, cache_hint, prefetch_size, predicate, fill_mode):
     if fill_mode != "" and predicate == -1:
         return
@@ -569,6 +577,7 @@ def main(A: Tx.Buffer((N), "float16")):
 
 @pytest.mark.parametrize("trans", [False, True])
 @pytest.mark.parametrize("num", [1, 2, 4])
+@tvm.testing.requires_cuda_compute_version(7, 5)
 def test_ptx_ldmatrix(trans, num):
     dtype = ".b16"
 
@@ -642,6 +651,7 @@ def main(A: Tx.Buffer((16, 16), "float16"), B: Tx.Buffer((16, 16), "float16")):
 
 @pytest.mark.parametrize("d_type", ["float16", "float32"])
 @pytest.mark.parametrize("no_c_ptr", [False, True])
+@tvm.testing.requires_cuda_compute_version(8)
 def test_ptx_mma_half_m16n8k16(d_type, no_c_ptr):
     shape = "m16n8k16"
     a_type = "float16"
@@ -733,6 +743,7 @@ def L2G(buf_local, buf_global, block_8x8):
 
 @pytest.mark.parametrize("d_type", ["float16", "float32"])
 @pytest.mark.parametrize("no_c_ptr", [False, True])
+@tvm.testing.requires_cuda_compute_version(8)
 def test_ptx_mma_half_m16n8k8(d_type, no_c_ptr):
     shape = "m16n8k8"
     a_type = "float16"
diff --git a/tests/python/tirx/codegen/test_codegen_dsmem.py b/tests/python/tirx/codegen/test_codegen_dsmem.py
index 926da724fe50..7ecf28cebe12 100644
--- a/tests/python/tirx/codegen/test_codegen_dsmem.py
+++ b/tests/python/tirx/codegen/test_codegen_dsmem.py
@@ -21,6 +21,8 @@
 import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(9)
+
 
 def _get_source(func: tvm.tirx.PrimFunc) -> str:
     target = tvm.target.Target("cuda")
diff --git a/tests/python/tirx/codegen/test_codegen_hopper.py b/tests/python/tirx/codegen/test_codegen_hopper.py
index b7d24a2d2e0d..3059f651c968 100644
--- a/tests/python/tirx/codegen/test_codegen_hopper.py
+++ b/tests/python/tirx/codegen/test_codegen_hopper.py
@@ -25,6 +25,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx import Buffer
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(9)
+
 
 def _get_source(func: tvm.tirx.PrimFunc) -> tuple[str, tvm.IRModule]:
     target = tvm.target.Target("cuda")
@@ -58,7 +60,6 @@ def main(A_ptr: Tx.handle):
 
 
 @pytest.mark.parametrize("inc", [False, True])
-@tvm.testing.requires_cuda_compute_version(9)
 def test_ptx_setmaxnreg(inc):
     # fmt: off
     @Tx.prim_func
@@ -79,7 +80,6 @@ def func(A: Tx.Buffer(1)):
 
 
 @pytest.mark.parametrize("trans", [False, True])
-@tvm.testing.requires_cuda_compute_version(9)
 def test_stmatrix_sync_aligned(trans):
     # fmt: off
     @Tx.prim_func
@@ -196,7 +196,6 @@ def main(A: Tx.Buffer((16, 16), "float16")):
     np.testing.assert_allclose(A.numpy(), A_ref)
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_bar_arrive():
     # fmt: off
     @Tx.prim_func
@@ -213,7 +212,6 @@ def func(A: Tx.Buffer(1)):
     assert 'bar.arrive %0, %1;" : : "r"(name_bar_id), "r"(thread_count) : "memory"' in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_bar_sync():
     # fmt: off
     @Tx.prim_func
@@ -230,7 +228,6 @@ def func(A: Tx.Buffer(1)):
     assert 'bar.sync %0, %1;" : : "r"(name_bar_id), "r"(thread_count) : "memory"' in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_fence_mbarrier_init_release_clsuter():
     # fmt: off
     @Tx.prim_func
@@ -246,7 +243,6 @@ def func(A: Tx.Buffer(1)):
     assert "fence.mbarrier_init.release.cluster" in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_ptx_elect_sync():
     # fmt: off
     @Tx.prim_func
@@ -264,7 +260,6 @@ def func(A: Tx.Buffer(1)):
     assert "elect.sync %%rx|%%px, %2;" in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("sem,scope", [("sc", "cta"), ("acq_rel", "gpu"), ("sc", "sys")])
 def test_ptx_fence(sem, scope):
     # fmt: off
@@ -281,7 +276,6 @@ def func(A: Tx.Buffer(1)):
     assert f"fence.{sem}.{scope};" in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_fence_proxy_async():
     # fmt: off
     @Tx.prim_func
@@ -300,7 +294,6 @@ def func(A: Tx.Buffer(1)):
     assert "fence.proxy.async.shared::cta" in src
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("dtype", ["float16", "float32", "float8_e4m3fn", "float8_e5m2"])
 @pytest.mark.parametrize(
     "inputs",
@@ -385,7 +378,6 @@ def get_np_dtype(dtype):
     assert np.allclose(A.numpy().astype("float32"), B.numpy().astype("float32"))
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize(
     ("shape", "dtype", "encode_args", "error_msg"),
     [
@@ -461,7 +453,6 @@ def test_tensormap_encode_tiled_runtime_validation(shape, dtype, encode_args, er
 
 @pytest.mark.parametrize("swizzle", [1, 2, 3])
 @pytest.mark.parametrize("dtype", ["uint8", "float16", "float32"])
-@tvm.testing.requires_cuda_compute_version(9)
 def test_cp_async_bulk_tensor_global_to_shared_swizzle(swizzle, dtype):
     def get_ir(swizzle, dtype):
         dtype = tvm.DataType(dtype)
@@ -560,7 +551,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
         ),
     ],
 )
-@tvm.testing.requires_cuda_compute_version(9)
 def test_cp_async_bulk_tensor_global_to_shared_multicast1(inputs):
     # 1 CTA does the copy, and then multicast to all CTAs in the cluster
     def get_ir(shape, tma_args):
@@ -635,7 +625,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
         ((16, 16, 4), [16, 16, 4, 64, 64 * 16, 16, 16, 1, 1, 1, 1, 0, 0, 0, 0]),
     ],
 )
-@tvm.testing.requires_cuda_compute_version(9)
 def test_cp_async_bulk_tensor_global_to_shared_multicast2(inputs):
     # 4 CTAs in the cluster do the copy of separate chunks, and then multicast to all CTAs in the cluster  # noqa: E501
     def get_ir(shape, tma_args):
@@ -726,7 +715,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
         ((16, 16, 4), [16, 16, 4, 64, 64 * 16, 16, 16, 4, 1, 1, 1, 0, 0, 0, 0]),
     ],
 )
-@tvm.testing.requires_cuda_compute_version(9)
 def test_cp_async_bulk_tensor_shared_to_global(inputs):
     def get_ir(shape, tma_args):
         assert shape[0] % 4 == 0
@@ -1092,7 +1080,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle, C_ptr: Tx.handle):
     tvm.testing.assert_allclose(C_tvm.numpy(), C_ref, rtol=1e-3, atol=1e-3)
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 def test_ptx_map_shared_rank():
     @Tx.prim_func
     def func(A: Tx.Buffer(1)):
diff --git a/tests/python/tirx/codegen/test_codegen_nvshmem.py b/tests/python/tirx/codegen/test_codegen_nvshmem.py
index 6e48246d53a1..2a8244faaa3a 100644
--- a/tests/python/tirx/codegen/test_codegen_nvshmem.py
+++ b/tests/python/tirx/codegen/test_codegen_nvshmem.py
@@ -29,6 +29,8 @@
 from tvm.runtime import disco as di
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 NUM_WORKERS = 4
 
 
diff --git a/tests/python/tirx/codegen/test_cuda_copy.py b/tests/python/tirx/codegen/test_cuda_copy.py
index 83e7d98040e9..499f53023805 100644
--- a/tests/python/tirx/codegen/test_cuda_copy.py
+++ b/tests/python/tirx/codegen/test_cuda_copy.py
@@ -20,8 +20,11 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 DEV = tvm.cuda(0)
 TARGET = tvm.target.Target("cuda")
 
diff --git a/tests/python/tirx/codegen/test_cuda_cta_reduce.py b/tests/python/tirx/codegen/test_cuda_cta_reduce.py
index bbffc92f4f58..24777500f152 100644
--- a/tests/python/tirx/codegen/test_cuda_cta_reduce.py
+++ b/tests/python/tirx/codegen/test_cuda_cta_reduce.py
@@ -20,8 +20,11 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 DEV = tvm.cuda(0)
 TARGET = tvm.target.Target("cuda")
 
diff --git a/tests/python/tirx/codegen/test_cuda_warp_reduce.py b/tests/python/tirx/codegen/test_cuda_warp_reduce.py
index a1aa7dab2218..fe9decc6cd39 100644
--- a/tests/python/tirx/codegen/test_cuda_warp_reduce.py
+++ b/tests/python/tirx/codegen/test_cuda_warp_reduce.py
@@ -20,8 +20,11 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 DEV = tvm.cuda(0)
 TARGET = tvm.target.Target("cuda")
 
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py b/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py
index 368137f63142..1cd150e7a6c8 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py
@@ -24,6 +24,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx.layout import S, TileLayout, wg_local_layout
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 @pytest.mark.parametrize(
     "input",
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py
index 1690b3b4e487..4c91791fe035 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py
@@ -25,6 +25,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx.layout import S, TileLayout
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 @pytest.mark.parametrize(
     "task",
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py
index 40b0cad87d98..bba33c7f1bef 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py
@@ -38,6 +38,8 @@
 from tvm.tirx.stmt import DeclBuffer, TilePrimitiveCall
 from tvm.tirx.stmt_functor import StmtExprVisitor
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(9)
+
 # ===========================================================================
 # Helpers
 # ===========================================================================
@@ -1045,7 +1047,6 @@ def test_copy_tma_codegen(case):
 # ===========================================================================
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("swizzle_len", [3])
 @pytest.mark.parametrize("dtype", ["float16"])
 def test_copy_tma_symbolic_dimension(dtype, swizzle_len):
@@ -1149,7 +1150,6 @@ def copy_async(A_ptr: Tx.handle, B_ptr: Tx.handle) -> None:
         np.testing.assert_allclose(B_ref, B.numpy())
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("swizzle_len", [3])
 @pytest.mark.parametrize("dtype", ["float16"])
 def test_copy_tma_3d_with_view(dtype, swizzle_len):
@@ -1261,7 +1261,6 @@ def copy_async(Q_ptr: Tx.handle, B_ptr: Tx.handle) -> None:
 # ===========================================================================
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize(
     "task",
     [
@@ -1447,7 +1446,6 @@ def copy_async(A_ptr: Tx.handle, B_ptr: Tx.handle) -> None:
             np.testing.assert_allclose(B_ref, B.numpy())
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("dtype", ["float16"])
 def test_copy_tma_gpu_smoke_s2g(dtype):
     """Smoke test: compile and run TMA S2G store on GPU."""
@@ -1513,7 +1511,6 @@ def copy_async(A_ptr: Tx.handle, B_ptr: Tx.handle) -> None:
         np.testing.assert_allclose(A_np, B.numpy())
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("dtype", ["float16"])
 def test_copy_tma_dynamic_cta_mask(dtype):
     """Regression test for B00004: dynamic cta_mask expression in TMA multicast.
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py
index 6cd6c38dc906..3de044c540f1 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py
@@ -26,6 +26,8 @@
 from tvm.tirx.layout import S, TCol, TileLayout, TLane
 from tvm.tirx.layout import tid_in_wg as axis_tid_in_wg
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(10)
+
 
 @pytest.mark.parametrize("dtype", ["float16", "float32"])
 @pytest.mark.parametrize("width_32b", [4, 8, 16, 32])
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py
index bf045c5969ce..20ef0f50af32 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py
@@ -38,6 +38,8 @@
 from tvm.tirx.operator.tile_primitive.ops import CopyAsync
 from tvm.tirx.stmt_functor import StmtExprVisitor
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(9)
+
 
 def _make_dsmem_dispatch_call(shape, dtype, src_layout, dst_layout):
     """Call copy_dsmem_impl directly. Returns impl or raises DispatchFail."""
@@ -121,7 +123,6 @@ def _layout_physical_elements(layout):
     return max_offset + 1
 
 
-@tvm.testing.requires_cuda_compute_version(9)
 @pytest.mark.parametrize("shape,dtype,src_spec,dst_spec,expected", DSMEM_CONFIGS)
 def test_dsmem(shape, dtype, src_spec, dst_spec, expected):
     """Dispatch assertion + GPU correctness for DSMEM copy.
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py
index 0da2c2ef4de6..90af8050a6c9 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py
@@ -24,6 +24,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx.layout import ComposeLayout, S, SwizzleLayout, TCol, TileLayout, TLane, tid_in_wg
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 ml_dtypes_dict = {
     "float8_e4m3fn": ml_dtypes.float8_e4m3fn,
     "float8_e5m2": ml_dtypes.float8_e5m2,
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_fma.py b/tests/python/tirx/operator/tile_primitive/cuda/test_fma.py
index 78222fc608ec..fb27ad62a427 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_fma.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_fma.py
@@ -27,6 +27,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx.layout import S, TileLayout, wg_local_layout
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 def _get_sm_version():
     target = tvm.target.Target("cuda")
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py b/tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py
index 164a903b96a8..7d06c896ab4d 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py
@@ -40,6 +40,8 @@
     mma_shared_layout,
 )
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(10)
+
 # ---------------------------------------------------------------------------
 # Shared test helpers
 # ---------------------------------------------------------------------------
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py b/tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py
index 3cea1eb9d69f..09bcbb4cc999 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py
@@ -24,6 +24,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx.layout import S, TileLayout
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 ml_dtypes_dict = {
     "float8_e4m3fn": ml_dtypes.float8_e4m3fn,
     "float8_e5m2": ml_dtypes.float8_e5m2,
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py b/tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py
index 4f147804fbb8..c219ccc570b6 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py
@@ -22,6 +22,8 @@
 from tvm.script import tirx as Tx
 from tvm.tirx.layout import R, S, TileLayout, laneid, wg_local_layout
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 @pytest.mark.parametrize(
     "src_shape, dst_shape, axes, st_src, st_dst, extent_src, extent_dst",
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py b/tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py
index 65fa3a37c36f..db659f280a5b 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py
@@ -33,6 +33,8 @@
 from tvm.tirx.layout import R, S, TCol, TileLayout, TLane
 from tvm.tirx.operator.tile_primitive.cuda.tma_utils import SwizzleMode, mma_shared_layout
 
+pytestmark = tvm.testing.requires_cuda_compute_version_marks(10)
+
 T_LAY_BASIC = TileLayout(S[(32, 16) : (1 @ TLane, 1 @ TCol)] + R[4 : 32 @ TLane])
 
 
@@ -236,7 +238,6 @@ def _execute(kernel, A_init, expected):
     )
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 @pytest.mark.parametrize(
     "name,s_full,s_full_shape,s_region",
     [
@@ -293,7 +294,6 @@ def test_single_cp(name, s_full, s_full_shape, s_region):
     _run_2d(s_full, T_LAY_BASIC, s_full_shape, s_region, "uint8", A_np, expected)
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_multi_cp_sw0_4tiles():
     s_full = TileLayout(S[(4, 32, 16) : (512, 16, 1)])
     t_full = TileLayout(S[(4, 32, 16) : (16 @ TCol, 1 @ TLane, 1 @ TCol)] + R[4 : 32 @ TLane])
@@ -302,7 +302,6 @@ def test_multi_cp_sw0_4tiles():
     _run_3d_4tile(s_full, t_full, [4, 32, 16], "uint8", A_np, expected)
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 def test_align_middle_2_to_1_nvfp4_sfb():
     """SFB-style nvfp4 case: TMEM mid canonicalizes to single iter
     (16@TCol + 4@TCol merge), but SMEM mid stays as 2 iters
@@ -424,7 +423,6 @@ def kernel(A_ptr: Tx.handle, B_ptr: Tx.handle):
     _execute(kernel, A_np, expected)
 
 
-@tvm.testing.requires_cuda_compute_version(10)
 @pytest.mark.parametrize(
     "bad",
     [
diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_unary.py b/tests/python/tirx/operator/tile_primitive/cuda/test_unary.py
index 13a2f128c78c..8e40c48e69b5 100644
--- a/tests/python/tirx/operator/tile_primitive/cuda/test_unary.py
+++ b/tests/python/tirx/operator/tile_primitive/cuda/test_unary.py
@@ -25,6 +25,8 @@
     cast_layout_supported_for_local as _cast_layout_supported_for_local,
 )
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 @pytest.mark.parametrize(
     "input",
diff --git a/tests/python/tirx/test_bench_utils.py b/tests/python/tirx/test_bench_utils.py
index 75fbaccb7fb9..2b9825cdadf4 100644
--- a/tests/python/tirx/test_bench_utils.py
+++ b/tests/python/tirx/test_bench_utils.py
@@ -161,7 +161,7 @@ def test_compute_groups_small_tensors():
 def test_compute_groups_large_tensors():
     """Inputs >= 3x L2 need only 1 group."""
     # 16384x16384 fp32 = 1GB >> 3*128MB = 384MB
-    input_bytes = tensor_bytes(torch.empty(16384, 16384, dtype=torch.float32))
+    input_bytes = 16384 * 16384 * 4
     n = _compute_group_count(input_bytes, l2_bytes=128 * 1024 * 1024)
     assert n == 1
 
@@ -169,7 +169,7 @@ def test_compute_groups_large_tensors():
 def test_compute_groups_moderate_tensors():
     """Moderate tensors: floor(3*L2 / input) + 1."""
     # 8192x8192 bf16 = 128MB.  floor(384M / 128M) + 1 = 4
-    input_bytes = tensor_bytes(torch.empty(8192, 8192, dtype=torch.bfloat16))
+    input_bytes = 8192 * 8192 * 2
     n = _compute_group_count(input_bytes, l2_bytes=128 * 1024 * 1024)
     assert n == 4
 
diff --git a/tests/python/tirx/test_buffer_print.py b/tests/python/tirx/test_buffer_print.py
index 1049a9d486a5..8503cbe266a7 100644
--- a/tests/python/tirx/test_buffer_print.py
+++ b/tests/python/tirx/test_buffer_print.py
@@ -23,6 +23,8 @@
 import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 def generate_random_data(shape, dtype):
     np.random.seed(0)
diff --git a/tests/python/tirx/test_control_flow.py b/tests/python/tirx/test_control_flow.py
index 2545f795080d..a5bea2c56947 100644
--- a/tests/python/tirx/test_control_flow.py
+++ b/tests/python/tirx/test_control_flow.py
@@ -17,8 +17,11 @@
 import numpy as np
 
 import tvm
+import tvm.testing
 from tvm.script import tirx as Tx
 
+pytestmark = tvm.testing.requires_cuda.marks()
+
 
 def run_test_break_continue(func, shape, expected):
     dev = tvm.cuda(0)
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index ca092d10d657..69e525ee7542 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -54,6 +54,7 @@ TEST_FILES=(
   "tirx-analysis"
   "tirx-base"
   "tirx-transform"
+  "tirx"
   "tvmscript"
   "relax"
 )