diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 3b78278de120..ab5fd1d0be2f 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -1225,14 +1225,13 @@ def inner(func): return inner -def requires_cuda_compute_version(major_version, minor_version=0, exact=False): - """Mark a test as requiring at least a compute architecture +def requires_cuda_compute_version_marks(major_version, minor_version=0, exact=False): + """Return pytest marks requiring at least a CUDA compute architecture. - Unit test marked with this decorator will run only if the CUDA - compute architecture of the GPU is at least `(major_version, - minor_version)`. + Tests marked with the returned marks will run only if the CUDA compute + architecture of the GPU is at least `(major_version, minor_version)`. - This also marks the test as requiring a cuda support. + This also marks the test as requiring CUDA support. Parameters ---------- @@ -1255,7 +1254,7 @@ def requires_cuda_compute_version(major_version, minor_version=0, exact=False): min_version_str = ".".join(str(v) for v in min_version) compute_version_str = ".".join(str(v) for v in compute_version) - requires = [ + return [ pytest.mark.skipif( compute_version < min_version or (exact and compute_version != min_version), reason=f"Requires CUDA compute >= {min_version_str}, but have {compute_version_str}", @@ -1263,6 +1262,28 @@ def requires_cuda_compute_version(major_version, minor_version=0, exact=False): *requires_cuda.marks(), ] + +def requires_cuda_compute_version(major_version, minor_version=0, exact=False): + """Mark a test as requiring at least a compute architecture + + Unit test marked with this decorator will run only if the CUDA + compute architecture of the GPU is at least `(major_version, + minor_version)`. + + This also marks the test as requiring a cuda support. + + Parameters + ---------- + major_version: int + + The major version of the (major,minor) version tuple. + + minor_version: int + + The minor version of the (major,minor) version tuple. + """ + requires = requires_cuda_compute_version_marks(major_version, minor_version, exact) + def inner(func): return _compose([func], requires) diff --git a/tests/python/tirx/codegen/test_codegen_blackwell.py b/tests/python/tirx/codegen/test_codegen_blackwell.py index 22d0705c145c..5ba14a33c772 100644 --- a/tests/python/tirx/codegen/test_codegen_blackwell.py +++ b/tests/python/tirx/codegen/test_codegen_blackwell.py @@ -22,6 +22,8 @@ import tvm.testing from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda_compute_version_marks(10) + def _get_source(func: tvm.tirx.PrimFunc) -> str: target = tvm.target.Target("cuda") @@ -31,7 +33,6 @@ def _get_source(func: tvm.tirx.PrimFunc) -> str: return src, mod -@tvm.testing.requires_cuda_compute_version(10) def test_tmem_alloc_dealloc_relinquish(): N_COLS = 512 cta_group = 1 @@ -69,7 +70,6 @@ def test_tmem(A: Tx.Buffer((16, 16), "float16")): assert f"tcgen05.relinquish_alloc_permit.cta_group::{cta_group}.sync.aligned" in src -@tvm.testing.requires_cuda_compute_version(10) def test_mbarrier_try_wait_once_codegen(): # fmt: off @Tx.prim_func @@ -89,7 +89,6 @@ def test_try_wait_once(A: Tx.Buffer((16, 16), "float16")): assert "selp.u32" in src -@tvm.testing.requires_cuda_compute_version(10) def test_fence_before_after_thread_sync(): # fmt: off @Tx.prim_func @@ -112,7 +111,6 @@ def test_fence(A: Tx.Buffer((16, 16), "float16")): assert "tcgen05.fence::before_thread_sync" in src -@tvm.testing.requires_cuda_compute_version(10) def test_tcgen05_ld_st_roundtrip(): HEIGHT = 128 WIDTH = 256 @@ -182,7 +180,6 @@ def test_ld_st(A: Tx.Buffer((HEIGHT, WIDTH), "float32"), B: Tx.Buffer((HEIGHT, W np.testing.assert_allclose(A.numpy(), B.numpy()) -@tvm.testing.requires_cuda_compute_version(10) def test_tcgen05_cp_ld_roundtrip(): dtype = "float32" dtype_bits = tvm.DataType(dtype).bits @@ -271,7 +268,6 @@ def test_cp_ld(A: Tx.Buffer((HEIGHT, WIDTH), dtype, layout=Tx.TileLayout(Tx.S[(H @pytest.mark.parametrize("swizzle", [0, 1, 2, 3]) -@tvm.testing.requires_cuda_compute_version(10) def test_tcgen05_mma_ss_no_tma(swizzle): d_type, a_type, b_type = "float32", "float16", "float16" M, N, K = 128, 128, 64 diff --git a/tests/python/tirx/codegen/test_codegen_cuda.py b/tests/python/tirx/codegen/test_codegen_cuda.py index 826a6e4e5e4a..69fd084700f6 100644 --- a/tests/python/tirx/codegen/test_codegen_cuda.py +++ b/tests/python/tirx/codegen/test_codegen_cuda.py @@ -23,6 +23,8 @@ import tvm.testing from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda.marks() + DEV = tvm.device("cuda") @@ -60,6 +62,7 @@ def main(A: Tx.Buffer((4,), "int32")): assert "break;" in src +@tvm.testing.requires_cuda_compute_version(9) def test_cluster_cta_id_codegen_uses_coordinate_sregs(): @Tx.prim_func def main(A: Tx.Buffer((1,), "int32")): @@ -139,6 +142,7 @@ def main( assert "ld.volatile.global.u64" in src +@tvm.testing.requires_cuda_compute_version(9) def test_megamoe_extracted_intrinsics_codegen(): @Tx.prim_func def main( @@ -245,6 +249,7 @@ def main( assert snippet in src +@tvm.testing.requires_cuda_compute_version(9) def test_ptx_cp_async_bulk_non_tma_form_codegen(): @Tx.prim_func def main( @@ -274,6 +279,7 @@ def main( assert "unsigned long long cache_policy" in src +@tvm.testing.requires_cuda_compute_version(9) def test_tensor_map_param_codegen(): @Tx.prim_func def main(A_map: Tx.TensorMap()): @@ -288,6 +294,7 @@ def main(A_map: Tx.TensorMap()): assert "((unsigned long long)(&(A_map)))" in src +@tvm.testing.requires_cuda_compute_version(10) def test_tma_cache_policy_operand_codegen(): @Tx.prim_func def main(Cache: Tx.Buffer((1,), "uint64")): @@ -528,6 +535,7 @@ def func(A_ptr: Tx.handle): @pytest.mark.parametrize("prefetch_size", [-1, 64, 128, 256]) @pytest.mark.parametrize("predicate", [-1, Tx.int32(0), Tx.int32(1)]) @pytest.mark.parametrize("fill_mode", ["", "zero"]) +@tvm.testing.requires_cuda_compute_version(8) def test_ptx_cp_async(cp_size, cache_hint, prefetch_size, predicate, fill_mode): if fill_mode != "" and predicate == -1: return @@ -569,6 +577,7 @@ def main(A: Tx.Buffer((N), "float16")): @pytest.mark.parametrize("trans", [False, True]) @pytest.mark.parametrize("num", [1, 2, 4]) +@tvm.testing.requires_cuda_compute_version(7, 5) def test_ptx_ldmatrix(trans, num): dtype = ".b16" @@ -642,6 +651,7 @@ def main(A: Tx.Buffer((16, 16), "float16"), B: Tx.Buffer((16, 16), "float16")): @pytest.mark.parametrize("d_type", ["float16", "float32"]) @pytest.mark.parametrize("no_c_ptr", [False, True]) +@tvm.testing.requires_cuda_compute_version(8) def test_ptx_mma_half_m16n8k16(d_type, no_c_ptr): shape = "m16n8k16" a_type = "float16" @@ -733,6 +743,7 @@ def L2G(buf_local, buf_global, block_8x8): @pytest.mark.parametrize("d_type", ["float16", "float32"]) @pytest.mark.parametrize("no_c_ptr", [False, True]) +@tvm.testing.requires_cuda_compute_version(8) def test_ptx_mma_half_m16n8k8(d_type, no_c_ptr): shape = "m16n8k8" a_type = "float16" diff --git a/tests/python/tirx/codegen/test_codegen_dsmem.py b/tests/python/tirx/codegen/test_codegen_dsmem.py index 926da724fe50..7ecf28cebe12 100644 --- a/tests/python/tirx/codegen/test_codegen_dsmem.py +++ b/tests/python/tirx/codegen/test_codegen_dsmem.py @@ -21,6 +21,8 @@ import tvm.testing from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda_compute_version_marks(9) + def _get_source(func: tvm.tirx.PrimFunc) -> str: target = tvm.target.Target("cuda") diff --git a/tests/python/tirx/codegen/test_codegen_hopper.py b/tests/python/tirx/codegen/test_codegen_hopper.py index b7d24a2d2e0d..3059f651c968 100644 --- a/tests/python/tirx/codegen/test_codegen_hopper.py +++ b/tests/python/tirx/codegen/test_codegen_hopper.py @@ -25,6 +25,8 @@ from tvm.script import tirx as Tx from tvm.tirx import Buffer +pytestmark = tvm.testing.requires_cuda_compute_version_marks(9) + def _get_source(func: tvm.tirx.PrimFunc) -> tuple[str, tvm.IRModule]: target = tvm.target.Target("cuda") @@ -58,7 +60,6 @@ def main(A_ptr: Tx.handle): @pytest.mark.parametrize("inc", [False, True]) -@tvm.testing.requires_cuda_compute_version(9) def test_ptx_setmaxnreg(inc): # fmt: off @Tx.prim_func @@ -79,7 +80,6 @@ def func(A: Tx.Buffer(1)): @pytest.mark.parametrize("trans", [False, True]) -@tvm.testing.requires_cuda_compute_version(9) def test_stmatrix_sync_aligned(trans): # fmt: off @Tx.prim_func @@ -196,7 +196,6 @@ def main(A: Tx.Buffer((16, 16), "float16")): np.testing.assert_allclose(A.numpy(), A_ref) -@tvm.testing.requires_cuda_compute_version(9) def test_bar_arrive(): # fmt: off @Tx.prim_func @@ -213,7 +212,6 @@ def func(A: Tx.Buffer(1)): assert 'bar.arrive %0, %1;" : : "r"(name_bar_id), "r"(thread_count) : "memory"' in src -@tvm.testing.requires_cuda_compute_version(9) def test_bar_sync(): # fmt: off @Tx.prim_func @@ -230,7 +228,6 @@ def func(A: Tx.Buffer(1)): assert 'bar.sync %0, %1;" : : "r"(name_bar_id), "r"(thread_count) : "memory"' in src -@tvm.testing.requires_cuda_compute_version(9) def test_fence_mbarrier_init_release_clsuter(): # fmt: off @Tx.prim_func @@ -246,7 +243,6 @@ def func(A: Tx.Buffer(1)): assert "fence.mbarrier_init.release.cluster" in src -@tvm.testing.requires_cuda_compute_version(9) def test_ptx_elect_sync(): # fmt: off @Tx.prim_func @@ -264,7 +260,6 @@ def func(A: Tx.Buffer(1)): assert "elect.sync %%rx|%%px, %2;" in src -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize("sem,scope", [("sc", "cta"), ("acq_rel", "gpu"), ("sc", "sys")]) def test_ptx_fence(sem, scope): # fmt: off @@ -281,7 +276,6 @@ def func(A: Tx.Buffer(1)): assert f"fence.{sem}.{scope};" in src -@tvm.testing.requires_cuda_compute_version(9) def test_fence_proxy_async(): # fmt: off @Tx.prim_func @@ -300,7 +294,6 @@ def func(A: Tx.Buffer(1)): assert "fence.proxy.async.shared::cta" in src -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize("dtype", ["float16", "float32", "float8_e4m3fn", "float8_e5m2"]) @pytest.mark.parametrize( "inputs", @@ -385,7 +378,6 @@ def get_np_dtype(dtype): assert np.allclose(A.numpy().astype("float32"), B.numpy().astype("float32")) -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize( ("shape", "dtype", "encode_args", "error_msg"), [ @@ -461,7 +453,6 @@ def test_tensormap_encode_tiled_runtime_validation(shape, dtype, encode_args, er @pytest.mark.parametrize("swizzle", [1, 2, 3]) @pytest.mark.parametrize("dtype", ["uint8", "float16", "float32"]) -@tvm.testing.requires_cuda_compute_version(9) def test_cp_async_bulk_tensor_global_to_shared_swizzle(swizzle, dtype): def get_ir(swizzle, dtype): dtype = tvm.DataType(dtype) @@ -560,7 +551,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle): ), ], ) -@tvm.testing.requires_cuda_compute_version(9) def test_cp_async_bulk_tensor_global_to_shared_multicast1(inputs): # 1 CTA does the copy, and then multicast to all CTAs in the cluster def get_ir(shape, tma_args): @@ -635,7 +625,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle): ((16, 16, 4), [16, 16, 4, 64, 64 * 16, 16, 16, 1, 1, 1, 1, 0, 0, 0, 0]), ], ) -@tvm.testing.requires_cuda_compute_version(9) def test_cp_async_bulk_tensor_global_to_shared_multicast2(inputs): # 4 CTAs in the cluster do the copy of separate chunks, and then multicast to all CTAs in the cluster # noqa: E501 def get_ir(shape, tma_args): @@ -726,7 +715,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle): ((16, 16, 4), [16, 16, 4, 64, 64 * 16, 16, 16, 4, 1, 1, 1, 0, 0, 0, 0]), ], ) -@tvm.testing.requires_cuda_compute_version(9) def test_cp_async_bulk_tensor_shared_to_global(inputs): def get_ir(shape, tma_args): assert shape[0] % 4 == 0 @@ -1092,7 +1080,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle, C_ptr: Tx.handle): tvm.testing.assert_allclose(C_tvm.numpy(), C_ref, rtol=1e-3, atol=1e-3) -@tvm.testing.requires_cuda_compute_version(9) def test_ptx_map_shared_rank(): @Tx.prim_func def func(A: Tx.Buffer(1)): diff --git a/tests/python/tirx/codegen/test_codegen_nvshmem.py b/tests/python/tirx/codegen/test_codegen_nvshmem.py index 6e48246d53a1..2a8244faaa3a 100644 --- a/tests/python/tirx/codegen/test_codegen_nvshmem.py +++ b/tests/python/tirx/codegen/test_codegen_nvshmem.py @@ -29,6 +29,8 @@ from tvm.runtime import disco as di from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda.marks() + NUM_WORKERS = 4 diff --git a/tests/python/tirx/codegen/test_cuda_copy.py b/tests/python/tirx/codegen/test_cuda_copy.py index 83e7d98040e9..499f53023805 100644 --- a/tests/python/tirx/codegen/test_cuda_copy.py +++ b/tests/python/tirx/codegen/test_cuda_copy.py @@ -20,8 +20,11 @@ import pytest import tvm +import tvm.testing from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda.marks() + DEV = tvm.cuda(0) TARGET = tvm.target.Target("cuda") diff --git a/tests/python/tirx/codegen/test_cuda_cta_reduce.py b/tests/python/tirx/codegen/test_cuda_cta_reduce.py index bbffc92f4f58..24777500f152 100644 --- a/tests/python/tirx/codegen/test_cuda_cta_reduce.py +++ b/tests/python/tirx/codegen/test_cuda_cta_reduce.py @@ -20,8 +20,11 @@ import pytest import tvm +import tvm.testing from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda.marks() + DEV = tvm.cuda(0) TARGET = tvm.target.Target("cuda") diff --git a/tests/python/tirx/codegen/test_cuda_warp_reduce.py b/tests/python/tirx/codegen/test_cuda_warp_reduce.py index a1aa7dab2218..fe9decc6cd39 100644 --- a/tests/python/tirx/codegen/test_cuda_warp_reduce.py +++ b/tests/python/tirx/codegen/test_cuda_warp_reduce.py @@ -20,8 +20,11 @@ import pytest import tvm +import tvm.testing from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda.marks() + DEV = tvm.cuda(0) TARGET = tvm.target.Target("cuda") diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py b/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py index 368137f63142..1cd150e7a6c8 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_binary.py @@ -24,6 +24,8 @@ from tvm.script import tirx as Tx from tvm.tirx.layout import S, TileLayout, wg_local_layout +pytestmark = tvm.testing.requires_cuda.marks() + @pytest.mark.parametrize( "input", diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py index 1690b3b4e487..4c91791fe035 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py @@ -25,6 +25,8 @@ from tvm.script import tirx as Tx from tvm.tirx.layout import S, TileLayout +pytestmark = tvm.testing.requires_cuda.marks() + @pytest.mark.parametrize( "task", diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py index 40b0cad87d98..bba33c7f1bef 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py @@ -38,6 +38,8 @@ from tvm.tirx.stmt import DeclBuffer, TilePrimitiveCall from tvm.tirx.stmt_functor import StmtExprVisitor +pytestmark = tvm.testing.requires_cuda_compute_version_marks(9) + # =========================================================================== # Helpers # =========================================================================== @@ -1045,7 +1047,6 @@ def test_copy_tma_codegen(case): # =========================================================================== -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize("swizzle_len", [3]) @pytest.mark.parametrize("dtype", ["float16"]) def test_copy_tma_symbolic_dimension(dtype, swizzle_len): @@ -1149,7 +1150,6 @@ def copy_async(A_ptr: Tx.handle, B_ptr: Tx.handle) -> None: np.testing.assert_allclose(B_ref, B.numpy()) -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize("swizzle_len", [3]) @pytest.mark.parametrize("dtype", ["float16"]) def test_copy_tma_3d_with_view(dtype, swizzle_len): @@ -1261,7 +1261,6 @@ def copy_async(Q_ptr: Tx.handle, B_ptr: Tx.handle) -> None: # =========================================================================== -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize( "task", [ @@ -1447,7 +1446,6 @@ def copy_async(A_ptr: Tx.handle, B_ptr: Tx.handle) -> None: np.testing.assert_allclose(B_ref, B.numpy()) -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize("dtype", ["float16"]) def test_copy_tma_gpu_smoke_s2g(dtype): """Smoke test: compile and run TMA S2G store on GPU.""" @@ -1513,7 +1511,6 @@ def copy_async(A_ptr: Tx.handle, B_ptr: Tx.handle) -> None: np.testing.assert_allclose(A_np, B.numpy()) -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize("dtype", ["float16"]) def test_copy_tma_dynamic_cta_mask(dtype): """Regression test for B00004: dynamic cta_mask expression in TMA multicast. diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py index 6cd6c38dc906..3de044c540f1 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py @@ -26,6 +26,8 @@ from tvm.tirx.layout import S, TCol, TileLayout, TLane from tvm.tirx.layout import tid_in_wg as axis_tid_in_wg +pytestmark = tvm.testing.requires_cuda_compute_version_marks(10) + @pytest.mark.parametrize("dtype", ["float16", "float32"]) @pytest.mark.parametrize("width_32b", [4, 8, 16, 32]) diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py index bf045c5969ce..20ef0f50af32 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py @@ -38,6 +38,8 @@ from tvm.tirx.operator.tile_primitive.ops import CopyAsync from tvm.tirx.stmt_functor import StmtExprVisitor +pytestmark = tvm.testing.requires_cuda_compute_version_marks(9) + def _make_dsmem_dispatch_call(shape, dtype, src_layout, dst_layout): """Call copy_dsmem_impl directly. Returns impl or raises DispatchFail.""" @@ -121,7 +123,6 @@ def _layout_physical_elements(layout): return max_offset + 1 -@tvm.testing.requires_cuda_compute_version(9) @pytest.mark.parametrize("shape,dtype,src_spec,dst_spec,expected", DSMEM_CONFIGS) def test_dsmem(shape, dtype, src_spec, dst_spec, expected): """Dispatch assertion + GPU correctness for DSMEM copy. diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py index 0da2c2ef4de6..90af8050a6c9 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py @@ -24,6 +24,8 @@ from tvm.script import tirx as Tx from tvm.tirx.layout import ComposeLayout, S, SwizzleLayout, TCol, TileLayout, TLane, tid_in_wg +pytestmark = tvm.testing.requires_cuda.marks() + ml_dtypes_dict = { "float8_e4m3fn": ml_dtypes.float8_e4m3fn, "float8_e5m2": ml_dtypes.float8_e5m2, diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_fma.py b/tests/python/tirx/operator/tile_primitive/cuda/test_fma.py index 78222fc608ec..fb27ad62a427 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_fma.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_fma.py @@ -27,6 +27,8 @@ from tvm.script import tirx as Tx from tvm.tirx.layout import S, TileLayout, wg_local_layout +pytestmark = tvm.testing.requires_cuda.marks() + def _get_sm_version(): target = tvm.target.Target("cuda") diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py b/tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py index 164a903b96a8..7d06c896ab4d 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py @@ -40,6 +40,8 @@ mma_shared_layout, ) +pytestmark = tvm.testing.requires_cuda_compute_version_marks(10) + # --------------------------------------------------------------------------- # Shared test helpers # --------------------------------------------------------------------------- diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py b/tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py index 3cea1eb9d69f..09bcbb4cc999 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py @@ -24,6 +24,8 @@ from tvm.script import tirx as Tx from tvm.tirx.layout import S, TileLayout +pytestmark = tvm.testing.requires_cuda.marks() + ml_dtypes_dict = { "float8_e4m3fn": ml_dtypes.float8_e4m3fn, "float8_e5m2": ml_dtypes.float8_e5m2, diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py b/tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py index 4f147804fbb8..c219ccc570b6 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py @@ -22,6 +22,8 @@ from tvm.script import tirx as Tx from tvm.tirx.layout import R, S, TileLayout, laneid, wg_local_layout +pytestmark = tvm.testing.requires_cuda.marks() + @pytest.mark.parametrize( "src_shape, dst_shape, axes, st_src, st_dst, extent_src, extent_dst", diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py b/tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py index 65fa3a37c36f..db659f280a5b 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py @@ -33,6 +33,8 @@ from tvm.tirx.layout import R, S, TCol, TileLayout, TLane from tvm.tirx.operator.tile_primitive.cuda.tma_utils import SwizzleMode, mma_shared_layout +pytestmark = tvm.testing.requires_cuda_compute_version_marks(10) + T_LAY_BASIC = TileLayout(S[(32, 16) : (1 @ TLane, 1 @ TCol)] + R[4 : 32 @ TLane]) @@ -236,7 +238,6 @@ def _execute(kernel, A_init, expected): ) -@tvm.testing.requires_cuda_compute_version(10) @pytest.mark.parametrize( "name,s_full,s_full_shape,s_region", [ @@ -293,7 +294,6 @@ def test_single_cp(name, s_full, s_full_shape, s_region): _run_2d(s_full, T_LAY_BASIC, s_full_shape, s_region, "uint8", A_np, expected) -@tvm.testing.requires_cuda_compute_version(10) def test_multi_cp_sw0_4tiles(): s_full = TileLayout(S[(4, 32, 16) : (512, 16, 1)]) t_full = TileLayout(S[(4, 32, 16) : (16 @ TCol, 1 @ TLane, 1 @ TCol)] + R[4 : 32 @ TLane]) @@ -302,7 +302,6 @@ def test_multi_cp_sw0_4tiles(): _run_3d_4tile(s_full, t_full, [4, 32, 16], "uint8", A_np, expected) -@tvm.testing.requires_cuda_compute_version(10) def test_align_middle_2_to_1_nvfp4_sfb(): """SFB-style nvfp4 case: TMEM mid canonicalizes to single iter (16@TCol + 4@TCol merge), but SMEM mid stays as 2 iters @@ -424,7 +423,6 @@ def kernel(A_ptr: Tx.handle, B_ptr: Tx.handle): _execute(kernel, A_np, expected) -@tvm.testing.requires_cuda_compute_version(10) @pytest.mark.parametrize( "bad", [ diff --git a/tests/python/tirx/operator/tile_primitive/cuda/test_unary.py b/tests/python/tirx/operator/tile_primitive/cuda/test_unary.py index 13a2f128c78c..8e40c48e69b5 100644 --- a/tests/python/tirx/operator/tile_primitive/cuda/test_unary.py +++ b/tests/python/tirx/operator/tile_primitive/cuda/test_unary.py @@ -25,6 +25,8 @@ cast_layout_supported_for_local as _cast_layout_supported_for_local, ) +pytestmark = tvm.testing.requires_cuda.marks() + @pytest.mark.parametrize( "input", diff --git a/tests/python/tirx/test_bench_utils.py b/tests/python/tirx/test_bench_utils.py index 75fbaccb7fb9..2b9825cdadf4 100644 --- a/tests/python/tirx/test_bench_utils.py +++ b/tests/python/tirx/test_bench_utils.py @@ -161,7 +161,7 @@ def test_compute_groups_small_tensors(): def test_compute_groups_large_tensors(): """Inputs >= 3x L2 need only 1 group.""" # 16384x16384 fp32 = 1GB >> 3*128MB = 384MB - input_bytes = tensor_bytes(torch.empty(16384, 16384, dtype=torch.float32)) + input_bytes = 16384 * 16384 * 4 n = _compute_group_count(input_bytes, l2_bytes=128 * 1024 * 1024) assert n == 1 @@ -169,7 +169,7 @@ def test_compute_groups_large_tensors(): def test_compute_groups_moderate_tensors(): """Moderate tensors: floor(3*L2 / input) + 1.""" # 8192x8192 bf16 = 128MB. floor(384M / 128M) + 1 = 4 - input_bytes = tensor_bytes(torch.empty(8192, 8192, dtype=torch.bfloat16)) + input_bytes = 8192 * 8192 * 2 n = _compute_group_count(input_bytes, l2_bytes=128 * 1024 * 1024) assert n == 4 diff --git a/tests/python/tirx/test_buffer_print.py b/tests/python/tirx/test_buffer_print.py index 1049a9d486a5..8503cbe266a7 100644 --- a/tests/python/tirx/test_buffer_print.py +++ b/tests/python/tirx/test_buffer_print.py @@ -23,6 +23,8 @@ import tvm.testing from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda.marks() + def generate_random_data(shape, dtype): np.random.seed(0) diff --git a/tests/python/tirx/test_control_flow.py b/tests/python/tirx/test_control_flow.py index 2545f795080d..a5bea2c56947 100644 --- a/tests/python/tirx/test_control_flow.py +++ b/tests/python/tirx/test_control_flow.py @@ -17,8 +17,11 @@ import numpy as np import tvm +import tvm.testing from tvm.script import tirx as Tx +pytestmark = tvm.testing.requires_cuda.marks() + def run_test_break_continue(func, shape, expected): dev = tvm.cuda(0) diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh index ca092d10d657..69e525ee7542 100755 --- a/tests/scripts/task_python_unittest.sh +++ b/tests/scripts/task_python_unittest.sh @@ -54,6 +54,7 @@ TEST_FILES=( "tirx-analysis" "tirx-base" "tirx-transform" + "tirx" "tvmscript" "relax" )