Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 28 additions & 7 deletions python/tvm/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1225,14 +1225,13 @@ def inner(func):
return inner


def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
"""Mark a test as requiring at least a compute architecture
def requires_cuda_compute_version_marks(major_version, minor_version=0, exact=False):
"""Return pytest marks requiring at least a CUDA compute architecture.

Unit test marked with this decorator will run only if the CUDA
compute architecture of the GPU is at least `(major_version,
minor_version)`.
Tests marked with the returned marks will run only if the CUDA compute
architecture of the GPU is at least `(major_version, minor_version)`.

This also marks the test as requiring a cuda support.
This also marks the test as requiring CUDA support.

Parameters
----------
Expand All @@ -1255,14 +1254,36 @@ def requires_cuda_compute_version(major_version, minor_version=0, exact=False):

min_version_str = ".".join(str(v) for v in min_version)
compute_version_str = ".".join(str(v) for v in compute_version)
requires = [
return [
pytest.mark.skipif(
compute_version < min_version or (exact and compute_version != min_version),
reason=f"Requires CUDA compute >= {min_version_str}, but have {compute_version_str}",
),
*requires_cuda.marks(),
]


def requires_cuda_compute_version(major_version, minor_version=0, exact=False):
"""Mark a test as requiring at least a compute architecture

Unit test marked with this decorator will run only if the CUDA
compute architecture of the GPU is at least `(major_version,
minor_version)`.

This also marks the test as requiring a cuda support.

Parameters
----------
major_version: int

The major version of the (major,minor) version tuple.

minor_version: int

The minor version of the (major,minor) version tuple.
"""
requires = requires_cuda_compute_version_marks(major_version, minor_version, exact)

def inner(func):
return _compose([func], requires)

Expand Down
8 changes: 2 additions & 6 deletions tests/python/tirx/codegen/test_codegen_blackwell.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import tvm.testing
from tvm.script import tirx as Tx

pytestmark = tvm.testing.requires_cuda_compute_version_marks(10)


def _get_source(func: tvm.tirx.PrimFunc) -> str:
target = tvm.target.Target("cuda")
Expand All @@ -31,7 +33,6 @@ def _get_source(func: tvm.tirx.PrimFunc) -> str:
return src, mod


@tvm.testing.requires_cuda_compute_version(10)
def test_tmem_alloc_dealloc_relinquish():
N_COLS = 512
cta_group = 1
Expand Down Expand Up @@ -69,7 +70,6 @@ def test_tmem(A: Tx.Buffer((16, 16), "float16")):
assert f"tcgen05.relinquish_alloc_permit.cta_group::{cta_group}.sync.aligned" in src


@tvm.testing.requires_cuda_compute_version(10)
def test_mbarrier_try_wait_once_codegen():
# fmt: off
@Tx.prim_func
Expand All @@ -89,7 +89,6 @@ def test_try_wait_once(A: Tx.Buffer((16, 16), "float16")):
assert "selp.u32" in src


@tvm.testing.requires_cuda_compute_version(10)
def test_fence_before_after_thread_sync():
# fmt: off
@Tx.prim_func
Expand All @@ -112,7 +111,6 @@ def test_fence(A: Tx.Buffer((16, 16), "float16")):
assert "tcgen05.fence::before_thread_sync" in src


@tvm.testing.requires_cuda_compute_version(10)
def test_tcgen05_ld_st_roundtrip():
HEIGHT = 128
WIDTH = 256
Expand Down Expand Up @@ -182,7 +180,6 @@ def test_ld_st(A: Tx.Buffer((HEIGHT, WIDTH), "float32"), B: Tx.Buffer((HEIGHT, W
np.testing.assert_allclose(A.numpy(), B.numpy())


@tvm.testing.requires_cuda_compute_version(10)
def test_tcgen05_cp_ld_roundtrip():
dtype = "float32"
dtype_bits = tvm.DataType(dtype).bits
Expand Down Expand Up @@ -271,7 +268,6 @@ def test_cp_ld(A: Tx.Buffer((HEIGHT, WIDTH), dtype, layout=Tx.TileLayout(Tx.S[(H


@pytest.mark.parametrize("swizzle", [0, 1, 2, 3])
@tvm.testing.requires_cuda_compute_version(10)
def test_tcgen05_mma_ss_no_tma(swizzle):
d_type, a_type, b_type = "float32", "float16", "float16"
M, N, K = 128, 128, 64
Expand Down
11 changes: 11 additions & 0 deletions tests/python/tirx/codegen/test_codegen_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import tvm.testing
from tvm.script import tirx as Tx

pytestmark = tvm.testing.requires_cuda.marks()

DEV = tvm.device("cuda")


Expand Down Expand Up @@ -60,6 +62,7 @@ def main(A: Tx.Buffer((4,), "int32")):
assert "break;" in src


@tvm.testing.requires_cuda_compute_version(9)
def test_cluster_cta_id_codegen_uses_coordinate_sregs():
@Tx.prim_func
def main(A: Tx.Buffer((1,), "int32")):
Expand Down Expand Up @@ -139,6 +142,7 @@ def main(
assert "ld.volatile.global.u64" in src


@tvm.testing.requires_cuda_compute_version(9)
def test_megamoe_extracted_intrinsics_codegen():
@Tx.prim_func
def main(
Expand Down Expand Up @@ -245,6 +249,7 @@ def main(
assert snippet in src


@tvm.testing.requires_cuda_compute_version(9)
def test_ptx_cp_async_bulk_non_tma_form_codegen():
@Tx.prim_func
def main(
Expand Down Expand Up @@ -274,6 +279,7 @@ def main(
assert "unsigned long long cache_policy" in src


@tvm.testing.requires_cuda_compute_version(9)
def test_tensor_map_param_codegen():
@Tx.prim_func
def main(A_map: Tx.TensorMap()):
Expand All @@ -288,6 +294,7 @@ def main(A_map: Tx.TensorMap()):
assert "((unsigned long long)(&(A_map)))" in src


@tvm.testing.requires_cuda_compute_version(10)
def test_tma_cache_policy_operand_codegen():
@Tx.prim_func
def main(Cache: Tx.Buffer((1,), "uint64")):
Expand Down Expand Up @@ -528,6 +535,7 @@ def func(A_ptr: Tx.handle):
@pytest.mark.parametrize("prefetch_size", [-1, 64, 128, 256])
@pytest.mark.parametrize("predicate", [-1, Tx.int32(0), Tx.int32(1)])
@pytest.mark.parametrize("fill_mode", ["", "zero"])
@tvm.testing.requires_cuda_compute_version(8)
def test_ptx_cp_async(cp_size, cache_hint, prefetch_size, predicate, fill_mode):
if fill_mode != "" and predicate == -1:
return
Expand Down Expand Up @@ -569,6 +577,7 @@ def main(A: Tx.Buffer((N), "float16")):

@pytest.mark.parametrize("trans", [False, True])
@pytest.mark.parametrize("num", [1, 2, 4])
@tvm.testing.requires_cuda_compute_version(7, 5)
Comment thread
tlopex marked this conversation as resolved.
def test_ptx_ldmatrix(trans, num):
dtype = ".b16"

Expand Down Expand Up @@ -642,6 +651,7 @@ def main(A: Tx.Buffer((16, 16), "float16"), B: Tx.Buffer((16, 16), "float16")):

@pytest.mark.parametrize("d_type", ["float16", "float32"])
@pytest.mark.parametrize("no_c_ptr", [False, True])
@tvm.testing.requires_cuda_compute_version(8)
def test_ptx_mma_half_m16n8k16(d_type, no_c_ptr):
shape = "m16n8k16"
a_type = "float16"
Expand Down Expand Up @@ -733,6 +743,7 @@ def L2G(buf_local, buf_global, block_8x8):

@pytest.mark.parametrize("d_type", ["float16", "float32"])
@pytest.mark.parametrize("no_c_ptr", [False, True])
@tvm.testing.requires_cuda_compute_version(8)
def test_ptx_mma_half_m16n8k8(d_type, no_c_ptr):
shape = "m16n8k8"
a_type = "float16"
Expand Down
2 changes: 2 additions & 0 deletions tests/python/tirx/codegen/test_codegen_dsmem.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import tvm.testing
from tvm.script import tirx as Tx

pytestmark = tvm.testing.requires_cuda_compute_version_marks(9)


def _get_source(func: tvm.tirx.PrimFunc) -> str:
target = tvm.target.Target("cuda")
Expand Down
17 changes: 2 additions & 15 deletions tests/python/tirx/codegen/test_codegen_hopper.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from tvm.script import tirx as Tx
from tvm.tirx import Buffer

pytestmark = tvm.testing.requires_cuda_compute_version_marks(9)


def _get_source(func: tvm.tirx.PrimFunc) -> tuple[str, tvm.IRModule]:
target = tvm.target.Target("cuda")
Expand Down Expand Up @@ -58,7 +60,6 @@ def main(A_ptr: Tx.handle):


@pytest.mark.parametrize("inc", [False, True])
@tvm.testing.requires_cuda_compute_version(9)
def test_ptx_setmaxnreg(inc):
# fmt: off
@Tx.prim_func
Expand All @@ -79,7 +80,6 @@ def func(A: Tx.Buffer(1)):


@pytest.mark.parametrize("trans", [False, True])
@tvm.testing.requires_cuda_compute_version(9)
def test_stmatrix_sync_aligned(trans):
# fmt: off
@Tx.prim_func
Expand Down Expand Up @@ -196,7 +196,6 @@ def main(A: Tx.Buffer((16, 16), "float16")):
np.testing.assert_allclose(A.numpy(), A_ref)


@tvm.testing.requires_cuda_compute_version(9)
def test_bar_arrive():
# fmt: off
@Tx.prim_func
Expand All @@ -213,7 +212,6 @@ def func(A: Tx.Buffer(1)):
assert 'bar.arrive %0, %1;" : : "r"(name_bar_id), "r"(thread_count) : "memory"' in src


@tvm.testing.requires_cuda_compute_version(9)
def test_bar_sync():
# fmt: off
@Tx.prim_func
Expand All @@ -230,7 +228,6 @@ def func(A: Tx.Buffer(1)):
assert 'bar.sync %0, %1;" : : "r"(name_bar_id), "r"(thread_count) : "memory"' in src


@tvm.testing.requires_cuda_compute_version(9)
def test_fence_mbarrier_init_release_clsuter():
# fmt: off
@Tx.prim_func
Expand All @@ -246,7 +243,6 @@ def func(A: Tx.Buffer(1)):
assert "fence.mbarrier_init.release.cluster" in src


@tvm.testing.requires_cuda_compute_version(9)
def test_ptx_elect_sync():
# fmt: off
@Tx.prim_func
Expand All @@ -264,7 +260,6 @@ def func(A: Tx.Buffer(1)):
assert "elect.sync %%rx|%%px, %2;" in src


@tvm.testing.requires_cuda_compute_version(9)
@pytest.mark.parametrize("sem,scope", [("sc", "cta"), ("acq_rel", "gpu"), ("sc", "sys")])
def test_ptx_fence(sem, scope):
# fmt: off
Expand All @@ -281,7 +276,6 @@ def func(A: Tx.Buffer(1)):
assert f"fence.{sem}.{scope};" in src


@tvm.testing.requires_cuda_compute_version(9)
def test_fence_proxy_async():
# fmt: off
@Tx.prim_func
Expand All @@ -300,7 +294,6 @@ def func(A: Tx.Buffer(1)):
assert "fence.proxy.async.shared::cta" in src


@tvm.testing.requires_cuda_compute_version(9)
@pytest.mark.parametrize("dtype", ["float16", "float32", "float8_e4m3fn", "float8_e5m2"])
@pytest.mark.parametrize(
"inputs",
Expand Down Expand Up @@ -385,7 +378,6 @@ def get_np_dtype(dtype):
assert np.allclose(A.numpy().astype("float32"), B.numpy().astype("float32"))


@tvm.testing.requires_cuda_compute_version(9)
@pytest.mark.parametrize(
("shape", "dtype", "encode_args", "error_msg"),
[
Expand Down Expand Up @@ -461,7 +453,6 @@ def test_tensormap_encode_tiled_runtime_validation(shape, dtype, encode_args, er

@pytest.mark.parametrize("swizzle", [1, 2, 3])
@pytest.mark.parametrize("dtype", ["uint8", "float16", "float32"])
@tvm.testing.requires_cuda_compute_version(9)
def test_cp_async_bulk_tensor_global_to_shared_swizzle(swizzle, dtype):
def get_ir(swizzle, dtype):
dtype = tvm.DataType(dtype)
Expand Down Expand Up @@ -560,7 +551,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
),
],
)
@tvm.testing.requires_cuda_compute_version(9)
def test_cp_async_bulk_tensor_global_to_shared_multicast1(inputs):
# 1 CTA does the copy, and then multicast to all CTAs in the cluster
def get_ir(shape, tma_args):
Expand Down Expand Up @@ -635,7 +625,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
((16, 16, 4), [16, 16, 4, 64, 64 * 16, 16, 16, 1, 1, 1, 1, 0, 0, 0, 0]),
],
)
@tvm.testing.requires_cuda_compute_version(9)
def test_cp_async_bulk_tensor_global_to_shared_multicast2(inputs):
# 4 CTAs in the cluster do the copy of separate chunks, and then multicast to all CTAs in the cluster # noqa: E501
def get_ir(shape, tma_args):
Expand Down Expand Up @@ -726,7 +715,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle):
((16, 16, 4), [16, 16, 4, 64, 64 * 16, 16, 16, 4, 1, 1, 1, 0, 0, 0, 0]),
],
)
@tvm.testing.requires_cuda_compute_version(9)
def test_cp_async_bulk_tensor_shared_to_global(inputs):
def get_ir(shape, tma_args):
assert shape[0] % 4 == 0
Expand Down Expand Up @@ -1092,7 +1080,6 @@ def main(A_ptr: Tx.handle, B_ptr: Tx.handle, C_ptr: Tx.handle):
tvm.testing.assert_allclose(C_tvm.numpy(), C_ref, rtol=1e-3, atol=1e-3)


@tvm.testing.requires_cuda_compute_version(9)
def test_ptx_map_shared_rank():
@Tx.prim_func
def func(A: Tx.Buffer(1)):
Expand Down
2 changes: 2 additions & 0 deletions tests/python/tirx/codegen/test_codegen_nvshmem.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from tvm.runtime import disco as di
from tvm.script import tirx as Tx

pytestmark = tvm.testing.requires_cuda.marks()

NUM_WORKERS = 4


Expand Down
3 changes: 3 additions & 0 deletions tests/python/tirx/codegen/test_cuda_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
import pytest

import tvm
import tvm.testing
from tvm.script import tirx as Tx

pytestmark = tvm.testing.requires_cuda.marks()

DEV = tvm.cuda(0)
TARGET = tvm.target.Target("cuda")

Expand Down
3 changes: 3 additions & 0 deletions tests/python/tirx/codegen/test_cuda_cta_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
import pytest

import tvm
import tvm.testing
from tvm.script import tirx as Tx

pytestmark = tvm.testing.requires_cuda.marks()

DEV = tvm.cuda(0)
TARGET = tvm.target.Target("cuda")

Expand Down
3 changes: 3 additions & 0 deletions tests/python/tirx/codegen/test_cuda_warp_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
import pytest

import tvm
import tvm.testing
from tvm.script import tirx as Tx

pytestmark = tvm.testing.requires_cuda.marks()

DEV = tvm.cuda(0)
TARGET = tvm.target.Target("cuda")

Expand Down
2 changes: 2 additions & 0 deletions tests/python/tirx/operator/tile_primitive/cuda/test_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from tvm.script import tirx as Tx
from tvm.tirx.layout import S, TileLayout, wg_local_layout

pytestmark = tvm.testing.requires_cuda.marks()


@pytest.mark.parametrize(
"input",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from tvm.script import tirx as Tx
from tvm.tirx.layout import S, TileLayout

pytestmark = tvm.testing.requires_cuda.marks()


@pytest.mark.parametrize(
"task",
Expand Down
Loading
Loading