From dca346941cf1b85f76c587cbe6525cdba1ad4fff Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 17 Jun 2026 08:53:11 +0200 Subject: [PATCH 1/3] test(bindings): Fix unsupported FS check in cufile This is a follow-up from making cufile tests use temporary directories as noted by Leo I don't think QA is a problem, because the previous xfail was guarded by having `CI` in the environment variables. However, the `isSupportedFilesystem` was using the wrong directory now as we are now running the test in the temporary directory. My suspicion is that there is some additional check that would be strictly needed (e.g. to check that it isn't just ext4 but also directly mounted on a local nvme device) but I have not figured out a check for that. --- cuda_bindings/tests/test_cufile.py | 54 +++++++++++++----------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py index 3a4d9b1c0e..295ac758b8 100644 --- a/cuda_bindings/tests/test_cufile.py +++ b/cuda_bindings/tests/test_cufile.py @@ -78,15 +78,20 @@ def cufileVersionLessThan(target): return True # Assume old version if any error occurs -@cache -def isSupportedFilesystem(): - """Check if the current filesystem is supported (ext4 or xfs). +@pytest.fixture(scope="session") +def skipIfUnsupportedFilesystem(tmpdir_factory): + """Fixture that skips if the current filesystem is supported (ext4 or xfs). + + The actual requirements are probably both stricter (ext4 was not working on CI previously) + and possibly also less strict. This uses `findmnt` so the kernel's mount table logic owns the decoding of the filesystem type. """ - fs_type = subprocess.check_output(["findmnt", "-no", "FSTYPE", "-T", os.getcwd()], text=True).strip() # noqa: S603, S607 + cmd = ["findmnt", "-no", "FSTYPE", "-T", tmpdir_factory.getbasetemp()] + fs_type = subprocess.check_output(cmd, text=True).strip() # noqa S603, S607 logging.info(f"Current filesystem type (findmnt): {fs_type}") - return fs_type in ("ext4", "xfs") + if fs_type not in ("ext4", "xfs"): + pytest.skip("cuFile handle_register requires ext4 or xfs filesystem") @cache @@ -195,8 +200,7 @@ def driver(ctx): cufile.driver_close() -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("driver") +@pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") def test_handle_register(tmpdir): """Test file handle registration with cuFile.""" # Create test file @@ -385,8 +389,7 @@ def test_buf_register_already_registered(): cuda.cuMemFree(buf_ptr) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("driver") +@pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") def test_cufile_read_write(tmpdir): """Test cuFile read and write operations.""" # Create test file @@ -469,8 +472,7 @@ def test_cufile_read_write(tmpdir): cuda.cuMemFree(read_buf) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("driver") +@pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") def test_cufile_read_write_host_memory(tmpdir): """Test cuFile read and write operations using host memory.""" # Create test file @@ -549,8 +551,7 @@ def test_cufile_read_write_host_memory(tmpdir): cuda.cuMemFreeHost(read_buf) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("driver") +@pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") def test_cufile_read_write_large(tmpdir): """Test cuFile read and write operations with large data.""" # Create test file @@ -636,8 +637,7 @@ def test_cufile_read_write_large(tmpdir): cuda.cuMemFree(read_buf) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver") +@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver", "skipIfUnsupportedFilesystem") def test_cufile_write_async(tmpdir): """Test cuFile asynchronous write operations.""" # Create test file @@ -711,8 +711,7 @@ def test_cufile_write_async(tmpdir): os.close(fd) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver") +@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver", "skipIfUnsupportedFilesystem") def test_cufile_read_async(tmpdir): """Test cuFile asynchronous read operations.""" # Create test file @@ -799,8 +798,7 @@ def test_cufile_read_async(tmpdir): os.close(fd) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver") +@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver", "skipIfUnsupportedFilesystem") def test_cufile_async_read_write(tmpdir): """Test cuFile asynchronous read and write operations in sequence.""" # Create test file @@ -910,8 +908,7 @@ def test_cufile_async_read_write(tmpdir): os.close(fd) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("driver") +@pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") def test_batch_io_basic(tmpdir): """Test basic batch IO operations with multiple read/write operations.""" # Create test file @@ -1106,8 +1103,7 @@ def test_batch_io_basic(tmpdir): cuda.cuMemFree(buf) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("driver") +@pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") def test_batch_io_cancel(tmpdir): """Test batch IO cancellation.""" # Create test file @@ -1183,8 +1179,7 @@ def test_batch_io_cancel(tmpdir): cuda.cuMemFree(buf) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("driver") +@pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") def test_batch_io_large_operations(tmpdir): """Test batch IO with large buffer operations.""" # Create test file @@ -1585,8 +1580,7 @@ def test_stats_start_stop(): @pytest.mark.skipif( cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later" ) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("stats") +@pytest.mark.usefixtures("stats", "skipIfUnsupportedFilesystem") @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") def test_get_stats_l1(tmpdir): """Test cuFile L1 statistics retrieval with file operations.""" @@ -1663,8 +1657,7 @@ def test_get_stats_l1(tmpdir): @pytest.mark.skipif( cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later" ) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("stats") +@pytest.mark.usefixtures("stats", "skipIfUnsupportedFilesystem") @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") def test_get_stats_l2(tmpdir): """Test cuFile L2 statistics retrieval with file operations.""" @@ -1745,8 +1738,7 @@ def test_get_stats_l2(tmpdir): @pytest.mark.skipif( cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later" ) -@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem") -@pytest.mark.usefixtures("stats") +@pytest.mark.usefixtures("stats", "skipIfUnsupportedFilesystem") @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") def test_get_stats_l3(tmpdir): """Test cuFile L3 statistics retrieval with file operations.""" From 245198542b61f0b4188eec8225f54a14c5410c6e Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 24 Jun 2026 20:46:21 +0200 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Leo Fang Signed-off-by: Sebastian Berg --- cuda_bindings/tests/test_cufile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py index 295ac758b8..7f23ff296a 100644 --- a/cuda_bindings/tests/test_cufile.py +++ b/cuda_bindings/tests/test_cufile.py @@ -80,7 +80,7 @@ def cufileVersionLessThan(target): @pytest.fixture(scope="session") def skipIfUnsupportedFilesystem(tmpdir_factory): - """Fixture that skips if the current filesystem is supported (ext4 or xfs). + """Fixture that skips if the current filesystem is not supported (ext4 or xfs). The actual requirements are probably both stricter (ext4 was not working on CI previously) and possibly also less strict. @@ -88,7 +88,7 @@ def skipIfUnsupportedFilesystem(tmpdir_factory): This uses `findmnt` so the kernel's mount table logic owns the decoding of the filesystem type. """ cmd = ["findmnt", "-no", "FSTYPE", "-T", tmpdir_factory.getbasetemp()] - fs_type = subprocess.check_output(cmd, text=True).strip() # noqa S603, S607 + fs_type = subprocess.check_output(cmd, text=True).strip() # noqa: S603 logging.info(f"Current filesystem type (findmnt): {fs_type}") if fs_type not in ("ext4", "xfs"): pytest.skip("cuFile handle_register requires ext4 or xfs filesystem") From dd647ae4647751eb0ad56569ccf3dabd99658ce5 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 25 Jun 2026 10:55:55 +0200 Subject: [PATCH 3/3] Temporary "revert" to see original CI failure Also see if cufile logging can give us more info... Signed-off-by: Sebastian Berg --- cuda_bindings/tests/test_cufile.py | 88 +++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py index 7f23ff296a..1a3dc3a51f 100644 --- a/cuda_bindings/tests/test_cufile.py +++ b/cuda_bindings/tests/test_cufile.py @@ -45,7 +45,14 @@ def cufile_env_json(monkeypatch): config_path = os.path.join(test_dir, "cufile.json") assert os.path.isfile(config_path) monkeypatch.setenv("CUFILE_ENV_PATH_JSON", config_path) + monkeypatch.setenv("CUFILE_LOGGING_LEVEL", "TRACE") logging.info(f"Using cuFile config: {config_path}") + yield + cufile_log_path = pathlib.Path.cwd() / "cufile.log" + if cufile_log_path.is_file(): + logging.info(f"cuFile log contents from {cufile_log_path}:\n{cufile_log_path.read_text(errors='replace')}") + else: + logging.info(f"cuFile log does not exist: {cufile_log_path}") @cache @@ -91,7 +98,8 @@ def skipIfUnsupportedFilesystem(tmpdir_factory): fs_type = subprocess.check_output(cmd, text=True).strip() # noqa: S603 logging.info(f"Current filesystem type (findmnt): {fs_type}") if fs_type not in ("ext4", "xfs"): - pytest.skip("cuFile handle_register requires ext4 or xfs filesystem") + # pytest.skip("cuFile handle_register requires ext4 or xfs filesystem") + pass @cache @@ -201,10 +209,10 @@ def driver(ctx): @pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") -def test_handle_register(tmpdir): +def test_handle_register(): """Test file handle registration with cuFile.""" # Create test file - file_path = tmpdir / "test_handle_register.bin" + file_path = "test_handle_register.bin" # Create file with POSIX operations fd = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600) @@ -238,6 +246,8 @@ def test_handle_register(tmpdir): finally: os.close(fd) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("driver") @@ -390,10 +400,10 @@ def test_buf_register_already_registered(): @pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") -def test_cufile_read_write(tmpdir): +def test_cufile_read_write(): """Test cuFile read and write operations.""" # Create test file - file_path = tmpdir / "test_cufile_rw.bin" + file_path = "test_cufile_rw.bin" # Allocate CUDA memory for write and read write_size = 65536 # 64KB, aligned to 4096 bytes (65536 % 4096 == 0) @@ -470,13 +480,15 @@ def test_cufile_read_write(tmpdir): # Free CUDA memory cuda.cuMemFree(write_buf) cuda.cuMemFree(read_buf) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") -def test_cufile_read_write_host_memory(tmpdir): +def test_cufile_read_write_host_memory(): """Test cuFile read and write operations using host memory.""" # Create test file - file_path = tmpdir / "test_cufile_rw_host.bin" + file_path = "test_cufile_rw_host.bin" # Allocate host memory for write and read write_size = 65536 # 64KB, aligned to 4096 bytes (65536 % 4096 == 0) @@ -549,13 +561,15 @@ def test_cufile_read_write_host_memory(tmpdir): # Free host memory cuda.cuMemFreeHost(write_buf) cuda.cuMemFreeHost(read_buf) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") -def test_cufile_read_write_large(tmpdir): +def test_cufile_read_write_large(): """Test cuFile read and write operations with large data.""" # Create test file - file_path = tmpdir / "test_cufile_rw_large.bin" + file_path = "test_cufile_rw_large.bin" # Allocate large CUDA memory (1MB, aligned to 4096 bytes) write_size = 1024 * 1024 # 1MB, aligned to 4096 bytes (1048576 % 4096 == 0) @@ -635,13 +649,15 @@ def test_cufile_read_write_large(tmpdir): # Free CUDA memory cuda.cuMemFree(write_buf) cuda.cuMemFree(read_buf) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver", "skipIfUnsupportedFilesystem") -def test_cufile_write_async(tmpdir): +def test_cufile_write_async(): """Test cuFile asynchronous write operations.""" # Create test file - file_path = tmpdir / "test_cufile_write_async.bin" + file_path = "test_cufile_write_async.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -709,13 +725,15 @@ def test_cufile_write_async(tmpdir): finally: os.close(fd) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver", "skipIfUnsupportedFilesystem") -def test_cufile_read_async(tmpdir): +def test_cufile_read_async(): """Test cuFile asynchronous read operations.""" # Create test file - file_path = tmpdir / "test_cufile_read_async.bin" + file_path = "test_cufile_read_async.bin" # First create and write test data without O_DIRECT fd_temp = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600) @@ -796,13 +814,15 @@ def test_cufile_read_async(tmpdir): finally: os.close(fd) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("ctx", "cufile_env_json", "driver", "skipIfUnsupportedFilesystem") -def test_cufile_async_read_write(tmpdir): +def test_cufile_async_read_write(): """Test cuFile asynchronous read and write operations in sequence.""" # Create test file - file_path = tmpdir / "test_cufile_async_rw.bin" + file_path = "test_cufile_async_rw.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -906,13 +926,15 @@ def test_cufile_async_read_write(tmpdir): finally: os.close(fd) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") -def test_batch_io_basic(tmpdir): +def test_batch_io_basic(): """Test basic batch IO operations with multiple read/write operations.""" # Create test file - file_path = tmpdir / "test_batch_io.bin" + file_path = "test_batch_io.bin" # Allocate CUDA memory for multiple operations buf_size = 65536 # 64KB @@ -1101,13 +1123,15 @@ def test_batch_io_basic(tmpdir): # Free CUDA memory for buf in buffers + read_buffers: cuda.cuMemFree(buf) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") -def test_batch_io_cancel(tmpdir): +def test_batch_io_cancel(): """Test batch IO cancellation.""" # Create test file - file_path = tmpdir / "test_batch_cancel.bin" + file_path = "test_batch_cancel.bin" # Allocate CUDA memory buf_size = 4096 # 4KB, aligned to 4096 bytes @@ -1177,13 +1201,15 @@ def test_batch_io_cancel(tmpdir): # Free CUDA memory for buf in buffers: cuda.cuMemFree(buf) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.usefixtures("driver", "skipIfUnsupportedFilesystem") -def test_batch_io_large_operations(tmpdir): +def test_batch_io_large_operations(): """Test batch IO with large buffer operations.""" # Create test file - file_path = tmpdir / "test_batch_large.bin" + file_path = "test_batch_large.bin" # Allocate large CUDA memory (1MB, aligned to 4096 bytes) buf_size = 1024 * 1024 # 1MB, aligned to 4096 bytes @@ -1361,6 +1387,8 @@ def test_batch_io_large_operations(tmpdir): # Free CUDA memory for buf in all_buffers: cuda.cuMemFree(buf) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.skipif( @@ -1582,10 +1610,10 @@ def test_stats_start_stop(): ) @pytest.mark.usefixtures("stats", "skipIfUnsupportedFilesystem") @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") -def test_get_stats_l1(tmpdir): +def test_get_stats_l1(): """Test cuFile L1 statistics retrieval with file operations.""" # Create test file directly with O_DIRECT - file_path = tmpdir / "test_stats_l1.bin" + file_path = "test_stats_l1.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -1652,6 +1680,8 @@ def test_get_stats_l1(tmpdir): finally: os.close(fd) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.skipif( @@ -1659,10 +1689,10 @@ def test_get_stats_l1(tmpdir): ) @pytest.mark.usefixtures("stats", "skipIfUnsupportedFilesystem") @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") -def test_get_stats_l2(tmpdir): +def test_get_stats_l2(): """Test cuFile L2 statistics retrieval with file operations.""" # Create test file directly with O_DIRECT - file_path = tmpdir / "test_stats_l2.bin" + file_path = "test_stats_l2.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -1733,6 +1763,8 @@ def test_get_stats_l2(tmpdir): finally: os.close(fd) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.skipif( @@ -1740,10 +1772,10 @@ def test_get_stats_l2(tmpdir): ) @pytest.mark.usefixtures("stats", "skipIfUnsupportedFilesystem") @pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global") -def test_get_stats_l3(tmpdir): +def test_get_stats_l3(): """Test cuFile L3 statistics retrieval with file operations.""" # Create test file directly with O_DIRECT - file_path = tmpdir / "test_stats_l3.bin" + file_path = "test_stats_l3.bin" fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600) try: @@ -1824,6 +1856,8 @@ def test_get_stats_l3(tmpdir): finally: os.close(fd) + with suppress(OSError): + os.unlink(file_path) @pytest.mark.skipif(