|
18 | 18 |
|
19 | 19 | import pytest |
20 | 20 |
|
| 21 | +from nemo_run.config import RUNDIR_NAME |
21 | 22 | from nemo_run.core.execution.launcher import SlurmTemplate, Torchrun |
22 | 23 | from nemo_run.core.execution.slurm import ( |
| 24 | + SlurmBatchRequest, |
23 | 25 | SlurmExecutor, |
24 | 26 | SlurmJobDetails, |
25 | 27 | SlurmTunnelCallback, |
@@ -403,3 +405,148 @@ def test_merge_mismatch(self): |
403 | 405 | [SlurmExecutor(account="account1"), SlurmExecutor(account="account2")], |
404 | 406 | num_tasks=3, |
405 | 407 | ) |
| 408 | + |
| 409 | + |
| 410 | +class TestSlurmBatchRequestNonContainerMode: |
| 411 | + """Tests for non-container mode support (container_image=None).""" |
| 412 | + |
| 413 | + @pytest.fixture |
| 414 | + def executor_with_container(self): |
| 415 | + """Create an executor with container image.""" |
| 416 | + executor = SlurmExecutor( |
| 417 | + account="test_account", |
| 418 | + partition="gpu", |
| 419 | + nodes=2, |
| 420 | + ntasks_per_node=8, |
| 421 | + container_image="nvcr.io/nvidia/pytorch:24.01-py3", |
| 422 | + container_mounts=["/data:/data"], |
| 423 | + ) |
| 424 | + executor.job_name = "test-job" |
| 425 | + executor.experiment_dir = "/local/experiments" |
| 426 | + executor.job_dir = "/local/experiments/test-job" |
| 427 | + executor.experiment_id = "exp-123" |
| 428 | + |
| 429 | + # Mock tunnel |
| 430 | + tunnel = MagicMock(spec=LocalTunnel) |
| 431 | + tunnel.job_dir = "/remote/experiments/exp-123" |
| 432 | + executor.tunnel = tunnel |
| 433 | + |
| 434 | + return executor |
| 435 | + |
| 436 | + @pytest.fixture |
| 437 | + def executor_without_container(self): |
| 438 | + """Create an executor without container image (non-container mode).""" |
| 439 | + executor = SlurmExecutor( |
| 440 | + account="test_account", |
| 441 | + partition="gpu", |
| 442 | + nodes=2, |
| 443 | + ntasks_per_node=8, |
| 444 | + container_image=None, # Non-container mode |
| 445 | + ) |
| 446 | + executor.job_name = "test-job" |
| 447 | + executor.experiment_dir = "/local/experiments" |
| 448 | + executor.job_dir = "/local/experiments/test-job" |
| 449 | + executor.experiment_id = "exp-123" |
| 450 | + |
| 451 | + # Mock tunnel |
| 452 | + tunnel = MagicMock(spec=LocalTunnel) |
| 453 | + tunnel.job_dir = "/remote/experiments/exp-123" |
| 454 | + executor.tunnel = tunnel |
| 455 | + |
| 456 | + return executor |
| 457 | + |
| 458 | + def test_materialize_with_container_uses_container_flags(self, executor_with_container): |
| 459 | + """Test that materialize uses container flags when container_image is set.""" |
| 460 | + request = SlurmBatchRequest( |
| 461 | + launch_cmd=["sbatch", "--parsable"], |
| 462 | + jobs=["test-job"], |
| 463 | + command_groups=[["python train.py"]], |
| 464 | + executor=executor_with_container, |
| 465 | + max_retries=0, |
| 466 | + extra_env={}, |
| 467 | + ) |
| 468 | + |
| 469 | + script = request.materialize() |
| 470 | + |
| 471 | + # Should contain container flags |
| 472 | + assert "--container-image" in script |
| 473 | + assert "--container-mounts" in script |
| 474 | + assert "--container-workdir" in script |
| 475 | + # Should NOT contain --chdir (used for non-container mode) |
| 476 | + assert "--chdir" not in script |
| 477 | + # Should contain /nemo_run paths (not substituted) |
| 478 | + assert f"/{RUNDIR_NAME}" in script |
| 479 | + |
| 480 | + def test_materialize_without_container_uses_chdir(self, executor_without_container): |
| 481 | + """Test that materialize uses --chdir when container_image is None.""" |
| 482 | + request = SlurmBatchRequest( |
| 483 | + launch_cmd=["sbatch", "--parsable"], |
| 484 | + jobs=["test-job"], |
| 485 | + command_groups=[["python train.py"]], |
| 486 | + executor=executor_without_container, |
| 487 | + max_retries=0, |
| 488 | + extra_env={}, |
| 489 | + ) |
| 490 | + |
| 491 | + script = request.materialize() |
| 492 | + |
| 493 | + # Should contain --chdir flag for working directory |
| 494 | + assert "--chdir" in script |
| 495 | + # Should NOT contain container flags |
| 496 | + assert "--container-image" not in script |
| 497 | + assert "--container-mounts" not in script |
| 498 | + assert "--container-workdir" not in script |
| 499 | + |
| 500 | + def test_materialize_without_container_substitutes_rundir_paths( |
| 501 | + self, executor_without_container |
| 502 | + ): |
| 503 | + """Test that /{RUNDIR_NAME} paths are substituted with actual paths in non-container mode.""" |
| 504 | + request = SlurmBatchRequest( |
| 505 | + launch_cmd=["sbatch", "--parsable"], |
| 506 | + jobs=["test-job"], |
| 507 | + command_groups=[["python train.py"]], |
| 508 | + executor=executor_without_container, |
| 509 | + max_retries=0, |
| 510 | + extra_env={}, |
| 511 | + ) |
| 512 | + |
| 513 | + script = request.materialize() |
| 514 | + |
| 515 | + # Should NOT contain /nemo_run paths (should be substituted) |
| 516 | + assert f"/{RUNDIR_NAME}/code" not in script |
| 517 | + # Should contain the actual job directory path |
| 518 | + actual_job_dir = "/remote/experiments/exp-123/test-job" |
| 519 | + assert f"{actual_job_dir}/code" in script |
| 520 | + |
| 521 | + def test_materialize_with_container_preserves_rundir_paths(self, executor_with_container): |
| 522 | + """Test that /{RUNDIR_NAME} paths are NOT substituted when using container.""" |
| 523 | + request = SlurmBatchRequest( |
| 524 | + launch_cmd=["sbatch", "--parsable"], |
| 525 | + jobs=["test-job"], |
| 526 | + command_groups=[["python train.py"]], |
| 527 | + executor=executor_with_container, |
| 528 | + max_retries=0, |
| 529 | + extra_env={}, |
| 530 | + ) |
| 531 | + |
| 532 | + script = request.materialize() |
| 533 | + |
| 534 | + # Should contain /nemo_run paths (not substituted for container mode) |
| 535 | + assert f"/{RUNDIR_NAME}" in script |
| 536 | + |
| 537 | + def test_non_container_mode_chdir_points_to_code_directory(self, executor_without_container): |
| 538 | + """Test that --chdir in non-container mode points to the code directory.""" |
| 539 | + request = SlurmBatchRequest( |
| 540 | + launch_cmd=["sbatch", "--parsable"], |
| 541 | + jobs=["test-job"], |
| 542 | + command_groups=[["python train.py"]], |
| 543 | + executor=executor_without_container, |
| 544 | + max_retries=0, |
| 545 | + extra_env={}, |
| 546 | + ) |
| 547 | + |
| 548 | + script = request.materialize() |
| 549 | + |
| 550 | + # The --chdir should point to {job_dir}/code |
| 551 | + expected_chdir = "--chdir /remote/experiments/exp-123/test-job/code" |
| 552 | + assert expected_chdir in script |
0 commit comments