Skip to content

Commit e09ccf0

Browse files
authored
feat: support container-image None in slurm (#409)
1 parent b1590e8 commit e09ccf0

10 files changed

Lines changed: 391 additions & 10 deletions

File tree

nemo_run/config.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,13 +468,33 @@ def get_name(self):
468468
return os.path.basename(self.path)
469469

470470
def to_command(
471-
self, with_entrypoint: bool = False, filename: Optional[str] = None, is_local: bool = False
471+
self,
472+
with_entrypoint: bool = False,
473+
filename: Optional[str] = None,
474+
is_local: bool = False,
475+
substitute_rundir_path: Optional[str] = None,
472476
) -> list[str]:
477+
"""Convert the script to a command.
478+
479+
Args:
480+
with_entrypoint: If True, prepend the entrypoint to the command.
481+
filename: If provided, write the inline script to this file.
482+
is_local: If True, use the local filename in the command.
483+
substitute_rundir_path: If provided, substitute /{RUNDIR_NAME} paths
484+
with this path in the inline script content. Used for non-container
485+
mode where container paths need to be replaced with actual cluster paths.
486+
"""
473487
if self.inline:
474488
if filename:
475489
os.makedirs(os.path.dirname(filename), exist_ok=True)
490+
inline_content = self.inline
491+
# Substitute /{RUNDIR_NAME} paths if specified (non-container mode)
492+
if substitute_rundir_path is not None:
493+
inline_content = inline_content.replace(
494+
f"/{RUNDIR_NAME}", substitute_rundir_path
495+
)
476496
with open(filename, "w") as f:
477-
f.write("#!/usr/bin/bash\n" + self.inline)
497+
f.write("#!/usr/bin/bash\n" + inline_content)
478498

479499
if is_local:
480500
cmd = [filename]

nemo_run/core/execution/slurm.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -938,7 +938,18 @@ def get_container_flags(
938938
container_image: Optional[str],
939939
container_env: Optional[list[str]] = None,
940940
) -> list[str]:
941-
_container_flags = ["--container-image", container_image] if container_image else []
941+
"""Get srun flags for container or non-container mode.
942+
943+
For non-container mode, returns --chdir flag to set working directory.
944+
For container mode, returns container-related flags (image, mounts, workdir, env).
945+
"""
946+
if container_image is None:
947+
# Non-container mode: use --chdir to set working directory
948+
workdir = os.path.join(src_job_dir, "code")
949+
return ["--chdir", workdir]
950+
951+
# Container mode: set up container mounts and workdir
952+
_container_flags = ["--container-image", container_image]
942953

943954
new_mounts = copy.deepcopy(base_mounts)
944955
for i, mount in enumerate(new_mounts):
@@ -1079,6 +1090,18 @@ def get_container_flags(
10791090
vars_to_fill["fault_tol_job_results_file"] = self.launcher.job_results_file
10801091

10811092
sbatch_script = fill_template("slurm.sh.j2", vars_to_fill)
1093+
1094+
# For non-container mode, substitute /{RUNDIR_NAME} paths with actual job directory
1095+
# Check both top-level container_image and resource_group container images
1096+
has_container = self.executor.container_image is not None
1097+
if self.executor.run_as_group and self.executor.resource_group:
1098+
has_container = has_container or any(
1099+
rg.container_image is not None for rg in self.executor.resource_group
1100+
)
1101+
if not has_container:
1102+
actual_job_dir = os.path.join(slurm_job_dir, job_directory_name)
1103+
sbatch_script = sbatch_script.replace(f"/{RUNDIR_NAME}", actual_job_dir)
1104+
10821105
return sbatch_script
10831106

10841107
def __repr__(self) -> str:

nemo_run/run/torchx_backend/packaging.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import logging
1717
import os
18+
from pathlib import Path
1819
from typing import Iterator, Optional, Type, Union
1920

2021
import fiddle as fdl
@@ -120,9 +121,23 @@ def _get_details_from_script(fn_or_script: Script, serialize_configs: bool):
120121
log.warning(f"Failed saving yaml configs due to: {e}")
121122

122123
args = fn_or_script.args
124+
125+
# For SlurmExecutor without container, substitute /{RUNDIR_NAME} paths
126+
# with actual cluster paths in inline scripts
127+
substitute_rundir_path = None
128+
if (
129+
isinstance(executor, SlurmExecutor)
130+
and executor.container_image is None
131+
and executor.tunnel is not None
132+
):
133+
substitute_rundir_path = os.path.join(
134+
executor.tunnel.job_dir, Path(executor.job_dir).name
135+
)
136+
123137
role_args = fn_or_script.to_command(
124138
filename=os.path.join(executor.job_dir, SCRIPTS_DIR, f"{name}.sh"),
125139
is_local=True if isinstance(executor, LocalExecutor) else False,
140+
substitute_rundir_path=substitute_rundir_path,
126141
)
127142
m = fn_or_script.path if fn_or_script.m else None
128143
no_python = fn_or_script.entrypoint != "python"

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ managed = true
101101
prerelease = "allow"
102102
conflicts = [
103103
[
104-
{ group = "docs", name = "colorama" },
105-
{ extra = "skypilot", name = "colorama" },
106-
{ extra = "skypilot-all", name = "colorama" }
104+
{ group = "docs" },
105+
{ extra = "skypilot" },
106+
{ extra = "skypilot-all" }
107107
]
108108
]
109109

test/core/execution/artifacts/dummy_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export ENV_VAR=value
3333

3434
# Command 1
3535

36-
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4
36+
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-image test_image --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4
3737

3838
exitcode=$?
3939

test/core/execution/artifacts/ft_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ echo "$SLURM_JOB_ID ${SLURM_RESTART_COUNT:-0} X" >> "$JOB_RESULTS_FILE"
6262

6363
# Command 1
6464

65-
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-workload_check_interval 10 --ft-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh
65+
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-image test_image --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-workload_check_interval 10 --ft-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh
6666

6767
exitcode=$?
6868

test/core/execution/test_slurm.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818

1919
import pytest
2020

21+
from nemo_run.config import RUNDIR_NAME
2122
from nemo_run.core.execution.launcher import SlurmTemplate, Torchrun
2223
from nemo_run.core.execution.slurm import (
24+
SlurmBatchRequest,
2325
SlurmExecutor,
2426
SlurmJobDetails,
2527
SlurmTunnelCallback,
@@ -403,3 +405,148 @@ def test_merge_mismatch(self):
403405
[SlurmExecutor(account="account1"), SlurmExecutor(account="account2")],
404406
num_tasks=3,
405407
)
408+
409+
410+
class TestSlurmBatchRequestNonContainerMode:
411+
"""Tests for non-container mode support (container_image=None)."""
412+
413+
@pytest.fixture
414+
def executor_with_container(self):
415+
"""Create an executor with container image."""
416+
executor = SlurmExecutor(
417+
account="test_account",
418+
partition="gpu",
419+
nodes=2,
420+
ntasks_per_node=8,
421+
container_image="nvcr.io/nvidia/pytorch:24.01-py3",
422+
container_mounts=["/data:/data"],
423+
)
424+
executor.job_name = "test-job"
425+
executor.experiment_dir = "/local/experiments"
426+
executor.job_dir = "/local/experiments/test-job"
427+
executor.experiment_id = "exp-123"
428+
429+
# Mock tunnel
430+
tunnel = MagicMock(spec=LocalTunnel)
431+
tunnel.job_dir = "/remote/experiments/exp-123"
432+
executor.tunnel = tunnel
433+
434+
return executor
435+
436+
@pytest.fixture
437+
def executor_without_container(self):
438+
"""Create an executor without container image (non-container mode)."""
439+
executor = SlurmExecutor(
440+
account="test_account",
441+
partition="gpu",
442+
nodes=2,
443+
ntasks_per_node=8,
444+
container_image=None, # Non-container mode
445+
)
446+
executor.job_name = "test-job"
447+
executor.experiment_dir = "/local/experiments"
448+
executor.job_dir = "/local/experiments/test-job"
449+
executor.experiment_id = "exp-123"
450+
451+
# Mock tunnel
452+
tunnel = MagicMock(spec=LocalTunnel)
453+
tunnel.job_dir = "/remote/experiments/exp-123"
454+
executor.tunnel = tunnel
455+
456+
return executor
457+
458+
def test_materialize_with_container_uses_container_flags(self, executor_with_container):
459+
"""Test that materialize uses container flags when container_image is set."""
460+
request = SlurmBatchRequest(
461+
launch_cmd=["sbatch", "--parsable"],
462+
jobs=["test-job"],
463+
command_groups=[["python train.py"]],
464+
executor=executor_with_container,
465+
max_retries=0,
466+
extra_env={},
467+
)
468+
469+
script = request.materialize()
470+
471+
# Should contain container flags
472+
assert "--container-image" in script
473+
assert "--container-mounts" in script
474+
assert "--container-workdir" in script
475+
# Should NOT contain --chdir (used for non-container mode)
476+
assert "--chdir" not in script
477+
# Should contain /nemo_run paths (not substituted)
478+
assert f"/{RUNDIR_NAME}" in script
479+
480+
def test_materialize_without_container_uses_chdir(self, executor_without_container):
481+
"""Test that materialize uses --chdir when container_image is None."""
482+
request = SlurmBatchRequest(
483+
launch_cmd=["sbatch", "--parsable"],
484+
jobs=["test-job"],
485+
command_groups=[["python train.py"]],
486+
executor=executor_without_container,
487+
max_retries=0,
488+
extra_env={},
489+
)
490+
491+
script = request.materialize()
492+
493+
# Should contain --chdir flag for working directory
494+
assert "--chdir" in script
495+
# Should NOT contain container flags
496+
assert "--container-image" not in script
497+
assert "--container-mounts" not in script
498+
assert "--container-workdir" not in script
499+
500+
def test_materialize_without_container_substitutes_rundir_paths(
501+
self, executor_without_container
502+
):
503+
"""Test that /{RUNDIR_NAME} paths are substituted with actual paths in non-container mode."""
504+
request = SlurmBatchRequest(
505+
launch_cmd=["sbatch", "--parsable"],
506+
jobs=["test-job"],
507+
command_groups=[["python train.py"]],
508+
executor=executor_without_container,
509+
max_retries=0,
510+
extra_env={},
511+
)
512+
513+
script = request.materialize()
514+
515+
# Should NOT contain /nemo_run paths (should be substituted)
516+
assert f"/{RUNDIR_NAME}/code" not in script
517+
# Should contain the actual job directory path
518+
actual_job_dir = "/remote/experiments/exp-123/test-job"
519+
assert f"{actual_job_dir}/code" in script
520+
521+
def test_materialize_with_container_preserves_rundir_paths(self, executor_with_container):
522+
"""Test that /{RUNDIR_NAME} paths are NOT substituted when using container."""
523+
request = SlurmBatchRequest(
524+
launch_cmd=["sbatch", "--parsable"],
525+
jobs=["test-job"],
526+
command_groups=[["python train.py"]],
527+
executor=executor_with_container,
528+
max_retries=0,
529+
extra_env={},
530+
)
531+
532+
script = request.materialize()
533+
534+
# Should contain /nemo_run paths (not substituted for container mode)
535+
assert f"/{RUNDIR_NAME}" in script
536+
537+
def test_non_container_mode_chdir_points_to_code_directory(self, executor_without_container):
538+
"""Test that --chdir in non-container mode points to the code directory."""
539+
request = SlurmBatchRequest(
540+
launch_cmd=["sbatch", "--parsable"],
541+
jobs=["test-job"],
542+
command_groups=[["python train.py"]],
543+
executor=executor_without_container,
544+
max_retries=0,
545+
extra_env={},
546+
)
547+
548+
script = request.materialize()
549+
550+
# The --chdir should point to {job_dir}/code
551+
expected_chdir = "--chdir /remote/experiments/exp-123/test-job/code"
552+
assert expected_chdir in script

test/core/execution/test_slurm_templates.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def dummy_slurm_request_with_artifact(
5454
account="account",
5555
job_dir="/root/sample_job",
5656
tunnel=LocalTunnel(job_dir="/root"),
57+
container_image="test_image",
5758
)
5859
slurm_config.job_name = "sample_job"
5960
max_retries = 3
@@ -79,6 +80,7 @@ def ft_slurm_request_with_artifact(
7980
account="account",
8081
job_dir="/root/sample_job",
8182
tunnel=LocalTunnel(job_dir="/root/"),
83+
container_image="test_image",
8284
)
8385
slurm_config.job_name = "sample_job"
8486
slurm_config.launcher = FaultTolerance(

0 commit comments

Comments
 (0)