diff --git a/docs/index.md b/docs/index.md index db5ca539..65a7a9a8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -34,10 +34,10 @@ To install Skypilot with optional features, use one of the following commands: You can also manually install Skypilot from https://skypilot.readthedocs.io/en/latest/getting-started/installation.html -If using DGX Cloud Lepton, use the following command to install the Lepton CLI: +If using DGX Cloud Lepton, install NeMo Run with the Lepton extra: ```bash -pip install leptonai +pip install "nemo_run[lepton]" ``` To authenticate with the DGX Cloud Lepton cluster, navigate to the **Settings > Tokens** page in the DGX Cloud Lepton UI and copy the ``lep login`` command shown on the page and run it in the terminal. diff --git a/nemo_run/__init__.py b/nemo_run/__init__.py index 0d403a54..45586c91 100644 --- a/nemo_run/__init__.py +++ b/nemo_run/__init__.py @@ -26,7 +26,6 @@ from nemo_run.core.execution.docker import DockerExecutor from nemo_run.core.execution.kubeflow import KubeflowExecutor from nemo_run.core.execution.launcher import FaultTolerance, SlurmRay, SlurmTemplate, Torchrun -from nemo_run.core.execution.lepton import LeptonExecutor from nemo_run.core.execution.local import LocalExecutor from nemo_run.core.execution.skypilot import SkypilotExecutor from nemo_run.core.execution.slurm import SlurmExecutor @@ -40,6 +39,15 @@ from nemo_run.run.experiment import Experiment from nemo_run.run.plugin import ExperimentPlugin as Plugin + +def __getattr__(name: str): + if name == "LeptonExecutor": + from nemo_run.core.execution.lepton import LeptonExecutor + + return LeptonExecutor + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + __all__ = [ "autoconvert", "cli", diff --git a/nemo_run/core/execution/__init__.py b/nemo_run/core/execution/__init__.py index 08e088c8..5937dfc3 100644 --- a/nemo_run/core/execution/__init__.py +++ b/nemo_run/core/execution/__init__.py @@ -14,12 +14,20 @@ # limitations under the License. from nemo_run.core.execution.dgxcloud import DGXCloudExecutor -from nemo_run.core.execution.lepton import LeptonExecutor from nemo_run.core.execution.local import LocalExecutor from nemo_run.core.execution.kubeflow import KubeflowExecutor from nemo_run.core.execution.skypilot import SkypilotExecutor from nemo_run.core.execution.slurm import SlurmExecutor + +def __getattr__(name: str): + if name == "LeptonExecutor": + from nemo_run.core.execution.lepton import LeptonExecutor + + return LeptonExecutor + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + __all__ = [ "LocalExecutor", "SlurmExecutor", diff --git a/nemo_run/core/execution/lepton.py b/nemo_run/core/execution/lepton.py index 3e3bb916..cbe87e60 100644 --- a/nemo_run/core/execution/lepton.py +++ b/nemo_run/core/execution/lepton.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import base64 import logging import os @@ -22,36 +24,80 @@ import time from dataclasses import dataclass, field from datetime import datetime +from enum import Enum from pathlib import Path from typing import Any, List, Optional, Set, Type from invoke.context import Context -from leptonai.api.v2.client import APIClient -from leptonai.api.v1.types.affinity import LeptonResourceAffinity -from leptonai.api.v1.types.common import Metadata, LeptonVisibility -from leptonai.api.v1.types.dedicated_node_group import DedicatedNodeGroup -from leptonai.api.v1.types.deployment import ( - EnvVar, - EnvValue, - LeptonContainer, - Mount, -) -from leptonai.api.v1.types.job import ( - LeptonJob, - LeptonJobState, - LeptonJobUserSpec, - ReservationConfig, -) -from leptonai.api.v1.types.replica import Replica from nemo_run.config import get_nemorun_home from nemo_run.core.execution.base import Executor, ExecutorMacros from nemo_run.core.packaging.base import Packager from nemo_run.core.packaging.git import GitArchivePackager +_LEPTON_IMPORT_ERROR: ImportError | None = None +_LEPTON_AVAILABLE = False + +try: + from leptonai.api.v1.types.affinity import LeptonResourceAffinity + from leptonai.api.v1.types.common import LeptonVisibility, Metadata + from leptonai.api.v1.types.dedicated_node_group import DedicatedNodeGroup + from leptonai.api.v1.types.deployment import ( + EnvVar, + EnvValue, + LeptonContainer, + Mount, + ) + from leptonai.api.v1.types.job import ( + LeptonJob, + LeptonJobState, + LeptonJobUserSpec, + ReservationConfig, + ) + from leptonai.api.v1.types.replica import Replica + from leptonai.api.v2.client import APIClient + + _LEPTON_AVAILABLE = True +except ImportError as e: + _LEPTON_IMPORT_ERROR = e + + class LeptonJobState(Enum): + Starting = "Starting" + Running = "Running" + Failed = "Failed" + Completed = "Completed" + Deleting = "Deleting" + Restarting = "Restarting" + Archived = "Archived" + Stopped = "Stopped" + Stopping = "Stopping" + Unknown = "Unknown" + + APIClient = None + DedicatedNodeGroup = None + EnvValue = None + EnvVar = None + LeptonContainer = None + LeptonJob = None + LeptonJobUserSpec = None + LeptonResourceAffinity = None + LeptonVisibility = None + Metadata = None + Mount = None + Replica = None + ReservationConfig = None + logger = logging.getLogger(__name__) +def _require_leptonai() -> None: + if not _LEPTON_AVAILABLE: + raise ImportError( + "leptonai package is required for LeptonExecutor. " + 'Install it with: pip install "nemo_run[lepton]"' + ) from _LEPTON_IMPORT_ERROR + + @dataclass(kw_only=True) class LeptonExecutor(Executor): """ @@ -84,6 +130,9 @@ class LeptonExecutor(Executor): head_resource_shape: Optional[str] = "" # Only used for LeptonRayCluster ray_version: Optional[str] = None # Only used for LeptonRayCluster + def __post_init__(self) -> None: + _require_leptonai() + def stop_job(self, job_id: str): """ Send a stop signal to the requested job @@ -376,6 +425,7 @@ def cancel(self, job_id: str): @classmethod def logs(cls: Type["LeptonExecutor"], app_id: str, fallback_path: Optional[str]): + _require_leptonai() client = APIClient() # Get the first replica from the job which contains the job logs diff --git a/nemo_run/run/ray/cluster.py b/nemo_run/run/ray/cluster.py index a15bdc55..21a6ba06 100644 --- a/nemo_run/run/ray/cluster.py +++ b/nemo_run/run/ray/cluster.py @@ -17,12 +17,21 @@ from typing import Optional, Type from nemo_run.core.execution.base import Executor -from nemo_run.core.execution.lepton import LeptonExecutor from nemo_run.core.execution.slurm import SlurmExecutor from nemo_run.core.frontend.console.api import configure_logging -from nemo_run.run.ray.lepton import LeptonRayCluster from nemo_run.run.ray.slurm import SlurmRayCluster +# Import guard for Lepton dependencies +try: + from nemo_run.core.execution.lepton import LeptonExecutor + from nemo_run.run.ray.lepton import LeptonRayCluster + + _LEPTON_RAY_AVAILABLE = True +except ImportError: + LeptonExecutor = None + LeptonRayCluster = None + _LEPTON_RAY_AVAILABLE = False + # Import guard for Kubernetes dependencies try: from nemo_run.core.execution.kuberay import KubeRayExecutor @@ -45,9 +54,11 @@ def __post_init__(self): configure_logging(level=self.log_level) backend_map: dict[Type[Executor], Type] = { SlurmExecutor: SlurmRayCluster, - LeptonExecutor: LeptonRayCluster, } + if _LEPTON_RAY_AVAILABLE and LeptonExecutor is not None and LeptonRayCluster is not None: + backend_map[LeptonExecutor] = LeptonRayCluster + if _KUBERAY_AVAILABLE and KubeRayExecutor is not None and KubeRayCluster is not None: backend_map[KubeRayExecutor] = KubeRayCluster diff --git a/nemo_run/run/ray/job.py b/nemo_run/run/ray/job.py index 8c608a3f..af2ac9a9 100644 --- a/nemo_run/run/ray/job.py +++ b/nemo_run/run/ray/job.py @@ -17,12 +17,21 @@ from typing import Any, Optional, Type from nemo_run.core.execution.base import Executor -from nemo_run.core.execution.lepton import LeptonExecutor from nemo_run.core.execution.slurm import SlurmExecutor from nemo_run.core.frontend.console.api import configure_logging -from nemo_run.run.ray.lepton import LeptonRayJob from nemo_run.run.ray.slurm import SlurmRayJob +# Import guard for Lepton dependencies +try: + from nemo_run.core.execution.lepton import LeptonExecutor + from nemo_run.run.ray.lepton import LeptonRayJob + + _LEPTON_RAY_AVAILABLE = True +except ImportError: + LeptonExecutor = None + LeptonRayJob = None + _LEPTON_RAY_AVAILABLE = False + # Import guard for Kubernetes dependencies try: from nemo_run.core.execution.kuberay import KubeRayExecutor @@ -49,10 +58,12 @@ class RayJob: def __post_init__(self) -> None: # noqa: D401 – simple implementation configure_logging(level=self.log_level) backend_map: dict[Type[Executor], Type[Any]] = { - LeptonExecutor: LeptonRayJob, SlurmExecutor: SlurmRayJob, } + if _LEPTON_RAY_AVAILABLE and LeptonExecutor is not None and LeptonRayJob is not None: + backend_map[LeptonExecutor] = LeptonRayJob + if _KUBERAY_AVAILABLE and KubeRayExecutor is not None and KubeRayJob is not None: backend_map[KubeRayExecutor] = KubeRayJob @@ -62,7 +73,7 @@ def __post_init__(self) -> None: # noqa: D401 – simple implementation backend_cls = backend_map[self.executor.__class__] self.backend = backend_cls(name=self.name, executor=self.executor) - if isinstance(self.executor, LeptonExecutor): + if LeptonExecutor is not None and isinstance(self.executor, LeptonExecutor): self.backend.cluster_name = self.cluster_name self.backend.cluster_ready_timeout = self.cluster_ready_timeout diff --git a/nemo_run/run/ray/lepton.py b/nemo_run/run/ray/lepton.py index 0639616e..2b89e528 100644 --- a/nemo_run/run/ray/lepton.py +++ b/nemo_run/run/ray/lepton.py @@ -13,33 +13,57 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import asyncio import json import logging import sys import time -import urllib3 import warnings from dataclasses import dataclass -from ray.job_submission import JobSubmissionClient -from rich.pretty import pretty_repr from typing import Any, Optional, TypeAlias -from leptonai.api.v1.types.affinity import LeptonResourceAffinity -from leptonai.api.v1.types.dedicated_node_group import DedicatedNodeGroup -from leptonai.api.v1.types.deployment import EnvVar, EnvValue - -from nemo_run.core.execution.lepton import LeptonExecutor +import urllib3 +from rich.pretty import pretty_repr -from leptonai.api.v2.client import APIClient -from leptonai.api.v1.types.raycluster import ( - LeptonRayCluster as LeptonRayClusterSpec, - LeptonRayClusterUserSpec, - Metadata, - RayHeadGroupSpec, - RayWorkerGroupSpec, -) -from leptonai.cli.raycluster import DEFAULT_RAY_IMAGE +from nemo_run.core.execution.lepton import LeptonExecutor, _require_leptonai + +_RAY_IMPORT_ERROR: ImportError | None = None +_RAY_AVAILABLE = False +try: + from ray.job_submission import JobSubmissionClient + + _RAY_AVAILABLE = True +except ImportError as e: + _RAY_IMPORT_ERROR = e + JobSubmissionClient = None + +try: + from leptonai.api.v1.types.affinity import LeptonResourceAffinity + from leptonai.api.v1.types.dedicated_node_group import DedicatedNodeGroup + from leptonai.api.v1.types.deployment import EnvVar, EnvValue + from leptonai.api.v1.types.raycluster import ( + LeptonRayCluster as LeptonRayClusterSpec, + LeptonRayClusterUserSpec, + Metadata, + RayHeadGroupSpec, + RayWorkerGroupSpec, + ) + from leptonai.api.v2.client import APIClient + from leptonai.cli.raycluster import DEFAULT_RAY_IMAGE +except ImportError: + APIClient = None + DEFAULT_RAY_IMAGE = None + DedicatedNodeGroup = None + EnvValue = None + EnvVar = None + LeptonRayClusterSpec = None + LeptonRayClusterUserSpec = None + LeptonResourceAffinity = None + Metadata = None + RayHeadGroupSpec = None + RayWorkerGroupSpec = None noquote: TypeAlias = str @@ -49,6 +73,19 @@ RAY_NOT_READY_STATE = "Not Ready" +def _require_ray() -> None: + if not _RAY_AVAILABLE: + raise ImportError( + "ray is required for Lepton Ray helpers. " + 'Install it with: pip install "nemo_run[lepton]"' + ) from _RAY_IMPORT_ERROR + + +def _require_lepton_ray() -> None: + _require_leptonai() + _require_ray() + + @dataclass(kw_only=True) class LeptonRayCluster: EXECUTOR_CLS = LeptonExecutor @@ -57,6 +94,7 @@ class LeptonRayCluster: executor: LeptonExecutor def __post_init__(self): + _require_lepton_ray() self.cluster_map: dict[str, str] = {} def _node_group_id(self, client: APIClient) -> DedicatedNodeGroup: @@ -374,6 +412,7 @@ class LeptonRayJob: # Internals # --------------------------------------------------------------------- def __post_init__(self): + _require_lepton_ray() self.submission_id = None def _get_last_submission_id(self) -> Optional[int]: diff --git a/nemo_run/run/torchx_backend/schedulers/lepton.py b/nemo_run/run/torchx_backend/schedulers/lepton.py index 0b012c19..b6e89ca2 100644 --- a/nemo_run/run/torchx_backend/schedulers/lepton.py +++ b/nemo_run/run/torchx_backend/schedulers/lepton.py @@ -24,13 +24,12 @@ import fiddle as fdl import fiddle._src.experimental.dataclasses as fdl_dc -from leptonai.api.v1.types.job import LeptonJobState from torchx.schedulers.api import AppDryRunInfo, DescribeAppResponse, ListAppResponse, Scheduler from torchx.specs import AppDef, AppState, ReplicaStatus, Role, RoleStatus, runopts from nemo_run.config import get_nemorun_home from nemo_run.core.execution.base import Executor -from nemo_run.core.execution.lepton import LeptonExecutor +from nemo_run.core.execution.lepton import LeptonExecutor, LeptonJobState, _require_leptonai from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer from nemo_run.run.torchx_backend.schedulers.api import SchedulerMixin @@ -70,6 +69,7 @@ class LeptonRequest: class LeptonScheduler(SchedulerMixin, Scheduler[dict[str, str]]): # type: ignore def __init__(self, session_name: str) -> None: super().__init__("lepton", session_name) + _require_leptonai() def _run_opts(self) -> runopts: opts = runopts() @@ -86,9 +86,8 @@ def _submit_dryrun( # type: ignore app: AppDef, cfg: Executor, ) -> AppDryRunInfo[LeptonRequest]: - assert isinstance(cfg, LeptonExecutor), ( - f"{cfg.__class__} not supported for Lepton scheduler." - ) + if not isinstance(cfg, LeptonExecutor): + raise AssertionError(f"{cfg.__class__} not supported for Lepton scheduler.") executor = cfg assert len(app.roles) == 1, "Only single-role apps are supported." diff --git a/pyproject.toml b/pyproject.toml index c23f4d9e..6a07885b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ dependencies = [ "jinja2>=3.1.4", "networkx >= 3.3", "omegaconf>=2.3.0", - "leptonai>=0.26.6", "toml", ] readme = "README.md" @@ -50,6 +49,9 @@ skypilot_jobs = "nemo_run.run.torchx_backend.schedulers.skypilot_jobs:create_sch kubeflow = "nemo_run.run.torchx_backend.schedulers.kubeflow:create_scheduler" [project.optional-dependencies] +lepton = [ + "leptonai>=0.26.6", +] skypilot = [ "skypilot[kubernetes]>=0.10.0", ] diff --git a/test/core/execution/test_lepton.py b/test/core/execution/test_lepton.py index 4d725133..7c391a02 100644 --- a/test/core/execution/test_lepton.py +++ b/test/core/execution/test_lepton.py @@ -19,18 +19,21 @@ from unittest.mock import MagicMock, mock_open, patch import pytest -from leptonai.api.v1.types.common import LeptonVisibility, Metadata -from leptonai.api.v1.types.deployment import ( + +pytest.importorskip("leptonai") + +from leptonai.api.v1.types.common import LeptonVisibility, Metadata # noqa: E402 +from leptonai.api.v1.types.deployment import ( # noqa: E402 LeptonContainer, LeptonResourceAffinity, Mount, EnvVar, EnvValue, ) -from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec +from leptonai.api.v1.types.job import LeptonJob, LeptonJobUserSpec # noqa: E402 -from nemo_run.core.execution.lepton import LeptonExecutor, LeptonJobState -from nemo_run.core.packaging.git import GitArchivePackager +from nemo_run.core.execution.lepton import LeptonExecutor, LeptonJobState # noqa: E402 +from nemo_run.core.packaging.git import GitArchivePackager # noqa: E402 class MockLeptonJob: diff --git a/test/core/execution/test_lepton_optional_dependency.py b/test/core/execution/test_lepton_optional_dependency.py new file mode 100644 index 00000000..65eba7b1 --- /dev/null +++ b/test/core/execution/test_lepton_optional_dependency.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import sys +import textwrap +from pathlib import Path + + +REPO_ROOT = Path(__file__).parents[3] + + +def _run_with_blocked_leptonai(code: str) -> subprocess.CompletedProcess[str]: + blocker = """ +import importlib.abc +import sys + + +class BlockLeptonai(importlib.abc.MetaPathFinder): + def find_spec(self, fullname, path=None, target=None): + if fullname == "leptonai" or fullname.startswith("leptonai."): + raise ModuleNotFoundError("No module named 'leptonai'") + return None + + +sys.meta_path.insert(0, BlockLeptonai()) +""" + script = blocker + "\n" + textwrap.dedent(code) + return subprocess.run( + [sys.executable, "-c", script], + cwd=REPO_ROOT, + text=True, + capture_output=True, + check=False, + ) + + +def test_nemo_run_import_without_leptonai() -> None: + result = _run_with_blocked_leptonai( + """ + import sys + + import nemo_run as run + from nemo_run import LeptonExecutor as PublicLeptonExecutor + from nemo_run.core.execution import LeptonExecutor as ExecutionLeptonExecutor + + assert run.LocalExecutor.__name__ == "LocalExecutor" + assert run.LeptonExecutor.__name__ == "LeptonExecutor" + assert PublicLeptonExecutor is run.LeptonExecutor + assert ExecutionLeptonExecutor is run.LeptonExecutor + assert "leptonai" not in sys.modules + + try: + run.LeptonExecutor(container_image="image", nemo_run_dir="/nemo") + except ImportError as e: + assert "nemo_run[lepton]" in str(e) + else: + raise AssertionError("LeptonExecutor should require the lepton extra") + """ + ) + + assert result.returncode == 0, result.stderr + + +def test_scheduler_and_ray_modules_import_without_leptonai() -> None: + result = _run_with_blocked_leptonai( + """ + import sys + + from nemo_run.core.execution.lepton import LeptonExecutor + from nemo_run.run.torchx_backend.schedulers.api import REVERSE_EXECUTOR_MAPPING + import nemo_run.run.ray.cluster + import nemo_run.run.ray.job + import nemo_run.run.ray.lepton + + assert REVERSE_EXECUTOR_MAPPING["lepton"] is LeptonExecutor + assert "leptonai" not in sys.modules + """ + ) + + assert result.returncode == 0, result.stderr diff --git a/test/run/ray/test_lepton.py b/test/run/ray/test_lepton.py index 3728c7ce..50794b46 100644 --- a/test/run/ray/test_lepton.py +++ b/test/run/ray/test_lepton.py @@ -18,9 +18,15 @@ import sys from unittest.mock import MagicMock, patch -from leptonai.api.v1.types.raycluster import LeptonRayClusterState, LeptonRayClusterStatus import pytest +pytest.importorskip("leptonai") + +from leptonai.api.v1.types.raycluster import ( # noqa: E402 + LeptonRayClusterState, + LeptonRayClusterStatus, +) + ######################################################## # Given the LeptonRayCluster and LeptonRayJob tests rely on the *ray* # Python module, we need to create a shim to ensure the official ray diff --git a/uv.lock b/uv.lock index 5bbeda1d..11ed4466 100644 --- a/uv.lock +++ b/uv.lock @@ -4506,7 +4506,6 @@ dependencies = [ { name = "fiddle" }, { name = "inquirerpy" }, { name = "jinja2" }, - { name = "leptonai" }, { name = "networkx" }, { name = "omegaconf" }, { name = "rich" }, @@ -4519,6 +4518,9 @@ dependencies = [ kubeflow = [ { name = "kubernetes" }, ] +lepton = [ + { name = "leptonai" }, +] ray = [ { name = "kubernetes" }, { name = "ray", extra = ["default"] }, @@ -4568,7 +4570,7 @@ requires-dist = [ { name = "jinja2", specifier = ">=3.1.4" }, { name = "kubernetes", marker = "extra == 'kubeflow'" }, { name = "kubernetes", marker = "extra == 'ray'" }, - { name = "leptonai", specifier = ">=0.26.6" }, + { name = "leptonai", marker = "extra == 'lepton'", specifier = ">=0.26.6" }, { name = "networkx", specifier = ">=3.3" }, { name = "omegaconf", specifier = ">=2.3.0" }, { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = ">=2.49.2" }, @@ -4579,7 +4581,7 @@ requires-dist = [ { name = "torchx", specifier = ">=0.7.0" }, { name = "typer", specifier = ">=0.12.3" }, ] -provides-extras = ["skypilot", "skypilot-all", "ray", "kubeflow"] +provides-extras = ["lepton", "skypilot", "skypilot-all", "ray", "kubeflow"] [package.metadata.requires-dev] dev = [