From bfeb3e4c24985c64f9aca4c65e40d49195f4f88c Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 16:14:42 +0000 Subject: [PATCH 1/2] [Diag] Revert app_launcher torch-defer (#5633) to reintroduce atfork bug Surgical hunk-level revert of commit a5eb9add4c3 ("Fixes OmniHub startup in Docker tests", #5633) applied only to source/isaaclab/isaaclab/app/app_launcher.py. Removes the defer-torch mechanism so that `import torch` (and transitively `import numpy`) happens in AppLauncher.__init__ BEFORE SimulationApp's fork() through libomni.platforminfo. If the resolved numpy is 2.3.5, its bundled OpenBLAS pthread_atfork handler will SIGSEGV the canary jobs. Unlike the prior whole-file revert, this preserves PR #5449's `--deterministic` CLI flag and RTX-determinism logic, which landed after #5633 and was wiped as collateral damage in the previous attempt. Companion to the relocated diagnostic conftest in the next commit: source/isaaclab/test/conftest.py prints the resolved numpy + OpenBLAS hash so we can confirm which numpy actually landed. Refs PR #5656 (numpy!=2.3.5 exclusion fix being validated). --- source/isaaclab/isaaclab/app/app_launcher.py | 21 ++++++-------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/source/isaaclab/isaaclab/app/app_launcher.py b/source/isaaclab/isaaclab/app/app_launcher.py index 2bdb8a08932d..dbfac672daf7 100644 --- a/source/isaaclab/isaaclab/app/app_launcher.py +++ b/source/isaaclab/isaaclab/app/app_launcher.py @@ -242,7 +242,6 @@ def __init__(self, launcher_args: argparse.Namespace | dict | None = None, **kwa # Exposed to train scripts self.device_id: int # device ID for GPU simulation (defaults to 0) self.device: str # resolved device string (e.g. "cuda:0" or "cpu") - self._deferred_cuda_device_id: int | None = None self.local_rank: int # local rank of GPUs in the current node self.global_rank: int # global rank for multi-node training @@ -251,7 +250,6 @@ def __init__(self, launcher_args: argparse.Namespace | dict | None = None, **kwa # Create SimulationApp, passing the resolved self._config to it for initialization self._create_app() - self._set_deferred_cuda_device() # Load IsaacSim extensions self._load_extensions() @@ -1007,26 +1005,19 @@ def _resolve_device_settings(self, launcher_args: dict): launcher_args["physics_gpu"] = self.device_id launcher_args["active_gpu"] = self.device_id - # Defer importing torch until after SimulationApp starts. Importing - # torch can import NumPy/OpenBLAS, whose at-fork handlers can crash - # Kit's platform-info fork during startup. + # Set the current CUDA device early so that physics backends (e.g. Newton/Warp) + # that allocate on the "current" device during initialization get the correct GPU. + # Without this, all ranks may default to cuda:0 for early allocations. if "cuda" in device: - self._deferred_cuda_device_id = self.device_id + import torch + + torch.cuda.set_device(self.device_id) # Store the resolved device string for downstream consumers (e.g. sim_launcher) self.device = device logger.info("Using device: %s", device) - def _set_deferred_cuda_device(self) -> None: - """Set the current torch CUDA device after Kit startup.""" - if self._deferred_cuda_device_id is None: - return - - import torch - - torch.cuda.set_device(self._deferred_cuda_device_id) - def _resolve_experience_file(self, launcher_args: dict): """Resolve experience file related settings.""" # Check if input keywords contain an 'experience' file setting From a6332aaf59fb5e75109604466db363b51d140fde Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 06:51:04 +0000 Subject: [PATCH 2/2] [Diag] Add numpy/OpenBLAS dep-manifest conftest at repo root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prints the resolved numpy version + bundled OpenBLAS .so filename at pytest session start. Located at the repo root so every subprocess pytest spawned by tools/conftest.py discovers and loads it, regardless of which package's tests are running. Repo root has no isaaclab_* subdirectories, so importmode=prepend placing the repo root on sys.path does NOT shadow the real pip-installed packages — unlike source/conftest.py, where source// (no __init__.py) gets promoted to a namespace package and breaks `from isaaclab_teleop import IsaacTeleopCfg`-style imports. Companion to the previous commit (app_launcher torch-defer revert). --- conftest.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 conftest.py diff --git a/conftest.py b/conftest.py new file mode 100644 index 000000000000..553e668cbb0b --- /dev/null +++ b/conftest.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Dep-manifest diagnostic: prints numpy version + bundled OpenBLAS hash at pytest session start. + +Located at the repo root so every subprocess pytest (driven by +``tools/conftest.py``) discovers and loads it, regardless of which package's +tests are running. The repo root has no ``isaaclab_*`` subdirectories, so +``importmode=prepend`` placing the repo root on ``sys.path`` does NOT shadow +the real pip-installed IsaacLab packages — unlike ``source/conftest.py``, +where ``source//`` (no ``__init__.py``) would be promoted to a namespace +package and break ``from isaaclab_teleop import IsaacTeleopCfg`` style imports. + +Importing numpy here registers its vendored OpenBLAS ``pthread_atfork`` +handler in the same process that later calls ``fork()`` via +``SimulationApp()``. The print output identifies which numpy + OpenBLAS bundle +actually landed in each CI test container. +""" + +import os + +import numpy + +print(f"\n[dep-manifest] numpy {numpy.__version__}", flush=True) +_libs_dir = os.path.join(os.path.dirname(numpy.__file__), os.pardir, "numpy.libs") +if os.path.isdir(_libs_dir): + for _f in sorted(os.listdir(_libs_dir)): + if "openblas" in _f.lower(): + print(f"[dep-manifest] bundled openblas: {_f}", flush=True)