From 9a1b36e36e6debea9ffdd960631fd7d211a3df59 Mon Sep 17 00:00:00 2001 From: Peru S Date: Mon, 2 Feb 2026 12:29:35 -0800 Subject: [PATCH 1/2] Ignore libs/bins --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index f9082380e..b2372b5dc 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,5 @@ pufferlib/ocean/impulse_wars/*-release/ pufferlib/ocean/impulse_wars/debug-*/ pufferlib/ocean/impulse_wars/release-*/ pufferlib/ocean/impulse_wars/benchmark/ +*.a +*.o From 6395468ac58cb3329c181ca9c1cc01613fa59956 Mon Sep 17 00:00:00 2001 From: Peru S Date: Mon, 2 Feb 2026 15:01:46 -0800 Subject: [PATCH 2/2] Make DEBUG=1 builds work. Adds Torch error catching (to DEBUG mode). Looks like this: ``` *Error from libtorch: The size of tensor a (8192) must match the size of tensor b (4096) at non-singleton dimension 0 Exception raised from infer_size_impl at /pytorch/aten/src/ATen/ExpandUtils.cpp:31 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x9d (0x747230129fdd in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libc10.so) frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string, std::allocator > const&) + 0xc3 (0x7472300bf561 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libc10.so) frame #2: at::infer_size_dimvector(c10::ArrayRef, c10::ArrayRef) + 0x404 (0x74718fc9edd4 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so) frame #3: at::TensorIteratorBase::compute_shape(at::TensorIteratorConfig const&) + 0x110 (0x74718fd438b0 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so) frame #4: at::TensorIteratorBase::build(at::TensorIteratorConfig&) + 0x59 (0x74718fd48ce9 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so) frame #5: + 0x1e1e2b7 (0x74719016a2b7 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so) frame #6: at::native::copy_(at::Tensor&, at::Tensor const&, bool) + 0x57 (0x74719016c917 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so) frame #7: + 0x692dddc (0x747194c79ddc in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so) frame #8: + 0x6931b8f (0x747194c7db8f in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so) frame #9: at::_ops::copy_::call(at::Tensor&, at::Tensor const&, bool) + 0x18a (0x747190fdaeaa in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so) frame #10: + 0xd3ffe (0x747107beeffe in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #11: + 0x101097 (0x747107c1c097 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #12: + 0x10476b (0x747107c1f76b in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #13: + 0x17d012 (0x747107c98012 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #14: + 0x15e093 (0x747107c79093 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #15: + 0x13e98f (0x747107c5998f in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #16: std::function::operator()() const + 0x36 (0x747107c2ec10 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #17: + 0x103668 (0x747107c1e668 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #18: + 0x10589a (0x747107c2089a in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #19: + 0x109023 (0x747107c24023 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #20: + 0x199dfb (0x747107cb4dfb in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #21: + 0x180921 (0x747107c9b921 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #22: + 0x163a79 (0x747107c7ea79 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #23: + 0x163da3 (0x747107c7eda3 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #24: + 0xcceae (0x747107be7eae in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so) frame #42: + 0x2a1ca (0x747272c721ca in /lib/x86_64-linux-gnu/libc.so.6) frame #43: __libc_start_main + 0x8b (0x747272c7228b in /lib/x86_64-linux-gnu/libc.so.6) Illegal instruction (core dumped) ``` This is particularly important for the individual threads that won't be caught by the Python main thread catch-all. --- pufferlib/extensions/pufferlib.cpp | 50 ++++++++++++++++++++++++++++++ setup.py | 17 ++++++++-- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/pufferlib/extensions/pufferlib.cpp b/pufferlib/extensions/pufferlib.cpp index cfd05a1f9..7049db96e 100644 --- a/pufferlib/extensions/pufferlib.cpp +++ b/pufferlib/extensions/pufferlib.cpp @@ -30,6 +30,44 @@ typedef torch::Tensor Tensor; +#if defined(PUFFER_DEBUG) + +inline static void PUFFER_ASSERT_BREAK() +{ +#if defined(_MSC_VER) + // assert (abort) does not break into the debugger in VS 2022 ! It's insane, so we have to use this weird contraption that's cross platform. + __debugbreak(); +#elif defined(__clang__) || defined(__GNUC__) + __builtin_trap(); +#else + /* Fallback method */ + *((volatile int*)0) = 0; /* This will cause a segmentation fault */ +#endif +} +#endif + +// LibTorch throws exceptions on errors, log them correctly in debug mode only. +#if PUFFER_DEBUG +#define BEGIN_LIBTORCH_CATCH try { +#else +#define BEGIN_LIBTORCH_CATCH +#endif + +#if PUFFER_DEBUG +#define END_LIBTORCH_CATCH \ + } \ + catch (const c10::Error& e) \ + { \ + std::cerr << "Error from libtorch: " << e.what() << std::endl; \ + PUFFER_ASSERT_BREAK(); \ + throw; \ + } + +#else +#define END_LIBTORCH_CATCH +#endif + + // CUDA kernel wrappers #include "modules.cpp" @@ -428,6 +466,7 @@ void train_forward_call(TrainGraph& graph, PolicyMinGRU* policy_bf16, PolicyMinG // Capture with shared memory pool void capture_graph(at::cuda::CUDAGraph* graph, std::function func, at::cuda::MempoolId_t pool) { + BEGIN_LIBTORCH_CATCH /* Checklist for avoiding diabolical capture bugs: * 1. Don't start separate streams before tracing (i.e. env gpu buffers) * 2. Make sure input/output buffer pointers don't change @@ -455,6 +494,7 @@ void capture_graph(at::cuda::CUDAGraph* graph, std::function func, cudaDeviceSynchronize(); at::cuda::setCurrentCUDAStream(current_stream); + END_LIBTORCH_CATCH } @@ -486,12 +526,15 @@ void compute_advantage(RolloutBuf& rollouts, Tensor& advantages, HypersT& hypers // Thread initialization callback - sets CUDA stream once per thread extern "C" void thread_init_wrapper(void* ctx, int buf) { + BEGIN_LIBTORCH_CATCH PuffeRL* pufferl = (PuffeRL*)ctx; at::cuda::setCurrentCUDAStream(pufferl->torch_streams[buf]); + END_LIBTORCH_CATCH } // Callback for OMP threadmanager - runs policy forward for one (buf, t) step extern "C" void net_callback_wrapper(void* ctx, int buf, int t) { + BEGIN_LIBTORCH_CATCH torch::NoGradGuard no_grad; PuffeRL* pufferl = (PuffeRL*)ctx; HypersT& hypers = pufferl->hypers; @@ -504,9 +547,11 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) { fused_rollout_step(*pufferl, t, buf); } profile_end(hypers.profile); + END_LIBTORCH_CATCH } std::unique_ptr create_pufferl_impl(HypersT& hypers, const std::string& env_name, Dict* vec_kwargs, Dict* env_kwargs) { + BEGIN_LIBTORCH_CATCH auto pufferl = std::make_unique(); pufferl->hypers = hypers; @@ -673,6 +718,7 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, const s static_vec_reset(vec); return pufferl; + END_LIBTORCH_CATCH } std::tuple compute_prio(Tensor& advantages, @@ -707,6 +753,7 @@ void train_select_and_copy(TrainGraph& graph, RolloutBuf& rollouts, } void rollouts_impl(PuffeRL& pufferl) { + BEGIN_LIBTORCH_CATCH torch::NoGradGuard no_grad; HypersT& hypers = pufferl.hypers; @@ -732,10 +779,12 @@ void rollouts_impl(PuffeRL& pufferl) { env_send(pufferl, buf); profile_end(hypers.profile); } + END_LIBTORCH_CATCH } void train_impl(PuffeRL& pufferl) { + BEGIN_LIBTORCH_CATCH // Update to HypersT& p HypersT& hypers = pufferl.hypers; @@ -858,6 +907,7 @@ void train_impl(PuffeRL& pufferl) { */ //double explained_var = (var_y.abs() < 1e-8) ? NAN : (1 - (y_true - y_pred).var() / var_y).item(); cudaStreamSynchronize(at::cuda::getCurrentCUDAStream()); + END_LIBTORCH_CATCH } // Profiler control for nsys --capture-range=cudaProfilerApi diff --git a/setup.py b/setup.py index 9a2c5abd9..dfb95488c 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,9 @@ NO_OCEAN = os.getenv("NO_OCEAN", "0") == "1" NO_TRAIN = os.getenv("NO_TRAIN", "0") == "1" +if DEBUG: + print("*****Building in DEBUG mode*******") + # Build raylib for your platform RAYLIB_URL = 'https://github.com/raysan5/raylib/releases/download/5.5/' RAYLIB_NAME = 'raylib-5.5_macos' if platform.system() == "Darwin" else 'raylib-5.5_linux_amd64' @@ -101,6 +104,12 @@ def download_box2d(platform): extra_compile_args += [ '-O0', '-g', + '-flto=auto', + '-fno-semantic-interposition', + '-fvisibility=hidden', + '-DPUFFER_DEBUG=1', + '-DDEBUG=1', + #'-fsanitize=address,undefined,bounds,pointer-overflow,leak', #'-fno-omit-frame-pointer', ] @@ -315,11 +324,15 @@ def run(self): static_obj = f'pufferlib/extensions/libstatic_{env_name}.o' clang_cmd = [ - 'clang', '-c', '-O2', '-DNDEBUG', + 'clang', '-c', + ('-O0' if DEBUG else '-O2'), + ('-DDEBUG' if DEBUG else '-DNDEBUG'), '-I.', '-Ipufferlib/extensions', f'-Ipufferlib/ocean/{env_name}', f'-I./{RAYLIB_NAME}/include', '-I/usr/local/cuda/include', '-DPLATFORM_DESKTOP', - '-fno-semantic-interposition', '-fvisibility=hidden', + ('-DPUFFER_DEBUG=1' if DEBUG else ''), + '-fno-semantic-interposition', + ('-fvisibility=default' if DEBUG else '-fvisibility=hidden'), '-fPIC', '-fopenmp', env_binding_src, '-o', static_obj ]