From 9a1b36e36e6debea9ffdd960631fd7d211a3df59 Mon Sep 17 00:00:00 2001
From: Peru S <perumaal@rlplays.com>
Date: Mon, 2 Feb 2026 12:29:35 -0800
Subject: [PATCH 1/2] Ignore libs/bins

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/.gitignore b/.gitignore
index f9082380e..b2372b5dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,5 @@ pufferlib/ocean/impulse_wars/*-release/
 pufferlib/ocean/impulse_wars/debug-*/
 pufferlib/ocean/impulse_wars/release-*/
 pufferlib/ocean/impulse_wars/benchmark/
+*.a
+*.o

From 6395468ac58cb3329c181ca9c1cc01613fa59956 Mon Sep 17 00:00:00 2001
From: Peru S <perumaal@rlplays.com>
Date: Mon, 2 Feb 2026 15:01:46 -0800
Subject: [PATCH 2/2] Make DEBUG=1 builds work. Adds Torch error catching (to
 DEBUG mode).

Looks like this:

```
*Error from libtorch: The size of tensor a (8192) must match the size of tensor b (4096) at non-singleton dimension 0
Exception raised from infer_size_impl at /pytorch/aten/src/ATen/ExpandUtils.cpp:31 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x9d (0x747230129fdd in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xc3 (0x7472300bf561 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libc10.so)
frame #2: at::infer_size_dimvector(c10::ArrayRef<long>, c10::ArrayRef<long>) + 0x404 (0x74718fc9edd4 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so)
frame #3: at::TensorIteratorBase::compute_shape(at::TensorIteratorConfig const&) + 0x110 (0x74718fd438b0 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::TensorIteratorBase::build(at::TensorIteratorConfig&) + 0x59 (0x74718fd48ce9 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x1e1e2b7 (0x74719016a2b7 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so)
frame #6: at::native::copy_(at::Tensor&, at::Tensor const&, bool) + 0x57 (0x74719016c917 in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0x692dddc (0x747194c79ddc in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0x6931b8f (0x747194c7db8f in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so)
frame #9: at::_ops::copy_::call(at::Tensor&, at::Tensor const&, bool) + 0x18a (0x747190fdaeaa in /home/peru/repo/thirdparty/puffer/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0xd3ffe (0x747107beeffe in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #11: <unknown function> + 0x101097 (0x747107c1c097 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #12: <unknown function> + 0x10476b (0x747107c1f76b in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #13: <unknown function> + 0x17d012 (0x747107c98012 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #14: <unknown function> + 0x15e093 (0x747107c79093 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #15: <unknown function> + 0x13e98f (0x747107c5998f in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #16: std::function<void ()>::operator()() const + 0x36 (0x747107c2ec10 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #17: <unknown function> + 0x103668 (0x747107c1e668 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #18: <unknown function> + 0x10589a (0x747107c2089a in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #19: <unknown function> + 0x109023 (0x747107c24023 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #20: <unknown function> + 0x199dfb (0x747107cb4dfb in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #21: <unknown function> + 0x180921 (0x747107c9b921 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #22: <unknown function> + 0x163a79 (0x747107c7ea79 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #23: <unknown function> + 0x163da3 (0x747107c7eda3 in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
frame #24: <unknown function> + 0xcceae (0x747107be7eae in /home/peru/repo/thirdparty/PufferLib/pufferlib/_C.cpython-313-x86_64-linux-gnu.so)
<omitting python frames>
frame #42: <unknown function> + 0x2a1ca (0x747272c721ca in /lib/x86_64-linux-gnu/libc.so.6)
frame #43: __libc_start_main + 0x8b (0x747272c7228b in /lib/x86_64-linux-gnu/libc.so.6)

Illegal instruction (core dumped)
```

This is particularly important for the individual threads that won't be caught by the Python main thread catch-all.
---
 pufferlib/extensions/pufferlib.cpp | 50 ++++++++++++++++++++++++++++++
 setup.py                           | 17 ++++++++--
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/pufferlib/extensions/pufferlib.cpp b/pufferlib/extensions/pufferlib.cpp
index cfd05a1f9..7049db96e 100644
--- a/pufferlib/extensions/pufferlib.cpp
+++ b/pufferlib/extensions/pufferlib.cpp
@@ -30,6 +30,44 @@
 
 typedef torch::Tensor Tensor;
 
+#if defined(PUFFER_DEBUG)
+
+inline static void PUFFER_ASSERT_BREAK()
+{
+#if defined(_MSC_VER)
+  // assert (abort) does not break into the debugger in VS 2022 ! It's insane, so we have to use this weird contraption that's cross platform.
+  __debugbreak();
+#elif defined(__clang__) || defined(__GNUC__)
+  __builtin_trap();
+#else
+  /* Fallback method */
+  *((volatile int*)0) = 0; /* This will cause a segmentation fault */
+#endif
+}
+#endif
+
+// LibTorch throws exceptions on errors, log them correctly in debug mode only.
+#if PUFFER_DEBUG
+#define BEGIN_LIBTORCH_CATCH try {
+#else
+#define BEGIN_LIBTORCH_CATCH
+#endif
+
+#if PUFFER_DEBUG
+#define END_LIBTORCH_CATCH                                                                                             \
+  }                                                                                                                    \
+  catch (const c10::Error& e)                                                                                          \
+  {                                                                                                                    \
+    std::cerr << "Error from libtorch: " << e.what() << std::endl;                                                     \
+    PUFFER_ASSERT_BREAK();                                                                                             \
+    throw;                                                                                                             \
+  }
+
+#else
+#define END_LIBTORCH_CATCH
+#endif
+
+
 // CUDA kernel wrappers
 #include "modules.cpp"
 
@@ -428,6 +466,7 @@ void train_forward_call(TrainGraph& graph, PolicyMinGRU* policy_bf16, PolicyMinG
 // Capture with shared memory pool
 void capture_graph(at::cuda::CUDAGraph* graph, std::function<void()> func,
                    at::cuda::MempoolId_t pool) {
+  BEGIN_LIBTORCH_CATCH
     /* Checklist for avoiding diabolical capture bugs:
      * 1. Don't start separate streams before tracing (i.e. env gpu buffers)
      * 2. Make sure input/output buffer pointers don't change
@@ -455,6 +494,7 @@ void capture_graph(at::cuda::CUDAGraph* graph, std::function<void()> func,
     cudaDeviceSynchronize();
 
     at::cuda::setCurrentCUDAStream(current_stream);
+  END_LIBTORCH_CATCH
 }
 
 
@@ -486,12 +526,15 @@ void compute_advantage(RolloutBuf& rollouts, Tensor& advantages, HypersT& hypers
 
 // Thread initialization callback - sets CUDA stream once per thread
 extern "C" void thread_init_wrapper(void* ctx, int buf) {
+  BEGIN_LIBTORCH_CATCH
     PuffeRL* pufferl = (PuffeRL*)ctx;
     at::cuda::setCurrentCUDAStream(pufferl->torch_streams[buf]);
+  END_LIBTORCH_CATCH
 }
 
 // Callback for OMP threadmanager - runs policy forward for one (buf, t) step
 extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
+  BEGIN_LIBTORCH_CATCH
     torch::NoGradGuard no_grad;
     PuffeRL* pufferl = (PuffeRL*)ctx;
     HypersT& hypers = pufferl->hypers;
@@ -504,9 +547,11 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
         fused_rollout_step(*pufferl, t, buf);
     }
     profile_end(hypers.profile);
+  END_LIBTORCH_CATCH
 }
 
 std::unique_ptr<pufferlib::PuffeRL> create_pufferl_impl(HypersT& hypers, const std::string& env_name, Dict* vec_kwargs, Dict* env_kwargs) {
+  BEGIN_LIBTORCH_CATCH
     auto pufferl = std::make_unique<pufferlib::PuffeRL>();
     pufferl->hypers = hypers;
 
@@ -673,6 +718,7 @@ std::unique_ptr<pufferlib::PuffeRL> create_pufferl_impl(HypersT& hypers, const s
     static_vec_reset(vec);
     
     return pufferl;
+  END_LIBTORCH_CATCH
 }
 
 std::tuple<Tensor, Tensor> compute_prio(Tensor& advantages,
@@ -707,6 +753,7 @@ void train_select_and_copy(TrainGraph& graph, RolloutBuf& rollouts,
 }
 
 void rollouts_impl(PuffeRL& pufferl) {
+  BEGIN_LIBTORCH_CATCH
     torch::NoGradGuard no_grad;
     HypersT& hypers = pufferl.hypers;
 
@@ -732,10 +779,12 @@ void rollouts_impl(PuffeRL& pufferl) {
         env_send(pufferl, buf);
         profile_end(hypers.profile);
     }
+  END_LIBTORCH_CATCH
 }
 
 
 void train_impl(PuffeRL& pufferl) {
+  BEGIN_LIBTORCH_CATCH
     // Update to HypersT& p
     HypersT& hypers = pufferl.hypers;
 
@@ -858,6 +907,7 @@ void train_impl(PuffeRL& pufferl) {
     */
     //double explained_var = (var_y.abs() < 1e-8) ? NAN : (1 - (y_true - y_pred).var() / var_y).item<double>();
     cudaStreamSynchronize(at::cuda::getCurrentCUDAStream());
+  END_LIBTORCH_CATCH  
 }
 
 // Profiler control for nsys --capture-range=cudaProfilerApi
diff --git a/setup.py b/setup.py
index 9a2c5abd9..dfb95488c 100644
--- a/setup.py
+++ b/setup.py
@@ -37,6 +37,9 @@
 NO_OCEAN = os.getenv("NO_OCEAN", "0") == "1"
 NO_TRAIN = os.getenv("NO_TRAIN", "0") == "1"
 
+if DEBUG:
+    print("*****Building in DEBUG mode*******")
+
 # Build raylib for your platform
 RAYLIB_URL = 'https://github.com/raysan5/raylib/releases/download/5.5/'
 RAYLIB_NAME = 'raylib-5.5_macos' if platform.system() == "Darwin" else 'raylib-5.5_linux_amd64'
@@ -101,6 +104,12 @@ def download_box2d(platform):
     extra_compile_args += [
         '-O0',
         '-g',
+        '-flto=auto',
+        '-fno-semantic-interposition',
+        '-fvisibility=hidden',
+        '-DPUFFER_DEBUG=1',
+        '-DDEBUG=1',
+
         #'-fsanitize=address,undefined,bounds,pointer-overflow,leak',
         #'-fno-omit-frame-pointer',
     ]
@@ -315,11 +324,15 @@ def run(self):
             static_obj = f'pufferlib/extensions/libstatic_{env_name}.o'
 
             clang_cmd = [
-                'clang', '-c', '-O2', '-DNDEBUG',
+                'clang', '-c', 
+                ('-O0' if DEBUG else '-O2'), 
+                ('-DDEBUG' if DEBUG else '-DNDEBUG'),
                 '-I.', '-Ipufferlib/extensions', f'-Ipufferlib/ocean/{env_name}',
                 f'-I./{RAYLIB_NAME}/include', '-I/usr/local/cuda/include',
                 '-DPLATFORM_DESKTOP',
-                '-fno-semantic-interposition', '-fvisibility=hidden',
+                ('-DPUFFER_DEBUG=1' if DEBUG else ''),
+                '-fno-semantic-interposition', 
+                ('-fvisibility=default' if DEBUG else '-fvisibility=hidden'),
                 '-fPIC', '-fopenmp',
                 env_binding_src, '-o', static_obj
             ]