Describe the bug
When N_VEHICLES reaches ~157 on a V100 GPU (sharedMemPerBlockOptin = 98304 bytes),
routing.Solve raises an unhandled RuntimeError instead of solving the problem.
Steps/Code to reproduce bug
import numpy as np
import cudf
from cuopt import routing
import cupy as cp
def build_model(n):
n_locs, n_orders, horizon = 2 * n, 4 * n, 1440
rng = np.random.default_rng(42)
coords = rng.uniform(0, 100, (n_locs, 2)).astype(np.float32)
diff = coords[:, None, :] - coords[None, :, :]
dist = np.sqrt((diff ** 2).sum(-1)).astype(np.float32)
np.fill_diagonal(dist, 0.0)
dm = routing.DataModel(n_locs, n, n_orders)
dm.add_cost_matrix(cudf.DataFrame(dist))
dm.add_transit_time_matrix(cudf.DataFrame(dist))
pickup_locs = np.arange(0, n, dtype=np.int32)
delivery_locs = np.arange(n, 2*n, dtype=np.int32)
dm.set_order_locations(cudf.Series(np.concatenate([
pickup_locs, delivery_locs, pickup_locs, pickup_locs])))
dm.set_pickup_delivery_pairs(
cudf.Series(np.concatenate([np.arange(0, n, dtype=np.int32),
np.arange(2*n, 3*n, dtype=np.int32)])),
cudf.Series(np.concatenate([np.arange(n, 2*n, dtype=np.int32),
np.arange(3*n, 4*n, dtype=np.int32)])))
dm.set_order_time_windows(
cudf.Series(np.zeros(n_orders, dtype=np.int32)),
cudf.Series(np.full(n_orders, horizon, dtype=np.int32)))
dm.set_order_service_times(cudf.Series(np.zeros(n_orders, dtype=np.int32)))
dm.add_capacity_dimension("demand",
cudf.Series(np.concatenate([np.ones(n), -np.ones(n), np.zeros(2*n)]).astype(np.int32)),
cudf.Series(np.full(n, 2, dtype=np.int32)))
v_locs = cudf.Series(pickup_locs.copy())
dm.set_vehicle_locations(v_locs, v_locs)
dm.set_vehicle_time_windows(
cudf.Series(np.zeros(n, dtype=np.int32)),
cudf.Series(np.full(n, horizon, dtype=np.int32)))
dm.set_drop_return_trips(cudf.Series(np.ones(n, dtype=bool)))
dm.set_skip_first_trips(cudf.Series(np.ones(n, dtype=bool)))
for i in range(n):
dm.add_order_vehicle_match(2*n + i, cudf.Series([i], dtype=np.int32))
dm.add_order_vehicle_match(3*n + i, cudf.Series([i], dtype=np.int32))
return dm
props = cp.cuda.runtime.getDeviceProperties(0)
print(f"GPU: {props['name'].decode()}, sharedMemPerBlockOptin: {props['sharedMemPerBlockOptin']} B")
for n in [140, 150, 155, 156, 157, 160, 170]:
print(f"n_vehicles={n:3d} ", end="", flush=True)
try:
sol = routing.Solve(build_model(n), routing.SolverSettings())
print(f"OK (status={sol.get_status()})")
except Exception as exc:
print(f"FAIL {type(exc).__name__}: {exc}")
** Terminal output **
GPU : Tesla V100-PCIE-32GB
sharedMemPerBlockOptin: 98304 bytes
n_vehicles=140 OK (status=0)
n_vehicles=150 OK (status=0)
n_vehicles=155 OK (status=0)
n_vehicles=156 OK (status=0)
n_vehicles=157 FAIL RuntimeError: CUDA error encountered at: file=/__w/cuopt/cuopt/cpp/src/routing/local_search/sliding_window.cu line=1086: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument
Obtained 25 stack frames
#1 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: raft::cuda_error::cuda_error(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) +0x91 [0x7fa6851a5fc1]
#2 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::local_search_t<int, float, (cuopt::routing::request_t)0>::perform_sliding_window(cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>&, cuopt::routing::detail::move_candidates_t<int, float>&) +0xf85 [0x7fa685758df5]
#3 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::local_search_t<int, float, (cuopt::routing::request_t)0>::run_best_local_search(cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>&, bool, bool, bool) +0x625 [0x7fa6858dc805]
#4 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0>::improve(cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>&, std::array<double, 9ul>, float, bool) +0x124 [0x7fa6858358c4]
#5 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solve<cuopt::routing::detail::pool_allocator_t<int, float, cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float> >, cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float>, cuopt::routing::detail::adapted_generator_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0> >::generate_initial(int, int) +0xe11 [0x7fa6858285d1]
#6 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solve<cuopt::routing::detail::pool_allocator_t<int, float, cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float> >, cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float>, cuopt::routing::detail::adapted_generator_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0> >::generate_from_scratch() +0x3a [0x7fa68582ab4a]
#7 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::ges_solver_t<int, float, (cuopt::routing::request_t)0>::compute_ges_solution(std::__cxx11::basic_string<char, std::char_traits, std::allocator >) +0x160c [0x7fa6858301cc]
#8 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::assignment_t cuopt::routing::solver_t<int, float>::run_ges_solver<(cuopt::routing::request_t)0>(int) +0x71 [0x7fa685975c41]
#9 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solver_t<int, float>::solve() +0x77 [0x7fa685977f27]
#10 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::assignment_t cuopt::routing::solve<int, float>(cuopt::routing::data_model_view_t<int, float> const&, cuopt::routing::solver_settings_t<int, float> const&) +0x2f [0x7fa6859745af]
#11 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::cython::call_solve(cuopt::routing::data_model_view_t<int, float>, cuopt::routing::solver_settings_t<int, float>) +0x26 [0x7fa6859c14f6]
#12 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/cuopt/routing/vehicle_routing_wrapper.cpython-312-x86_64-linux-gnu.so(+0x67716) [0x7fa6744af716]
#13 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/cuopt/routing/vehicle_routing_wrapper.cpython-312-x86_64-linux-gnu.so(+0x6b96e) [0x7fa6744b396e]
#14 in python3: PyObject_Vectorcall +0x35 [0x549975]
#15 in python3: _PyEval_EvalFrameDefault +0xa89 [0x5d6f09]
#16 in python3: PyEval_EvalCode +0x15b [0x5d543b]
#17 in python3() [0x6085d2]
#18 in python3() [0x6b4d03]
#19 in python3: _PyRun_SimpleFileObject +0x1aa [0x6b4a6a]
#20 in python3: _PyRun_AnyFileObject +0x4f [0x6b489f]
#21 in python3: Py_RunMain +0x3b5 [0x6bc905]
#22 in python3: Py_BytesMain +0x2d [0x6bc3ed]
#23 in /usr/lib/x86_64-linux-gnu/libc.so.6(+0x2a1ca) [0x7fa819a5f1ca]
#24 in /usr/lib/x86_64-linux-gnu/libc.so.6: __libc_start_main +0x8b [0x7fa819a5f28b]
#25 in python3: _start +0x25 [0x6576c5]
n_vehicles=160 FAIL RuntimeError: fill_n: failed inside CUB: cudaErrorInvalidDevice: invalid device ordinal
n_vehicles=170 FAIL RuntimeError: fill_n: failed inside CUB: cudaErrorInvalidDevice: invalid device ordinal
Expected behavior
The solver completes successfully for all n_vehicles values.
Environment details (please complete the following information):
- Environment location: Bare-metal
- Method of cuOpt install: pip (
nvidia-cuopt-cu12==26.02.00)
- GPU: NVIDIA Tesla V100, compute capability 7.0,
sharedMemPerBlockOptin = 98304 bytes
- CUDA: 12.9, Driver: 580.126.20
- OS: Ubuntu 24.04, Python 3.12
Additional context
In my use-case (dynamic Capacitated PDPTW with order incompatibilities and more...) I may have big amount of couriers (up to 8K per problem). That'd be great if current cuOpt limitations were highlighted.
Describe the bug
When
N_VEHICLESreaches ~157 on a V100 GPU (sharedMemPerBlockOptin= 98304 bytes),routing.Solveraises an unhandledRuntimeErrorinstead of solving the problem.Steps/Code to reproduce bug
** Terminal output **
GPU : Tesla V100-PCIE-32GB
sharedMemPerBlockOptin: 98304 bytes
n_vehicles=140 OK (status=0)
n_vehicles=150 OK (status=0)
n_vehicles=155 OK (status=0)
n_vehicles=156 OK (status=0)
n_vehicles=157 FAIL RuntimeError: CUDA error encountered at: file=/__w/cuopt/cuopt/cpp/src/routing/local_search/sliding_window.cu line=1086: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument
Obtained 25 stack frames
#1 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: raft::cuda_error::cuda_error(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) +0x91 [0x7fa6851a5fc1]
#2 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::local_search_t<int, float, (cuopt::routing::request_t)0>::perform_sliding_window(cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>&, cuopt::routing::detail::move_candidates_t<int, float>&) +0xf85 [0x7fa685758df5]
#3 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::local_search_t<int, float, (cuopt::routing::request_t)0>::run_best_local_search(cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>&, bool, bool, bool) +0x625 [0x7fa6858dc805]
#4 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0>::improve(cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>&, std::array<double, 9ul>, float, bool) +0x124 [0x7fa6858358c4]
#5 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solve<cuopt::routing::detail::pool_allocator_t<int, float, cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float> >, cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float>, cuopt::routing::detail::adapted_generator_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0> >::generate_initial(int, int) +0xe11 [0x7fa6858285d1]
#6 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solve<cuopt::routing::detail::pool_allocator_t<int, float, cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float> >, cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float>, cuopt::routing::detail::adapted_generator_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0> >::generate_from_scratch() +0x3a [0x7fa68582ab4a]
#7 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::ges_solver_t<int, float, (cuopt::routing::request_t)0>::compute_ges_solution(std::__cxx11::basic_string<char, std::char_traits, std::allocator >) +0x160c [0x7fa6858301cc]
#8 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::assignment_t cuopt::routing::solver_t<int, float>::run_ges_solver<(cuopt::routing::request_t)0>(int) +0x71 [0x7fa685975c41]
#9 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solver_t<int, float>::solve() +0x77 [0x7fa685977f27]
#10 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::assignment_t cuopt::routing::solve<int, float>(cuopt::routing::data_model_view_t<int, float> const&, cuopt::routing::solver_settings_t<int, float> const&) +0x2f [0x7fa6859745af]
#11 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::cython::call_solve(cuopt::routing::data_model_view_t<int, float>, cuopt::routing::solver_settings_t<int, float>) +0x26 [0x7fa6859c14f6]
#12 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/cuopt/routing/vehicle_routing_wrapper.cpython-312-x86_64-linux-gnu.so(+0x67716) [0x7fa6744af716]
#13 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/cuopt/routing/vehicle_routing_wrapper.cpython-312-x86_64-linux-gnu.so(+0x6b96e) [0x7fa6744b396e]
#14 in python3: PyObject_Vectorcall +0x35 [0x549975]
#15 in python3: _PyEval_EvalFrameDefault +0xa89 [0x5d6f09]
#16 in python3: PyEval_EvalCode +0x15b [0x5d543b]
#17 in python3() [0x6085d2]
#18 in python3() [0x6b4d03]
#19 in python3: _PyRun_SimpleFileObject +0x1aa [0x6b4a6a]
#20 in python3: _PyRun_AnyFileObject +0x4f [0x6b489f]
#21 in python3: Py_RunMain +0x3b5 [0x6bc905]
#22 in python3: Py_BytesMain +0x2d [0x6bc3ed]
#23 in /usr/lib/x86_64-linux-gnu/libc.so.6(+0x2a1ca) [0x7fa819a5f1ca]
#24 in /usr/lib/x86_64-linux-gnu/libc.so.6: __libc_start_main +0x8b [0x7fa819a5f28b]
#25 in python3: _start +0x25 [0x6576c5]
n_vehicles=160 FAIL RuntimeError: fill_n: failed inside CUB: cudaErrorInvalidDevice: invalid device ordinal
n_vehicles=170 FAIL RuntimeError: fill_n: failed inside CUB: cudaErrorInvalidDevice: invalid device ordinal
Expected behavior
The solver completes successfully for all
n_vehiclesvalues.Environment details (please complete the following information):
nvidia-cuopt-cu12==26.02.00)sharedMemPerBlockOptin= 98304 bytesAdditional context
In my use-case (dynamic Capacitated PDPTW with order incompatibilities and more...) I may have big amount of couriers (up to 8K per problem). That'd be great if current cuOpt limitations were highlighted.