Skip to content

[BUG] Solver crashes with cudaErrorInvalidValue when N_VEHICLES >= 157 on V100 #1094

@np96

Description

@np96

Describe the bug

When N_VEHICLES reaches ~157 on a V100 GPU (sharedMemPerBlockOptin = 98304 bytes),
routing.Solve raises an unhandled RuntimeError instead of solving the problem.

Steps/Code to reproduce bug

import numpy as np
import cudf
from cuopt import routing
import cupy as cp

def build_model(n):
    n_locs, n_orders, horizon = 2 * n, 4 * n, 1440
    rng = np.random.default_rng(42)
    coords = rng.uniform(0, 100, (n_locs, 2)).astype(np.float32)
    diff = coords[:, None, :] - coords[None, :, :]
    dist = np.sqrt((diff ** 2).sum(-1)).astype(np.float32)
    np.fill_diagonal(dist, 0.0)

    dm = routing.DataModel(n_locs, n, n_orders)
    dm.add_cost_matrix(cudf.DataFrame(dist))
    dm.add_transit_time_matrix(cudf.DataFrame(dist))

    pickup_locs   = np.arange(0, n,   dtype=np.int32)
    delivery_locs = np.arange(n, 2*n, dtype=np.int32)

    dm.set_order_locations(cudf.Series(np.concatenate([
        pickup_locs, delivery_locs, pickup_locs, pickup_locs])))
    dm.set_pickup_delivery_pairs(
        cudf.Series(np.concatenate([np.arange(0,   n,   dtype=np.int32),
                                    np.arange(2*n, 3*n, dtype=np.int32)])),
        cudf.Series(np.concatenate([np.arange(n,   2*n, dtype=np.int32),
                                    np.arange(3*n, 4*n, dtype=np.int32)])))
    dm.set_order_time_windows(
        cudf.Series(np.zeros(n_orders, dtype=np.int32)),
        cudf.Series(np.full(n_orders, horizon, dtype=np.int32)))
    dm.set_order_service_times(cudf.Series(np.zeros(n_orders, dtype=np.int32)))
    dm.add_capacity_dimension("demand",
        cudf.Series(np.concatenate([np.ones(n), -np.ones(n), np.zeros(2*n)]).astype(np.int32)),
        cudf.Series(np.full(n, 2, dtype=np.int32)))

    v_locs = cudf.Series(pickup_locs.copy())
    dm.set_vehicle_locations(v_locs, v_locs)
    dm.set_vehicle_time_windows(
        cudf.Series(np.zeros(n, dtype=np.int32)),
        cudf.Series(np.full(n, horizon, dtype=np.int32)))
    dm.set_drop_return_trips(cudf.Series(np.ones(n, dtype=bool)))
    dm.set_skip_first_trips(cudf.Series(np.ones(n, dtype=bool)))

    for i in range(n):
        dm.add_order_vehicle_match(2*n + i, cudf.Series([i], dtype=np.int32))
        dm.add_order_vehicle_match(3*n + i, cudf.Series([i], dtype=np.int32))
    return dm

props = cp.cuda.runtime.getDeviceProperties(0)
print(f"GPU: {props['name'].decode()}, sharedMemPerBlockOptin: {props['sharedMemPerBlockOptin']} B")

for n in [140, 150, 155, 156, 157, 160, 170]:
    print(f"n_vehicles={n:3d}  ", end="", flush=True)
    try:
        sol = routing.Solve(build_model(n), routing.SolverSettings())
        print(f"OK   (status={sol.get_status()})")
    except Exception as exc:
        print(f"FAIL {type(exc).__name__}: {exc}")

** Terminal output **

GPU : Tesla V100-PCIE-32GB
sharedMemPerBlockOptin: 98304 bytes

n_vehicles=140 OK (status=0)
n_vehicles=150 OK (status=0)
n_vehicles=155 OK (status=0)
n_vehicles=156 OK (status=0)
n_vehicles=157 FAIL RuntimeError: CUDA error encountered at: file=/__w/cuopt/cuopt/cpp/src/routing/local_search/sliding_window.cu line=1086: call='cudaPeekAtLastError()', Reason=cudaErrorInvalidValue:invalid argument
Obtained 25 stack frames
#1 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: raft::cuda_error::cuda_error(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) +0x91 [0x7fa6851a5fc1]
#2 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::local_search_t<int, float, (cuopt::routing::request_t)0>::perform_sliding_window(cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>&, cuopt::routing::detail::move_candidates_t<int, float>&) +0xf85 [0x7fa685758df5]
#3 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::local_search_t<int, float, (cuopt::routing::request_t)0>::run_best_local_search(cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>&, bool, bool, bool) +0x625 [0x7fa6858dc805]
#4 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0>::improve(cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>&, std::array<double, 9ul>, float, bool) +0x124 [0x7fa6858358c4]
#5 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solve<cuopt::routing::detail::pool_allocator_t<int, float, cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float> >, cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float>, cuopt::routing::detail::adapted_generator_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0> >::generate_initial(int, int) +0xe11 [0x7fa6858285d1]
#6 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solve<cuopt::routing::detail::pool_allocator_t<int, float, cuopt::routing::detail::solution_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float> >, cuopt::routing::detail::adapted_sol_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::problem_t<int, float>, cuopt::routing::detail::adapted_generator_t<int, float, (cuopt::routing::request_t)0>, cuopt::routing::detail::adapted_modifier_t<int, float, (cuopt::routing::request_t)0> >::generate_from_scratch() +0x3a [0x7fa68582ab4a]
#7 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::ges_solver_t<int, float, (cuopt::routing::request_t)0>::compute_ges_solution(std::__cxx11::basic_string<char, std::char_traits, std::allocator >) +0x160c [0x7fa6858301cc]
#8 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::assignment_t cuopt::routing::solver_t<int, float>::run_ges_solver<(cuopt::routing::request_t)0>(int) +0x71 [0x7fa685975c41]
#9 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::solver_t<int, float>::solve() +0x77 [0x7fa685977f27]
#10 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::routing::assignment_t cuopt::routing::solve<int, float>(cuopt::routing::data_model_view_t<int, float> const&, cuopt::routing::solver_settings_t<int, float> const&) +0x2f [0x7fa6859745af]
#11 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/libcuopt/lib64/libcuopt.so: cuopt::cython::call_solve(cuopt::routing::data_model_view_t<int, float>, cuopt::routing::solver_settings_t<int, float>) +0x26 [0x7fa6859c14f6]
#12 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/cuopt/routing/vehicle_routing_wrapper.cpython-312-x86_64-linux-gnu.so(+0x67716) [0x7fa6744af716]
#13 in /home/np96/cuopt-test/venv/lib/python3.12/site-packages/cuopt/routing/vehicle_routing_wrapper.cpython-312-x86_64-linux-gnu.so(+0x6b96e) [0x7fa6744b396e]
#14 in python3: PyObject_Vectorcall +0x35 [0x549975]
#15 in python3: _PyEval_EvalFrameDefault +0xa89 [0x5d6f09]
#16 in python3: PyEval_EvalCode +0x15b [0x5d543b]
#17 in python3() [0x6085d2]
#18 in python3() [0x6b4d03]
#19 in python3: _PyRun_SimpleFileObject +0x1aa [0x6b4a6a]
#20 in python3: _PyRun_AnyFileObject +0x4f [0x6b489f]
#21 in python3: Py_RunMain +0x3b5 [0x6bc905]
#22 in python3: Py_BytesMain +0x2d [0x6bc3ed]
#23 in /usr/lib/x86_64-linux-gnu/libc.so.6(+0x2a1ca) [0x7fa819a5f1ca]
#24 in /usr/lib/x86_64-linux-gnu/libc.so.6: __libc_start_main +0x8b [0x7fa819a5f28b]
#25 in python3: _start +0x25 [0x6576c5]

n_vehicles=160 FAIL RuntimeError: fill_n: failed inside CUB: cudaErrorInvalidDevice: invalid device ordinal
n_vehicles=170 FAIL RuntimeError: fill_n: failed inside CUB: cudaErrorInvalidDevice: invalid device ordinal

Expected behavior
The solver completes successfully for all n_vehicles values.

Environment details (please complete the following information):

  • Environment location: Bare-metal
  • Method of cuOpt install: pip (nvidia-cuopt-cu12==26.02.00)
  • GPU: NVIDIA Tesla V100, compute capability 7.0, sharedMemPerBlockOptin = 98304 bytes
  • CUDA: 12.9, Driver: 580.126.20
  • OS: Ubuntu 24.04, Python 3.12

Additional context

In my use-case (dynamic Capacitated PDPTW with order incompatibilities and more...) I may have big amount of couriers (up to 8K per problem). That'd be great if current cuOpt limitations were highlighted.

Metadata

Metadata

Assignees

Labels

awaiting responseThis expects a response from maintainer or contributor depending on who requested in last comment.bugSomething isn't working

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions