From d65be83090e3d4fc1f26c34eaa106e9779eda227 Mon Sep 17 00:00:00 2001 From: akifcorduk Date: Fri, 24 Apr 2026 14:01:53 +0000 Subject: [PATCH 01/47] main baseline test --- .../linear_programming/cuopt/run_mip.cpp | 86 ++++++++++++------- 1 file changed, 57 insertions(+), 29 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index e01e533a65..ceac154162 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -23,8 +23,6 @@ #include #include -#include - #include #include #include @@ -85,7 +83,7 @@ void write_to_output_file(const std::string& out_dir, } } -inline auto make_async() { return std::make_shared(); } +inline auto make_async() { return rmm::mr::cuda_async_memory_resource(); } void read_single_solution_from_path(const std::string& path, const std::vector& var_names, @@ -274,7 +272,7 @@ void run_single_file_mp(std::string file_path, { std::cout << "running file " << file_path << " on gpu : " << device << std::endl; auto memory_resource = make_async(); - rmm::mr::set_current_device_resource(memory_resource.get()); + rmm::mr::set_current_device_resource(memory_resource); int sol_found = run_single_file(file_path, device, batch_id, @@ -294,6 +292,36 @@ void run_single_file_mp(std::string file_path, exit(sol_found); } +void bind_process_to_cpu_partition(int gpu_id, int n_gpus) +{ + cpu_set_t parent_mask; + CPU_ZERO(&parent_mask); + + if (sched_getaffinity(0, sizeof(parent_mask), &parent_mask) != 0) { + perror("sched_getaffinity"); + return; + } + + std::vector visible_cpus; + for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) { + if (CPU_ISSET(cpu, &parent_mask)) { visible_cpus.push_back(cpu); } + } + + int cpus_per_gpu = std::max(1, static_cast(visible_cpus.size()) / n_gpus); + int start = gpu_id * cpus_per_gpu; + int end = std::min(start + cpus_per_gpu, static_cast(visible_cpus.size())); + + cpu_set_t child_mask; + CPU_ZERO(&child_mask); + + for (int i = start; i < end; ++i) { + CPU_SET(visible_cpus[i], &child_mask); + std::cout << "Binding process to CPU " << visible_cpus[i] << std::endl; + } + + if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { perror("sched_setaffinity"); } +} + void return_gpu_to_the_queue(std::unordered_map& pid_gpu_map, std::unordered_map& pid_file_map, std::queue& gpu_queue) @@ -420,12 +448,6 @@ int main(int argc, char* argv[]) if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; - // std::ifstream smt_file("/sys/devices/system/cpu/smt/active"); - // if (smt_file.is_open()) { - // int smt_active = 0; - // smt_file >> smt_active; - // if (smt_active) { num_cpu_threads /= 2; } - // } num_cpu_threads = std::max(num_cpu_threads, 1); } @@ -503,6 +525,7 @@ int main(int argc, char* argv[]) } if (sys_pid == 0) { RAFT_CUDA_TRY(cudaSetDevice(gpu_id)); + bind_process_to_cpu_partition(gpu_id, n_gpus); run_single_file_mp(file_name, gpu_id, batch_num, @@ -535,31 +558,36 @@ int main(int argc, char* argv[]) merge_result_files(out_dir, result_file, n_gpus, batch_num); } else { auto memory_resource = make_async(); + auto run_single = [&]() { + run_single_file(path, + 0, + 0, + n_gpus, + out_dir, + initial_solution_file, + heuristics_only, + num_cpu_threads, + write_log_file, + log_to_console, + reliability_branching, + time_limit, + work_limit, + deterministic); + }; if (memory_limit > 0) { auto limiting_adaptor = - rmm::mr::limiting_resource_adaptor(memory_resource.get(), memory_limit * 1024ULL * 1024ULL); - rmm::mr::set_current_device_resource(&limiting_adaptor); + rmm::mr::limiting_resource_adaptor(memory_resource, memory_limit * 1024ULL * 1024ULL); + rmm::mr::set_current_device_resource(limiting_adaptor); + run_single(); } else if (track_allocations) { - rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource.get(), + rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource, /*capture_stacks=*/true); - rmm::mr::set_current_device_resource(&tracking_adaptor); + rmm::mr::set_current_device_resource(tracking_adaptor); + run_single(); } else { - rmm::mr::set_current_device_resource(memory_resource.get()); + rmm::mr::set_current_device_resource(memory_resource); + run_single(); } - run_single_file(path, - 0, - 0, - n_gpus, - out_dir, - initial_solution_file, - heuristics_only, - num_cpu_threads, - write_log_file, - log_to_console, - reliability_branching, - time_limit, - work_limit, - deterministic); } return 0; From eb5d5865fa8114fc24f5e50f8d23427f4bea62ce Mon Sep 17 00:00:00 2001 From: akifcorduk Date: Fri, 24 Apr 2026 14:20:28 +0000 Subject: [PATCH 02/47] fix thread count --- benchmarks/linear_programming/cuopt/run_mip.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index ceac154162..18efd83dbe 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -292,14 +292,14 @@ void run_single_file_mp(std::string file_path, exit(sol_found); } -void bind_process_to_cpu_partition(int gpu_id, int n_gpus) +int bind_process_to_cpu_partition(int gpu_id, int n_gpus) { cpu_set_t parent_mask; CPU_ZERO(&parent_mask); if (sched_getaffinity(0, sizeof(parent_mask), &parent_mask) != 0) { perror("sched_getaffinity"); - return; + return -1; } std::vector visible_cpus; @@ -320,6 +320,7 @@ void bind_process_to_cpu_partition(int gpu_id, int n_gpus) } if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { perror("sched_setaffinity"); } + return cpus_per_gpu; } void return_gpu_to_the_queue(std::unordered_map& pid_gpu_map, @@ -446,11 +447,6 @@ int main(int argc, char* argv[]) int reliability_branching = program.get("--reliability-branching"); bool deterministic = program.get("--determinism"); - if (num_cpu_threads < 0) { - num_cpu_threads = omp_get_max_threads() / n_gpus; - num_cpu_threads = std::max(num_cpu_threads, 1); - } - if (program.is_used("--out-dir")) { out_dir = program.get("--out-dir"); result_file = out_dir + "/final_result.csv"; @@ -525,7 +521,9 @@ int main(int argc, char* argv[]) } if (sys_pid == 0) { RAFT_CUDA_TRY(cudaSetDevice(gpu_id)); - bind_process_to_cpu_partition(gpu_id, n_gpus); + int assigned_cpus = bind_process_to_cpu_partition(gpu_id, n_gpus); + omp_set_num_threads(assigned_cpus); + num_cpu_threads = assigned_cpus; run_single_file_mp(file_name, gpu_id, batch_num, From c03842b1bfc3e9317de4e155c6085b7487462035 Mon Sep 17 00:00:00 2001 From: akifcorduk Date: Mon, 4 May 2026 16:22:16 +0000 Subject: [PATCH 03/47] initial version of odd-cycle cuts --- .../mip/solver_settings.hpp | 1 + cpp/src/branch_and_bound/branch_and_bound.cpp | 2 +- cpp/src/cuts/cuts.cpp | 639 ++++++++++++++++++ cpp/src/cuts/cuts.hpp | 96 ++- .../dual_simplex/simplex_solver_settings.hpp | 2 + cpp/src/mip_heuristics/diversity/lns/rins.cu | 1 + .../diversity/recombiners/sub_mip.cuh | 1 + .../local_search/local_search.cu | 7 +- cpp/src/mip_heuristics/solver.cu | 21 +- cpp/tests/mip/cuts_test.cu | 234 +++++++ 10 files changed, 997 insertions(+), 7 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index ae0187e454..f1f5b79f8b 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -105,6 +105,7 @@ class mip_solver_settings_t { i_t mixed_integer_gomory_cuts = -1; i_t knapsack_cuts = -1; i_t clique_cuts = -1; + i_t zero_half_cuts = -1; i_t implied_bound_cuts = -1; i_t strong_chvatal_gomory_cuts = -1; i_t reduced_cost_strengthening = -1; diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index e69ff7b9a5..aef637d881 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2055,7 +2055,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut root_relax_soln_.resize(original_lp_.num_rows, original_lp_.num_cols); - if (settings_.clique_cuts != 0 && clique_table_ == nullptr) { + if ((settings_.clique_cuts != 0 || settings_.zero_half_cuts != 0) && clique_table_ == nullptr) { signal_extend_cliques_.store(false, std::memory_order_release); typename ::cuopt::linear_programming::mip_solver_settings_t::tolerances_t tolerances_for_clique{}; diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 6d7d97ef0a..38532ceb27 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -443,6 +444,365 @@ void extend_clique_vertices(std::vector& clique_vertices, static_cast(clique_vertices.size() - initial_clique_vertices)); } +// Build a zero-half (odd-cycle / odd-wheel) cut from a cycle and optional wheel +// centers. cycle_vertices is a simple odd cycle in the conflict graph using the +// 2*num_vars vertex indexing (var j and complement j+num_vars). wheel_centers +// are extra vertices each adjacent to every vertex in cycle_vertices. The +// resulting cut is stored in the form a^T x >= rhs to match cut_pool_t. +template +clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vertices, + const std::vector& wheel_centers, + i_t num_vars, + const std::vector& var_types, + const std::vector& lower_bounds, + const std::vector& upper_bounds, + const std::vector& xstar, + f_t bound_tol, + f_t min_violation, + sparse_vector_t& cut, + f_t& cut_rhs, + f_t* work_estimate, + f_t max_work_estimate) +{ + const size_t cycle_size = cycle_vertices.size(); + if (cycle_size < 5 || (cycle_size % 2) == 0) { return clique_cut_build_status_t::NO_CUT; } + cuopt_assert(num_vars > 0, "Zero-half cut num_vars must be positive"); + cuopt_assert(static_cast(num_vars) <= lower_bounds.size(), + "Zero-half cut lower bounds size mismatch"); + cuopt_assert(static_cast(num_vars) <= xstar.size(), "Zero-half cut xstar size mismatch"); + + const i_t m = static_cast((cycle_size - 1) / 2); + const f_t f_m = static_cast(m); + const f_t total_size = static_cast(cycle_size + wheel_centers.size()); + const f_t estimated_work = 8.0 * total_size + 2.0 * total_size * std::log2(total_size + 1.0); + if (add_work_estimate(estimated_work, work_estimate, max_work_estimate)) { + return clique_cut_build_status_t::NO_CUT; + } + + cut.i.clear(); + cut.x.clear(); + + std::unordered_map coeff_by_var; + std::unordered_set seen_original; + std::unordered_set seen_complement; + coeff_by_var.reserve(cycle_size + wheel_centers.size()); + seen_original.reserve(cycle_size + wheel_centers.size()); + seen_complement.reserve(cycle_size + wheel_centers.size()); + + f_t rhs_acc = -f_m; + + auto accumulate = + [&](const std::vector& verts, f_t weight, bool is_cycle) -> clique_cut_build_status_t { + for (const auto vertex_idx : verts) { + cuopt_assert(vertex_idx >= 0 && vertex_idx < 2 * num_vars, "Zero-half vertex out of range"); + const i_t var_idx = vertex_idx % num_vars; + const bool complement = vertex_idx >= num_vars; + const f_t lower_bound = lower_bounds[var_idx]; + const f_t upper_bound = upper_bounds[var_idx]; + cuopt_assert(var_types[var_idx] != variable_type_t::CONTINUOUS, + "Zero-half cut contains continuous variable"); + cuopt_assert(lower_bound >= -bound_tol, "Zero-half variable lower bound below zero"); + cuopt_assert(upper_bound <= 1 + bound_tol, "Zero-half variable upper bound above one"); + + // is_cycle is currently informational only; both cycle and wheel paths + // share the same accumulation logic + (void)is_cycle; + + if (complement) { + if (seen_original.count(var_idx) > 0) { return clique_cut_build_status_t::NO_CUT; } + seen_complement.insert(var_idx); + coeff_by_var[var_idx] += weight; + rhs_acc += weight; + } else { + if (seen_complement.count(var_idx) > 0) { return clique_cut_build_status_t::NO_CUT; } + seen_original.insert(var_idx); + coeff_by_var[var_idx] -= weight; + } + } + return clique_cut_build_status_t::CUT_ADDED; + }; + + if (accumulate(cycle_vertices, static_cast(1), true) != + clique_cut_build_status_t::CUT_ADDED) { + return clique_cut_build_status_t::NO_CUT; + } + if (m > 0 && !wheel_centers.empty()) { + if (accumulate(wheel_centers, f_m, false) != clique_cut_build_status_t::CUT_ADDED) { + return clique_cut_build_status_t::NO_CUT; + } + } + + const f_t coeff_zero_tol = static_cast(1e-12); + cut.i.reserve(coeff_by_var.size()); + cut.x.reserve(coeff_by_var.size()); + for (const auto& kv : coeff_by_var) { + if (std::abs(kv.second) <= coeff_zero_tol) { continue; } + cut.i.push_back(kv.first); + cut.x.push_back(kv.second); + } + + if (cut.i.empty()) { + CUOPT_LOG_DEBUG("[zero_half] build_zero_half_cut empty support after accumulation"); + return clique_cut_build_status_t::NO_CUT; + } + + cut_rhs = rhs_acc; + cut.sort(); + + const f_t dot = cut.dot(xstar); + const f_t violation = cut_rhs - dot; + CUOPT_LOG_DEBUG( + "[zero_half] build_zero_half_cut nz=%lld rhs=%g dot=%g violation=%g threshold=%g cycle=%lld " + "wheel=%lld", + static_cast(cut.i.size()), + static_cast(cut_rhs), + static_cast(dot), + static_cast(violation), + static_cast(min_violation), + static_cast(cycle_size), + static_cast(wheel_centers.size())); + if (violation > min_violation) { return clique_cut_build_status_t::CUT_ADDED; } + return clique_cut_build_status_t::NO_CUT; +} + +// Run Dijkstra over the bipartite auxiliary graph G' built from the fractional +// sub-CG. local_adj is the adjacency in CG (local indices). weights[v] is the +// LP value of vertex v in CG. The auxiliary graph has 2 * num_local vertices, +// with bipartite_idx = local_idx + part * num_local, part in {0, 1}. +// Edge weight in G' is max(0, (1 - weights[u] - weights[v]) / 2). We seek the +// shortest path from `source_local + 0 * num_local` to `source_local + num_local`. +// On success, returns true and fills `path` with the path (sequence of bipartite +// indices) and `total_weight` with its cost. Otherwise returns false. +template +bool dijkstra_odd_cycle(i_t source_local, + const std::vector>& local_adj, + const std::vector& weights, + f_t cutoff, + std::vector& path, + f_t& total_weight, + f_t* work_estimate, + f_t max_work_estimate) +{ + const i_t num_local = static_cast(local_adj.size()); + cuopt_assert(source_local >= 0 && source_local < num_local, + "Zero-half Dijkstra source out of range"); + cuopt_assert(weights.size() == static_cast(num_local), + "Zero-half Dijkstra weights size mismatch"); + + const i_t source_idx = source_local; + const i_t target_idx = source_local + num_local; + const i_t total_idx = 2 * num_local; + const f_t f_inf = std::numeric_limits::infinity(); + + std::vector dist(static_cast(total_idx), f_inf); + std::vector prev(static_cast(total_idx), -1); + dist[source_idx] = 0; + + using node_t = std::pair; + std::priority_queue, std::greater> pq; + pq.emplace(static_cast(0), source_idx); + + while (!pq.empty()) { + auto [d, u] = pq.top(); + pq.pop(); + if (d > dist[u]) { continue; } + if (u == target_idx) { break; } + if (cutoff > 0 && d >= cutoff) { break; } + + const i_t u_local = u % num_local; + const i_t u_part = u / num_local; + const i_t v_part = 1 - u_part; + cuopt_assert(u_part == 0 || u_part == 1, "Bipartite part out of range"); + + const auto& neigh = local_adj[u_local]; + if (add_work_estimate(static_cast(neigh.size()) + 4.0, work_estimate, max_work_estimate)) { + return false; + } + for (const auto v_local : neigh) { + cuopt_assert(v_local >= 0 && v_local < num_local, "Zero-half Dijkstra neighbor out of range"); + f_t edge_w = (static_cast(1) - weights[u_local] - weights[v_local]) / 2; + if (edge_w < 0) { edge_w = 0; } + const i_t v = v_local + v_part * num_local; + const f_t nd = d + edge_w; + if (nd < dist[v]) { + dist[v] = nd; + prev[v] = u; + pq.emplace(nd, v); + } + } + } + + if (!std::isfinite(dist[target_idx])) { return false; } + total_weight = dist[target_idx]; + if (cutoff > 0 && total_weight >= cutoff) { return false; } + + path.clear(); + for (i_t cur = target_idx; cur != -1; cur = prev[cur]) { + path.push_back(cur); + if (cur == source_idx) { break; } + } + cuopt_assert(!path.empty(), "Zero-half Dijkstra path empty"); + cuopt_assert(path.back() == source_idx, "Zero-half Dijkstra path missing source"); + std::reverse(path.begin(), path.end()); + // bipartite path from j1 to j2 must have odd number of edges + cuopt_assert((path.size() % 2) == 0, "Zero-half bipartite path must have even node count"); + return true; +} + +// Convert a bipartite-graph path (sequence of bipartite indices) into a simple +// odd cycle expressed as global CG vertex indices in [0, 2*num_vars). Returns +// true and fills `cycle_vertices` if a simple cycle of odd length >= 5 (so > +// triangle) was successfully extracted. +template +bool path_to_odd_cycle(const std::vector& bipartite_path, + const std::vector& vertices, + i_t num_local, + i_t num_vars, + std::vector& cycle_vertices, + f_t* work_estimate, + f_t max_work_estimate) +{ + cycle_vertices.clear(); + if (bipartite_path.size() < 4) { return false; } + if (add_work_estimate( + static_cast(bipartite_path.size()) * 2.0, work_estimate, max_work_estimate)) { + return false; + } + + std::vector local_seq; + local_seq.reserve(bipartite_path.size()); + for (const auto bv : bipartite_path) { + local_seq.push_back(bv % num_local); + } + // First and last entry should both correspond to the source CG vertex + cuopt_assert(local_seq.front() == local_seq.back(), "Zero-half cycle path endpoints must match"); + + // Drop the duplicate end so we have a sequence covering each cycle vertex once + local_seq.pop_back(); + if ((local_seq.size() % 2) == 0 || local_seq.size() < 5) { return false; } + + std::unordered_set seen_local; + seen_local.reserve(local_seq.size()); + for (const auto lv : local_seq) { + if (!seen_local.insert(lv).second) { + // Same CG vertex appears twice in the path; reject (degenerate cycle) + return false; + } + } + + cycle_vertices.reserve(local_seq.size()); + std::unordered_set seen_var; + seen_var.reserve(local_seq.size()); + for (const auto lv : local_seq) { + cuopt_assert(lv >= 0 && lv < num_local, "Zero-half local idx out of range"); + const i_t global = vertices[lv]; + cuopt_assert(global >= 0 && global < 2 * num_vars, "Zero-half global vertex out of range"); + const i_t var_idx = global % num_vars; + if (!seen_var.insert(var_idx).second) { + // Variable appears as both x and ¯x in the cycle; reject (degenerate) + return false; + } + cycle_vertices.push_back(global); + } + return cycle_vertices.size() >= 5; +} + +// Greedy lifting: extend an odd cycle by attaching a clique of "wheel center" +// vertices that are adjacent (in CG) to every vertex of the cycle. Mirrors the +// behavior of extend_clique_vertices but uses the cycle as the seed. +template +void extend_to_odd_wheel(const std::vector& cycle_vertices, + std::vector& wheel_centers, + detail::clique_table_t& graph, + const std::vector& reduced_costs, + i_t num_vars, + f_t start_time, + f_t time_limit, + f_t* work_estimate, + f_t max_work_estimate) +{ + wheel_centers.clear(); + if (cycle_vertices.empty()) { return; } + if (toc(start_time) >= time_limit) { return; } + + i_t smallest_degree = std::numeric_limits::max(); + i_t smallest_degree_var = -1; + for (auto v : cycle_vertices) { + if (toc(start_time) >= time_limit) { return; } + i_t degree = graph.get_degree_of_var(v); + if (degree < smallest_degree) { + smallest_degree = degree; + smallest_degree_var = v; + } + } + if (smallest_degree_var < 0) { return; } + + auto adj_set = graph.get_adj_set_of_var(smallest_degree_var); + std::unordered_set cycle_members(cycle_vertices.begin(), cycle_vertices.end()); + std::vector candidates; + candidates.reserve(adj_set.size()); + for (const auto candidate : adj_set) { + if (toc(start_time) >= time_limit) { return; } + if (cycle_members.count(candidate) != 0) { continue; } + bool adj_to_all = true; + for (const auto v : cycle_vertices) { + if (candidate == v) { + adj_to_all = false; + break; + } + if (!graph.check_adjacency(candidate, v)) { + adj_to_all = false; + break; + } + } + if (adj_to_all) { candidates.push_back(candidate); } + } + if (candidates.empty()) { return; } + + const f_t candidate_size = static_cast(candidates.size()); + const f_t cycle_size_f = static_cast(cycle_vertices.size()); + const f_t adj_set_cost = 2.0 * static_cast(adj_set.size()); + const f_t sort_cost = + candidate_size > 0.0 ? 2.0 * candidate_size * std::log2(candidate_size + 1.0) : 0.0; + if (add_work_estimate(adj_set_cost + cycle_size_f * candidate_size + sort_cost, + work_estimate, + max_work_estimate)) { + return; + } + + auto reduced_cost = [&](i_t vertex_idx) -> f_t { + i_t var_idx = vertex_idx % num_vars; + cuopt_assert(var_idx >= 0 && var_idx < static_cast(reduced_costs.size()), + "Reduced cost index out of range"); + f_t rc = reduced_costs[var_idx]; + if (!std::isfinite(rc)) { rc = 0.0; } + return vertex_idx >= num_vars ? -rc : rc; + }; + + std::sort(candidates.begin(), candidates.end(), [&](i_t a, i_t b) { + return reduced_cost(a) < reduced_cost(b); + }); + + const f_t adj_check_cost = 5.0; + for (const auto candidate : candidates) { + if (toc(start_time) >= time_limit) { return; } + bool adj_to_wheel = true; + i_t checks = 0; + for (const auto w : wheel_centers) { + checks++; + if (!graph.check_adjacency(candidate, w)) { + adj_to_wheel = false; + break; + } + } + if (add_work_estimate( + adj_check_cost * static_cast(checks), work_estimate, max_work_estimate)) { + break; + } + if (adj_to_wheel) { wheel_centers.push_back(candidate); } + } +} + } // namespace template @@ -507,6 +867,81 @@ std::vector> find_maximal_cliques_for_test( return ctx.cliques; } +// This function is only used in tests +std::vector> find_violated_odd_cycles_for_test( + const std::vector>& adjacency_list, + const std::vector& x_values, + double min_violation, + double time_limit) +{ + const size_t n_vertices = adjacency_list.size(); + if (n_vertices == 0) { return {}; } + cuopt_assert(x_values.size() == n_vertices, "x_values size mismatch in odd-cycle test helper"); + + const int num_local = static_cast(n_vertices); + std::vector> adj_local(n_vertices); + for (size_t v = 0; v < n_vertices; ++v) { + adj_local[v].reserve(adjacency_list[v].size()); + for (const auto nbr : adjacency_list[v]) { + cuopt_assert(nbr >= 0 && static_cast(nbr) < n_vertices, + "Neighbor index out of range in odd-cycle test helper"); + adj_local[v].push_back(nbr); + } + } + + double work_estimate = 0.0; + const double max_work_estimate = std::numeric_limits::infinity(); + const double start_time = tic(); + const double cutoff = 0.5 - min_violation; + + std::vector> result; + std::vector bipartite_path; + std::vector cycle_local; + std::vector already_used(n_vertices, 0); + + for (int s = 0; s < num_local; ++s) { + if (toc(start_time) >= time_limit) { break; } + if (already_used[s]) { continue; } + + double total_weight = 0; + if (!dijkstra_odd_cycle(s, + adj_local, + x_values, + cutoff, + bipartite_path, + total_weight, + &work_estimate, + max_work_estimate)) { + continue; + } + cycle_local.clear(); + if (bipartite_path.size() < 4) { continue; } + std::vector seq; + seq.reserve(bipartite_path.size()); + for (const auto bv : bipartite_path) { + seq.push_back(bv % num_local); + } + cuopt_assert(seq.front() == seq.back(), "Odd-cycle test helper path endpoints must match"); + seq.pop_back(); + if ((seq.size() % 2) == 0 || seq.size() < 5) { continue; } + bool simple = true; + std::unordered_set seen; + seen.reserve(seq.size()); + for (const auto v : seq) { + if (!seen.insert(v).second) { + simple = false; + break; + } + } + if (!simple) { continue; } + result.push_back(seq); + for (const auto v : seq) { + already_used[v] = 1; + } + } + return result; +} + template void cut_pool_t::add_cut(cut_type_t cut_type, const inequality_t& cut) { @@ -1827,6 +2262,20 @@ bool cut_generation_t::generate_cuts(const lp_problem_t& lp, } } + // Generate Zero-half (odd-cycle / odd-wheel) cuts; reuses the clique table built above + if (settings.zero_half_cuts != 0) { + f_t cut_start_time = tic(); + bool feasible = generate_zero_half_cuts(lp, settings, var_types, xstar, zstar, start_time); + if (!feasible) { + settings.log.printf("Zero-half cuts proved infeasible\n"); + return false; + } + f_t cut_generation_time = toc(cut_start_time); + if (cut_generation_time > 1.0) { + settings.log.debug("Zero-half cut generation time %.2f seconds\n", cut_generation_time); + } + } + // Generate implied bound cuts if (settings.implied_bound_cuts != 0) { f_t cut_start_time = tic(); @@ -2136,6 +2585,196 @@ bool cut_generation_t::generate_clique_cuts( return true; } +template +bool cut_generation_t::generate_zero_half_cuts( + const lp_problem_t& lp, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const std::vector& xstar, + const std::vector& reduced_costs, + f_t start_time) +{ + if (settings.zero_half_cuts == 0) { return true; } + if (toc(start_time) >= settings.time_limit) { return true; } + + const i_t num_vars = user_problem_.num_cols; + CUOPT_LOG_DEBUG("[zero_half] generate_zero_half_cuts start num_vars=%lld elapsed=%g", + static_cast(num_vars), + static_cast(toc(start_time))); + + if (clique_table_ == nullptr && clique_table_future_ != nullptr && + clique_table_future_->valid()) { + if (signal_extend_) { signal_extend_->store(true, std::memory_order_release); } + clique_table_ = clique_table_future_->get(); + clique_table_future_ = nullptr; + } + + if (clique_table_ == nullptr) { + CUOPT_LOG_DEBUG("[zero_half] no clique table available, skipping"); + return true; + } + if (clique_table_->first.empty() && clique_table_->addtl_cliques.empty()) { + CUOPT_LOG_DEBUG("[zero_half] empty clique table, nothing to separate"); + return true; + } + + cuopt_assert(clique_table_->n_variables == num_vars, + "Zero-half clique table variable count mismatch"); + cuopt_assert(static_cast(num_vars) <= xstar.size(), "Zero-half xstar size mismatch"); + cuopt_assert(user_problem_.var_types.size() == static_cast(num_vars), + "Zero-half user problem var_types size mismatch"); + + const f_t min_violation = std::max(settings.primal_tol, static_cast(1e-6)); + const f_t bound_tol = settings.primal_tol; + // shortest path of length >= 0.5 - min_violation cannot yield a violated cut + const f_t cutoff = static_cast(0.5) - min_violation; + f_t work_estimate = 0.0; + const f_t max_work_estimate = 1e8; + + std::vector vertices; + std::vector weights; + vertices.reserve(num_vars * 2); + weights.reserve(num_vars * 2); + + for (i_t j = 0; j < num_vars; ++j) { + if (user_problem_.var_types[j] == variable_type_t::CONTINUOUS) { continue; } + const f_t lower_bound = user_problem_.lower[j]; + const f_t upper_bound = user_problem_.upper[j]; + if (lower_bound < -bound_tol || upper_bound > 1 + bound_tol) { continue; } + const f_t xj = xstar[j]; + if (std::abs(xj - std::round(xj)) <= settings.integer_tol) { continue; } + vertices.push_back(j); + weights.push_back(xj); + vertices.push_back(j + num_vars); + weights.push_back(1.0 - xj); + } + work_estimate += 4.0 * static_cast(num_vars) + 2.0 * static_cast(vertices.size()); + if (work_estimate > max_work_estimate) { return true; } + if (vertices.empty()) { + CUOPT_LOG_DEBUG("[zero_half] no fractional binary vertices"); + return true; + } + + const i_t num_local = static_cast(vertices.size()); + CUOPT_LOG_DEBUG("[zero_half] fractional sub-CG vertices=%lld", static_cast(num_local)); + + std::vector vertex_to_local(2 * num_vars, -1); + std::vector in_subgraph(2 * num_vars, 0); + for (i_t idx = 0; idx < num_local; ++idx) { + const i_t vertex_idx = vertices[idx]; + vertex_to_local[vertex_idx] = idx; + in_subgraph[vertex_idx] = 1; + } + work_estimate += 3.0 * static_cast(num_local); + if (work_estimate > max_work_estimate) { return true; } + + std::vector> adj_local(num_local); + for (i_t idx = 0; idx < num_local; ++idx) { + if (toc(start_time) >= settings.time_limit) { return true; } + const i_t vertex_idx = vertices[idx]; + auto adj_set = clique_table_->get_adj_set_of_var(vertex_idx); + auto& adj = adj_local[idx]; + adj.reserve(adj_set.size()); + for (const auto neighbor : adj_set) { + cuopt_assert(neighbor >= 0 && neighbor < 2 * num_vars, "Zero-half neighbor out of range"); + if (!in_subgraph[neighbor]) { continue; } + const i_t local_neighbor = vertex_to_local[neighbor]; + cuopt_assert(local_neighbor >= 0, "Zero-half local neighbor out of range"); + adj.push_back(local_neighbor); + } + work_estimate += static_cast(adj_set.size()); + } + if (work_estimate > max_work_estimate) { return true; } + + sparse_vector_t cut(lp.num_cols, 0); + f_t cut_rhs = 0.0; + std::vector bipartite_path; + std::vector cycle_vertices; + std::vector wheel_centers; + + i_t cycles_found = 0; + i_t cuts_added = 0; + i_t added_per_var = 0; + std::vector already_used(num_local, 0); + + for (i_t s = 0; s < num_local; ++s) { + if (toc(start_time) >= settings.time_limit) { break; } + if (work_estimate > max_work_estimate) { break; } + if (already_used[s]) { continue; } + + f_t total_weight = 0; + if (!dijkstra_odd_cycle(s, + adj_local, + weights, + cutoff, + bipartite_path, + total_weight, + &work_estimate, + max_work_estimate)) { + continue; + } + if (!path_to_odd_cycle(bipartite_path, + vertices, + num_local, + num_vars, + cycle_vertices, + &work_estimate, + max_work_estimate)) { + continue; + } + cycles_found++; + + extend_to_odd_wheel(cycle_vertices, + wheel_centers, + *clique_table_, + reduced_costs, + num_vars, + start_time, + settings.time_limit, + &work_estimate, + max_work_estimate); + + const auto build_status = build_zero_half_cut(cycle_vertices, + wheel_centers, + num_vars, + var_types, + user_problem_.lower, + user_problem_.upper, + xstar, + bound_tol, + min_violation, + cut, + cut_rhs, + &work_estimate, + max_work_estimate); + if (work_estimate > max_work_estimate) { break; } + if (build_status == clique_cut_build_status_t::INFEASIBLE) { + CUOPT_LOG_DEBUG("[zero_half] infeasible cycle detected"); + return false; + } + if (build_status == clique_cut_build_status_t::CUT_ADDED) { + inequality_t cut_inequality; + cut_inequality.vector = cut; + cut_inequality.rhs = cut_rhs; + cut_pool_.add_cut(cut_type_t::ZERO_HALF, cut_inequality); + cuts_added++; + added_per_var++; + // mark all CG vertices that participated so we do not re-derive the same + // cycle from a different source vertex + for (const auto v : cycle_vertices) { + const i_t lv = vertex_to_local[v]; + if (lv >= 0) { already_used[lv] = 1; } + } + } + } + + CUOPT_LOG_DEBUG("[zero_half] generate_zero_half_cuts done cycles=%lld cuts=%lld work=%g", + static_cast(cycles_found), + static_cast(cuts_added), + static_cast(work_estimate)); + return true; +} + template void cut_generation_t::generate_mir_cuts( const lp_problem_t& lp, diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index 2da9760e27..e3846c2c65 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,8 @@ enum cut_type_t : int8_t { CHVATAL_GOMORY = 3, CLIQUE = 4, IMPLIED_BOUND = 5, - MAX_CUT_TYPE = 6 + ZERO_HALF = 6, + MAX_CUT_TYPE = 7 }; template @@ -178,7 +180,8 @@ struct cut_info_t { "Knapsack ", "Strong CG ", "Clique ", - "Implied Bounds"}; + "Implied Bounds", + "Zero-Half "}; std::array num_cuts = {0}; }; @@ -269,6 +272,16 @@ std::vector> find_maximal_cliques_for_test( int max_calls, double time_limit); +// Test-only helper to run the production odd-cycle separator used by zero-half cuts. +// adjacency_list must contain local vertex indices in [0, n_vertices). x_values gives +// the LP value for each vertex. Returns simple odd cycles whose induced edge weight +// sum is < 0.5 - min_violation. +std::vector> find_violated_odd_cycles_for_test( + const std::vector>& adjacency_list, + const std::vector& x_values, + double min_violation, + double time_limit); + template class cut_pool_t { public: @@ -279,6 +292,8 @@ class cut_pool_t { rhs_storage_(0), cut_age_(0), cut_type_(0), + cut_inv_norm_(0), + cut_max_abs_coef_(0), scored_cuts_(0) { } @@ -288,8 +303,24 @@ class cut_pool_t { // cut'*xstart < rhs void add_cut(cut_type_t cut_type, const inequality_t& cut); + // Backward-compatible scoring entry-point. Falls back to the legacy + // geometric-distance / nnz-penalty score when bounds are not provided. void score_cuts(std::vector& x_relax); + // HiGHS-like active-support scoring with adaptive threshold, adaptive + // parallelism rejection, and violation-based aging. Selected + // rows remain in the pool so they can be reconsidered if later removed + // from the LP and violated again. + void score_cuts(const std::vector& x_relax, + const std::vector& lower, + const std::vector& upper, + f_t feastol); + void score_cuts(const std::vector& x_relax, + const std::vector& lower, + const std::vector& upper, + const std::vector& var_types, + f_t feastol); + // We return the cuts in the form best_cuts*x <= best_rhs i_t get_best_cuts(csr_matrix_t& best_cuts, std::vector& best_rhs, @@ -301,22 +332,65 @@ class cut_pool_t { i_t pool_size() const { return cut_storage_.m; } + // Number of nonzeros in the cut at row `row` of the cut pool. + i_t cut_nz(i_t row) const { return cut_storage_.row_length(row); } + void print_cutpool_types() { print_cut_types("In cut pool", cut_type_, settings_); } void check_for_duplicate_cuts(); + // Configuration knobs for the HiGHS-like cut pool. Defaults match the + // recommended starting values from the cut-selection design note. + void set_pool_age_limit(i_t v) { pool_age_limit_ = v; } + void set_pool_soft_limit(i_t v) { pool_soft_limit_ = v; } + void set_max_parallelism(f_t v) { max_parallelism_ = v; } + private: f_t cut_distance(i_t row, const std::vector& x, f_t& cut_violation, f_t& cut_norm); f_t cut_density(i_t row); f_t cut_orthogonality(i_t i, i_t j); + // HiGHS-like active-support score for a single pool row. Returns true if + // the cut is currently violated (violation > feastol). Falls back to the + // full row norm if no variable is "active" (rare for a violated cut). + bool compute_active_support_score(i_t row, + const std::vector& x, + const std::vector& lower, + const std::vector& upper, + const std::vector& var_types, + f_t feastol, + f_t& violation, + f_t& score) const; + + // Parallelism in [-1, 1] using stored 1/||a||_2. We use the absolute + // value because cuts are stored in a fixed sign convention (a^T x >= rhs) + // but two equivalent cuts may differ by a global sign. + f_t parallelism_abs(i_t i, i_t j) const; + + // Compact: drop pool rows for which keep_row[r] == 0. + void compact_pool(const std::vector& keep_row); + uint64_t support_hash(const i_t* indices, i_t size) const; + void rebuild_support_hash_buckets(); + static bool is_integral_type(variable_type_t t) + { + return t == variable_type_t::INTEGER || t == variable_type_t::BINARY; + } + i_t original_vars_; const simplex_solver_settings_t& settings_; csr_matrix_t cut_storage_; std::vector rhs_storage_; + // Age convention: + // age >= 0 : cut is in the pool, available for selection. Newly added + // cuts start at max(0, pool_age_limit_ - 5). Each separation + // round, non-violated cuts have age++ and are deleted once + // age >= effective_age_limit; violated cuts reset to age = 0. std::vector cut_age_; std::vector cut_type_; + // 1 / sqrt(sum a_j^2). 0.0 means the cut is degenerate / removed. + std::vector cut_inv_norm_; + std::vector cut_max_abs_coef_; i_t scored_cuts_; std::vector cut_distances_; @@ -325,6 +399,16 @@ class cut_pool_t { std::vector cut_scores_; std::vector best_cuts_; const f_t min_cut_distance_{1e-4}; + + // HiGHS-like cut-pool configuration + i_t pool_age_limit_{30}; + i_t pool_soft_limit_{10000}; + f_t max_parallelism_{0.1}; + f_t min_score_factor_{0.9}; + f_t best_observed_score_{0.0}; + f_t integer_support_weight_{0.1}; + f_t full_support_penalty_{0.01}; + std::unordered_map> support_hash_buckets_; }; template @@ -481,6 +565,14 @@ class cut_generation_t { const std::vector& reduced_costs, f_t start_time); + // Generate zero-half (odd-cycle / odd-wheel) cuts from the conflict graph + bool generate_zero_half_cuts(const lp_problem_t& lp, + const simplex_solver_settings_t& settings, + const std::vector& var_types, + const std::vector& xstar, + const std::vector& reduced_costs, + f_t start_time); + // Generate implied bounds cuts from probing implications void generate_implied_bound_cuts(const lp_problem_t& lp, const simplex_solver_settings_t& settings, diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index cfc120e477..3674cd67d1 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -102,6 +102,7 @@ struct simplex_solver_settings_t { knapsack_cuts(-1), implied_bound_cuts(-1), clique_cuts(-1), + zero_half_cuts(-1), strong_chvatal_gomory_cuts(-1), reduced_cost_strengthening(-1), cut_change_threshold(1e-3), @@ -186,6 +187,7 @@ struct simplex_solver_settings_t { i_t knapsack_cuts; // -1 automatic, 0 to disable, >0 to enable knapsack cuts i_t implied_bound_cuts; // -1 automatic, 0 to disable, >0 to enable implied bound cuts i_t clique_cuts; // -1 automatic, 0 to disable, >0 to enable clique cuts + i_t zero_half_cuts; // -1 automatic, 0 to disable, >0 to enable zero-half cuts i_t strong_chvatal_gomory_cuts; // -1 automatic, 0 to disable, >0 to enable strong Chvatal Gomory // cuts i_t reduced_cost_strengthening; // -1 automatic, 0 to disable, >0 to enable reduced cost diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index c4331343de..7eadb0a33f 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -267,6 +267,7 @@ void rins_t::run_rins() branch_and_bound_settings.reliability_branching = 0; branch_and_bound_settings.max_cut_passes = 0; branch_and_bound_settings.clique_cuts = 0; + branch_and_bound_settings.zero_half_cuts = 0; branch_and_bound_settings.sub_mip = 1; branch_and_bound_settings.strong_branching_simplex_iteration_limit = 200; branch_and_bound_settings.log.log = false; diff --git a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh index 5a637aae8e..7df5d8dd0f 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh @@ -110,6 +110,7 @@ class sub_mip_recombiner_t : public recombiner_t { branch_and_bound_settings.reliability_branching = 0; branch_and_bound_settings.max_cut_passes = 0; branch_and_bound_settings.clique_cuts = 0; + branch_and_bound_settings.zero_half_cuts = 0; branch_and_bound_settings.sub_mip = 1; branch_and_bound_settings.strong_branching_simplex_iteration_limit = 200; branch_and_bound_settings.solution_callback = [this](std::vector& solution, diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index b96b48a413..b00c4a2196 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -125,12 +125,13 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_); scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: "; scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback = - [&population](f_t obj, const std::vector& h_vec, double /*work_units*/) { + [&population, problem_ptr = context.problem_ptr]( + f_t obj, const std::vector& h_vec, double /*work_units*/) { population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); if (obj < local_search_best_obj) { CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", - context.problem_ptr->get_user_obj_from_solver_obj(obj), - context.problem_ptr->get_user_obj_from_solver_obj( + problem_ptr->get_user_obj_from_solver_obj(obj), + problem_ptr->get_user_obj_from_solver_obj( population.is_feasible() ? population.best_feasible().get_objective() : std::numeric_limits::max())); local_search_best_obj = obj; diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index ce6b602fba..4003c11437 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -337,6 +338,7 @@ solution_t mip_solver_t::run_solver() branch_and_bound_settings.knapsack_cuts = context.settings.knapsack_cuts; branch_and_bound_settings.implied_bound_cuts = context.settings.implied_bound_cuts; branch_and_bound_settings.clique_cuts = context.settings.clique_cuts; + branch_and_bound_settings.zero_half_cuts = context.settings.zero_half_cuts; branch_and_bound_settings.strong_chvatal_gomory_cuts = context.settings.strong_chvatal_gomory_cuts; branch_and_bound_settings.cut_change_threshold = context.settings.cut_change_threshold; @@ -386,7 +388,14 @@ solution_t mip_solver_t::run_solver() std::placeholders::_2); } - // Create the branch and bound object + // Create the branch and bound object. + // + // Clique-table lifecycle: presolve no longer builds an initial clique + // table, so context.problem_ptr->clique_table is expected to be null + // here. B&B's async build (kicked off inside branch_and_bound_t::solve) + // produces the table and, via the publish callback installed below, + // atomically stores it into context.problem_ptr->clique_table so + // heuristic ensure_clique_data() can observe it on its next iteration. branch_and_bound = std::make_unique>( branch_and_bound_problem, branch_and_bound_settings, @@ -395,6 +404,16 @@ solution_t mip_solver_t::run_solver() context.problem_ptr->clique_table); context.branch_and_bound_ptr = branch_and_bound.get(); + // Publish the async-built clique_table onto context.problem_ptr so + // heuristics pick it up via the atomic snapshot accessor. + // { + // auto* pb = context.problem_ptr; + // branch_and_bound->set_clique_publish_callback( + // [pb](std::shared_ptr> ct) { + // pb->publish_clique_table(std::move(ct)); + // }); + // } + // Convert the best external upper bound from user-space to B&B's internal objective space. // context.problem_ptr is the post-trivial-presolve problem, whose get_solver_obj_from_user_obj // produces values in the same space as B&B node lower bounds. diff --git a/cpp/tests/mip/cuts_test.cu b/cpp/tests/mip/cuts_test.cu index 1348d7e7e4..7eec8847c9 100644 --- a/cpp/tests/mip/cuts_test.cu +++ b/cpp/tests/mip/cuts_test.cu @@ -73,6 +73,37 @@ mps_parser::mps_data_model_t create_pairwise_triangle_set_packing_p return problem; } +mps_parser::mps_data_model_t create_pairwise_pentagon_set_packing_problem() +{ + // Maximize x0 + x1 + x2 + x3 + x4 via minimizing the negation. + // Pairwise conflicts forming an odd 5-cycle in the conflict graph: + // x0 + x1 <= 1 + // x1 + x2 <= 1 + // x2 + x3 <= 1 + // x3 + x4 <= 1 + // x4 + x0 <= 1 + // LP optimum is sum=2.5; valid IP/zero-half cut: x0+x1+x2+x3+x4 <= 2. + mps_parser::mps_data_model_t problem; + std::vector offsets = {0, 2, 4, 6, 8, 10}; + std::vector indices = {0, 1, 1, 2, 2, 3, 3, 4, 4, 0}; + std::vector coefficients = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + problem.set_csr_constraint_matrix(coefficients, indices, offsets); + std::vector lower_bounds(5, -std::numeric_limits::infinity()); + std::vector upper_bounds(5, 1.0); + problem.set_constraint_lower_bounds(lower_bounds); + problem.set_constraint_upper_bounds(upper_bounds); + std::vector var_lower_bounds(5, 0.0); + std::vector var_upper_bounds(5, 1.0); + problem.set_variable_lower_bounds(var_lower_bounds); + problem.set_variable_upper_bounds(var_upper_bounds); + std::vector objective_coefficients(5, -1.0); + problem.set_objective_coefficients(objective_coefficients); + std::vector variable_types(5, 'I'); + problem.set_variable_types(variable_types); + problem.set_maximize(false); + return problem; +} + mps_parser::mps_data_model_t create_pairwise_triangle_with_isolated_variable_problem() { // Same triangle conflicts as create_pairwise_triangle_set_packing_problem(), @@ -379,6 +410,18 @@ std::string format_phase2_panic_dump(const mps_parser::mps_data_model_t& settings) { settings.clique_cuts = 1; + settings.zero_half_cuts = 0; + settings.max_cut_passes = 10; + settings.mixed_integer_gomory_cuts = 0; + settings.knapsack_cuts = 0; + settings.mir_cuts = 0; + settings.strong_chvatal_gomory_cuts = 0; +} + +void disable_non_zero_half_cuts(mip_solver_settings_t& settings) +{ + settings.clique_cuts = 1; + settings.zero_half_cuts = 1; settings.max_cut_passes = 10; settings.mixed_integer_gomory_cuts = 0; settings.knapsack_cuts = 0; @@ -390,6 +433,7 @@ void disable_all_cuts(mip_solver_settings_t& settings) { settings.max_cut_passes = 0; settings.clique_cuts = 0; + settings.zero_half_cuts = 0; settings.mixed_integer_gomory_cuts = 0; settings.knapsack_cuts = 0; settings.mir_cuts = 0; @@ -1404,4 +1448,194 @@ TEST(cuts, clique_neos8_phase4_lp_infeasibility_binary_search) EXPECT_EQ(first_infeasible.value(), injected_index); } +// ---- Zero-half cut tests -------------------------------------------------- + +namespace { + +std::vector> canonicalize_cycles(std::vector> cycles) +{ + for (auto& cycle : cycles) { + if (cycle.empty()) { continue; } + auto min_it = std::min_element(cycle.begin(), cycle.end()); + std::rotate(cycle.begin(), min_it, cycle.end()); + if (cycle.size() >= 3 && cycle[1] > cycle.back()) { + std::reverse(cycle.begin() + 1, cycle.end()); + } + } + std::sort(cycles.begin(), cycles.end()); + cycles.erase(std::unique(cycles.begin(), cycles.end()), cycles.end()); + return cycles; +} + +} // namespace + +TEST(cuts, zero_half_unit_separator_simple_pentagon) +{ + // 5-cycle: 0-1-2-3-4-0. All vertices fractional at 0.5. + std::vector> adj = { + {1, 4}, + {0, 2}, + {1, 3}, + {2, 4}, + {3, 0}, + }; + std::vector x_values(5, 0.5); + auto cycles = dual_simplex::find_violated_odd_cycles_for_test( + adj, x_values, 1e-6, std::numeric_limits::infinity()); + ASSERT_FALSE(cycles.empty()); + cycles = canonicalize_cycles(std::move(cycles)); + std::vector expected{0, 1, 2, 3, 4}; + bool found = false; + for (const auto& cycle : cycles) { + if (cycle.size() == 5) { + auto sorted = cycle; + std::sort(sorted.begin(), sorted.end()); + if (sorted == expected) { + found = true; + break; + } + } + } + EXPECT_TRUE(found); +} + +TEST(cuts, zero_half_unit_separator_no_cycle_for_4_cycle) +{ + // Even cycle: 0-1-2-3-0 + std::vector> adj = { + {1, 3}, + {0, 2}, + {1, 3}, + {2, 0}, + }; + std::vector x_values(4, 0.5); + auto cycles = dual_simplex::find_violated_odd_cycles_for_test( + adj, x_values, 1e-6, std::numeric_limits::infinity()); + EXPECT_TRUE(cycles.empty()); +} + +TEST(cuts, zero_half_unit_separator_skips_triangle) +{ + // Triangle 0-1-2-0 ; size-3 cycles must be left to the clique separator. + std::vector> adj = { + {1, 2}, + {0, 2}, + {0, 1}, + }; + std::vector x_values(3, 0.5); + auto cycles = dual_simplex::find_violated_odd_cycles_for_test( + adj, x_values, 1e-6, std::numeric_limits::infinity()); + for (const auto& cycle : cycles) { + EXPECT_GE(cycle.size(), 5u); + } +} + +TEST(cuts, zero_half_unit_separator_no_cycle_when_integer_solution) +{ + // 5-cycle but x_values are integer feasible: (1, 0, 1, 0, 0) -- no violation. + std::vector> adj = { + {1, 4}, + {0, 2}, + {1, 3}, + {2, 4}, + {3, 0}, + }; + std::vector x_values = {1.0, 0.0, 1.0, 0.0, 0.0}; + // x_v interpreted as conflict-graph vertex weight (here just x_j directly). + auto cycles = dual_simplex::find_violated_odd_cycles_for_test( + adj, x_values, 1e-6, std::numeric_limits::infinity()); + EXPECT_TRUE(cycles.empty()); +} + +TEST(cuts, zero_half_unit_separator_disjoint_pentagons) +{ + // Two disjoint 5-cycles share no vertices: {0..4} and {5..9}. + std::vector> adj = { + {1, 4}, + {0, 2}, + {1, 3}, + {2, 4}, + {3, 0}, + {6, 9}, + {5, 7}, + {6, 8}, + {7, 9}, + {8, 5}, + }; + std::vector x_values(10, 0.5); + auto cycles = dual_simplex::find_violated_odd_cycles_for_test( + adj, x_values, 1e-6, std::numeric_limits::infinity()); + ASSERT_GE(cycles.size(), 2u); + cycles = canonicalize_cycles(std::move(cycles)); + bool found_left = false; + bool found_right = false; + for (const auto& cycle : cycles) { + if (cycle.size() != 5) { continue; } + auto sorted = cycle; + std::sort(sorted.begin(), sorted.end()); + if (sorted == std::vector{0, 1, 2, 3, 4}) { found_left = true; } + if (sorted == std::vector{5, 6, 7, 8, 9}) { found_right = true; } + } + EXPECT_TRUE(found_left); + EXPECT_TRUE(found_right); +} + +TEST(cuts, zero_half_end_to_end_pentagon_tightens_lp_relaxation) +{ + const raft::handle_t handle{}; + auto mip_problem = create_pairwise_pentagon_set_packing_problem(); + + // First solve the LP relaxation (no cuts) to confirm the baseline value 2.5. + auto lp_relaxation = mip_problem; + std::vector all_continuous(lp_relaxation.get_n_variables(), 'C'); + lp_relaxation.set_variable_types(all_continuous); + + pdlp_solver_settings_t lp_settings{}; + lp_settings.time_limit = 10.0; + lp_settings.presolver = presolver_t::None; + lp_settings.set_optimality_tolerance(1e-8); + auto lp_solution = solve_lp(&handle, lp_relaxation, lp_settings); + ASSERT_EQ(lp_solution.get_termination_status(), pdlp_termination_status_t::Optimal); + const double lp_obj_no_cuts = lp_solution.get_objective_value(); + EXPECT_NEAR(lp_obj_no_cuts, -2.5, kCliqueTestTol); + + // Optimal IP value is 2 (independent set of size 2), so the LP gap is 0.5. + mip_solver_settings_t settings; + settings.time_limit = 10.0; + settings.presolver = presolver_t::None; + disable_non_zero_half_cuts(settings); + + auto mip_solution = solve_mip(&handle, mip_problem, settings); + ASSERT_EQ(mip_solution.get_termination_status(), mip_termination_status_t::Optimal); + EXPECT_NEAR(mip_solution.get_objective_value(), -2.0, kCliqueTestTol); +} + +TEST(cuts, zero_half_unit_separator_seven_cycle_violated_below_half) +{ + // 7-cycle: 0-1-2-3-4-5-6-0, all weights 0.4. Each edge weight = (1-0.4-0.4)/2 = 0.1 + // total path weight from j1 to j2 of length 7 = 0.7 — not below 0.5, so no cut. + // Make weights slightly higher: 0.45 → edge weight = 0.05, total = 7*0.05 = 0.35 < 0.5. + std::vector> adj = { + {1, 6}, + {0, 2}, + {1, 3}, + {2, 4}, + {3, 5}, + {4, 6}, + {5, 0}, + }; + std::vector x_values(7, 0.45); + auto cycles = dual_simplex::find_violated_odd_cycles_for_test( + adj, x_values, 1e-6, std::numeric_limits::infinity()); + ASSERT_FALSE(cycles.empty()); + bool found_seven = false; + for (const auto& cycle : cycles) { + if (cycle.size() == 7) { + found_seven = true; + break; + } + } + EXPECT_TRUE(found_seven); +} + } // namespace cuopt::linear_programming::test From e7bf32ca4b7c19703136acfd32bb6b7c5f42ba40 Mon Sep 17 00:00:00 2001 From: akifcorduk Date: Thu, 7 May 2026 14:10:32 +0000 Subject: [PATCH 04/47] with gap computation --- .../cuopt/miplib2017_optima.hpp | 471 ++++++++++++++++++ .../linear_programming/cuopt/run_mip.cpp | 196 +++++++- .../mip/solver_settings.hpp | 13 + cpp/src/branch_and_bound/branch_and_bound.cpp | 48 ++ .../dual_simplex/simplex_solver_settings.hpp | 13 + cpp/src/mip_heuristics/solver.cu | 3 + 6 files changed, 734 insertions(+), 10 deletions(-) create mode 100644 benchmarks/linear_programming/cuopt/miplib2017_optima.hpp diff --git a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp new file mode 100644 index 0000000000..201346c656 --- /dev/null +++ b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp @@ -0,0 +1,471 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +// MIPLIB2017 best-known objective ("optimum") lookup for the MIP +// benchmark runner. Self-contained: no env vars, no external CSV. +// +// Coverage: every instance in the MIPLIB2017 *benchmark* set (240 +// instances). Of those, 232 have a known optimum and live in +// kBenchmarkOptima; 7 are infeasible and live in kBenchmarkInfeasible +// so the printer can label them clearly instead of returning "no opt". +// +// Lookup uses the basename without directory and stripped of +// .mps / .mps.gz / .lp / .lp.gz / .gz suffixes, lower-cased. So +// "miplib2017/MAS74.mps.gz" / "mas74.mps" / "mas74" +// all hit the same entry. +// +// Returns std::optional: nullopt means "instance is in our +// benchmark set but infeasible" *or* "we don't have an entry for it". +// is_known_infeasible() distinguishes the two. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuopt_bench { + +// Strip directory prefix and any .mps/.lp suffix (with optional .gz), +// then lower-case. Designed to match how MPS instance files are named +// across MIPLIB downloads (case- and extension-insensitive). +inline std::string normalize_instance_name(const std::string& raw) +{ + std::string s = raw; + const auto slash = s.find_last_of("/\\"); + if (slash != std::string::npos) { s = s.substr(slash + 1); } + auto endswith = [&](const std::string& suf) { + if (s.size() < suf.size()) { return false; } + for (size_t i = 0; i < suf.size(); ++i) { + if (std::tolower(static_cast(s[s.size() - suf.size() + i])) != + std::tolower(static_cast(suf[i]))) { + return false; + } + } + return true; + }; + for (const char* suf : {".mps.gz", ".lp.gz", ".mps", ".lp", ".gz"}) { + if (endswith(suf)) { + s.resize(s.size() - std::char_traits::length(suf)); + break; + } + } + for (char& c : s) { + c = static_cast(std::tolower(static_cast(c))); + } + return s; +} + +// MIPLIB2017 benchmark-set best-known objectives (n=232). Source: +// https://miplib.zib.de "The Benchmark Set". Values are stored in the +// double precision they were published at; unit tests should compare +// with a tolerance of ~|opt|*1e-9 rather than exact equality. +inline const std::unordered_map& kBenchmarkOptima() +{ + static const std::unordered_map kOptima = { + {"30n20b8", 302}, + {"50v-10", 3311.1799841000002}, + {"academictimetablesmall", 0}, + {"air05", 26374}, + {"app1-1", -3}, + {"app1-2", -41}, + {"assign1-5-8", 211.99999999999801}, + {"atlanta-ip", 90.009878614000002}, + {"b1c1s1", 24544.25}, + {"bab2", -357544.31150000001}, + {"bab6", -284248.23070000007}, + {"beasleyc3", 753.9999999999128}, + {"binkar10_1", 6742.1998835000004}, + {"blp-ar98", 6205.2147103999996}, + {"blp-ic98", 4491.4475839500001}, + {"bnatt400", 1}, + {"bppc4-08", 53}, + {"brazil3", 24}, + {"buildingenergy", 33283.853236000003}, + {"cbs-cta", 0}, + {"chromaticindex1024-7", 4}, + {"chromaticindex512-7", 4}, + {"cmflsp50-24-8-8", 55789389.886}, + {"cms750_4", 252}, + {"co-100", 2639942.0600000001}, + {"cod105", -12}, + {"comp07-2idx", 6}, + {"comp21-2idx", 74}, + {"cost266-uue", 25148940.55999998}, + {"cryptanalysiskb128n5obj16", 0}, + {"csched007", 350.99999999999551}, + {"csched008", 173}, + {"cvs16r128-89", -97}, + {"dano3_3", 576.34463302999995}, + {"dano3_5", 576.9249159565619}, + {"decomp2", -160}, + {"drayage-100-23", 103333.87407000001}, + {"drayage-25-23", 101282.647018}, + {"dws008-01", 37412.604587945083}, + {"eil33-2", 934.007915999999}, + {"eila101-2", 880.92010799999991}, + {"enlight_hard", 37}, + {"ex10", 100}, + {"ex9", 81}, + {"exp-1-500-5-5", 65887}, + {"fast0507", 174}, + {"fastxgemm-n2r6s0t2", 230}, + {"fhnw-binpack4-48", 0}, + {"fiball", 138}, + {"gen-ip002", -4783.7333920000001}, + {"gen-ip054", 6840.9656417899996}, + {"germanrr", 47095869.648999996}, + {"gfd-schedulen180f7d50m30k18", 1}, + {"glass-sc", 23}, + {"glass4", 1200012599.972384}, + {"gmu-35-40", -2406733.3687999998}, + {"gmu-35-50", -2607958.3300000001}, + {"graph20-20-1rand", -9}, + {"graphdraw-domain", 19685.999975500381}, + {"h80x6320d", 6382.0990482459993}, + {"highschool1-aigio", 0}, + {"hypothyroid-k1", -2851}, + {"ic97_potential", 3941.9999309022501}, + {"icir97_tension", 6375}, + {"irish-electricity", 3723497.5913959998}, + {"irp", 12159.492835396981}, + {"istanbul-no-cutoff", 204.08170701}, + {"k1mushroom", -3288}, + {"lectsched-5-obj", 24}, + {"leo1", 404227536.16000003}, + {"leo2", 404077441.12}, + {"lotsize", 1480195}, + {"mad", 0.026800000000000001}, + {"map10", -495}, + {"map16715-04", -111}, + {"markshare2", 1}, + {"markshare_4_0", 1}, + {"mas74", 11801.185719999999}, + {"mas76", 40005.053989999993}, + {"mc11", 11688.99999999966}, + {"mcsched", 211913}, + {"mik-250-20-75-4", -52301}, + {"milo-v12-6-r2-40-1", 326481.14282799}, + {"momentum1", 109143.4935}, + {"mushroom-best", 0.055333761199999998}, + {"mzzv11", -21718}, + {"mzzv42z", -20540}, + {"n2seq36q", 52200}, + {"n3div36", 130800}, + {"n5-3", 8104.9999999939992}, + {"neos-1122047", 161}, + {"neos-1171448", -309}, + {"neos-1171737", -195}, + {"neos-1354092", 46}, + {"neos-1445765", -17783}, + {"neos-1456979", 176}, + {"neos-1582420", 90.999999999999957}, + {"neos-2657525-crna", 1.810748}, + {"neos-2746589-doon", 2008.1999999999989}, + {"neos-2978193-inde", -2.3880616899999998}, + {"neos-2987310-joes", -607702988.29999995}, + {"neos-3004026-krka", 0}, + {"neos-3024952-loue", 26756}, + {"neos-3046615-murg", 1600}, + {"neos-3083819-nubu", 6307996}, + {"neos-3216931-puriri", 71320}, + {"neos-3381206-awhea", 453}, + {"neos-3402294-bobin", 0.067249999999999491}, + {"neos-3555904-turama", -34.700000000000003}, + {"neos-3627168-kasai", 988585.61999999976}, + {"neos-3656078-kumeu", -13172.200000000001}, + {"neos-3754480-nidda", 12941.73838561778}, + {"neos-4300652-rahue", 2.1415999999999999}, + {"neos-4338804-snowy", 1471}, + {"neos-4387871-tavua", 33.384729927000002}, + {"neos-4413714-turia", 45.370167019999798}, + {"neos-4532248-waihi", 61.599999999999987}, + {"neos-4647030-tutaki", 27265.705999999958}, + {"neos-4722843-widden", 25009.662227000001}, + {"neos-4738912-atrato", 283627956.59500003}, + {"neos-4763324-toguru", 1613.0388458499999}, + {"neos-4954672-berkel", 2612710}, + {"neos-5049753-cuanza", 561.99999716889999}, + {"neos-5052403-cygnet", 182}, + {"neos-5093327-huahum", 6259.9999971258949}, + {"neos-5104907-jarama", 935}, + {"neos-5107597-kakapo", 3644.9999999995198}, + {"neos-5114902-kasavu", 655}, + {"neos-5188808-nattai", 0.110283622999984}, + {"neos-5195221-niemur", 0.0038354325999999999}, + {"neos-631710", 203}, + {"neos-662469", 184379.99999999991}, + {"neos-787933", 30}, + {"neos-827175", 112.00152}, + {"neos-848589", 2351.40309999697}, + {"neos-860300", 3200.9999999999982}, + {"neos-873061", 113.6562385063}, + {"neos-911970", 54.759999999999998}, + {"neos-933966", 318}, + {"neos-950242", 4}, + {"neos-957323", -237.75668150000001}, + {"neos-960392", -238}, + {"neos17", 0.1500025774}, + {"neos5", 15}, + {"neos8", -3719}, + {"net12", 214}, + {"netdiversion", 242}, + {"nexp-150-20-8-5", 231}, + {"ns1116954", 0}, + {"ns1208400", 2}, + {"ns1644855", -1524.3333333333301}, + {"ns1760995", -549.21438505000003}, + {"ns1830653", 20622}, + {"ns1952667", 0}, + {"nu25-pr12", 53904.999999999993}, + {"nursesched-medium-hint03", 115}, + {"nursesched-sprint02", 57.999999999999993}, + {"nw04", 16862}, + {"opm2-z10-s4", -33269}, + {"p200x1188c", 15078}, + {"peg-solitaire-a3", 1}, + {"pg", -8674.3426071199992}, + {"pg5_34", -14339.353450000001}, + {"physiciansched3-3", 2623271.3266670001}, + {"physiciansched6-2", 49324}, + {"piperout-08", 125054.9999999999}, + {"piperout-27", 8123.9999999999727}, + {"pk1", 11}, + {"proteindesign121hz512p9", 1473}, + {"proteindesign122trx11p8", 1747}, + {"qap10", 339.99999999838712}, + {"radiationm18-12-05", 17566}, + {"radiationm40-10-02", 155328}, + {"rail01", -70.569964299999995}, + {"rail02", -200.44990770000001}, + {"rail507", 174}, + {"ran14x18-disj-8", 3712}, + {"rd-rplusc-21", 165395.275295}, + {"reblock115", -36800603.233199999}, + {"rmatr100-p10", 423}, + {"rmatr200-p5", 4521}, + {"roci-4-11", -6020203}, + {"rocii-5-11", -6.6755047315380001}, + {"rococob10-011000", 19449}, + {"rocococ10-001000", 11460}, + {"roi2alpha3n4", -63.208495030000002}, + {"roi5alpha10n8", -52.322274350999997}, + {"roll3000", 12889.999991999999}, + {"s100", -0.16972352705829999}, + {"s250r10", -0.17178048342319999}, + {"satellites2-40", -19}, + {"satellites2-60-fs", -19.000000000099998}, + {"savsched1", 3217.6999999999998}, + {"sct2", -230.9891623}, + {"seymour", 423}, + {"seymour1", 410.76370138999999}, + {"sing326", 7753674.8537600003}, + {"sing44", 8128831.1771999998}, + {"snp-02-004-104", 586803238.65672886}, + {"sorrell3", -16}, + {"sp150x300d", 69}, + {"sp97ar", 660705645.75899994}, + {"sp98ar", 529740623.19999999}, + {"splice1k1", -394}, + {"square41", 15}, + {"square47", 15.9999999997877}, + {"supportcase10", 7}, + {"supportcase12", -7559.5330538170001}, + {"supportcase18", 48}, + {"supportcase19", 12677205.999920519}, + {"supportcase22", 110}, // best-known marked "*" in MIPLIB2017 (not proven optimal) + {"supportcase26", 1745.1238129999999}, + {"supportcase33", -345}, + {"supportcase40", 24256.3122898}, + {"supportcase42", 7.7586307222700004}, + {"supportcase6", 51906.477370000001}, + {"supportcase7", -1132.2231770000001}, + {"swath1", 379.07129574999999}, + {"swath3", 397.76134365000001}, + {"tbfp-network", 24.163194440000002}, + {"thor50dday", 40417}, + {"timtab1", 764771.99999977998}, + {"tr12-30", 130595.9999999999}, + {"traininstance2", 71820}, + {"traininstance6", 28290}, + {"trento1", 5189487}, + {"triptim1", 22.868099999999899}, + {"uccase12", 11507.4050616}, + {"uccase9", 10993.131409}, + {"uct-subprob", 314}, + {"unitcal_7", 19635558.243999999}, + {"var-smallemery-m6j6", -149.37501}, + {"wachplan", -8}, + }; + return kOptima; +} + +// MIPLIB2017 benchmark-set instances flagged as infeasible (n=7). +// Solver should return Infeasible status; we use this set to label +// the printer line with status_extra=KnownInfeasible so a downstream +// "did the run agree with MIPLIB?" check can be a single grep. +inline const std::unordered_set& kBenchmarkInfeasible() +{ + static const std::unordered_set kInfeas = { + "bnatt500", + "cryptanalysiskb128n5obj14", + "fhnw-binpack4-4", + "neos-2075418-temuka", + "neos-3402454-bohle", + "neos-3988577-wolgan", + "neos859080", + }; + return kInfeas; +} + +inline std::optional lookup_miplib_optimum(const std::string& filename) +{ + const auto& m = kBenchmarkOptima(); + const auto it = m.find(normalize_instance_name(filename)); + if (it == m.end()) { return std::nullopt; } + return it->second; +} + +inline bool is_known_infeasible(const std::string& filename) +{ + return kBenchmarkInfeasible().count(normalize_instance_name(filename)) != 0; +} + +// Single grep-friendly per-instance line. Emits to stdout via printf +// so the output survives unconditionally regardless of the project's +// settings_.log routing (NFS-backed log files, gated debug levels) +// and is trivially cross-compared between cuts-config branches. +// +// "Gap closed" is reported relative to the *root LP after cuts*, not +// relative to the final dual bound at the end of solve. The standard +// MIP cutting-plane definition is: +// gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) +// / (opt - root_lp_no_cuts) +// On a minimization-form problem all three differences are >= 0 and +// gap_closed_pct lies in [0, 100]. The ratio is sign-symmetric so the +// formula also holds verbatim for maximization (numerator and +// denominator flip sign together). NaN is emitted when either root +// bound was not published (e.g. B&B never entered the cut loop). +// +// Other field semantics (signed for minimization): +// abs_root_dual_gap = opt - root_lp_with_cuts +// rel_root_dual_gap_pct = 100 * abs_root_dual_gap / max(|opt|, 1) +// abs_primal_gap = primal - opt +// rel_primal_gap_pct = 100 * abs_primal_gap / max(|opt|, 1) +// +// The line still also reports `final_dual` (solver's bound at the end +// of solve) so the new metric and the previous one can be compared +// without re-running. +// +// "TBD" is emitted when the optimum is unknown so downstream parsers +// can join lines on (instance, field) without dropping rows. "NaN" is +// emitted for root_lp_* when the value is unavailable. +template +inline void print_miplib_gap_stat(const std::string& filename, + const Solution& solution, + double solve_time_seconds, + const std::string& termination_status, + double root_lp_no_cuts, + double root_lp_with_cuts) +{ + const std::string norm = normalize_instance_name(filename); + const auto opt = lookup_miplib_optimum(filename); + const double primal = solution.get_objective_value(); + const double final_dual = solution.get_solution_bound(); + const double mip_gap = solution.get_mip_gap(); + const bool primal_finite = std::isfinite(primal); + const bool root0_finite = std::isfinite(root_lp_no_cuts); + const bool root1_finite = std::isfinite(root_lp_with_cuts); + constexpr double NaN = std::numeric_limits::quiet_NaN(); + + if (is_known_infeasible(filename)) { + std::printf( + "MIPLIBGapStat instance=%s opt=Infeasible primal=%.10g final_dual=%.10g " + "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " + "abs_root_dual_gap=NA rel_root_dual_gap_pct=NA gap_closed_pct=NA " + "abs_primal_gap=NA rel_primal_gap_pct=NA " + "mip_gap_reported=%.6g time_s=%.3f status=%s\n", + norm.c_str(), + primal, + final_dual, + root_lp_no_cuts, + root_lp_with_cuts, + mip_gap, + solve_time_seconds, + termination_status.c_str()); + } else if (opt.has_value()) { + const double o = *opt; + const double denom = std::max(std::abs(o), 1.0); + + const double abs_root_dgap = root1_finite ? (o - root_lp_with_cuts) : NaN; + const double rel_root_dgap_pct = root1_finite ? 100.0 * abs_root_dgap / denom : NaN; + + // Classical gap-closed-by-cuts. Skip when either root bound is + // missing, when the LP relaxation already proves optimality + // (denominator = opt - root_lp_no_cuts ~= 0), or when the bound + // moved the wrong way (numerical noise in either direction). + double gap_closed_pct = NaN; + if (root0_finite && root1_finite) { + const double total_gap = o - root_lp_no_cuts; + if (std::abs(total_gap) > 1e-12 * denom) { + gap_closed_pct = 100.0 * (root_lp_with_cuts - root_lp_no_cuts) / total_gap; + } else { + // LP relaxation already (numerically) optimal -> 100% closed + // by definition. Avoid /0 noise. + gap_closed_pct = 100.0; + } + } + + const double abs_pgap = primal_finite ? (primal - o) : NaN; + const double rel_pgap_pct = primal_finite ? 100.0 * abs_pgap / denom : NaN; + + std::printf( + "MIPLIBGapStat instance=%s opt=%.10g primal=%.10g final_dual=%.10g " + "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " + "abs_root_dual_gap=%.10g rel_root_dual_gap_pct=%.6g gap_closed_pct=%.6g " + "abs_primal_gap=%.10g rel_primal_gap_pct=%.6g " + "mip_gap_reported=%.6g time_s=%.3f status=%s\n", + norm.c_str(), + o, + primal, + final_dual, + root_lp_no_cuts, + root_lp_with_cuts, + abs_root_dgap, + rel_root_dgap_pct, + gap_closed_pct, + abs_pgap, + rel_pgap_pct, + mip_gap, + solve_time_seconds, + termination_status.c_str()); + } else { + std::printf( + "MIPLIBGapStat instance=%s opt=TBD primal=%.10g final_dual=%.10g " + "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " + "abs_root_dual_gap=TBD rel_root_dual_gap_pct=TBD gap_closed_pct=TBD " + "abs_primal_gap=TBD rel_primal_gap_pct=TBD " + "mip_gap_reported=%.6g time_s=%.3f status=%s\n", + norm.c_str(), + primal, + final_dual, + root_lp_no_cuts, + root_lp_with_cuts, + mip_gap, + solve_time_seconds, + termination_status.c_str()); + } + std::fflush(stdout); +} + +} // namespace cuopt_bench diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index 18efd83dbe..366cd28201 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -6,6 +6,7 @@ /* clang-format on */ #include "initial_solution_reader.hpp" #include "mip_test_instances.hpp" +#include "miplib2017_optima.hpp" #include #include @@ -23,15 +24,20 @@ #include #include +#include #include #include #include #include #include +#include #include +#include #include +#include #include #include +#include #include #include #include @@ -238,6 +244,42 @@ int run_single_file(std::string file_path, } else { CUOPT_LOG_INFO("%s: no solution found", base_filename.c_str()); } + + // Per-instance "gap closed to optimum" stat. Emits a single + // grep-friendly "MIPLIBGapStat ..." line via printf so cross-branch + // comparison is just `grep '^MIPLIBGapStat' branchA.log` then diff. + // Optima are looked up from the in-source MIPLIB2017 benchmark-set + // table (miplib2017_optima.hpp); unknown instances emit "opt=TBD" + // and infeasibility-flagged instances emit "opt=Infeasible". + { + const double _gap_seconds = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_run_solver) + .count() / + 1000.0; + std::string _status_str; + switch (solution.get_termination_status()) { + case cuopt::linear_programming::mip_termination_status_t::Optimal: + _status_str = "Optimal"; + break; + case cuopt::linear_programming::mip_termination_status_t::FeasibleFound: + _status_str = "FeasibleFound"; + break; + case cuopt::linear_programming::mip_termination_status_t::TimeLimit: + _status_str = "TimeLimit"; + break; + case cuopt::linear_programming::mip_termination_status_t::Infeasible: + _status_str = "Infeasible"; + break; + default: _status_str = "Other"; break; + } + cuopt_bench::print_miplib_gap_stat(base_filename, + solution, + _gap_seconds, + _status_str, + benchmark_info.root_lp_no_cuts, + benchmark_info.root_lp_with_cuts); + } + std::stringstream ss; int decimal_places = 2; double mip_gap = solution.get_mip_gap(); @@ -292,11 +334,84 @@ void run_single_file_mp(std::string file_path, exit(sol_found); } +// Return the NUMA node of each GPU (one entry per gpu_id), or -1 when the +// node can't be determined. Reads /sys/bus/pci/devices//numa_node so it +// requires no extra dependencies (NVML / hwloc). +static std::vector get_gpu_numa_nodes(int n_gpus) +{ + std::vector nodes(static_cast(std::max(0, n_gpus)), -1); + for (int i = 0; i < n_gpus; ++i) { + char pci_id[32] = {0}; + if (cudaDeviceGetPCIBusId(pci_id, sizeof(pci_id), i) != cudaSuccess) { continue; } + for (char* c = pci_id; *c; ++c) { + *c = static_cast(std::tolower(static_cast(*c))); + } + std::ifstream f(std::string("/sys/bus/pci/devices/") + pci_id + "/numa_node"); + if (!f) { continue; } + int node = -1; + f >> node; + nodes[i] = node; + } + return nodes; +} + +// Parse a sysfs cpulist string ("0-71,144-215") into a sorted list of CPU IDs. +// Returns empty on any read or parse failure. +static std::vector read_numa_cpulist(int numa_node) +{ + std::vector cpus; + if (numa_node < 0) { return cpus; } + std::ifstream f(std::string("/sys/devices/system/node/node") + std::to_string(numa_node) + + "/cpulist"); + if (!f) { return cpus; } + std::string line; + if (!std::getline(f, line)) { return cpus; } + size_t pos = 0; + while (pos < line.size()) { + const size_t comma = line.find(',', pos); + const size_t end = (comma == std::string::npos) ? line.size() : comma; + const std::string range = line.substr(pos, end - pos); + if (!range.empty()) { + try { + const size_t dash = range.find('-'); + const int lo = std::stoi(range.substr(0, dash)); + const int hi = (dash == std::string::npos) ? lo : std::stoi(range.substr(dash + 1)); + for (int c = lo; c <= hi; ++c) { + cpus.push_back(c); + } + } catch (...) { + return std::vector{}; + } + } + if (comma == std::string::npos) { break; } + pos = comma + 1; + } + std::sort(cpus.begin(), cpus.end()); + return cpus; +} + +// Bind the current process to a fair partition of the inherited CPU mask, +// preferring CPUs on the same NUMA node as the GPU. Returns the actual +// number of CPUs the child was pinned to, or -1 if the partition could not +// be applied (caller must then choose a fallback). +// +// Algorithm: +// 1. Read inherited (parent) affinity mask -> visible_cpus. +// 2. Look up each GPU's NUMA node via PCI BDF. +// 3. If this GPU's NUMA node is known and has visible CPUs, partition +// that NUMA node's CPUs among the GPUs that landed on the same node +// (siblings, ordered by gpu_id). +// 4. Otherwise fall back to a contiguous global partition of visible_cpus. +// +// The function always emits a single stdout line per child summarising the +// partition (NUMA-local vs contiguous-fallback), so the parent's log isn't +// interleaved per-CPU across n_gpus children. int bind_process_to_cpu_partition(int gpu_id, int n_gpus) { + if (gpu_id < 0 || n_gpus <= 0 || gpu_id >= n_gpus) { return -1; } + cpu_set_t parent_mask; CPU_ZERO(&parent_mask); - if (sched_getaffinity(0, sizeof(parent_mask), &parent_mask) != 0) { perror("sched_getaffinity"); return -1; @@ -306,21 +421,68 @@ int bind_process_to_cpu_partition(int gpu_id, int n_gpus) for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) { if (CPU_ISSET(cpu, &parent_mask)) { visible_cpus.push_back(cpu); } } + if (visible_cpus.empty()) { return -1; } + std::sort(visible_cpus.begin(), visible_cpus.end()); - int cpus_per_gpu = std::max(1, static_cast(visible_cpus.size()) / n_gpus); - int start = gpu_id * cpus_per_gpu; - int end = std::min(start + cpus_per_gpu, static_cast(visible_cpus.size())); + std::vector chosen_cpus; + bool numa_aware = false; + + const std::vector gpu_numa_nodes = get_gpu_numa_nodes(n_gpus); + const int my_numa = gpu_numa_nodes[gpu_id]; + if (my_numa >= 0) { + std::vector siblings; + for (int i = 0; i < n_gpus; ++i) { + if (gpu_numa_nodes[i] == my_numa) { siblings.push_back(i); } + } + std::vector numa_cpus = read_numa_cpulist(my_numa); + if (!numa_cpus.empty() && !siblings.empty()) { + std::vector local_visible; + std::set_intersection(visible_cpus.begin(), + visible_cpus.end(), + numa_cpus.begin(), + numa_cpus.end(), + std::back_inserter(local_visible)); + if (!local_visible.empty()) { + const int siblings_count = static_cast(siblings.size()); + const int my_idx = + static_cast(std::find(siblings.begin(), siblings.end(), gpu_id) - siblings.begin()); + const int local_per_gpu = + std::max(1, static_cast(local_visible.size()) / siblings_count); + const int s = my_idx * local_per_gpu; + const int e = std::min(s + local_per_gpu, static_cast(local_visible.size())); + if (s < e) { + chosen_cpus.assign(local_visible.begin() + s, local_visible.begin() + e); + numa_aware = true; + } + } + } + } + + if (!numa_aware) { + const int cpus_per_gpu = std::max(1, static_cast(visible_cpus.size()) / n_gpus); + const int start = gpu_id * cpus_per_gpu; + if (start >= static_cast(visible_cpus.size())) { return -1; } + const int end = std::min(start + cpus_per_gpu, static_cast(visible_cpus.size())); + chosen_cpus.assign(visible_cpus.begin() + start, visible_cpus.begin() + end); + } cpu_set_t child_mask; CPU_ZERO(&child_mask); - - for (int i = start; i < end; ++i) { - CPU_SET(visible_cpus[i], &child_mask); - std::cout << "Binding process to CPU " << visible_cpus[i] << std::endl; + std::ostringstream oss; + oss << "[gpu " << gpu_id << "] bound to " << chosen_cpus.size() << " CPUs (" + << (numa_aware ? "NUMA-local node " + std::to_string(my_numa) : "contiguous-fallback") + << "):"; + for (int c : chosen_cpus) { + CPU_SET(c, &child_mask); + oss << ' ' << c; } + std::cout << oss.str() << std::endl; - if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { perror("sched_setaffinity"); } - return cpus_per_gpu; + if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { + perror("sched_setaffinity"); + return -1; + } + return static_cast(chosen_cpus.size()); } void return_gpu_to_the_queue(std::unordered_map& pid_gpu_map, @@ -447,6 +609,11 @@ int main(int argc, char* argv[]) int reliability_branching = program.get("--reliability-branching"); bool deterministic = program.get("--determinism"); + if (run_dir && program.is_used("--num-cpu-threads")) { + std::cerr << "Warning: --num-cpu-threads is ignored in directory-run mode; " + "thread count is set per process from the bound CPU partition.\n"; + } + if (program.is_used("--out-dir")) { out_dir = program.get("--out-dir"); result_file = out_dir + "/final_result.csv"; @@ -522,6 +689,15 @@ int main(int argc, char* argv[]) if (sys_pid == 0) { RAFT_CUDA_TRY(cudaSetDevice(gpu_id)); int assigned_cpus = bind_process_to_cpu_partition(gpu_id, n_gpus); + if (assigned_cpus <= 0) { + assigned_cpus = std::max(1, omp_get_max_threads() / n_gpus); + std::cerr << "[gpu " << gpu_id << "] CPU pin failed; falling back to " + << assigned_cpus << " threads\n"; + } + // Directory-run mode owns the thread count: --num-cpu-threads is + // intentionally ignored here so per-process thread budgets match + // the bound CPU partition. The single-run path below still + // honours --num-cpu-threads. omp_set_num_threads(assigned_cpus); num_cpu_threads = assigned_cpus; run_single_file_mp(file_name, diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index ae0187e454..29f3d34525 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -26,6 +26,19 @@ struct benchmark_info_t { double last_improvement_of_best_feasible = 0; double last_improvement_after_recombination = 0; double objective_of_initial_population = std::numeric_limits::max(); + // LP relaxation objective at the root node, BEFORE any cuts have been + // added. quiet_NaN() means "B&B did not run cut passes / value was + // never written" — distinguishes it from a legitimate 0.0. + double root_lp_no_cuts = std::numeric_limits::quiet_NaN(); + // LP relaxation objective at the root node, AFTER the full cut loop + // (final pass result). The dual gap "by cuts at the root" is then + // gap_after_cuts = opt - root_lp_with_cuts (in B&B's solver + // objective sense) + // and the classical "gap closed by cuts" metric is + // gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) + // / (opt - root_lp_no_cuts). + // quiet_NaN() means "B&B did not finish the cut loop / value not written". + double root_lp_with_cuts = std::numeric_limits::quiet_NaN(); }; // Forward declare solver_settings_t for friend class diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 13313a46e2..520b4e0e99 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -10,6 +10,8 @@ #include #include +#include // benchmark_info_t + #include #include @@ -2191,6 +2193,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut cut_info_t cut_info; if (num_fractional == 0) { + // LP relaxation already integer-feasible — solved at the root with + // no cuts. Publish both bounds equal to the root LP value so the + // gap-closed-by-cuts line still has a finite, meaningful entry + // (the printer reports 100% closed when total integrality gap ~= 0). + if (settings_.benchmark_info_ptr != nullptr) { + const double v = static_cast(compute_user_objective(original_lp_, root_objective_)); + settings_.benchmark_info_ptr->root_lp_no_cuts = v; + settings_.benchmark_info_ptr->root_lp_with_cuts = v; + } set_solution_at_root(solution, cut_info); finish_clique_thread(); return mip_status_t::OPTIMAL; @@ -2228,10 +2239,26 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut f_t last_objective = root_objective_; f_t root_relax_objective = root_objective_; + // Publish the no-cuts root LP value once. The with-cuts companion is + // published below after the cut loop terminates. Both go to the + // benchmark_info_t so callers (run_mip.cpp) can compute + // gap-closed-by-cuts without instrumenting the cut loop directly. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_no_cuts = + static_cast(compute_user_objective(original_lp_, root_relax_objective)); + } + f_t cut_generation_start_time = tic(); i_t cut_pool_size = 0; for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { if (num_fractional == 0) { + // LP relaxation is already integer-feasible — solved at the root + // by the cuts added so far (possibly zero). Publish the with-cuts + // value so the gap-closed line still has a non-NaN dual bound. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_with_cuts = + static_cast(compute_user_objective(original_lp_, root_objective_)); + } set_solution_at_root(solution, cut_info); return mip_status_t::OPTIMAL; } else { @@ -2439,6 +2466,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + // Publish after every successful post-cut LP resolve so any + // early-exit path below (NUMERICAL, TIME_LIMIT, gap-tolerance + // exit) still leaves benchmark_info->root_lp_with_cuts pointing + // at the most recent valid LP-with-cuts objective. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_with_cuts = + static_cast(compute_user_objective(original_lp_, root_objective_)); + } + f_t remove_cuts_start_time = tic(); mutex_original_lp_.lock(); remove_cuts(original_lp_, @@ -2497,6 +2533,18 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } } + // Cut loop terminated (max_cut_passes hit, num_fractional==0 break, + // negligible-objective-change break, or time-limit break). Publish + // the post-cuts root LP value so benchmark drivers can compute + // gap-closed-by-cuts. We use compute_user_objective to flip the sign + // back into user space when the LP was dualized, matching the + // convention used for root_lp_no_cuts above and for the per-pass + // "Bound" column in the search log. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_with_cuts = + static_cast(compute_user_objective(original_lp_, root_objective_)); + } + print_cut_info(settings_, cut_info); f_t cut_generation_time = toc(cut_generation_start_time); if (cut_info.has_cuts()) { diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index cfc120e477..76d80e62b4 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -18,6 +18,13 @@ #include #include +namespace cuopt::linear_programming { +// Forward-declared so simplex code can hold a pointer to the high-level +// benchmark_info_t (defined in cuopt/linear_programming/mip/solver_settings.hpp) +// without pulling that header into every dual_simplex compilation unit. +struct benchmark_info_t; +} // namespace cuopt::linear_programming + namespace cuopt::linear_programming::dual_simplex { template @@ -222,6 +229,12 @@ struct simplex_solver_settings_t { mutable logger_t log; std::atomic* concurrent_halt; // if nullptr ignored, if !nullptr, 0 if solver should // continue, 1 if solver should halt + // Optional non-owning pointer to the run-level benchmark_info_t (defined + // in mip/solver_settings.hpp). Used by branch_and_bound to publish the + // root LP objectives (before & after the cut loop) so benchmark drivers + // can compute gap-closed-by-cuts without having to instrument the + // solver internals. Production builds leave this as nullptr. + cuopt::linear_programming::benchmark_info_t* benchmark_info_ptr = nullptr; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index ce6b602fba..83ecf2e315 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -341,6 +341,9 @@ solution_t mip_solver_t::run_solver() context.settings.strong_chvatal_gomory_cuts; branch_and_bound_settings.cut_change_threshold = context.settings.cut_change_threshold; branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality; + // Forward the run-level benchmark_info_t so B&B can publish root LP + // bounds (before / after cuts) for gap-closed-by-cuts measurement. + branch_and_bound_settings.benchmark_info_ptr = context.settings.benchmark_info_ptr; branch_and_bound_settings.mip_batch_pdlp_strong_branching = context.settings.mip_batch_pdlp_strong_branching; branch_and_bound_settings.mip_batch_pdlp_reliability_branching = From 5335b659947ab8c1e54953d4e9970d511abc99e9 Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 8 May 2026 16:37:30 +0200 Subject: [PATCH 05/47] measure main branch --- cpp/src/branch_and_bound/branch_and_bound.cpp | 44 ++++++++++++++----- cpp/src/mip_heuristics/solver.cu | 39 +++++++++------- 2 files changed, 54 insertions(+), 29 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 520b4e0e99..de81673896 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2367,18 +2367,13 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut return mip_status_t::NUMERICAL; } - if (settings_.reduced_cost_strengthening >= 1 && upper_bound_.load() < last_upper_bound) { - mutex_upper_.lock(); - last_upper_bound = upper_bound_.load(); - std::vector lower_bounds; - std::vector upper_bounds; - find_reduced_cost_fixings(upper_bound_.load(), lower_bounds, upper_bounds); - mutex_upper_.unlock(); - mutex_original_lp_.lock(); - original_lp_.lower = lower_bounds; - original_lp_.upper = upper_bounds; - mutex_original_lp_.unlock(); - } + // In-cut-pass reduced-cost strengthening is disabled on this + // branch: the branch exists only to produce a deterministic + // gap-closed-by-cuts baseline, and primal-driven bound + // tightening makes the per-pass cut yield depend on the timing + // of heuristic-found incumbents (non-deterministic across + // reruns). + // Original block intentionally left out. // Try to do bound strengthening std::vector bounds_changed(original_lp_.num_cols, true); @@ -2556,6 +2551,30 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut original_lp_.A.col_start[original_lp_.A.n]); } + // Stop here. The cut loop has finished, the post-cut root LP value + // has been published to benchmark_info_t (just above), and the + // cut-info summary has been printed. This branch exists only to + // measure gap-closed-by-cuts, so we return before strong branching + // / B&B exploration. The early-exit point matches the cut_scoring + // branch so MIPLIBGapStat numbers from both branches line up + // exactly. + settings_.log.printf( + "CutBench: cut generation complete (max_passes=%d, pool=%d), " + "exiting before strong branching / B&B exploration\n", + static_cast(settings_.max_cut_passes), + static_cast(cut_pool_size)); + finish_clique_thread(); + solver_status_ = mip_status_t::TIME_LIMIT; + set_final_solution(solution, root_objective_); + return solver_status_; + + // The B&B exploration that normally follows cut generation is + // intentionally dead-coded out on this branch. Kept under #if 0 so + // the original control-flow stays visible to anyone diffing against + // upstream main, and so reverting the branch back to a normal + // solver only requires deleting the early-return above and the + // matching #if 0 / #endif markers. +#if 0 set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_); pc_.resize(original_lp_.num_cols); @@ -2714,6 +2733,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } set_final_solution(solution, lower_bound); return solver_status_; +#endif // dead-coded B&B exploration; see #if 0 marker above } // ============================================================================ diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 83ecf2e315..3a2ae1e02a 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -416,7 +416,14 @@ solution_t mip_solver_t::run_solver() // Set the primal heuristics -> branch and bound callback if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) { - branch_and_bound->set_concurrent_lp_root_solve(true); + // Force single-threaded dual simplex at root so the root LP + // value (and therefore the cut-pass starting point) is + // deterministic across reruns. The concurrent racer would + // otherwise pick PDLP or DS as the winner non-deterministically + // and the post-cut gap-closed metric would drift. This branch + // is for gap measurement only, so we make it deterministic by + // default rather than gating on an env var. + branch_and_bound->set_concurrent_lp_root_solve(false); context.problem_ptr->branch_and_bound_callback = std::bind(&dual_simplex::branch_and_bound_t::set_new_solution, @@ -463,32 +470,30 @@ solution_t mip_solver_t::run_solver() std::ref(branch_and_bound_solution)); } - // Start the primal heuristics + // The diversity-manager primal heuristics and the post-BB + // feasibility checks are dead-coded out on this branch. The only + // thing we want from a run on main_baselin is the per-instance + // post-cut gap, which dual_simplex BB has already published into + // benchmark_info_t before returning. Skipping `dm.run_solver()` + // (which would otherwise consume the full time budget after BB + // exits early) and the feasibility checks that depend on a real + // incumbent makes the run exit quickly with a default-constructed + // empty solution. This mirrors the early-return at the top of this + // function used when the solve hits the time limit before B&B even + // starts, and matches the cut_scoring branch so timing comparisons + // are valid. context.diversity_manager_ptr = &dm; - auto sol = dm.run_solver(); + solution_t sol(*context.problem_ptr); if (run_bb) { - // Wait for the branch and bound to finish auto bb_status = branch_and_bound_status_future.get(); + static_cast(bb_status); if (branch_and_bound_solution.lower_bound > -std::numeric_limits::infinity()) { context.stats.set_solution_bound( context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound)); } - if (bb_status == dual_simplex::mip_status_t::INFEASIBLE) { sol.set_problem_fully_reduced(); } context.stats.num_nodes = branch_and_bound_solution.nodes_explored; context.stats.num_simplex_iterations = branch_and_bound_solution.simplex_iterations; } - sol.compute_feasibility(); - - rmm::device_scalar is_feasible(sol.handle_ptr->get_stream()); - sol.test_variable_bounds(true, is_feasible.data()); - // test_variable_bounds clears is_feasible if the test is failed - if (!is_feasible.value(sol.handle_ptr->get_stream())) { - CUOPT_LOG_ERROR( - "Solution is not feasible due to variable bounds, returning infeasible solution!"); - context.stats.total_solve_time = timer_.elapsed_time(); - context.problem_ptr->post_process_solution(sol); - return sol; - } context.stats.total_solve_time = timer_.elapsed_time(); context.problem_ptr->post_process_solution(sol); return sol; From 0b04683b0100dfaeefa14ac420b6bab4b713a211 Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 12 May 2026 20:55:50 +0200 Subject: [PATCH 06/47] test clique changes --- .../cuopt/miplib2017_optima.hpp | 23 +- .../linear_programming/cuopt/run_mip.cpp | 3 +- .../mip/solver_settings.hpp | 8 + cpp/src/branch_and_bound/branch_and_bound.cpp | 21 +- cpp/src/cuts/cuts.cpp | 420 +++++++++- cpp/src/cuts/cuts.hpp | 139 +++- .../presolve/conflict_graph/clique_table.cu | 760 +++++------------- .../presolve/conflict_graph/clique_table.cuh | 137 +++- cpp/tests/mip/cuts_test.cu | 4 +- 9 files changed, 890 insertions(+), 625 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp index 201346c656..7f6826a5ce 100644 --- a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp +++ b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp @@ -371,12 +371,14 @@ inline bool is_known_infeasible(const std::string& filename) // can join lines on (instance, field) without dropping rows. "NaN" is // emitted for root_lp_* when the value is unavailable. template -inline void print_miplib_gap_stat(const std::string& filename, - const Solution& solution, - double solve_time_seconds, - const std::string& termination_status, - double root_lp_no_cuts, - double root_lp_with_cuts) +inline void print_miplib_gap_stat( + const std::string& filename, + const Solution& solution, + double solve_time_seconds, + const std::string& termination_status, + double root_lp_no_cuts, + double root_lp_with_cuts, + double cut_gen_time_sec = std::numeric_limits::quiet_NaN()) { const std::string norm = normalize_instance_name(filename); const auto opt = lookup_miplib_optimum(filename); @@ -394,7 +396,7 @@ inline void print_miplib_gap_stat(const std::string& filename, "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " "abs_root_dual_gap=NA rel_root_dual_gap_pct=NA gap_closed_pct=NA " "abs_primal_gap=NA rel_primal_gap_pct=NA " - "mip_gap_reported=%.6g time_s=%.3f status=%s\n", + "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", norm.c_str(), primal, final_dual, @@ -402,6 +404,7 @@ inline void print_miplib_gap_stat(const std::string& filename, root_lp_with_cuts, mip_gap, solve_time_seconds, + cut_gen_time_sec, termination_status.c_str()); } else if (opt.has_value()) { const double o = *opt; @@ -434,7 +437,7 @@ inline void print_miplib_gap_stat(const std::string& filename, "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " "abs_root_dual_gap=%.10g rel_root_dual_gap_pct=%.6g gap_closed_pct=%.6g " "abs_primal_gap=%.10g rel_primal_gap_pct=%.6g " - "mip_gap_reported=%.6g time_s=%.3f status=%s\n", + "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", norm.c_str(), o, primal, @@ -448,6 +451,7 @@ inline void print_miplib_gap_stat(const std::string& filename, rel_pgap_pct, mip_gap, solve_time_seconds, + cut_gen_time_sec, termination_status.c_str()); } else { std::printf( @@ -455,7 +459,7 @@ inline void print_miplib_gap_stat(const std::string& filename, "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " "abs_root_dual_gap=TBD rel_root_dual_gap_pct=TBD gap_closed_pct=TBD " "abs_primal_gap=TBD rel_primal_gap_pct=TBD " - "mip_gap_reported=%.6g time_s=%.3f status=%s\n", + "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", norm.c_str(), primal, final_dual, @@ -463,6 +467,7 @@ inline void print_miplib_gap_stat(const std::string& filename, root_lp_with_cuts, mip_gap, solve_time_seconds, + cut_gen_time_sec, termination_status.c_str()); } std::fflush(stdout); diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index 366cd28201..8a386e6b0d 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -277,7 +277,8 @@ int run_single_file(std::string file_path, _gap_seconds, _status_str, benchmark_info.root_lp_no_cuts, - benchmark_info.root_lp_with_cuts); + benchmark_info.root_lp_with_cuts, + benchmark_info.cut_generation_time_sec); } std::stringstream ss; diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 29f3d34525..506f9221dc 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -39,6 +39,14 @@ struct benchmark_info_t { // / (opt - root_lp_no_cuts). // quiet_NaN() means "B&B did not finish the cut loop / value not written". double root_lp_with_cuts = std::numeric_limits::quiet_NaN(); + + // Wall-clock time spent inside the root-node cut generation loop + // (sum of generate_cuts + score_cuts + check_for_duplicate_cuts + + // get_best_cuts + add_cuts + post-cut LP resolves), in seconds. + // Published by branch_and_bound.cpp::solve() at the same point that + // root_lp_with_cuts is finalised. quiet_NaN() means "cut loop did + // not run / value never written". + double cut_generation_time_sec = std::numeric_limits::quiet_NaN(); }; // Forward declare solver_settings_t for friend class diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index de81673896..5aefa6463d 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2081,7 +2081,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut cuopt::timer_t timer(std::numeric_limits::infinity()); std::shared_ptr> table; detail::find_initial_cliques( - problem_copy, tolerances_for_clique, &table, timer, false, signal_ptr); + problem_copy, tolerances_for_clique, &table, timer, signal_ptr); return table; }); } @@ -2218,6 +2218,12 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } cut_pool_t cut_pool(original_lp_.num_cols, settings_); + // Apply CUOPT_CONFIG_ID sweep override (5 configs; see cuts.cpp). + // Mutates `cut_pool` knobs only (clique cousin filter on/off, Jaccard + // tau, integer-support size tilt). No-op when CUOPT_CONFIG_ID is unset + // / out of range. The deterministic measurement path (no concurrent + // root LP, no in-cut-pass RCS, exit-after-cuts) is unconditional. + apply_cut_sweep_config(cut_pool, settings_); cut_generation_t cut_generation(cut_pool, original_lp_, settings_, @@ -2542,6 +2548,14 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut print_cut_info(settings_, cut_info); f_t cut_generation_time = toc(cut_generation_start_time); + // Publish the cut generation wall time so MIPLIBGapStat / run_mip can + // emit it alongside gap_closed_pct. Always set when the cut loop ran, + // even if no cuts were added (the time still measures real work in + // generate_cuts + score_cuts + dedup + LP resolves). + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->cut_generation_time_sec = + static_cast(cut_generation_time); + } if (cut_info.has_cuts()) { settings_.log.printf("Cut generation time: %.2f seconds\n", cut_generation_time); settings_.log.printf("Cut pool size : %d\n", cut_pool_size); @@ -2559,10 +2573,11 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut // branch so MIPLIBGapStat numbers from both branches line up // exactly. settings_.log.printf( - "CutBench: cut generation complete (max_passes=%d, pool=%d), " + "CutBench: cut generation complete (max_passes=%d, pool=%d, time=%.3fs), " "exiting before strong branching / B&B exploration\n", static_cast(settings_.max_cut_passes), - static_cast(cut_pool_size)); + static_cast(cut_pool_size), + static_cast(cut_generation_time)); finish_clique_thread(); solver_status_ = mip_status_t::TIME_LIMIT; set_final_solution(solution, root_objective_); diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 6d7d97ef0a..5fe52ddc0f 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -14,7 +14,9 @@ #include #include +#include #include +#include #include #include @@ -80,13 +82,46 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic cut.i.clear(); cut.x.clear(); - i_t num_complements = 0; + + // P0-3 (1): two-pass complement-pair detection. The baseline returned + // NO_CUT on the first variable that appeared both as itself and as its + // complement; that hides how many such conflicts a candidate clique has + // and makes it impossible to attribute infeasibility events to specific + // clique generators. Pre-pass collects every original/complement + // occurrence per variable, counts the actual complement pairs, and only + // then decides. Accept/reject behavior matches baseline (a complement + // pair still aborts cut construction); only the diagnostics change. std::unordered_set seen_original; std::unordered_set seen_complement; seen_original.reserve(clique_vertices.size()); seen_complement.reserve(clique_vertices.size()); for (const auto vertex_idx : clique_vertices) { cuopt_assert(vertex_idx >= 0 && vertex_idx < 2 * num_vars, "Clique vertex out of range"); + const i_t var_idx = vertex_idx % num_vars; + const bool complement = vertex_idx >= num_vars; + if (complement) { + seen_complement.insert(var_idx); + } else { + seen_original.insert(var_idx); + } + } + i_t complement_pairs = 0; + for (const auto var_idx : seen_original) { + if (seen_complement.count(var_idx) > 0) { complement_pairs++; } + } + if (complement_pairs > 0) { + CLIQUE_CUTS_DEBUG("build_clique_cut infeasible: %lld complement-pairs", + static_cast(complement_pairs)); + return clique_cut_build_status_t::NO_CUT; + } + + // Second pass: emit cut coefficients. We already know there are no + // complement-pair conflicts so the lookups against seen_original / + // seen_complement that the baseline performed are now redundant. + i_t num_complements = 0; + const bool has_original = !seen_original.empty(); + const bool has_complement = !seen_complement.empty(); + for (const auto vertex_idx : clique_vertices) { const i_t var_idx = vertex_idx % num_vars; const bool complement = vertex_idx >= num_vars; const f_t lower_bound = lower_bounds[var_idx]; @@ -96,32 +131,17 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic "Clique contains continuous variable"); cuopt_assert(lower_bound >= -bound_tol, "Clique variable lower bound below zero"); cuopt_assert(upper_bound <= 1 + bound_tol, "Clique variable upper bound above one"); + static_cast(lower_bound); + static_cast(upper_bound); - // we store the cut in the form of >= 1, for easy violation check with dot product - // that's why compelements have 1 as coeff and normal vars have -1 + // Cut is stored in form sum_j a_j x_j >= rhs for direct dot-product + // violation checks. Complemented literals (1 - x_j) contribute +1*x_j + // to the inequality and originals contribute -1*x_j. if (complement) { - if (seen_original.count(var_idx) > 0) { - // FIXME: this is temporary, fix all the vars of all other vars in the clique - return clique_cut_build_status_t::NO_CUT; - CLIQUE_CUTS_DEBUG("build_clique_cut infeasible var=%lld appears as variable and complement", - static_cast(var_idx)); - return clique_cut_build_status_t::INFEASIBLE; - } - cuopt_assert(seen_complement.count(var_idx) == 0, "Duplicate complement in clique"); - seen_complement.insert(var_idx); num_complements++; cut.i.push_back(var_idx); cut.x.push_back(1.0); } else { - if (seen_complement.count(var_idx) > 0) { - // FIXME: this is temporary, fix all the vars of all other vars in the clique - return clique_cut_build_status_t::NO_CUT; - CLIQUE_CUTS_DEBUG("build_clique_cut infeasible var=%lld appears as variable and complement", - static_cast(var_idx)); - return clique_cut_build_status_t::INFEASIBLE; - } - cuopt_assert(seen_original.count(var_idx) == 0, "Duplicate variable in clique"); - seen_original.insert(var_idx); cut.i.push_back(var_idx); cut.x.push_back(-1.0); } @@ -135,27 +155,36 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic cut_rhs = static_cast(num_complements - 1); cut.sort(); + // P0-3 (4): has_pair distinguishes pure (all originals OR all + // complements) from mixed cliques in the accepted-cut log line so + // post-mortem analysis can attribute gap closure to one variant or + // the other. + const int has_pair = (has_original && has_complement) ? 1 : 0; const f_t dot = cut.dot(xstar); const f_t violation = cut_rhs - dot; if (violation > min_violation) { CLIQUE_CUTS_DEBUG( - "build_clique_cut accepted nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld", + "build_clique_cut accepted nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld " + "has_pair=%d", static_cast(cut.i.size()), static_cast(cut_rhs), static_cast(dot), static_cast(violation), static_cast(min_violation), - static_cast(num_complements)); + static_cast(num_complements), + has_pair); return clique_cut_build_status_t::CUT_ADDED; } CLIQUE_CUTS_DEBUG( - "build_clique_cut rejected nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld", + "build_clique_cut rejected nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld " + "has_pair=%d", static_cast(cut.i.size()), static_cast(cut_rhs), static_cast(dot), static_cast(violation), static_cast(min_violation), - static_cast(num_complements)); + static_cast(num_complements), + has_pair); return clique_cut_build_status_t::NO_CUT; } @@ -388,11 +417,20 @@ void extend_clique_vertices(std::vector& clique_vertices, const f_t candidate_size = static_cast(candidates.size()); const f_t sort_work = candidate_size > 0.0 ? 2.0 * candidate_size * std::log2(candidate_size + 1.0) : 0.0; - const f_t adj_set_build_cost = 2.0 * static_cast(adj_set.size()); - const f_t adj_check_cost = 5.0; - const f_t estimated_preloop_work = 2.0 * initial_clique_size + adj_set_build_cost + - 3.0 * static_cast(adj_set.size()) + sort_work + - 2.0 * candidate_size; + const f_t adj_set_build_cost = 2.0 * static_cast(adj_set.size()); + // P0-3 (2): account for the addtl_cliques scan that + // clique_table_t::check_adjacency performs on every adjacency probe. + // Baseline ignored this, so on instances with many addtl_clique entries + // the extension loop dominated cut-generation wall time without being + // attributed to clique cuts. avg_slice_size of var_clique_addtl is a + // robust proxy for the per-call addtl scan cost. + const f_t addtl_cliques_scan_cost = + 1.0 + static_cast(graph.var_clique_addtl.avg_slice_size()); + const f_t adj_check_cost = 5.0 + addtl_cliques_scan_cost; + const f_t estimated_preloop_work = + 2.0 * initial_clique_size + adj_set_build_cost + 3.0 * static_cast(adj_set.size()) + + sort_work + 2.0 * candidate_size + addtl_cliques_scan_cost * initial_clique_size + + addtl_cliques_scan_cost; if (add_work_estimate(estimated_preloop_work, work_estimate, max_work_estimate)) { CLIQUE_CUTS_DEBUG("extend_clique_vertices skip work_limit work=%g limit=%g", work_estimate == nullptr ? -1.0 : static_cast(*work_estimate), @@ -428,6 +466,8 @@ void extend_clique_vertices(std::vector& clique_vertices, break; } } + // Each check_adjacency now charges its own addtl_cliques_scan_cost + // term so the per-iteration budget reflects the addtl scan cost. if (add_work_estimate( adj_check_cost * static_cast(checks), work_estimate, max_work_estimate)) { break; @@ -507,8 +547,66 @@ std::vector> find_maximal_cliques_for_test( return ctx.cliques; } +namespace { + +// 64-bit integer mixer (SplitMix64). Used as the building block for the +// cousin filter's per-slot independent hash family. +inline uint64_t splitmix64_mix(uint64_t x) +{ + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; + x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; + x = x ^ (x >> 31); + return x; +} + +inline uint64_t hash64_with_seed(uint64_t value, uint64_t seed) +{ + return splitmix64_mix(value ^ (seed * 0xbf58476d1ce4e5b9ULL + 0x9e3779b97f4a7c15ULL)); +} + +} // namespace + +template +void cut_pool_t::compute_clique_minhash_sketch(const inequality_t& cut, + std::vector& sketch) const +{ + // Min-hash over the cut's column-support set. With clique_cousin_minhash_k_ + // independent random orderings of the variable index space, the expected + // number of agreements between two sketches is k * Jaccard(supp_a, supp_b), + // so sketch comparison estimates Jaccard in O(k) regardless of support + // sizes. + const i_t k = clique_cousin_minhash_k_; + sketch.assign(k, std::numeric_limits::max()); + const i_t nz = cut.size(); + for (i_t p = 0; p < nz; p++) { + const uint64_t j = static_cast(cut.index(p)); + for (i_t s = 0; s < k; s++) { + const uint64_t h = hash64_with_seed(j, static_cast(s)); + if (h < sketch[s]) { sketch[s] = h; } + } + } +} + +template +void cut_pool_t::rebuild_clique_cousin_buckets() +{ + // Buckets index CLIQUE rows by the first sketch hash. Compaction + // routines (check_for_duplicate_cuts) shift row indices, so they call + // this after the parallel sketch vector has been remapped to make + // sure bucket entries point to the post-compaction rows. + clique_cousin_buckets_.clear(); + const i_t m = static_cast(clique_support_minhash_.size()); + for (i_t i = 0; i < m; i++) { + if (clique_support_minhash_[i].empty()) { continue; } + const uint64_t key = clique_support_minhash_[i][0]; + clique_cousin_buckets_[key].push_back(i); + } +} + template -void cut_pool_t::add_cut(cut_type_t cut_type, const inequality_t& cut) +void cut_pool_t::add_cut(cut_type_t cut_type, + const inequality_t& cut, + f_t cut_score) { // TODO: Add fast duplicate check and only add if the cut is not already in the pool @@ -527,10 +625,126 @@ void cut_pool_t::add_cut(cut_type_t cut_type, const inequality_t= clique_cousin_jaccard_tau_ + // with an existing pool entry, keep the higher-scoring representative. + std::vector new_sketch; + i_t cousin_replace_row = -1; + bool cousin_invariant_path = false; + // Apply the size-tilt boost to the caller's score so larger cliques win + // ties on cousin replacement (more variables covered = more constraint + // strength; a proxy for "integer support" since clique vars are 0-1). + f_t effective_score = cut_score; + if (effective_score >= static_cast(0.0) && + clique_cousin_size_weight_ > static_cast(0.0) && cut_type == cut_type_t::CLIQUE) { + const f_t sz = static_cast(cut_squeezed.size()); + const f_t mult = + static_cast(1.0) + + clique_cousin_size_weight_ * static_cast(std::log2(1.0 + static_cast(sz))); + effective_score *= mult; + } + if (cut_type == cut_type_t::CLIQUE) { clique_inserts_++; } + if (cut_type == cut_type_t::CLIQUE && clique_cousin_filter_enable_ && + clique_cousin_minhash_k_ > 0) { + cousin_invariant_path = true; + compute_clique_minhash_sketch(cut_squeezed, new_sketch); + const uint64_t bucket_key = new_sketch[0]; + auto bucket_it = clique_cousin_buckets_.find(bucket_key); + if (bucket_it != clique_cousin_buckets_.end()) { + const i_t pool_size = cut_storage_.m; + const i_t k = clique_cousin_minhash_k_; + // Walk the bucket, computing min-hash agreement with each peer. + // Bucket sizes are O(distinct max-clique families per round) so + // this loop is short on every realistic instance even when the + // pool is large. + auto& bucket_rows = bucket_it->second; + for (size_t b = 0; b < bucket_rows.size(); b++) { + const i_t row = bucket_rows[b]; + if (row < 0 || row >= pool_size) { continue; } + if (static_cast(clique_support_minhash_[row].size()) != k) { continue; } + i_t agree = 0; + for (i_t s = 0; s < k; s++) { + if (clique_support_minhash_[row][s] == new_sketch[s]) { agree++; } + } + const f_t jaccard_est = static_cast(agree) / static_cast(k); + if (jaccard_est < clique_cousin_jaccard_tau_) { continue; } + // Cousin found. Compare scores; keep the better representative. + const f_t existing_score = clique_cousin_score_[row]; + if (effective_score < static_cast(0.0)) { + // Caller did not supply a score — be conservative and drop the + // new cut; the existing entry stays as the bucket invariant + // winner ("first-write-wins" policy). + cousin_drops_++; + return; + } + if (effective_score <= existing_score) { + // Existing representative is at least as good; drop the new cut. + cousin_drops_++; + return; + } + // New cut beats the existing representative. We "soft-replace": + // clear the loser's sketch so future cousins don't anchor against + // it (restoring the bucket invariant for new inserts), and + // reroute the bucket entry to the new row below. The loser stays + // in cut_storage_ for now and will be filtered by the standard + // orthogonality scan in score_cuts() — main_baselin has no + // mid-pass eviction primitive, and adding one would invalidate + // the per-pass cut_pool_size accounting. + cousin_replace_row = row; + // Replace at most one peer per insert; a transitive cousin of + // the loser at the same bucket is filtered next time. Matches + // the SCIP / Mops "pairwise" family invariant. + break; + } + } + } + cut_storage_.append_row(cut_squeezed.vector); rhs_storage_.push_back(cut_squeezed.rhs); cut_type_.push_back(cut_type); cut_age_.push_back(0); + + // Keep the cousin-filter side tables sized like cut_storage_ regardless + // of cut type. Non-CLIQUE rows carry an empty sketch and a zero score; + // they are skipped by rebuild_clique_cousin_buckets(). + const i_t new_row = cut_storage_.m - 1; + clique_support_minhash_.resize(cut_storage_.m); + clique_cousin_score_.resize(cut_storage_.m, static_cast(0.0)); + if (cousin_invariant_path) { + clique_support_minhash_[new_row] = std::move(new_sketch); + clique_cousin_score_[new_row] = effective_score; + if (cousin_replace_row >= 0) { + // Reroute the bucket entry from the loser's row to the new row. + // Other peers in the same bucket (if any) keep their entries. + const uint64_t bucket_key = clique_support_minhash_[new_row][0]; + auto& rows = clique_cousin_buckets_[bucket_key]; + bool replaced = false; + for (auto& r : rows) { + if (r == cousin_replace_row) { + r = new_row; + replaced = true; + break; + } + } + if (!replaced) { rows.push_back(new_row); } + // Clear the loser's sketch so it's a no-op on subsequent inserts + // and rebuild_clique_cousin_buckets() ignores it. Loser's row + // lingers in cut_storage_ until score_cuts compacts via dedup or + // filters via orthogonality. + clique_support_minhash_[cousin_replace_row].clear(); + clique_cousin_score_[cousin_replace_row] = static_cast(0.0); + cousin_replaces_++; + } else { + const uint64_t bucket_key = clique_support_minhash_[new_row][0]; + clique_cousin_buckets_[bucket_key].push_back(new_row); + } + } } template @@ -705,17 +919,35 @@ void cut_pool_t::check_for_duplicate_cuts() cut_storage_.remove_rows(cuts_to_remove, new_cut_storage); cut_storage_ = new_cut_storage; i_t write = 0; + // Cousin-filter parallel arrays: only meaningful when populated, but + // size them up before the loop so the row remap is uniform. + const bool cousin_active = !clique_support_minhash_.empty(); + if (cousin_active) { + clique_support_minhash_.resize(m); + clique_cousin_score_.resize(m, static_cast(0.0)); + } for (i_t i = 0; i < m; i++) { if (cuts_to_remove[i] == 0) { rhs_storage_[write] = rhs_storage_[i]; cut_type_[write] = cut_type_[i]; cut_age_[write] = cut_age_[i]; + if (cousin_active) { + clique_support_minhash_[write] = std::move(clique_support_minhash_[i]); + clique_cousin_score_[write] = clique_cousin_score_[i]; + } write++; } } rhs_storage_.resize(write); cut_type_.resize(write); cut_age_.resize(write); + if (cousin_active) { + clique_support_minhash_.resize(write); + clique_cousin_score_.resize(write); + // Row indices changed — bucket entries point to old row IDs and + // would corrupt the next at-insert filter. Rebuild from scratch. + rebuild_clique_cousin_buckets(); + } } } @@ -2108,7 +2340,11 @@ bool cut_generation_t::generate_clique_cuts( inequality_t cut_inequality; cut_inequality.vector = cut; cut_inequality.rhs = cut_rhs; - cut_pool_.add_cut(cut_type_t::CLIQUE, cut_inequality); + // Pass the LP violation as the cousin-filter score so add_cut can + // pick the stronger representative on a Jaccard collision (>= tau). + // build_clique_cut has already verified violation > min_violation. + const f_t cut_violation = cut_rhs - cut.dot(xstar); + cut_pool_.add_cut(cut_type_t::CLIQUE, cut_inequality, cut_violation); #if DEBUG_CLIQUE_CUTS added_cuts++; CLIQUE_CUTS_DEBUG("generate_clique_cuts added cut nz=%lld rhs=%g clique_size=%lld", @@ -4760,6 +4996,123 @@ void verify_cuts_against_saved_solution(const csr_matrix_t& cuts, } } +// Cut-pool sweep configuration dispatch (declared in cuts.hpp). +// +// Driven by the same CUOPT_CONFIG_ID / CUOPT_MAX_CONFIG env vars the +// diversity manager uses (see diversity_manager.cu). One integer +// selects one of kCutSweepNumConfigs hard-coded cut-pool configurations. +// Caller side is just: +// CUOPT_MAX_CONFIG=5 CUOPT_CONFIG_ID=$id $RUN_MIP ... +// +// CUOPT_MAX_CONFIG is the caller's expected upper bound; when set +// we additionally range-check CUOPT_CONFIG_ID against it. +// CUOPT_CONFIG_ID unset / unparsable -> baseline (config 0). +// CUOPT_CONFIG_ID < 0 or >= valid range -> baseline + warning. +// +// Banner printf is gated to a single emission per process so B&B +// restarts (which re-construct cut_pool_t) don't spam the log. +template +void apply_cut_sweep_config(cut_pool_t& cut_pool, + const simplex_solver_settings_t& settings) +{ + static std::atomic banner_emitted{false}; + + const char* env_config_id_raw = std::getenv("CUOPT_CONFIG_ID"); + int config_id = -1; + if (env_config_id_raw != nullptr && env_config_id_raw[0] != '\0') { + try { + config_id = std::stoi(env_config_id_raw); + } catch (const std::exception&) { + config_id = -1; + } + } + + int max_config = kCutSweepNumConfigs; + const char* env_max_config = std::getenv("CUOPT_MAX_CONFIG"); + if (env_max_config != nullptr && env_max_config[0] != '\0') { + try { + max_config = std::stoi(env_max_config); + } catch (const std::exception&) { + max_config = kCutSweepNumConfigs; + } + } + + if (config_id < 0 || config_id >= max_config || config_id >= kCutSweepNumConfigs) { + if (!banner_emitted.load()) { + settings.log.printf( + "CutPoolConfig WARN config_id=%d out of range [0,%d), falling back to baseline\n", + config_id, + std::min(max_config, kCutSweepNumConfigs)); + } + config_id = 0; + } + + // Defaults match cut_pool_t's initializers: cousin filter OFF, tau=0.85, + // k=8, size_weight=0.0. Each case below documents what it tweaks. + switch (config_id) { + case 0: + // 00_baseline_no_cousin: clique algorithmic changes only (8f2cf00a). + // Cousin filter disabled — isolates the impact of the + // build_clique_cut two-pass refactor and the addtl_cliques_scan_cost + // work-accounting. + cut_pool.set_clique_cousin_filter_enable(false); + break; + case 1: + // 01_cousin_default: P2-4 cousin filter on with the cut_scoring branch + // defaults (tau=0.85, k=8, no size tilt). Score is the caller-supplied + // violation; ties prefer the earlier insert. + cut_pool.set_clique_cousin_filter_enable(true); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.85)); + cut_pool.set_clique_cousin_minhash_k(8); + cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); + break; + case 2: + // 02_cousin_strict: tighter Jaccard threshold (0.85 -> 0.70). Calls + // more cliques "cousins" so we drop / replace more aggressively. + cut_pool.set_clique_cousin_filter_enable(true); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.70)); + cut_pool.set_clique_cousin_minhash_k(8); + cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); + break; + case 3: + // 03_cousin_loose: looser Jaccard threshold (0.85 -> 0.95). Closer + // to no-filter behavior. Gap should match config 0 if the cousin + // filter is mostly absorbing redundancy that the orthogonality + // scan would catch anyway. + cut_pool.set_clique_cousin_filter_enable(true); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.95)); + cut_pool.set_clique_cousin_minhash_k(8); + cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); + break; + case 4: + // 04_cousin_size_tilt: cousin filter on at default tau=0.85, but the + // score used for cousin replacement is multiplied by + // (1 + 0.5 * log2(1 + clique_size)) + // so larger cliques win on ties / near-ties. For clique cuts every + // variable is binary, so clique size is the integer-support count + // — this is the "clique integer support" knob the user requested. + cut_pool.set_clique_cousin_filter_enable(true); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.85)); + cut_pool.set_clique_cousin_minhash_k(8); + cut_pool.set_clique_cousin_size_weight(static_cast(0.5)); + break; + default: break; // unreachable due to range check above + } + + if (!banner_emitted.exchange(true)) { + settings.log.printf( + "CutPoolConfig id=%d name=%s clique_cousin_enable=%d clique_cousin_tau=%g " + "clique_cousin_k=%d clique_cousin_size_weight=%g\n", + config_id, + cut_sweep_config_name(config_id), + static_cast(cut_pool.clique_cousin_filter_enable() ? 1 : 0), + static_cast(cut_pool.clique_cousin_jaccard_tau()), + static_cast(cut_pool.clique_cousin_minhash_k()), + static_cast(cut_pool.clique_cousin_size_weight())); + std::fflush(stdout); + } +} + #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE template class cut_pool_t; template class cut_generation_t; @@ -4768,6 +5121,9 @@ template class tableau_equality_t; template class complemented_mixed_integer_rounding_cut_t; template class variable_bounds_t; +template void apply_cut_sweep_config( + cut_pool_t& cut_pool, const simplex_solver_settings_t& settings); + template int add_cuts(const simplex_solver_settings_t& settings, const csr_matrix_t& cuts, const std::vector& cut_rhs, diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index 2da9760e27..632708dfd0 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -284,9 +285,17 @@ class cut_pool_t { } // Add a cut in the form: cut'*x >= rhs. - // We expect that the cut is violated by the current relaxation xstar - // cut'*xstart < rhs - void add_cut(cut_type_t cut_type, const inequality_t& cut); + // We expect that the cut is violated by the current relaxation xstar. + // + // cut_score is an optional caller-supplied quality score used by the + // P2-4 clique cousin filter (only consulted for cut_type == CLIQUE + // when the cousin filter is enabled). Pass a non-negative value to + // enable score-aware cousin replacement; the default (-1.0) reverts + // to "first-write-wins" cousin policy. Other cut types ignore this + // parameter. + void add_cut(cut_type_t cut_type, + const inequality_t& cut, + f_t cut_score = static_cast(-1.0)); void score_cuts(std::vector& x_relax); @@ -305,11 +314,63 @@ class cut_pool_t { void check_for_duplicate_cuts(); + // ----- P2-4 clique cousin filter knobs / counters ----------------------- + // + // The clique cut family (Bron-Kerbosch + extension) emits cousin + // cliques whose support sets agree in |k-1| of |k| vertices. The + // selection-stage orthogonality scan catches them but only after the + // full insert + dedup + score cost has been paid. The cousin filter + // intercepts at insert: we min-hash the cut's column-support set, + // bucket on the first sketch hash, and when an existing pool entry + // collides with estimated Jaccard >= jaccard_tau we keep the + // higher-scoring representative (or, if no score was supplied, the + // earlier-inserted one). + // + // Defaults: jaccard_tau=0.85, k=8, enable=false. Cousin filter is OFF + // by default so cut_pool_t behavior matches main_baselin (5335b659) + // unless apply_cut_sweep_config() explicitly turns it on. The numeric + // defaults (tau=0.85, k=8) match the cut_scoring branch's "final + // version" so config 1 here lines up with the P2-4 baseline measured + // there. + void set_clique_cousin_filter_enable(bool v) { clique_cousin_filter_enable_ = v; } + void set_clique_cousin_jaccard_tau(f_t v) { clique_cousin_jaccard_tau_ = v; } + void set_clique_cousin_minhash_k(i_t v) { clique_cousin_minhash_k_ = v; } + void set_clique_cousin_size_weight(f_t v) { clique_cousin_size_weight_ = v; } + + bool clique_cousin_filter_enable() const { return clique_cousin_filter_enable_; } + f_t clique_cousin_jaccard_tau() const { return clique_cousin_jaccard_tau_; } + i_t clique_cousin_minhash_k() const { return clique_cousin_minhash_k_; } + f_t clique_cousin_size_weight() const { return clique_cousin_size_weight_; } + + // Per-pool tally for log lines (instance-level diagnostic). All three + // counters are reset by reset_cousin_stats() and incremented inside + // add_cut() / cousin replacement. + i_t cousin_drops() const { return cousin_drops_; } + i_t cousin_replaces() const { return cousin_replaces_; } + i_t clique_inserts() const { return clique_inserts_; } + void reset_cousin_stats() + { + cousin_drops_ = 0; + cousin_replaces_ = 0; + clique_inserts_ = 0; + } + private: f_t cut_distance(i_t row, const std::vector& x, f_t& cut_violation, f_t& cut_norm); f_t cut_density(i_t row); f_t cut_orthogonality(i_t i, i_t j); + // Cousin filter helpers. compute_clique_minhash_sketch() fills + // `sketch` (length = clique_cousin_minhash_k_) with k independent + // min-hashes over the cut's column-support set. Two sketches agree + // on slot s with probability Jaccard(supp_a, supp_b), so element-wise + // agreement count divided by k estimates the Jaccard similarity. + void compute_clique_minhash_sketch(const inequality_t& cut, + std::vector& sketch) const; + // Rebuilds clique_cousin_buckets_ from clique_support_minhash_ after + // any compaction that remaps row indices (e.g. dedup). + void rebuild_clique_cousin_buckets(); + i_t original_vars_; const simplex_solver_settings_t& settings_; @@ -325,8 +386,80 @@ class cut_pool_t { std::vector cut_scores_; std::vector best_cuts_; const f_t min_cut_distance_{1e-4}; + + // P2-4 cousin filter state. clique_support_minhash_ is sized in + // lock-step with cut_storage_; non-CLIQUE rows carry an empty + // sketch and are skipped by rebuild_clique_cousin_buckets() and the + // cousin loop in add_cut. clique_cousin_score_ holds the + // caller-supplied score (raw violation, or violation * size-tilt) so + // we can decide which representative to keep when two cliques + // collide. clique_cousin_buckets_ maps the first sketch hash to the + // list of pool rows whose sketches start with that hash. + std::vector> clique_support_minhash_; + std::vector clique_cousin_score_; + std::unordered_map> clique_cousin_buckets_; + f_t clique_cousin_jaccard_tau_{static_cast(0.85)}; + i_t clique_cousin_minhash_k_{8}; + bool clique_cousin_filter_enable_{false}; + // When > 0, the cousin filter's "score" used to pick a winner is + // boosted as: effective_score = base_score * (1 + size_weight * log2(1 + clique_size)). + // This biases cousin replacement toward larger cliques (more variables + // covered, larger integer support). 0 disables the tilt. + f_t clique_cousin_size_weight_{static_cast(0.0)}; + + // Diagnostic counters reset at the start of each cut pass via + // reset_cousin_stats(). + i_t cousin_drops_{0}; + i_t cousin_replaces_{0}; + i_t clique_inserts_{0}; }; +// --------------------------------------------------------------------------- +// Cut-pool sweep configuration dispatch. +// +// Selected by the CUOPT_CONFIG_ID environment variable; range-checked +// against CUOPT_MAX_CONFIG (caller-asserted upper bound). One env-var +// dispatch covers the entire clique cut family because the only knobs +// we vary on this branch live on cut_pool_t (cousin filter on/off, +// Jaccard tau, integer-support size tilt). The deterministic +// measurement path (no concurrent root LP, no in-cut-pass RCS, exit +// after the cut loop) is unconditional and lives in branch_and_bound. +// +// Keep kCutSweepNumConfigs in sync with the switch table in +// apply_cut_sweep_config() (see cuts.cpp) and with cut_sweep_config_name() +// below. +// +// Layout: +// 0 baseline_no_cousin clique cut algorithmic changes only +// (cousin filter off; isolates 8f2cf00a impact) +// 1 cousin_default cousin filter on, tau=0.85, k=8, score=violation +// (the cut_scoring final-version P2-4 baseline) +// 2 cousin_strict cousin filter on, tau=0.70 (more aggressive +// cousin removal — favors quantity reduction) +// 3 cousin_loose cousin filter on, tau=0.95 (closer to no-filter +// extreme — selection-stage absorbs cousins) +// 4 cousin_size_tilt cousin filter on, tau=0.85, score = violation * +// (1 + 0.5 * log2(1 + clique_size)) — picks the +// larger clique on cousin replacement (integer +// support proxy, since clique vars are 0-1) +constexpr int kCutSweepNumConfigs = 5; + +inline const char* cut_sweep_config_name(int config_id) +{ + switch (config_id) { + case 0: return "00_baseline_no_cousin"; + case 1: return "01_cousin_default"; + case 2: return "02_cousin_strict"; + case 3: return "03_cousin_loose"; + case 4: return "04_cousin_size_tilt"; + default: return "unknown"; + } +} + +template +void apply_cut_sweep_config(cut_pool_t& cut_pool, + const simplex_solver_settings_t& settings); + template class knapsack_generation_t { public: diff --git a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu index 82462c11ce..deca5a46c3 100644 --- a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu +++ b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu @@ -20,6 +20,7 @@ #include "clique_table.cuh" #include +#include #include #include #include @@ -100,7 +101,6 @@ template void make_coeff_positive_knapsack_constraint( const dual_simplex::user_problem_t& problem, std::vector>& knapsack_constraints, - std::unordered_set& set_packing_constraints, typename mip_solver_settings_t::tolerances_t tolerances) { for (i_t i = 0; i < (i_t)knapsack_constraints.size(); i++) { @@ -125,7 +125,6 @@ void make_coeff_positive_knapsack_constraint( } knapsack_constraint.is_set_packing = all_coeff_are_equal; if (!all_coeff_are_equal) { knapsack_constraint.is_set_partitioning = false; } - if (knapsack_constraint.is_set_packing) { set_packing_constraints.insert(i); } cuopt_assert(knapsack_constraint.rhs >= 0, "RHS must be non-negative"); } } @@ -185,9 +184,8 @@ void fill_knapsack_constraints(const dual_simplex::user_problem_t& pro } // equality part else { - // For equality rows, partitioning status should not depend on raw rhs scale here. - // The exact set-packing/partitioning check is finalized later in - // make_coeff_positive_knapsack_constraint after coefficient normalization. + // Final partitioning check is done after coefficient normalization in + // make_coeff_positive_knapsack_constraint. bool is_set_partitioning = true; bool ranged_constraint = ranged_constraint_counter < problem.num_range_rows && problem.range_rows[ranged_constraint_counter] == i; @@ -203,8 +201,7 @@ void fill_knapsack_constraints(const dual_simplex::user_problem_t& pro } // greater than part: convert it to less than knapsack_constraint_t knapsack_constraint2; - // Mark synthetic rows from equality splitting with negative ids so they never alias real row - // indices (including rows appended later by clique extension). + // Negative ids prevent aliasing with real row indices. knapsack_constraint2.cstr_idx = -(added_constraints + 1); added_constraints++; knapsack_constraint2.rhs = -problem.rhs[i]; @@ -228,62 +225,51 @@ void remove_small_cliques(clique_table_t& clique_table, cuopt::timer_t i_t num_removed_first = 0; i_t num_removed_addtl = 0; std::vector to_delete(clique_table.first.size(), false); - // if a clique is small, we remove it from the cliques and add it to adjlist + std::vector> small_edges; + + // Demote sub-threshold first-cliques into pairwise edges. for (size_t clique_idx = 0; clique_idx < clique_table.first.size(); clique_idx++) { if (timer.check_time_limit()) { return; } const auto& clique = clique_table.first[clique_idx]; - if (clique.size() <= (size_t)clique_table.min_clique_size) { + if (clique.size() < (size_t)clique_table.min_clique_size) { for (size_t i = 0; i < clique.size(); i++) { for (size_t j = 0; j < clique.size(); j++) { if (i == j) { continue; } - clique_table.adj_list_small_cliques[clique[i]].insert(clique[j]); + small_edges.emplace_back(clique[i], clique[j]); } } num_removed_first++; to_delete[clique_idx] = true; } } + std::vector addtl_to_delete(clique_table.addtl_cliques.size(), false); for (size_t addtl_c = 0; addtl_c < clique_table.addtl_cliques.size(); addtl_c++) { const auto& addtl_clique = clique_table.addtl_cliques[addtl_c]; const auto base_clique_idx = static_cast(addtl_clique.clique_idx); cuopt_assert(base_clique_idx < to_delete.size(), "Additional clique points to invalid base clique index"); - // Remove additional cliques whose base clique is scheduled for deletion. - if (to_delete[base_clique_idx]) { - // Materialize conflicts represented by: - // addtl_clique.vertex_idx + first[base_clique_idx][start_pos_on_clique:] - // before deleting both the additional and base clique entries. - for (size_t i = addtl_clique.start_pos_on_clique; - i < clique_table.first[base_clique_idx].size(); - i++) { - clique_table.adj_list_small_cliques[clique_table.first[base_clique_idx][i]].insert( - addtl_clique.vertex_idx); - clique_table.adj_list_small_cliques[addtl_clique.vertex_idx].insert( - clique_table.first[base_clique_idx][i]); - } - clique_table.addtl_cliques.erase(clique_table.addtl_cliques.begin() + addtl_c); - addtl_c--; - num_removed_addtl++; - continue; - } - i_t size_of_clique = + const bool drop_because_base = to_delete[base_clique_idx]; + const i_t extended_size = clique_table.first[base_clique_idx].size() - addtl_clique.start_pos_on_clique + 1; - if (size_of_clique < clique_table.min_clique_size) { - // the items from first clique are already added to the adjlist - // only add the items that are coming from the new var in the additional clique - for (size_t i = addtl_clique.start_pos_on_clique; - i < clique_table.first[base_clique_idx].size(); - i++) { - // insert conflicts both way - clique_table.adj_list_small_cliques[clique_table.first[base_clique_idx][i]].insert( - addtl_clique.vertex_idx); - clique_table.adj_list_small_cliques[addtl_clique.vertex_idx].insert( - clique_table.first[base_clique_idx][i]); - } - clique_table.addtl_cliques.erase(clique_table.addtl_cliques.begin() + addtl_c); - addtl_c--; - num_removed_addtl++; + const bool drop_because_small = extended_size < clique_table.min_clique_size; + if (!drop_because_base && !drop_because_small) { continue; } + + for (size_t i = addtl_clique.start_pos_on_clique; + i < clique_table.first[base_clique_idx].size(); + i++) { + const i_t base_member = clique_table.first[base_clique_idx][i]; + small_edges.emplace_back(base_member, addtl_clique.vertex_idx); + small_edges.emplace_back(addtl_clique.vertex_idx, base_member); } + addtl_to_delete[addtl_c] = true; + num_removed_addtl++; + } + { + size_t old_addtl_idx = 0; + auto addtl_it = std::remove_if(clique_table.addtl_cliques.begin(), + clique_table.addtl_cliques.end(), + [&](const auto&) { return addtl_to_delete[old_addtl_idx++]; }); + clique_table.addtl_cliques.erase(addtl_it, clique_table.addtl_cliques.end()); } CUOPT_LOG_DEBUG("Number of removed cliques from first: %d, additional: %d", num_removed_first, @@ -312,40 +298,46 @@ void remove_small_cliques(clique_table_t& clique_table, cuopt::timer_t (size_t)clique_table.min_clique_size, "A small clique remained after removing small cliques"); } - // Clique removals/edge materialization can change degrees; force recompute on next query. + clique_table.small_clique_adj.finalize_from_unsorted_pairs(2 * clique_table.n_variables, + small_edges); + // Force degree recompute after structural changes. std::fill(clique_table.var_degrees.begin(), clique_table.var_degrees.end(), -1); } template -std::unordered_set clique_table_t::get_adj_set_of_var(i_t var_idx) +std::unordered_set clique_table_t::get_adj_set_of_var(i_t var_idx) const { std::unordered_set adj_set; - for (const auto& clique_idx : var_clique_map_first[var_idx]) { - adj_set.insert(first[clique_idx].begin(), first[clique_idx].end()); - } - for (const auto& addtl_clique_idx : var_clique_map_addtl[var_idx]) { - adj_set.insert(addtl_cliques[addtl_clique_idx].vertex_idx); - adj_set.insert(first[addtl_cliques[addtl_clique_idx].clique_idx].begin() + - addtl_cliques[addtl_clique_idx].start_pos_on_clique, - first[addtl_cliques[addtl_clique_idx].clique_idx].end()); - } - // Reverse lookup for additional cliques using position map: - // if var_idx is in first[clique_idx][start_pos_on_clique:], it is adjacent to vertex_idx. - for (const auto& addtl : addtl_cliques) { - if (addtl.vertex_idx == var_idx) { continue; } - if (static_cast(addtl.clique_idx) < first_var_positions.size()) { - const auto& pos_map = first_var_positions[addtl.clique_idx]; - auto it = pos_map.find(var_idx); - if (it != pos_map.end() && it->second >= addtl.start_pos_on_clique) { - adj_set.insert(addtl.vertex_idx); - } + // First-clique edges: every member of each first-clique containing var_idx. + for (const i_t* it = var_clique_first.slice_begin(var_idx); + it != var_clique_first.slice_end(var_idx); + ++it) { + const auto& c = first[*it]; + adj_set.insert(c.begin(), c.end()); + } + + // Addtl-clique edges. + for (const i_t* it = var_clique_addtl.slice_begin(var_idx); + it != var_clique_addtl.slice_end(var_idx); + ++it) { + const auto& a = addtl_cliques[*it]; + if (a.vertex_idx == var_idx) { + // var_idx is the extension vertex; new neighbors are the base suffix. + const auto& base = first[a.clique_idx]; + adj_set.insert(base.begin() + a.start_pos_on_clique, base.end()); + } else { + // var_idx is a base member; only new edge is to the extension vertex. + adj_set.insert(a.vertex_idx); } } - for (const auto& adj_vertex : adj_list_small_cliques[var_idx]) { - adj_set.insert(adj_vertex); + for (const i_t* it = small_clique_adj.slice_begin(var_idx); + it != small_clique_adj.slice_end(var_idx); + ++it) { + adj_set.insert(*it); } + // Add the complement of var_idx to the adjacency set i_t complement_idx = (var_idx >= n_variables) ? (var_idx - n_variables) : (var_idx + n_variables); adj_set.insert(complement_idx); @@ -362,99 +354,58 @@ i_t clique_table_t::get_degree_of_var(i_t var_idx) } template -bool clique_table_t::check_adjacency(i_t var_idx1, i_t var_idx2) +bool clique_table_t::check_adjacency(i_t var_idx1, i_t var_idx2) const { if (var_idx1 == var_idx2) { return false; } if (var_idx1 % n_variables == var_idx2 % n_variables) { return true; } - { - auto it = adj_list_small_cliques.find(var_idx1); - if (it != adj_list_small_cliques.end() && it->second.count(var_idx2) > 0) { return true; } - } + // small_clique_adj is symmetric, so probe either direction. + if (small_clique_adj.slice_contains(var_idx1, var_idx2)) { return true; } - // Iterate whichever variable belongs to fewer first-cliques + // Probe through the var with the smaller var_clique_first slice. { i_t probe_var = var_idx1; i_t target_var = var_idx2; - if (var_clique_map_first[var_idx1].size() > var_clique_map_first[var_idx2].size()) { + if (var_clique_first.slice_size(var_idx1) > var_clique_first.slice_size(var_idx2)) { probe_var = var_idx2; target_var = var_idx1; } - for (const auto& clique_idx : var_clique_map_first[probe_var]) { - if (first_var_positions[clique_idx].count(target_var) > 0) { return true; } + for (const i_t* it = var_clique_first.slice_begin(probe_var); + it != var_clique_first.slice_end(probe_var); + ++it) { + if (first_var_positions[*it].count(target_var) > 0) { return true; } } } - for (const auto& addtl_idx : var_clique_map_addtl[var_idx1]) { - const auto& addtl = addtl_cliques[addtl_idx]; + for (const i_t* it = var_clique_addtl.slice_begin(var_idx1); + it != var_clique_addtl.slice_end(var_idx1); + ++it) { + const auto& addtl = addtl_cliques[*it]; const auto& pos_map = first_var_positions[addtl.clique_idx]; - auto it = pos_map.find(var_idx2); - if (it != pos_map.end() && it->second >= addtl.start_pos_on_clique) { return true; } + auto pos_it = pos_map.find(var_idx2); + if (pos_it != pos_map.end() && pos_it->second >= addtl.start_pos_on_clique) { return true; } } - - for (const auto& addtl_idx : var_clique_map_addtl[var_idx2]) { - const auto& addtl = addtl_cliques[addtl_idx]; + for (const i_t* it = var_clique_addtl.slice_begin(var_idx2); + it != var_clique_addtl.slice_end(var_idx2); + ++it) { + const auto& addtl = addtl_cliques[*it]; const auto& pos_map = first_var_positions[addtl.clique_idx]; - auto it = pos_map.find(var_idx1); - if (it != pos_map.end() && it->second >= addtl.start_pos_on_clique) { return true; } + auto pos_it = pos_map.find(var_idx1); + if (pos_it != pos_map.end() && pos_it->second >= addtl.start_pos_on_clique) { return true; } } return false; } -// this function should only be called within extend clique -// if this is called outside extend clique, csr matrix should be converted into csc and copied into -// problem because the problem is partly modified -template -void insert_clique_into_problem(const std::vector& clique, - dual_simplex::user_problem_t& problem, - dual_simplex::csr_matrix_t& A, - f_t coeff_scale) -{ - // convert vertices into original vars - f_t rhs_offset = 0.; - std::vector new_vars; - std::vector new_coeffs; - for (size_t i = 0; i < clique.size(); i++) { - f_t coeff = coeff_scale; - i_t var_idx = clique[i]; - if (var_idx >= problem.num_cols) { - coeff = -coeff_scale; - var_idx = var_idx - problem.num_cols; - rhs_offset += coeff_scale; - } - new_vars.push_back(var_idx); - new_coeffs.push_back(coeff); - } - // coeff_scale * (1 - x) = coeff_scale - coeff_scale * x - // Move constants to the right, so rhs must decrease by rhs_offset. - f_t rhs = coeff_scale - rhs_offset; - // insert the new clique into the problem as a new constraint - dual_simplex::sparse_vector_t new_row(A.n, new_vars.size()); - new_row.i = std::move(new_vars); - new_row.x = std::move(new_coeffs); - A.append_row(new_row); - problem.row_sense.push_back('L'); - problem.rhs.push_back(rhs); - problem.row_names.push_back("Clique" + std::to_string(problem.row_names.size())); -} - +// Returns true on success; `work_out` accumulates scan/hash ops as a +// near-uniform wall-time proxy. template bool extend_clique(const std::vector& clique, clique_table_t& clique_table, - dual_simplex::user_problem_t& problem, - dual_simplex::csr_matrix_t& A, - f_t coeff_scale, - bool modify_problem, - i_t min_extension_gain, - i_t remaining_rows_budget, - i_t remaining_nnz_budget, - i_t& inserted_row_nnz) + double& work_out) { - inserted_row_nnz = 0; i_t smallest_degree = std::numeric_limits::max(); i_t smallest_degree_var = -1; - // find smallest degree vertex in the current set packing constraint for (size_t idx = 0; idx < clique.size(); idx++) { i_t var_idx = clique[idx]; i_t degree = clique_table.get_degree_of_var(var_idx); @@ -463,108 +414,58 @@ bool extend_clique(const std::vector& clique, smallest_degree_var = var_idx; } } - std::vector extension_candidates; + work_out += static_cast(clique.size()); + auto smallest_degree_adj_set = clique_table.get_adj_set_of_var(smallest_degree_var); + const double D = static_cast(smallest_degree_adj_set.size()); + work_out += D; + std::unordered_set clique_members(clique.begin(), clique.end()); + work_out += static_cast(clique.size()); + + std::vector extension_candidates; + extension_candidates.reserve(smallest_degree_adj_set.size()); for (const auto& candidate : smallest_degree_adj_set) { if (clique_members.find(candidate) == clique_members.end()) { extension_candidates.push_back(candidate); } } + work_out += D; + std::sort(extension_candidates.begin(), extension_candidates.end(), [&](i_t a, i_t b) { return clique_table.get_degree_of_var(a) > clique_table.get_degree_of_var(b); }); - auto new_clique = clique; - i_t n_of_complement_conflicts = 0; - i_t complement_conflict_var = -1; + const double C = static_cast(extension_candidates.size()); + if (C > 1.0) { work_out += C * std::log2(C); } + + auto new_clique = clique; for (size_t idx = 0; idx < extension_candidates.size(); idx++) { - i_t var_idx = extension_candidates[idx]; - bool add = true; - bool complement_conflict = false; - i_t complement_conflict_idx = -1; + i_t var_idx = extension_candidates[idx]; + bool add = true; for (size_t i = 0; i < new_clique.size(); i++) { - if (var_idx % clique_table.n_variables == new_clique[i] % clique_table.n_variables) { - complement_conflict = true; - complement_conflict_idx = var_idx % clique_table.n_variables; - } - // check if the tested variable conflicts with all vars in the new clique + work_out += 1.0; if (!clique_table.check_adjacency(var_idx, new_clique[i])) { add = false; break; } } - if (add) { - new_clique.push_back(var_idx); - if (complement_conflict) { - n_of_complement_conflicts++; - complement_conflict_var = complement_conflict_idx; - } - } + if (add) { new_clique.push_back(var_idx); } } - // if we found a larger cliqe, insert it into the formulation + if (new_clique.size() > clique.size()) { - if (n_of_complement_conflicts > 0) { - CUOPT_LOG_DEBUG("Found %d complement conflicts on var %d", - n_of_complement_conflicts, - complement_conflict_var); - cuopt_assert(n_of_complement_conflicts == 1, "There can only be one complement conflict"); - // Keep the discovered extension in the clique table for downstream dominance checks. - clique_table.first.push_back(new_clique); - for (const auto& var_idx : new_clique) { - clique_table.var_degrees[var_idx] = -1; - } - if (modify_problem) { - // fix all other variables other than complementing var - for (size_t i = 0; i < new_clique.size(); i++) { - if (new_clique[i] % clique_table.n_variables != complement_conflict_var) { - CUOPT_LOG_DEBUG("Fixing variable %d", new_clique[i]); - if (new_clique[i] >= problem.num_cols) { - cuopt_assert(problem.lower[new_clique[i] - problem.num_cols] != 0 || - problem.upper[new_clique[i] - problem.num_cols] != 0, - "Variable is fixed to other side"); - problem.lower[new_clique[i] - problem.num_cols] = 1; - problem.upper[new_clique[i] - problem.num_cols] = 1; - } else { - cuopt_assert(problem.lower[new_clique[i]] != 1 || problem.upper[new_clique[i]] != 1, - "Variable is fixed to other side"); - problem.lower[new_clique[i]] = 0; - problem.upper[new_clique[i]] = 0; - } - } - } - } - return true; - } else { - // Keep the discovered extension in the clique table even when row insertion is skipped by - // row/nnz budgets. - clique_table.first.push_back(new_clique); - for (const auto& var_idx : new_clique) { - clique_table.var_degrees[var_idx] = -1; - } + clique_table.first.push_back(new_clique); + for (const auto& var_idx : new_clique) { + clique_table.var_degrees[var_idx] = -1; + } + work_out += static_cast(new_clique.size()); #if DEBUG_KNAPSACK_CONSTRAINTS - CUOPT_LOG_DEBUG("Extended clique: %lu from %lu", new_clique.size(), clique.size()); + CUOPT_LOG_DEBUG("Extended clique: %lu from %lu", new_clique.size(), clique.size()); #endif - i_t extension_gain = static_cast(new_clique.size() - clique.size()); - if (extension_gain < min_extension_gain) { return true; } - if (remaining_rows_budget <= 0 || - remaining_nnz_budget < static_cast(new_clique.size())) { - return true; - } - // Row insertion is now deferred until dominance is confirmed against model rows. - // This keeps extension and replacement sequential: detect dominance first, then replace. - inserted_row_nnz = 0; - } + return true; } - return new_clique.size() > clique.size(); + return false; } -template -struct clique_sig_t { - i_t knapsack_idx; - i_t size; - long long signature; -}; - template struct extension_candidate_t { i_t knapsack_idx; @@ -572,19 +473,6 @@ struct extension_candidate_t { i_t clique_size; }; -template -bool compare_clique_sig(const clique_sig_t& a, const clique_sig_t& b) -{ - if (a.signature != b.signature) { return a.signature < b.signature; } - return a.size < b.size; -} - -template -bool compare_signature_value(long long value, const clique_sig_t& a) -{ - return value < a.signature; -} - template bool compare_extension_candidate(const extension_candidate_t& a, const extension_candidate_t& b) @@ -594,265 +482,28 @@ bool compare_extension_candidate(const extension_candidate_t& a, return a.knapsack_idx < b.knapsack_idx; } -template -bool is_sorted_subset(const std::vector& a, const std::vector& b) -{ - size_t i = 0; - size_t j = 0; - while (i < a.size() && j < b.size()) { - if (a[i] == b[j]) { - i++; - j++; - } else if (a[i] > b[j]) { - j++; - } else { - return false; - } - } - return i == a.size(); -} - -template -void fix_difference(const std::vector& superset, - const std::vector& subset, - dual_simplex::user_problem_t& problem) -{ - cuopt_assert(std::is_sorted(subset.begin(), subset.end()), - "subset vector passed to fix_difference is not sorted"); - for (auto var_idx : superset) { - if (std::binary_search(subset.begin(), subset.end(), var_idx)) { continue; } - if (var_idx >= problem.num_cols) { - i_t orig_idx = var_idx - problem.num_cols; - CUOPT_LOG_DEBUG("Fixing variable %d", orig_idx); - cuopt_assert(problem.lower[orig_idx] != 0 || problem.upper[orig_idx] != 0, - "Variable is fixed to other side"); - problem.lower[orig_idx] = 1; - problem.upper[orig_idx] = 1; - } else { - CUOPT_LOG_DEBUG("Fixing variable %d", var_idx); - cuopt_assert(problem.lower[var_idx] != 1 || problem.upper[var_idx] != 1, - "Variable is fixed to other side"); - problem.lower[var_idx] = 0; - problem.upper[var_idx] = 0; - } - } -} - -template -void remove_marked_elements(std::vector& vec, const std::vector& removal_marker) -{ - size_t write_idx = 0; - for (size_t i = 0; i < vec.size(); i++) { - if (!removal_marker[i]) { - if (write_idx != i) { vec[write_idx] = std::move(vec[i]); } - write_idx++; - } - } - vec.resize(write_idx); -} - -template -void remove_dominated_cliques_in_problem_for_single_extended_clique( - const std::vector& curr_clique, - f_t coeff_scale, - i_t remaining_rows_budget, - i_t remaining_nnz_budget, - i_t& inserted_row_nnz, - const std::vector>& sp_sigs, - const std::vector>& cstr_vars, - const std::vector>& knapsack_constraints, - std::vector& original_to_current_row_idx, - dual_simplex::user_problem_t& problem, - dual_simplex::csr_matrix_t& A, - cuopt::timer_t& timer) -{ - inserted_row_nnz = 0; - if (curr_clique.empty() || sp_sigs.empty()) { return; } - std::vector curr_clique_vars(curr_clique.begin(), curr_clique.end()); - std::sort(curr_clique_vars.begin(), curr_clique_vars.end()); - curr_clique_vars.erase(std::unique(curr_clique_vars.begin(), curr_clique_vars.end()), - curr_clique_vars.end()); - long long signature = 0; - for (auto v : curr_clique_vars) { - signature += static_cast(v); - } - constexpr size_t dominance_window = 20000; - auto end_it = - std::upper_bound(sp_sigs.begin(), sp_sigs.end(), signature, compare_signature_value); - size_t end = static_cast(std::distance(sp_sigs.begin(), end_it)); - size_t start = (end > dominance_window) ? (end - dominance_window) : 0; - std::vector rows_to_remove; - bool covering_clique_implied_by_partitioning = false; - for (size_t idx = end; idx > start; idx--) { - if (timer.check_time_limit()) { break; } - const auto& sp = sp_sigs[idx - 1]; - const auto& vars_sp = cstr_vars[sp.knapsack_idx]; - if (vars_sp.size() > curr_clique_vars.size()) { continue; } - cuopt_assert(std::is_sorted(vars_sp.begin(), vars_sp.end()), - "vars_sp vector passed to is_sorted_subset is not sorted"); - if (!is_sorted_subset(vars_sp, curr_clique_vars)) { continue; } - if (knapsack_constraints[sp.knapsack_idx].is_set_partitioning) { - if (vars_sp.size() != curr_clique_vars.size()) { - fix_difference(curr_clique_vars, vars_sp, problem); - covering_clique_implied_by_partitioning = true; - } - continue; - } - i_t original_row_idx = knapsack_constraints[sp.knapsack_idx].cstr_idx; - if (original_row_idx < 0) { continue; } - cuopt_assert(original_row_idx < static_cast(original_to_current_row_idx.size()), - "Invalid original row index in knapsack constraint"); - i_t current_row_idx = original_to_current_row_idx[original_row_idx]; - if (current_row_idx < 0) { continue; } - cuopt_assert(current_row_idx < static_cast(problem.row_sense.size()), - "Invalid current row index in row mapping"); - rows_to_remove.push_back(current_row_idx); - } - if (rows_to_remove.empty()) { return; } - std::sort(rows_to_remove.begin(), rows_to_remove.end()); - rows_to_remove.erase(std::unique(rows_to_remove.begin(), rows_to_remove.end()), - rows_to_remove.end()); - if (!covering_clique_implied_by_partitioning) { - if (remaining_rows_budget <= 0 || - remaining_nnz_budget < static_cast(curr_clique_vars.size())) { - return; - } - insert_clique_into_problem(curr_clique_vars, problem, A, coeff_scale); - inserted_row_nnz = static_cast(curr_clique_vars.size()); - } - std::vector removal_marker(problem.row_sense.size(), 0); - for (auto row_idx : rows_to_remove) { - cuopt_assert(row_idx >= 0 && row_idx < static_cast(removal_marker.size()), - "Invalid dominated row index"); - CUOPT_LOG_DEBUG("Removing dominated row %d", row_idx); - removal_marker[row_idx] = true; - } - dual_simplex::csr_matrix_t A_removed(0, 0, 0); - A.remove_rows(removal_marker, A_removed); - A = std::move(A_removed); - problem.num_rows = A.m; - remove_marked_elements(problem.row_sense, removal_marker); - remove_marked_elements(problem.rhs, removal_marker); - remove_marked_elements(problem.row_names, removal_marker); - cuopt_assert(problem.rhs.size() == problem.row_sense.size(), "rhs and row sense size mismatch"); - cuopt_assert(problem.row_names.size() == problem.rhs.size(), "row names and rhs size mismatch"); - cuopt_assert(problem.num_rows == static_cast(problem.rhs.size()), - "matrix and num rows mismatch after removal"); - if (!problem.range_rows.empty()) { - std::vector old_to_new_indices; - old_to_new_indices.reserve(removal_marker.size()); - i_t new_idx = 0; - for (size_t i = 0; i < removal_marker.size(); ++i) { - if (!removal_marker[i]) { - old_to_new_indices.push_back(new_idx++); - } else { - old_to_new_indices.push_back(-1); - } - } - std::vector new_range_rows; - std::vector new_range_values; - for (size_t i = 0; i < problem.range_rows.size(); ++i) { - i_t old_row = problem.range_rows[i]; - cuopt_assert(old_row >= 0 && old_row < static_cast(removal_marker.size()), - "Invalid row index in range_rows"); - if (!removal_marker[old_row]) { - i_t new_row = old_to_new_indices[old_row]; - cuopt_assert(new_row != -1, "Invalid new row index for ranged row renumbering"); - new_range_rows.push_back(new_row); - new_range_values.push_back(problem.range_value[i]); - } - } - problem.range_rows = std::move(new_range_rows); - problem.range_value = std::move(new_range_values); - } - problem.num_range_rows = static_cast(problem.range_rows.size()); - std::vector removed_prefix(removal_marker.size() + 1, 0); - for (size_t row_idx = 0; row_idx < removal_marker.size(); row_idx++) { - removed_prefix[row_idx + 1] = - removed_prefix[row_idx] + static_cast(removal_marker[row_idx]); - } - for (i_t row_idx = 0; row_idx < static_cast(original_to_current_row_idx.size()); row_idx++) { - i_t current_row_idx = original_to_current_row_idx[row_idx]; - if (current_row_idx < 0) { continue; } - cuopt_assert(current_row_idx < static_cast(removal_marker.size()), - "Row index map is out of bounds"); - if (removal_marker[current_row_idx]) { - original_to_current_row_idx[row_idx] = -1; - } else { - original_to_current_row_idx[row_idx] = current_row_idx - removed_prefix[current_row_idx]; - } - } -} - -// Also known as clique merging. Infer larger clique constraints which allows inclusion of vars from -// other constraints. This only extends the original cliques in the formulation for now. -// TODO: consider a heuristic on how much of the cliques derived from knapsacks to include here +// Extends set-packing cliques. Soft floor: min_work; hard ceiling: max_work +// or `timer`. signal_extend only honored after min_work. template i_t extend_cliques(const std::vector>& knapsack_constraints, - const std::unordered_set& set_packing_constraints, clique_table_t& clique_table, - dual_simplex::user_problem_t& problem, - dual_simplex::csr_matrix_t& A, - bool modify_problem, cuopt::timer_t& timer, double* work_estimate_out, - double max_work_estimate) + double min_work, + double max_work, + std::atomic* signal_extend) { - constexpr i_t min_extension_gain = 2; - constexpr i_t extension_yield_window = 64; - constexpr i_t min_successes_per_window = 1; + constexpr i_t min_extension_gain = 2; double local_work = 0.0; double& work = work_estimate_out ? *work_estimate_out : local_work; - i_t base_rows = A.m; - i_t base_nnz = A.row_start[A.m]; - i_t max_added_rows = std::max(8, base_rows / 50); - i_t max_added_nnz = std::max(8 * clique_table.max_clique_size_for_extension, base_nnz / 50); - - i_t added_rows = 0; - i_t added_nnz = 0; - i_t window_attempts = 0; - i_t window_successes = 0; - - CUOPT_LOG_DEBUG("Clique extension heuristics: min_gain=%d row_budget=%d nnz_budget=%d", - min_extension_gain, - max_added_rows, - max_added_nnz); - std::vector> cstr_vars(knapsack_constraints.size()); - std::vector> sp_sigs; - sp_sigs.reserve(set_packing_constraints.size()); - for (const auto knapsack_idx : set_packing_constraints) { - cuopt_assert(knapsack_idx >= 0 && knapsack_idx < static_cast(knapsack_constraints.size()), - "Invalid set packing constraint index"); - const auto& vars = knapsack_constraints[knapsack_idx].entries; - cstr_vars[knapsack_idx].reserve(vars.size()); - for (const auto& entry : vars) { - cstr_vars[knapsack_idx].push_back(entry.col); - } - std::sort(cstr_vars[knapsack_idx].begin(), cstr_vars[knapsack_idx].end()); - cstr_vars[knapsack_idx].erase( - std::unique(cstr_vars[knapsack_idx].begin(), cstr_vars[knapsack_idx].end()), - cstr_vars[knapsack_idx].end()); - long long signature = 0; - for (auto v : cstr_vars[knapsack_idx]) { - signature += static_cast(v); - } - sp_sigs.push_back({knapsack_idx, static_cast(cstr_vars[knapsack_idx].size()), signature}); - work += cstr_vars[knapsack_idx].size(); - } - if (work > max_work_estimate) { return 0; } - std::sort(sp_sigs.begin(), sp_sigs.end(), compare_clique_sig); - std::vector original_to_current_row_idx(problem.row_sense.size(), -1); - for (i_t row_idx = 0; row_idx < static_cast(original_to_current_row_idx.size()); row_idx++) { - original_to_current_row_idx[row_idx] = row_idx; - } std::vector> extension_worklist; extension_worklist.reserve(knapsack_constraints.size()); for (i_t knapsack_idx = 0; knapsack_idx < static_cast(knapsack_constraints.size()); knapsack_idx++) { if (timer.check_time_limit()) { break; } - if (work > max_work_estimate) { break; } + if (work >= max_work) { break; } const auto& knapsack_constraint = knapsack_constraints[knapsack_idx]; if (!knapsack_constraint.is_set_packing) { continue; } i_t clique_size = static_cast(knapsack_constraint.entries.size()); @@ -864,99 +515,93 @@ i_t extend_cliques(const std::vector>& knapsack_ i_t estimated_gain = std::max(0, smallest_degree - (clique_size - 1)); if (estimated_gain < min_extension_gain) { continue; } extension_worklist.push_back({knapsack_idx, estimated_gain, clique_size}); - work += knapsack_constraint.entries.size(); + work += static_cast(knapsack_constraint.entries.size()); } std::stable_sort( extension_worklist.begin(), extension_worklist.end(), compare_extension_candidate); + if (!extension_worklist.empty()) { + work += static_cast(extension_worklist.size()) * + std::log2(static_cast(extension_worklist.size())); + } CUOPT_LOG_DEBUG("Clique extension candidates after scoring: %zu", extension_worklist.size()); i_t n_extended_cliques = 0; for (const auto& candidate : extension_worklist) { if (timer.check_time_limit()) { break; } - if (work > max_work_estimate) { break; } - if (added_rows >= max_added_rows || added_nnz >= max_added_nnz) { - CUOPT_LOG_DEBUG( - "Stopping clique extension: budget reached (rows=%d nnz=%d)", added_rows, added_nnz); - break; + if (work >= min_work) { + if (work >= max_work) { break; } + if (signal_extend && signal_extend->load(std::memory_order_acquire)) { + CUOPT_LOG_DEBUG("Stopping clique extension: cut-pass signal received (work=%.0f)", work); + break; + } } - window_attempts++; const auto& knapsack_constraint = knapsack_constraints[candidate.knapsack_idx]; std::vector clique; + clique.reserve(knapsack_constraint.entries.size()); for (const auto& entry : knapsack_constraint.entries) { clique.push_back(entry.col); } - i_t inserted_row_nnz = 0; - f_t coeff_scale = knapsack_constraint.entries[0].val; - bool extended_clique = extend_clique(clique, - clique_table, - problem, - A, - coeff_scale, - modify_problem, - min_extension_gain, - max_added_rows - added_rows, - max_added_nnz - added_nnz, - inserted_row_nnz); - work += clique.size() * clique.size(); - if (extended_clique) { - n_extended_cliques++; - i_t replacement_row_nnz = 0; - if (modify_problem) { - remove_dominated_cliques_in_problem_for_single_extended_clique(clique_table.first.back(), - coeff_scale, - max_added_rows - added_rows, - max_added_nnz - added_nnz, - replacement_row_nnz, - sp_sigs, - cstr_vars, - knapsack_constraints, - original_to_current_row_idx, - problem, - A, - timer); - } - if (replacement_row_nnz > 0) { - window_successes++; - added_rows++; - added_nnz += replacement_row_nnz; - } - } - if (window_attempts >= extension_yield_window) { - if (window_successes < min_successes_per_window) { - CUOPT_LOG_DEBUG( - "Stopping clique extension: low yield (%d/%d)", window_successes, window_attempts); - break; - } - window_attempts = 0; - window_successes = 0; - } + if (extend_clique(clique, clique_table, work)) { n_extended_cliques++; } } - if (modify_problem) { - // copy modified matrix back to problem - A.to_compressed_col(problem.A); - } - CUOPT_LOG_DEBUG("Number of extended cliques: %d", n_extended_cliques); + CUOPT_LOG_DEBUG("Number of extended cliques: %d (work=%.0f)", n_extended_cliques, work); return n_extended_cliques; } template void fill_var_clique_maps(clique_table_t& clique_table) { - clique_table.first_var_positions.resize(clique_table.first.size()); + const i_t n_vertices = 2 * clique_table.n_variables; + + // first_var_positions: per-clique hash map (cliques small ⇒ hash beats binary search). + clique_table.first_var_positions.assign(clique_table.first.size(), {}); + + std::vector> first_pairs; + size_t total_first_members = 0; + for (const auto& c : clique_table.first) { + total_first_members += c.size(); + } + first_pairs.reserve(total_first_members); + for (size_t clique_idx = 0; clique_idx < clique_table.first.size(); clique_idx++) { const auto& clique = clique_table.first[clique_idx]; auto& pos_map = clique_table.first_var_positions[clique_idx]; pos_map.reserve(clique.size()); for (size_t idx = 0; idx < clique.size(); idx++) { - i_t var_idx = clique[idx]; - clique_table.var_clique_map_first[var_idx].insert(clique_idx); + const i_t var_idx = clique[idx]; + first_pairs.emplace_back(var_idx, static_cast(clique_idx)); pos_map[var_idx] = static_cast(idx); } } - for (size_t addtl_c = 0; addtl_c < clique_table.addtl_cliques.size(); addtl_c++) { - const auto& addtl_clique = clique_table.addtl_cliques[addtl_c]; - clique_table.var_clique_map_addtl[addtl_clique.vertex_idx].insert(addtl_c); + clique_table.var_clique_first.finalize_from_unsorted_pairs(n_vertices, first_pairs); + + std::vector> addtl_pairs; + for (size_t addtl_c = 0; addtl_c < clique_table.addtl_cliques.size(); ++addtl_c) { + const auto& a = clique_table.addtl_cliques[addtl_c]; + addtl_pairs.emplace_back(a.vertex_idx, static_cast(addtl_c)); + const auto& base = clique_table.first[a.clique_idx]; + for (i_t pos = a.start_pos_on_clique; pos < static_cast(base.size()); ++pos) { + addtl_pairs.emplace_back(base[pos], static_cast(addtl_c)); + } + } + clique_table.var_clique_addtl.finalize_from_unsorted_pairs(n_vertices, addtl_pairs); +} + +template +void clique_table_t::set_small_clique_adj_for_test( + const std::unordered_map>& edges) +{ + std::vector> pairs; + size_t total = 0; + for (const auto& kv : edges) { + total += kv.second.size(); + } + pairs.reserve(total); + for (const auto& kv : edges) { + for (const auto& v : kv.second) { + pairs.emplace_back(kv.first, v); + } } + small_clique_adj.finalize_from_unsorted_pairs(2 * n_variables, pairs); } template @@ -972,12 +617,10 @@ void build_clique_table(const dual_simplex::user_problem_t& problem, cuopt_assert(problem.var_types.size() == static_cast(problem.num_cols), "Problem variable types size mismatch"); std::vector> knapsack_constraints; - std::unordered_set set_packing_constraints; dual_simplex::csr_matrix_t A(problem.num_rows, problem.num_cols, 0); problem.A.to_compressed_row(A); fill_knapsack_constraints(problem, knapsack_constraints, A); - make_coeff_positive_knapsack_constraint( - problem, knapsack_constraints, set_packing_constraints, tolerances); + make_coeff_positive_knapsack_constraint(problem, knapsack_constraints, tolerances); sort_csr_by_constraint_coefficients(knapsack_constraints); clique_table.tolerances = tolerances; for (const auto& knapsack_constraint : knapsack_constraints) { @@ -1035,7 +678,6 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, typename mip_solver_settings_t::tolerances_t tolerances, std::shared_ptr>* clique_table_out, cuopt::timer_t& timer, - bool modify_problem, std::atomic* signal_extend) { cuopt::timer_t stage_timer(std::numeric_limits::infinity()); @@ -1050,15 +692,13 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, double t_remove = 0.; #endif std::vector> knapsack_constraints; - std::unordered_set set_packing_constraints; dual_simplex::csr_matrix_t A(problem.num_rows, problem.num_cols, 0); problem.A.to_compressed_row(A); fill_knapsack_constraints(problem, knapsack_constraints, A); #ifdef DEBUG_CLIQUE_TABLE t_fill = stage_timer.elapsed_time(); #endif - make_coeff_positive_knapsack_constraint( - problem, knapsack_constraints, set_packing_constraints, tolerances); + make_coeff_positive_knapsack_constraint(problem, knapsack_constraints, tolerances); #ifdef DEBUG_CLIQUE_TABLE t_coeff = stage_timer.elapsed_time(); #endif @@ -1083,9 +723,9 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, double time_limit_for_additional_cliques = timer.remaining_time() / 2; cuopt::timer_t additional_cliques_timer(time_limit_for_additional_cliques); double find_work_estimate = 0.0; + // Always build base cliques in full; signal_extend only gates the extension phase. for (const auto& knapsack_constraint : knapsack_constraints) { if (timer.check_time_limit()) { break; } - if (signal_extend && signal_extend->load(std::memory_order_acquire)) { break; } find_cliques_from_constraint(knapsack_constraint, *clique_table_ptr, additional_cliques_timer); find_work_estimate += knapsack_constraint.entries.size(); } @@ -1105,17 +745,15 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, t_maps = stage_timer.elapsed_time(); #endif if (clique_table_out != nullptr) { *clique_table_out = std::move(clique_table_shared); } - double extend_work = 0.0; - constexpr double max_extend_work = 2e9; - i_t n_extended_cliques = extend_cliques(knapsack_constraints, - set_packing_constraints, + double extend_work = 0.0; + i_t n_extended_cliques = extend_cliques(knapsack_constraints, *clique_table_ptr, - problem, - A, - modify_problem, timer, &extend_work, - max_extend_work); + clique_config.min_extend_work, + clique_config.max_extend_work, + signal_extend); + if (n_extended_cliques > 0) { fill_var_clique_maps(*clique_table_ptr); } #ifdef DEBUG_CLIQUE_TABLE t_extend = stage_timer.elapsed_time(); CUOPT_LOG_DEBUG( @@ -1134,21 +772,21 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, #endif } -#define INSTANTIATE(F_TYPE) \ - template void find_initial_cliques( \ - dual_simplex::user_problem_t & problem, \ - typename mip_solver_settings_t::tolerances_t tolerances, \ - std::shared_ptr> * clique_table_out, \ - cuopt::timer_t & timer, \ - bool modify_problem, \ - std::atomic* signal_extend); \ - template void build_clique_table( \ - const dual_simplex::user_problem_t& problem, \ - clique_table_t& clique_table, \ - typename mip_solver_settings_t::tolerances_t tolerances, \ - bool remove_small_cliques_flag, \ - bool fill_var_clique_maps_flag, \ - cuopt::timer_t& timer); \ +#define INSTANTIATE(F_TYPE) \ + template void find_initial_cliques( \ + dual_simplex::user_problem_t & problem, \ + typename mip_solver_settings_t::tolerances_t tolerances, \ + std::shared_ptr> * clique_table_out, \ + cuopt::timer_t & timer, \ + std::atomic * signal_extend); \ + template void build_clique_table( \ + const dual_simplex::user_problem_t& problem, \ + clique_table_t& clique_table, \ + typename mip_solver_settings_t::tolerances_t tolerances, \ + bool remove_small_cliques_flag, \ + bool fill_var_clique_maps_flag, \ + cuopt::timer_t& timer); \ + template void fill_var_clique_maps(clique_table_t & clique_table); \ template class clique_table_t; #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh index 944241b4f0..10ad4e3942 100644 --- a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh +++ b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh @@ -23,9 +23,11 @@ #include #include +#include #include #include #include +#include #include namespace cuopt::linear_programming::detail { @@ -33,6 +35,10 @@ namespace cuopt::linear_programming::detail { struct clique_config_t { int min_clique_size = 512; int max_clique_size_for_extension = 128; + // extend_cliques work budget; one unit ≈ one hash/scan op in extend_clique. + // Soft floor before honoring cut-gen signal; hard ceiling. + double min_extend_work = 1e7; + double max_extend_work = 2e9; }; template @@ -59,37 +65,138 @@ struct addtl_clique_t { i_t start_pos_on_clique; }; +// CSR per-vertex map: for v in [0, n_vertices), `indices[offsets[v] .. +// offsets[v+1])` is a sorted slice. Build protocol: callers push (src, value) +// pairs and call `finalize_from_unsorted_pairs`. +template +struct csr_var_map_t { + std::vector offsets; // size: n_vertices + 1; offsets[v] is the start in `indices` + std::vector indices; // sorted within each [offsets[v], offsets[v+1]) slice + + void clear_and_resize(i_t n_vertices) + { + offsets.assign(n_vertices + 1, 0); + indices.clear(); + } + i_t n_keys() const { return offsets.empty() ? 0 : static_cast(offsets.size() - 1); } + i_t slice_size(i_t v) const { return offsets[v + 1] - offsets[v]; } + const i_t* slice_begin(i_t v) const { return indices.data() + offsets[v]; } + const i_t* slice_end(i_t v) const { return indices.data() + offsets[v + 1]; } + // O(1) summary used by cut/extension cost-budget heuristics. + double avg_slice_size() const + { + const i_t k = n_keys(); + return k > 0 ? static_cast(indices.size()) / static_cast(k) : 0.0; + } + bool slice_contains(i_t v, i_t value) const + { + const i_t* b = slice_begin(v); + const i_t* e = slice_end(v); + return std::binary_search(b, e, value); + } + + // Build CSR from unsorted (src, value) pairs. Each output slice is sorted + // and deduplicated. Caller must keep p.first in [0, n_vertices). + void finalize_from_unsorted_pairs(i_t n_vertices, std::vector>& pairs) + { + offsets.assign(n_vertices + 1, 0); + for (const auto& p : pairs) { + offsets[p.first + 1]++; + } + for (i_t v = 1; v <= n_vertices; ++v) { + offsets[v] += offsets[v - 1]; + } + indices.assign(static_cast(offsets.back()), i_t{0}); + std::vector head(n_vertices, 0); + for (const auto& p : pairs) { + indices[offsets[p.first] + head[p.first]++] = p.second; + } + for (i_t v = 0; v < n_vertices; ++v) { + auto* b = indices.data() + offsets[v]; + auto* e = indices.data() + offsets[v] + head[v]; + std::sort(b, e); + auto* new_end = std::unique(b, e); + head[v] = static_cast(new_end - b); + } + // Compact away dedupe holes. + std::vector new_offsets(n_vertices + 1, 0); + for (i_t v = 0; v < n_vertices; ++v) { + new_offsets[v + 1] = new_offsets[v] + head[v]; + } + if (new_offsets.back() != offsets.back()) { + std::vector new_indices(static_cast(new_offsets.back())); + for (i_t v = 0; v < n_vertices; ++v) { + std::copy(indices.data() + offsets[v], + indices.data() + offsets[v] + head[v], + new_indices.data() + new_offsets[v]); + } + offsets = std::move(new_offsets); + indices = std::move(new_indices); + } else { + offsets = std::move(new_offsets); + } + } +}; + template struct clique_table_t { clique_table_t(i_t n_vertices, i_t min_clique_size_, i_t max_clique_size_for_extension_) : min_clique_size(min_clique_size_), max_clique_size_for_extension(max_clique_size_for_extension_), - var_clique_map_first(n_vertices), - var_clique_map_addtl(n_vertices), - adj_list_small_cliques(n_vertices), var_degrees(n_vertices, -1), n_variables(n_vertices / 2) { + var_clique_first.clear_and_resize(n_vertices); + var_clique_addtl.clear_and_resize(n_vertices); + small_clique_adj.clear_and_resize(n_vertices); } - std::unordered_set get_adj_set_of_var(i_t var_idx); + // Copy disabled; move provided so tests can return by value. + // Move-assign omitted because of const members. + clique_table_t(const clique_table_t&) = delete; + clique_table_t& operator=(const clique_table_t&) = delete; + + clique_table_t(clique_table_t&& other) noexcept + : first(std::move(other.first)), + addtl_cliques(std::move(other.addtl_cliques)), + var_clique_first(std::move(other.var_clique_first)), + var_clique_addtl(std::move(other.var_clique_addtl)), + first_var_positions(std::move(other.first_var_positions)), + small_clique_adj(std::move(other.small_clique_adj)), + var_degrees(std::move(other.var_degrees)), + n_variables(other.n_variables), + min_clique_size(other.min_clique_size), + max_clique_size_for_extension(other.max_clique_size_for_extension), + tolerances(other.tolerances) + { + } + + clique_table_t& operator=(clique_table_t&&) = delete; + + std::unordered_set get_adj_set_of_var(i_t var_idx) const; i_t get_degree_of_var(i_t var_idx); - bool check_adjacency(i_t var_idx1, i_t var_idx2); + bool check_adjacency(i_t var_idx1, i_t var_idx2) const; + bool empty() const + { + return first.empty() && addtl_cliques.empty() && small_clique_adj.indices.empty(); + } + + void set_small_clique_adj_for_test(const std::unordered_map>& edges); // keeps the large cliques in each constraint std::vector> first; // keeps the additional cliques std::vector> addtl_cliques; - // TODO figure out the performance of lookup for the following: unordered_set vs vector - // keeps the indices of original(first) cliques that contain variable x - std::vector> var_clique_map_first; - // keeps the indices of additional cliques that contain variable x - std::vector> var_clique_map_addtl; + // var_idx → indices of `first` cliques that contain var_idx (CSR). + csr_var_map_t var_clique_first; + // var_idx → indices of `addtl_cliques` containing var_idx (as the extension + // vertex or as a base-suffix member). + csr_var_map_t var_clique_addtl; // var_idx -> position mapping for each first clique, enabling O(1) membership/position checks std::vector> first_var_positions; - // adjacency list to keep small cliques, this basically keeps the vars share a small clique - // constraint - std::unordered_map> adj_list_small_cliques; + // var_idx → pairwise edges from cliques demoted by remove_small_cliques. + // Symmetric: edge (u, v) appears in both u's and v's slices. + csr_var_map_t small_clique_adj; // degrees of each vertex std::vector var_degrees; // number of variables in the original problem @@ -104,7 +211,6 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, typename mip_solver_settings_t::tolerances_t tolerances, std::shared_ptr>* clique_table_out, cuopt::timer_t& timer, - bool modify_problem, std::atomic* signal_extend = nullptr); template @@ -115,6 +221,9 @@ void build_clique_table(const dual_simplex::user_problem_t& problem, bool fill_var_clique_maps, cuopt::timer_t& timer); +template +void fill_var_clique_maps(clique_table_t& clique_table); + } // namespace cuopt::linear_programming::detail // Possible application to rounding procedure, keeping it as reference diff --git a/cpp/tests/mip/cuts_test.cu b/cpp/tests/mip/cuts_test.cu index 1348d7e7e4..4f733194f2 100644 --- a/cpp/tests/mip/cuts_test.cu +++ b/cpp/tests/mip/cuts_test.cu @@ -1067,7 +1067,7 @@ TEST(cuts, clique_phase1_remove_small_cliques_preserves_addtl_conflicts) EXPECT_TRUE(clique_table.first.empty()); EXPECT_TRUE(clique_table.addtl_cliques.empty()); - // Conflicts must remain materialized in adj_list_small_cliques after removals. + // Conflicts must remain materialized in small_clique_adj after removals. EXPECT_TRUE(clique_table.check_adjacency(1, 3)); EXPECT_TRUE(clique_table.check_adjacency(3, 1)); EXPECT_TRUE(clique_table.check_adjacency(2, 3)); @@ -1246,7 +1246,7 @@ TEST(cuts, clique_neos8_phase1_addtl_suffix_conflicts_materialized) TEST(cuts, clique_neos8_phase1_symmetry_and_degree_cache_consistency) { auto& clique_table = get_neos8_clique_table_cached(); - const int n_vertices = static_cast(clique_table.var_clique_map_first.size()); + const int n_vertices = static_cast(clique_table.var_clique_first.n_keys()); ASSERT_GT(n_vertices, 0); const int sample_size = std::min(n_vertices, 24); From 72bb299cb1e98800f56145eb18c5b1da938b8552 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 13 May 2026 15:03:49 +0200 Subject: [PATCH 07/47] merge clique changes --- cpp/src/branch_and_bound/branch_and_bound.cpp | 2 +- cpp/src/cuts/cuts.cpp | 420 +++++++++- cpp/src/cuts/cuts.hpp | 138 +++- .../presolve/conflict_graph/clique_table.cu | 760 +++++------------- .../presolve/conflict_graph/clique_table.cuh | 137 +++- cpp/tests/mip/cuts_test.cu | 4 +- 6 files changed, 848 insertions(+), 613 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index aef637d881..c10ec9edbf 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2075,7 +2075,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut cuopt::timer_t timer(std::numeric_limits::infinity()); std::shared_ptr> table; detail::find_initial_cliques( - problem_copy, tolerances_for_clique, &table, timer, false, signal_ptr); + problem_copy, tolerances_for_clique, &table, timer, signal_ptr); return table; }); } diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 38532ceb27..8eab7778b5 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -15,7 +15,9 @@ #include #include +#include #include +#include #include #include @@ -81,13 +83,46 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic cut.i.clear(); cut.x.clear(); - i_t num_complements = 0; + + // P0-3 (1): two-pass complement-pair detection. The baseline returned + // NO_CUT on the first variable that appeared both as itself and as its + // complement; that hides how many such conflicts a candidate clique has + // and makes it impossible to attribute infeasibility events to specific + // clique generators. Pre-pass collects every original/complement + // occurrence per variable, counts the actual complement pairs, and only + // then decides. Accept/reject behavior matches baseline (a complement + // pair still aborts cut construction); only the diagnostics change. std::unordered_set seen_original; std::unordered_set seen_complement; seen_original.reserve(clique_vertices.size()); seen_complement.reserve(clique_vertices.size()); for (const auto vertex_idx : clique_vertices) { cuopt_assert(vertex_idx >= 0 && vertex_idx < 2 * num_vars, "Clique vertex out of range"); + const i_t var_idx = vertex_idx % num_vars; + const bool complement = vertex_idx >= num_vars; + if (complement) { + seen_complement.insert(var_idx); + } else { + seen_original.insert(var_idx); + } + } + i_t complement_pairs = 0; + for (const auto var_idx : seen_original) { + if (seen_complement.count(var_idx) > 0) { complement_pairs++; } + } + if (complement_pairs > 0) { + CLIQUE_CUTS_DEBUG("build_clique_cut infeasible: %lld complement-pairs", + static_cast(complement_pairs)); + return clique_cut_build_status_t::NO_CUT; + } + + // Second pass: emit cut coefficients. We already know there are no + // complement-pair conflicts so the lookups against seen_original / + // seen_complement that the baseline performed are now redundant. + i_t num_complements = 0; + const bool has_original = !seen_original.empty(); + const bool has_complement = !seen_complement.empty(); + for (const auto vertex_idx : clique_vertices) { const i_t var_idx = vertex_idx % num_vars; const bool complement = vertex_idx >= num_vars; const f_t lower_bound = lower_bounds[var_idx]; @@ -97,32 +132,17 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic "Clique contains continuous variable"); cuopt_assert(lower_bound >= -bound_tol, "Clique variable lower bound below zero"); cuopt_assert(upper_bound <= 1 + bound_tol, "Clique variable upper bound above one"); + static_cast(lower_bound); + static_cast(upper_bound); - // we store the cut in the form of >= 1, for easy violation check with dot product - // that's why compelements have 1 as coeff and normal vars have -1 + // Cut is stored in form sum_j a_j x_j >= rhs for direct dot-product + // violation checks. Complemented literals (1 - x_j) contribute +1*x_j + // to the inequality and originals contribute -1*x_j. if (complement) { - if (seen_original.count(var_idx) > 0) { - // FIXME: this is temporary, fix all the vars of all other vars in the clique - return clique_cut_build_status_t::NO_CUT; - CLIQUE_CUTS_DEBUG("build_clique_cut infeasible var=%lld appears as variable and complement", - static_cast(var_idx)); - return clique_cut_build_status_t::INFEASIBLE; - } - cuopt_assert(seen_complement.count(var_idx) == 0, "Duplicate complement in clique"); - seen_complement.insert(var_idx); num_complements++; cut.i.push_back(var_idx); cut.x.push_back(1.0); } else { - if (seen_complement.count(var_idx) > 0) { - // FIXME: this is temporary, fix all the vars of all other vars in the clique - return clique_cut_build_status_t::NO_CUT; - CLIQUE_CUTS_DEBUG("build_clique_cut infeasible var=%lld appears as variable and complement", - static_cast(var_idx)); - return clique_cut_build_status_t::INFEASIBLE; - } - cuopt_assert(seen_original.count(var_idx) == 0, "Duplicate variable in clique"); - seen_original.insert(var_idx); cut.i.push_back(var_idx); cut.x.push_back(-1.0); } @@ -136,27 +156,36 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic cut_rhs = static_cast(num_complements - 1); cut.sort(); + // P0-3 (4): has_pair distinguishes pure (all originals OR all + // complements) from mixed cliques in the accepted-cut log line so + // post-mortem analysis can attribute gap closure to one variant or + // the other. + const int has_pair = (has_original && has_complement) ? 1 : 0; const f_t dot = cut.dot(xstar); const f_t violation = cut_rhs - dot; if (violation > min_violation) { CLIQUE_CUTS_DEBUG( - "build_clique_cut accepted nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld", + "build_clique_cut accepted nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld " + "has_pair=%d", static_cast(cut.i.size()), static_cast(cut_rhs), static_cast(dot), static_cast(violation), static_cast(min_violation), - static_cast(num_complements)); + static_cast(num_complements), + has_pair); return clique_cut_build_status_t::CUT_ADDED; } CLIQUE_CUTS_DEBUG( - "build_clique_cut rejected nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld", + "build_clique_cut rejected nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld " + "has_pair=%d", static_cast(cut.i.size()), static_cast(cut_rhs), static_cast(dot), static_cast(violation), static_cast(min_violation), - static_cast(num_complements)); + static_cast(num_complements), + has_pair); return clique_cut_build_status_t::NO_CUT; } @@ -389,11 +418,20 @@ void extend_clique_vertices(std::vector& clique_vertices, const f_t candidate_size = static_cast(candidates.size()); const f_t sort_work = candidate_size > 0.0 ? 2.0 * candidate_size * std::log2(candidate_size + 1.0) : 0.0; - const f_t adj_set_build_cost = 2.0 * static_cast(adj_set.size()); - const f_t adj_check_cost = 5.0; - const f_t estimated_preloop_work = 2.0 * initial_clique_size + adj_set_build_cost + - 3.0 * static_cast(adj_set.size()) + sort_work + - 2.0 * candidate_size; + const f_t adj_set_build_cost = 2.0 * static_cast(adj_set.size()); + // P0-3 (2): account for the addtl_cliques scan that + // clique_table_t::check_adjacency performs on every adjacency probe. + // Baseline ignored this, so on instances with many addtl_clique entries + // the extension loop dominated cut-generation wall time without being + // attributed to clique cuts. avg_slice_size of var_clique_addtl is a + // robust proxy for the per-call addtl scan cost. + const f_t addtl_cliques_scan_cost = + 1.0 + static_cast(graph.var_clique_addtl.avg_slice_size()); + const f_t adj_check_cost = 5.0 + addtl_cliques_scan_cost; + const f_t estimated_preloop_work = + 2.0 * initial_clique_size + adj_set_build_cost + 3.0 * static_cast(adj_set.size()) + + sort_work + 2.0 * candidate_size + addtl_cliques_scan_cost * initial_clique_size + + addtl_cliques_scan_cost; if (add_work_estimate(estimated_preloop_work, work_estimate, max_work_estimate)) { CLIQUE_CUTS_DEBUG("extend_clique_vertices skip work_limit work=%g limit=%g", work_estimate == nullptr ? -1.0 : static_cast(*work_estimate), @@ -429,6 +467,8 @@ void extend_clique_vertices(std::vector& clique_vertices, break; } } + // Each check_adjacency now charges its own addtl_cliques_scan_cost + // term so the per-iteration budget reflects the addtl scan cost. if (add_work_estimate( adj_check_cost * static_cast(checks), work_estimate, max_work_estimate)) { break; @@ -942,8 +982,66 @@ std::vector> find_violated_odd_cycles_for_test( return result; } +namespace { + +// 64-bit integer mixer (SplitMix64). Used as the building block for the +// cousin filter's per-slot independent hash family. +inline uint64_t splitmix64_mix(uint64_t x) +{ + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; + x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; + x = x ^ (x >> 31); + return x; +} + +inline uint64_t hash64_with_seed(uint64_t value, uint64_t seed) +{ + return splitmix64_mix(value ^ (seed * 0xbf58476d1ce4e5b9ULL + 0x9e3779b97f4a7c15ULL)); +} + +} // namespace + +template +void cut_pool_t::compute_clique_minhash_sketch(const inequality_t& cut, + std::vector& sketch) const +{ + // Min-hash over the cut's column-support set. With clique_cousin_minhash_k_ + // independent random orderings of the variable index space, the expected + // number of agreements between two sketches is k * Jaccard(supp_a, supp_b), + // so sketch comparison estimates Jaccard in O(k) regardless of support + // sizes. + const i_t k = clique_cousin_minhash_k_; + sketch.assign(k, std::numeric_limits::max()); + const i_t nz = cut.size(); + for (i_t p = 0; p < nz; p++) { + const uint64_t j = static_cast(cut.index(p)); + for (i_t s = 0; s < k; s++) { + const uint64_t h = hash64_with_seed(j, static_cast(s)); + if (h < sketch[s]) { sketch[s] = h; } + } + } +} + +template +void cut_pool_t::rebuild_clique_cousin_buckets() +{ + // Buckets index CLIQUE rows by the first sketch hash. Compaction + // routines (check_for_duplicate_cuts) shift row indices, so they call + // this after the parallel sketch vector has been remapped to make + // sure bucket entries point to the post-compaction rows. + clique_cousin_buckets_.clear(); + const i_t m = static_cast(clique_support_minhash_.size()); + for (i_t i = 0; i < m; i++) { + if (clique_support_minhash_[i].empty()) { continue; } + const uint64_t key = clique_support_minhash_[i][0]; + clique_cousin_buckets_[key].push_back(i); + } +} + template -void cut_pool_t::add_cut(cut_type_t cut_type, const inequality_t& cut) +void cut_pool_t::add_cut(cut_type_t cut_type, + const inequality_t& cut, + f_t cut_score) { // TODO: Add fast duplicate check and only add if the cut is not already in the pool @@ -962,10 +1060,126 @@ void cut_pool_t::add_cut(cut_type_t cut_type, const inequality_t= clique_cousin_jaccard_tau_ + // with an existing pool entry, keep the higher-scoring representative. + std::vector new_sketch; + i_t cousin_replace_row = -1; + bool cousin_invariant_path = false; + // Apply the size-tilt boost to the caller's score so larger cliques win + // ties on cousin replacement (more variables covered = more constraint + // strength; a proxy for "integer support" since clique vars are 0-1). + f_t effective_score = cut_score; + if (effective_score >= static_cast(0.0) && + clique_cousin_size_weight_ > static_cast(0.0) && cut_type == cut_type_t::CLIQUE) { + const f_t sz = static_cast(cut_squeezed.size()); + const f_t mult = + static_cast(1.0) + + clique_cousin_size_weight_ * static_cast(std::log2(1.0 + static_cast(sz))); + effective_score *= mult; + } + if (cut_type == cut_type_t::CLIQUE) { clique_inserts_++; } + if (cut_type == cut_type_t::CLIQUE && clique_cousin_filter_enable_ && + clique_cousin_minhash_k_ > 0) { + cousin_invariant_path = true; + compute_clique_minhash_sketch(cut_squeezed, new_sketch); + const uint64_t bucket_key = new_sketch[0]; + auto bucket_it = clique_cousin_buckets_.find(bucket_key); + if (bucket_it != clique_cousin_buckets_.end()) { + const i_t pool_size = cut_storage_.m; + const i_t k = clique_cousin_minhash_k_; + // Walk the bucket, computing min-hash agreement with each peer. + // Bucket sizes are O(distinct max-clique families per round) so + // this loop is short on every realistic instance even when the + // pool is large. + auto& bucket_rows = bucket_it->second; + for (size_t b = 0; b < bucket_rows.size(); b++) { + const i_t row = bucket_rows[b]; + if (row < 0 || row >= pool_size) { continue; } + if (static_cast(clique_support_minhash_[row].size()) != k) { continue; } + i_t agree = 0; + for (i_t s = 0; s < k; s++) { + if (clique_support_minhash_[row][s] == new_sketch[s]) { agree++; } + } + const f_t jaccard_est = static_cast(agree) / static_cast(k); + if (jaccard_est < clique_cousin_jaccard_tau_) { continue; } + // Cousin found. Compare scores; keep the better representative. + const f_t existing_score = clique_cousin_score_[row]; + if (effective_score < static_cast(0.0)) { + // Caller did not supply a score — be conservative and drop the + // new cut; the existing entry stays as the bucket invariant + // winner ("first-write-wins" policy). + cousin_drops_++; + return; + } + if (effective_score <= existing_score) { + // Existing representative is at least as good; drop the new cut. + cousin_drops_++; + return; + } + // New cut beats the existing representative. We "soft-replace": + // clear the loser's sketch so future cousins don't anchor against + // it (restoring the bucket invariant for new inserts), and + // reroute the bucket entry to the new row below. The loser stays + // in cut_storage_ for now and will be filtered by the standard + // orthogonality scan in score_cuts() — main_baselin has no + // mid-pass eviction primitive, and adding one would invalidate + // the per-pass cut_pool_size accounting. + cousin_replace_row = row; + // Replace at most one peer per insert; a transitive cousin of + // the loser at the same bucket is filtered next time. Matches + // the SCIP / Mops "pairwise" family invariant. + break; + } + } + } + cut_storage_.append_row(cut_squeezed.vector); rhs_storage_.push_back(cut_squeezed.rhs); cut_type_.push_back(cut_type); cut_age_.push_back(0); + + // Keep the cousin-filter side tables sized like cut_storage_ regardless + // of cut type. Non-CLIQUE rows carry an empty sketch and a zero score; + // they are skipped by rebuild_clique_cousin_buckets(). + const i_t new_row = cut_storage_.m - 1; + clique_support_minhash_.resize(cut_storage_.m); + clique_cousin_score_.resize(cut_storage_.m, static_cast(0.0)); + if (cousin_invariant_path) { + clique_support_minhash_[new_row] = std::move(new_sketch); + clique_cousin_score_[new_row] = effective_score; + if (cousin_replace_row >= 0) { + // Reroute the bucket entry from the loser's row to the new row. + // Other peers in the same bucket (if any) keep their entries. + const uint64_t bucket_key = clique_support_minhash_[new_row][0]; + auto& rows = clique_cousin_buckets_[bucket_key]; + bool replaced = false; + for (auto& r : rows) { + if (r == cousin_replace_row) { + r = new_row; + replaced = true; + break; + } + } + if (!replaced) { rows.push_back(new_row); } + // Clear the loser's sketch so it's a no-op on subsequent inserts + // and rebuild_clique_cousin_buckets() ignores it. Loser's row + // lingers in cut_storage_ until score_cuts compacts via dedup or + // filters via orthogonality. + clique_support_minhash_[cousin_replace_row].clear(); + clique_cousin_score_[cousin_replace_row] = static_cast(0.0); + cousin_replaces_++; + } else { + const uint64_t bucket_key = clique_support_minhash_[new_row][0]; + clique_cousin_buckets_[bucket_key].push_back(new_row); + } + } } template @@ -1140,17 +1354,35 @@ void cut_pool_t::check_for_duplicate_cuts() cut_storage_.remove_rows(cuts_to_remove, new_cut_storage); cut_storage_ = new_cut_storage; i_t write = 0; + // Cousin-filter parallel arrays: only meaningful when populated, but + // size them up before the loop so the row remap is uniform. + const bool cousin_active = !clique_support_minhash_.empty(); + if (cousin_active) { + clique_support_minhash_.resize(m); + clique_cousin_score_.resize(m, static_cast(0.0)); + } for (i_t i = 0; i < m; i++) { if (cuts_to_remove[i] == 0) { rhs_storage_[write] = rhs_storage_[i]; cut_type_[write] = cut_type_[i]; cut_age_[write] = cut_age_[i]; + if (cousin_active) { + clique_support_minhash_[write] = std::move(clique_support_minhash_[i]); + clique_cousin_score_[write] = clique_cousin_score_[i]; + } write++; } } rhs_storage_.resize(write); cut_type_.resize(write); cut_age_.resize(write); + if (cousin_active) { + clique_support_minhash_.resize(write); + clique_cousin_score_.resize(write); + // Row indices changed — bucket entries point to old row IDs and + // would corrupt the next at-insert filter. Rebuild from scratch. + rebuild_clique_cousin_buckets(); + } } } @@ -2557,7 +2789,11 @@ bool cut_generation_t::generate_clique_cuts( inequality_t cut_inequality; cut_inequality.vector = cut; cut_inequality.rhs = cut_rhs; - cut_pool_.add_cut(cut_type_t::CLIQUE, cut_inequality); + // Pass the LP violation as the cousin-filter score so add_cut can + // pick the stronger representative on a Jaccard collision (>= tau). + // build_clique_cut has already verified violation > min_violation. + const f_t cut_violation = cut_rhs - cut.dot(xstar); + cut_pool_.add_cut(cut_type_t::CLIQUE, cut_inequality, cut_violation); #if DEBUG_CLIQUE_CUTS added_cuts++; CLIQUE_CUTS_DEBUG("generate_clique_cuts added cut nz=%lld rhs=%g clique_size=%lld", @@ -5399,6 +5635,123 @@ void verify_cuts_against_saved_solution(const csr_matrix_t& cuts, } } +// Cut-pool sweep configuration dispatch (declared in cuts.hpp). +// +// Driven by the same CUOPT_CONFIG_ID / CUOPT_MAX_CONFIG env vars the +// diversity manager uses (see diversity_manager.cu). One integer +// selects one of kCutSweepNumConfigs hard-coded cut-pool configurations. +// Caller side is just: +// CUOPT_MAX_CONFIG=5 CUOPT_CONFIG_ID=$id $RUN_MIP ... +// +// CUOPT_MAX_CONFIG is the caller's expected upper bound; when set +// we additionally range-check CUOPT_CONFIG_ID against it. +// CUOPT_CONFIG_ID unset / unparsable -> baseline (config 0). +// CUOPT_CONFIG_ID < 0 or >= valid range -> baseline + warning. +// +// Banner printf is gated to a single emission per process so B&B +// restarts (which re-construct cut_pool_t) don't spam the log. +template +void apply_cut_sweep_config(cut_pool_t& cut_pool, + const simplex_solver_settings_t& settings) +{ + static std::atomic banner_emitted{false}; + + const char* env_config_id_raw = std::getenv("CUOPT_CONFIG_ID"); + int config_id = -1; + if (env_config_id_raw != nullptr && env_config_id_raw[0] != '\0') { + try { + config_id = std::stoi(env_config_id_raw); + } catch (const std::exception&) { + config_id = -1; + } + } + + int max_config = kCutSweepNumConfigs; + const char* env_max_config = std::getenv("CUOPT_MAX_CONFIG"); + if (env_max_config != nullptr && env_max_config[0] != '\0') { + try { + max_config = std::stoi(env_max_config); + } catch (const std::exception&) { + max_config = kCutSweepNumConfigs; + } + } + + if (config_id < 0 || config_id >= max_config || config_id >= kCutSweepNumConfigs) { + if (!banner_emitted.load()) { + settings.log.printf( + "CutPoolConfig WARN config_id=%d out of range [0,%d), falling back to baseline\n", + config_id, + std::min(max_config, kCutSweepNumConfigs)); + } + config_id = 0; + } + + // Defaults match cut_pool_t's initializers: cousin filter OFF, tau=0.85, + // k=8, size_weight=0.0. Each case below documents what it tweaks. + switch (config_id) { + case 0: + // 00_baseline_no_cousin: clique algorithmic changes only (8f2cf00a). + // Cousin filter disabled — isolates the impact of the + // build_clique_cut two-pass refactor and the addtl_cliques_scan_cost + // work-accounting. + cut_pool.set_clique_cousin_filter_enable(false); + break; + case 1: + // 01_cousin_default: P2-4 cousin filter on with the cut_scoring branch + // defaults (tau=0.85, k=8, no size tilt). Score is the caller-supplied + // violation; ties prefer the earlier insert. + cut_pool.set_clique_cousin_filter_enable(true); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.85)); + cut_pool.set_clique_cousin_minhash_k(8); + cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); + break; + case 2: + // 02_cousin_strict: tighter Jaccard threshold (0.85 -> 0.70). Calls + // more cliques "cousins" so we drop / replace more aggressively. + cut_pool.set_clique_cousin_filter_enable(true); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.70)); + cut_pool.set_clique_cousin_minhash_k(8); + cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); + break; + case 3: + // 03_cousin_loose: looser Jaccard threshold (0.85 -> 0.95). Closer + // to no-filter behavior. Gap should match config 0 if the cousin + // filter is mostly absorbing redundancy that the orthogonality + // scan would catch anyway. + cut_pool.set_clique_cousin_filter_enable(true); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.95)); + cut_pool.set_clique_cousin_minhash_k(8); + cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); + break; + case 4: + // 04_cousin_size_tilt: cousin filter on at default tau=0.85, but the + // score used for cousin replacement is multiplied by + // (1 + 0.5 * log2(1 + clique_size)) + // so larger cliques win on ties / near-ties. For clique cuts every + // variable is binary, so clique size is the integer-support count + // — this is the "clique integer support" knob the user requested. + cut_pool.set_clique_cousin_filter_enable(true); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.85)); + cut_pool.set_clique_cousin_minhash_k(8); + cut_pool.set_clique_cousin_size_weight(static_cast(0.5)); + break; + default: break; // unreachable due to range check above + } + + if (!banner_emitted.exchange(true)) { + settings.log.printf( + "CutPoolConfig id=%d name=%s clique_cousin_enable=%d clique_cousin_tau=%g " + "clique_cousin_k=%d clique_cousin_size_weight=%g\n", + config_id, + cut_sweep_config_name(config_id), + static_cast(cut_pool.clique_cousin_filter_enable() ? 1 : 0), + static_cast(cut_pool.clique_cousin_jaccard_tau()), + static_cast(cut_pool.clique_cousin_minhash_k()), + static_cast(cut_pool.clique_cousin_size_weight())); + std::fflush(stdout); + } +} + #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE template class cut_pool_t; template class cut_generation_t; @@ -5407,6 +5760,9 @@ template class tableau_equality_t; template class complemented_mixed_integer_rounding_cut_t; template class variable_bounds_t; +template void apply_cut_sweep_config( + cut_pool_t& cut_pool, const simplex_solver_settings_t& settings); + template int add_cuts(const simplex_solver_settings_t& settings, const csr_matrix_t& cuts, const std::vector& cut_rhs, diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index e3846c2c65..29aba8bbb4 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -299,9 +299,17 @@ class cut_pool_t { } // Add a cut in the form: cut'*x >= rhs. - // We expect that the cut is violated by the current relaxation xstar - // cut'*xstart < rhs - void add_cut(cut_type_t cut_type, const inequality_t& cut); + // We expect that the cut is violated by the current relaxation xstar. + // + // cut_score is an optional caller-supplied quality score used by the + // P2-4 clique cousin filter (only consulted for cut_type == CLIQUE + // when the cousin filter is enabled). Pass a non-negative value to + // enable score-aware cousin replacement; the default (-1.0) reverts + // to "first-write-wins" cousin policy. Other cut types ignore this + // parameter. + void add_cut(cut_type_t cut_type, + const inequality_t& cut, + f_t cut_score = static_cast(-1.0)); // Backward-compatible scoring entry-point. Falls back to the legacy // geometric-distance / nnz-penalty score when bounds are not provided. @@ -345,6 +353,47 @@ class cut_pool_t { void set_pool_soft_limit(i_t v) { pool_soft_limit_ = v; } void set_max_parallelism(f_t v) { max_parallelism_ = v; } + // ----- P2-4 clique cousin filter knobs / counters ----------------------- + // + // The clique cut family (Bron-Kerbosch + extension) emits cousin + // cliques whose support sets agree in |k-1| of |k| vertices. The + // selection-stage orthogonality scan catches them but only after the + // full insert + dedup + score cost has been paid. The cousin filter + // intercepts at insert: we min-hash the cut's column-support set, + // bucket on the first sketch hash, and when an existing pool entry + // collides with estimated Jaccard >= jaccard_tau we keep the + // higher-scoring representative (or, if no score was supplied, the + // earlier-inserted one). + // + // Defaults: jaccard_tau=0.85, k=8, enable=false. Cousin filter is OFF + // by default so cut_pool_t behavior matches main_baselin (5335b659) + // unless apply_cut_sweep_config() explicitly turns it on. The numeric + // defaults (tau=0.85, k=8) match the cut_scoring branch's "final + // version" so config 1 here lines up with the P2-4 baseline measured + // there. + void set_clique_cousin_filter_enable(bool v) { clique_cousin_filter_enable_ = v; } + void set_clique_cousin_jaccard_tau(f_t v) { clique_cousin_jaccard_tau_ = v; } + void set_clique_cousin_minhash_k(i_t v) { clique_cousin_minhash_k_ = v; } + void set_clique_cousin_size_weight(f_t v) { clique_cousin_size_weight_ = v; } + + bool clique_cousin_filter_enable() const { return clique_cousin_filter_enable_; } + f_t clique_cousin_jaccard_tau() const { return clique_cousin_jaccard_tau_; } + i_t clique_cousin_minhash_k() const { return clique_cousin_minhash_k_; } + f_t clique_cousin_size_weight() const { return clique_cousin_size_weight_; } + + // Per-pool tally for log lines (instance-level diagnostic). All three + // counters are reset by reset_cousin_stats() and incremented inside + // add_cut() / cousin replacement. + i_t cousin_drops() const { return cousin_drops_; } + i_t cousin_replaces() const { return cousin_replaces_; } + i_t clique_inserts() const { return clique_inserts_; } + void reset_cousin_stats() + { + cousin_drops_ = 0; + cousin_replaces_ = 0; + clique_inserts_ = 0; + } + private: f_t cut_distance(i_t row, const std::vector& x, f_t& cut_violation, f_t& cut_norm); f_t cut_density(i_t row); @@ -376,6 +425,17 @@ class cut_pool_t { return t == variable_type_t::INTEGER || t == variable_type_t::BINARY; } + // Cousin filter helpers. compute_clique_minhash_sketch() fills + // `sketch` (length = clique_cousin_minhash_k_) with k independent + // min-hashes over the cut's column-support set. Two sketches agree + // on slot s with probability Jaccard(supp_a, supp_b), so element-wise + // agreement count divided by k estimates the Jaccard similarity. + void compute_clique_minhash_sketch(const inequality_t& cut, + std::vector& sketch) const; + // Rebuilds clique_cousin_buckets_ from clique_support_minhash_ after + // any compaction that remaps row indices (e.g. dedup). + void rebuild_clique_cousin_buckets(); + i_t original_vars_; const simplex_solver_settings_t& settings_; @@ -409,8 +469,80 @@ class cut_pool_t { f_t integer_support_weight_{0.1}; f_t full_support_penalty_{0.01}; std::unordered_map> support_hash_buckets_; + + // P2-4 cousin filter state. clique_support_minhash_ is sized in + // lock-step with cut_storage_; non-CLIQUE rows carry an empty + // sketch and are skipped by rebuild_clique_cousin_buckets() and the + // cousin loop in add_cut. clique_cousin_score_ holds the + // caller-supplied score (raw violation, or violation * size-tilt) so + // we can decide which representative to keep when two cliques + // collide. clique_cousin_buckets_ maps the first sketch hash to the + // list of pool rows whose sketches start with that hash. + std::vector> clique_support_minhash_; + std::vector clique_cousin_score_; + std::unordered_map> clique_cousin_buckets_; + f_t clique_cousin_jaccard_tau_{static_cast(0.85)}; + i_t clique_cousin_minhash_k_{8}; + bool clique_cousin_filter_enable_{false}; + // When > 0, the cousin filter's "score" used to pick a winner is + // boosted as: effective_score = base_score * (1 + size_weight * log2(1 + clique_size)). + // This biases cousin replacement toward larger cliques (more variables + // covered, larger integer support). 0 disables the tilt. + f_t clique_cousin_size_weight_{static_cast(0.0)}; + + // Diagnostic counters reset at the start of each cut pass via + // reset_cousin_stats(). + i_t cousin_drops_{0}; + i_t cousin_replaces_{0}; + i_t clique_inserts_{0}; }; +// --------------------------------------------------------------------------- +// Cut-pool sweep configuration dispatch. +// +// Selected by the CUOPT_CONFIG_ID environment variable; range-checked +// against CUOPT_MAX_CONFIG (caller-asserted upper bound). One env-var +// dispatch covers the entire clique cut family because the only knobs +// we vary on this branch live on cut_pool_t (cousin filter on/off, +// Jaccard tau, integer-support size tilt). The deterministic +// measurement path (no concurrent root LP, no in-cut-pass RCS, exit +// after the cut loop) is unconditional and lives in branch_and_bound. +// +// Keep kCutSweepNumConfigs in sync with the switch table in +// apply_cut_sweep_config() (see cuts.cpp) and with cut_sweep_config_name() +// below. +// +// Layout: +// 0 baseline_no_cousin clique cut algorithmic changes only +// (cousin filter off; isolates 8f2cf00a impact) +// 1 cousin_default cousin filter on, tau=0.85, k=8, score=violation +// (the cut_scoring final-version P2-4 baseline) +// 2 cousin_strict cousin filter on, tau=0.70 (more aggressive +// cousin removal — favors quantity reduction) +// 3 cousin_loose cousin filter on, tau=0.95 (closer to no-filter +// extreme — selection-stage absorbs cousins) +// 4 cousin_size_tilt cousin filter on, tau=0.85, score = violation * +// (1 + 0.5 * log2(1 + clique_size)) — picks the +// larger clique on cousin replacement (integer +// support proxy, since clique vars are 0-1) +constexpr int kCutSweepNumConfigs = 5; + +inline const char* cut_sweep_config_name(int config_id) +{ + switch (config_id) { + case 0: return "00_baseline_no_cousin"; + case 1: return "01_cousin_default"; + case 2: return "02_cousin_strict"; + case 3: return "03_cousin_loose"; + case 4: return "04_cousin_size_tilt"; + default: return "unknown"; + } +} + +template +void apply_cut_sweep_config(cut_pool_t& cut_pool, + const simplex_solver_settings_t& settings); + template class knapsack_generation_t { public: diff --git a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu index 82462c11ce..deca5a46c3 100644 --- a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu +++ b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu @@ -20,6 +20,7 @@ #include "clique_table.cuh" #include +#include #include #include #include @@ -100,7 +101,6 @@ template void make_coeff_positive_knapsack_constraint( const dual_simplex::user_problem_t& problem, std::vector>& knapsack_constraints, - std::unordered_set& set_packing_constraints, typename mip_solver_settings_t::tolerances_t tolerances) { for (i_t i = 0; i < (i_t)knapsack_constraints.size(); i++) { @@ -125,7 +125,6 @@ void make_coeff_positive_knapsack_constraint( } knapsack_constraint.is_set_packing = all_coeff_are_equal; if (!all_coeff_are_equal) { knapsack_constraint.is_set_partitioning = false; } - if (knapsack_constraint.is_set_packing) { set_packing_constraints.insert(i); } cuopt_assert(knapsack_constraint.rhs >= 0, "RHS must be non-negative"); } } @@ -185,9 +184,8 @@ void fill_knapsack_constraints(const dual_simplex::user_problem_t& pro } // equality part else { - // For equality rows, partitioning status should not depend on raw rhs scale here. - // The exact set-packing/partitioning check is finalized later in - // make_coeff_positive_knapsack_constraint after coefficient normalization. + // Final partitioning check is done after coefficient normalization in + // make_coeff_positive_knapsack_constraint. bool is_set_partitioning = true; bool ranged_constraint = ranged_constraint_counter < problem.num_range_rows && problem.range_rows[ranged_constraint_counter] == i; @@ -203,8 +201,7 @@ void fill_knapsack_constraints(const dual_simplex::user_problem_t& pro } // greater than part: convert it to less than knapsack_constraint_t knapsack_constraint2; - // Mark synthetic rows from equality splitting with negative ids so they never alias real row - // indices (including rows appended later by clique extension). + // Negative ids prevent aliasing with real row indices. knapsack_constraint2.cstr_idx = -(added_constraints + 1); added_constraints++; knapsack_constraint2.rhs = -problem.rhs[i]; @@ -228,62 +225,51 @@ void remove_small_cliques(clique_table_t& clique_table, cuopt::timer_t i_t num_removed_first = 0; i_t num_removed_addtl = 0; std::vector to_delete(clique_table.first.size(), false); - // if a clique is small, we remove it from the cliques and add it to adjlist + std::vector> small_edges; + + // Demote sub-threshold first-cliques into pairwise edges. for (size_t clique_idx = 0; clique_idx < clique_table.first.size(); clique_idx++) { if (timer.check_time_limit()) { return; } const auto& clique = clique_table.first[clique_idx]; - if (clique.size() <= (size_t)clique_table.min_clique_size) { + if (clique.size() < (size_t)clique_table.min_clique_size) { for (size_t i = 0; i < clique.size(); i++) { for (size_t j = 0; j < clique.size(); j++) { if (i == j) { continue; } - clique_table.adj_list_small_cliques[clique[i]].insert(clique[j]); + small_edges.emplace_back(clique[i], clique[j]); } } num_removed_first++; to_delete[clique_idx] = true; } } + std::vector addtl_to_delete(clique_table.addtl_cliques.size(), false); for (size_t addtl_c = 0; addtl_c < clique_table.addtl_cliques.size(); addtl_c++) { const auto& addtl_clique = clique_table.addtl_cliques[addtl_c]; const auto base_clique_idx = static_cast(addtl_clique.clique_idx); cuopt_assert(base_clique_idx < to_delete.size(), "Additional clique points to invalid base clique index"); - // Remove additional cliques whose base clique is scheduled for deletion. - if (to_delete[base_clique_idx]) { - // Materialize conflicts represented by: - // addtl_clique.vertex_idx + first[base_clique_idx][start_pos_on_clique:] - // before deleting both the additional and base clique entries. - for (size_t i = addtl_clique.start_pos_on_clique; - i < clique_table.first[base_clique_idx].size(); - i++) { - clique_table.adj_list_small_cliques[clique_table.first[base_clique_idx][i]].insert( - addtl_clique.vertex_idx); - clique_table.adj_list_small_cliques[addtl_clique.vertex_idx].insert( - clique_table.first[base_clique_idx][i]); - } - clique_table.addtl_cliques.erase(clique_table.addtl_cliques.begin() + addtl_c); - addtl_c--; - num_removed_addtl++; - continue; - } - i_t size_of_clique = + const bool drop_because_base = to_delete[base_clique_idx]; + const i_t extended_size = clique_table.first[base_clique_idx].size() - addtl_clique.start_pos_on_clique + 1; - if (size_of_clique < clique_table.min_clique_size) { - // the items from first clique are already added to the adjlist - // only add the items that are coming from the new var in the additional clique - for (size_t i = addtl_clique.start_pos_on_clique; - i < clique_table.first[base_clique_idx].size(); - i++) { - // insert conflicts both way - clique_table.adj_list_small_cliques[clique_table.first[base_clique_idx][i]].insert( - addtl_clique.vertex_idx); - clique_table.adj_list_small_cliques[addtl_clique.vertex_idx].insert( - clique_table.first[base_clique_idx][i]); - } - clique_table.addtl_cliques.erase(clique_table.addtl_cliques.begin() + addtl_c); - addtl_c--; - num_removed_addtl++; + const bool drop_because_small = extended_size < clique_table.min_clique_size; + if (!drop_because_base && !drop_because_small) { continue; } + + for (size_t i = addtl_clique.start_pos_on_clique; + i < clique_table.first[base_clique_idx].size(); + i++) { + const i_t base_member = clique_table.first[base_clique_idx][i]; + small_edges.emplace_back(base_member, addtl_clique.vertex_idx); + small_edges.emplace_back(addtl_clique.vertex_idx, base_member); } + addtl_to_delete[addtl_c] = true; + num_removed_addtl++; + } + { + size_t old_addtl_idx = 0; + auto addtl_it = std::remove_if(clique_table.addtl_cliques.begin(), + clique_table.addtl_cliques.end(), + [&](const auto&) { return addtl_to_delete[old_addtl_idx++]; }); + clique_table.addtl_cliques.erase(addtl_it, clique_table.addtl_cliques.end()); } CUOPT_LOG_DEBUG("Number of removed cliques from first: %d, additional: %d", num_removed_first, @@ -312,40 +298,46 @@ void remove_small_cliques(clique_table_t& clique_table, cuopt::timer_t (size_t)clique_table.min_clique_size, "A small clique remained after removing small cliques"); } - // Clique removals/edge materialization can change degrees; force recompute on next query. + clique_table.small_clique_adj.finalize_from_unsorted_pairs(2 * clique_table.n_variables, + small_edges); + // Force degree recompute after structural changes. std::fill(clique_table.var_degrees.begin(), clique_table.var_degrees.end(), -1); } template -std::unordered_set clique_table_t::get_adj_set_of_var(i_t var_idx) +std::unordered_set clique_table_t::get_adj_set_of_var(i_t var_idx) const { std::unordered_set adj_set; - for (const auto& clique_idx : var_clique_map_first[var_idx]) { - adj_set.insert(first[clique_idx].begin(), first[clique_idx].end()); - } - for (const auto& addtl_clique_idx : var_clique_map_addtl[var_idx]) { - adj_set.insert(addtl_cliques[addtl_clique_idx].vertex_idx); - adj_set.insert(first[addtl_cliques[addtl_clique_idx].clique_idx].begin() + - addtl_cliques[addtl_clique_idx].start_pos_on_clique, - first[addtl_cliques[addtl_clique_idx].clique_idx].end()); - } - // Reverse lookup for additional cliques using position map: - // if var_idx is in first[clique_idx][start_pos_on_clique:], it is adjacent to vertex_idx. - for (const auto& addtl : addtl_cliques) { - if (addtl.vertex_idx == var_idx) { continue; } - if (static_cast(addtl.clique_idx) < first_var_positions.size()) { - const auto& pos_map = first_var_positions[addtl.clique_idx]; - auto it = pos_map.find(var_idx); - if (it != pos_map.end() && it->second >= addtl.start_pos_on_clique) { - adj_set.insert(addtl.vertex_idx); - } + // First-clique edges: every member of each first-clique containing var_idx. + for (const i_t* it = var_clique_first.slice_begin(var_idx); + it != var_clique_first.slice_end(var_idx); + ++it) { + const auto& c = first[*it]; + adj_set.insert(c.begin(), c.end()); + } + + // Addtl-clique edges. + for (const i_t* it = var_clique_addtl.slice_begin(var_idx); + it != var_clique_addtl.slice_end(var_idx); + ++it) { + const auto& a = addtl_cliques[*it]; + if (a.vertex_idx == var_idx) { + // var_idx is the extension vertex; new neighbors are the base suffix. + const auto& base = first[a.clique_idx]; + adj_set.insert(base.begin() + a.start_pos_on_clique, base.end()); + } else { + // var_idx is a base member; only new edge is to the extension vertex. + adj_set.insert(a.vertex_idx); } } - for (const auto& adj_vertex : adj_list_small_cliques[var_idx]) { - adj_set.insert(adj_vertex); + for (const i_t* it = small_clique_adj.slice_begin(var_idx); + it != small_clique_adj.slice_end(var_idx); + ++it) { + adj_set.insert(*it); } + // Add the complement of var_idx to the adjacency set i_t complement_idx = (var_idx >= n_variables) ? (var_idx - n_variables) : (var_idx + n_variables); adj_set.insert(complement_idx); @@ -362,99 +354,58 @@ i_t clique_table_t::get_degree_of_var(i_t var_idx) } template -bool clique_table_t::check_adjacency(i_t var_idx1, i_t var_idx2) +bool clique_table_t::check_adjacency(i_t var_idx1, i_t var_idx2) const { if (var_idx1 == var_idx2) { return false; } if (var_idx1 % n_variables == var_idx2 % n_variables) { return true; } - { - auto it = adj_list_small_cliques.find(var_idx1); - if (it != adj_list_small_cliques.end() && it->second.count(var_idx2) > 0) { return true; } - } + // small_clique_adj is symmetric, so probe either direction. + if (small_clique_adj.slice_contains(var_idx1, var_idx2)) { return true; } - // Iterate whichever variable belongs to fewer first-cliques + // Probe through the var with the smaller var_clique_first slice. { i_t probe_var = var_idx1; i_t target_var = var_idx2; - if (var_clique_map_first[var_idx1].size() > var_clique_map_first[var_idx2].size()) { + if (var_clique_first.slice_size(var_idx1) > var_clique_first.slice_size(var_idx2)) { probe_var = var_idx2; target_var = var_idx1; } - for (const auto& clique_idx : var_clique_map_first[probe_var]) { - if (first_var_positions[clique_idx].count(target_var) > 0) { return true; } + for (const i_t* it = var_clique_first.slice_begin(probe_var); + it != var_clique_first.slice_end(probe_var); + ++it) { + if (first_var_positions[*it].count(target_var) > 0) { return true; } } } - for (const auto& addtl_idx : var_clique_map_addtl[var_idx1]) { - const auto& addtl = addtl_cliques[addtl_idx]; + for (const i_t* it = var_clique_addtl.slice_begin(var_idx1); + it != var_clique_addtl.slice_end(var_idx1); + ++it) { + const auto& addtl = addtl_cliques[*it]; const auto& pos_map = first_var_positions[addtl.clique_idx]; - auto it = pos_map.find(var_idx2); - if (it != pos_map.end() && it->second >= addtl.start_pos_on_clique) { return true; } + auto pos_it = pos_map.find(var_idx2); + if (pos_it != pos_map.end() && pos_it->second >= addtl.start_pos_on_clique) { return true; } } - - for (const auto& addtl_idx : var_clique_map_addtl[var_idx2]) { - const auto& addtl = addtl_cliques[addtl_idx]; + for (const i_t* it = var_clique_addtl.slice_begin(var_idx2); + it != var_clique_addtl.slice_end(var_idx2); + ++it) { + const auto& addtl = addtl_cliques[*it]; const auto& pos_map = first_var_positions[addtl.clique_idx]; - auto it = pos_map.find(var_idx1); - if (it != pos_map.end() && it->second >= addtl.start_pos_on_clique) { return true; } + auto pos_it = pos_map.find(var_idx1); + if (pos_it != pos_map.end() && pos_it->second >= addtl.start_pos_on_clique) { return true; } } return false; } -// this function should only be called within extend clique -// if this is called outside extend clique, csr matrix should be converted into csc and copied into -// problem because the problem is partly modified -template -void insert_clique_into_problem(const std::vector& clique, - dual_simplex::user_problem_t& problem, - dual_simplex::csr_matrix_t& A, - f_t coeff_scale) -{ - // convert vertices into original vars - f_t rhs_offset = 0.; - std::vector new_vars; - std::vector new_coeffs; - for (size_t i = 0; i < clique.size(); i++) { - f_t coeff = coeff_scale; - i_t var_idx = clique[i]; - if (var_idx >= problem.num_cols) { - coeff = -coeff_scale; - var_idx = var_idx - problem.num_cols; - rhs_offset += coeff_scale; - } - new_vars.push_back(var_idx); - new_coeffs.push_back(coeff); - } - // coeff_scale * (1 - x) = coeff_scale - coeff_scale * x - // Move constants to the right, so rhs must decrease by rhs_offset. - f_t rhs = coeff_scale - rhs_offset; - // insert the new clique into the problem as a new constraint - dual_simplex::sparse_vector_t new_row(A.n, new_vars.size()); - new_row.i = std::move(new_vars); - new_row.x = std::move(new_coeffs); - A.append_row(new_row); - problem.row_sense.push_back('L'); - problem.rhs.push_back(rhs); - problem.row_names.push_back("Clique" + std::to_string(problem.row_names.size())); -} - +// Returns true on success; `work_out` accumulates scan/hash ops as a +// near-uniform wall-time proxy. template bool extend_clique(const std::vector& clique, clique_table_t& clique_table, - dual_simplex::user_problem_t& problem, - dual_simplex::csr_matrix_t& A, - f_t coeff_scale, - bool modify_problem, - i_t min_extension_gain, - i_t remaining_rows_budget, - i_t remaining_nnz_budget, - i_t& inserted_row_nnz) + double& work_out) { - inserted_row_nnz = 0; i_t smallest_degree = std::numeric_limits::max(); i_t smallest_degree_var = -1; - // find smallest degree vertex in the current set packing constraint for (size_t idx = 0; idx < clique.size(); idx++) { i_t var_idx = clique[idx]; i_t degree = clique_table.get_degree_of_var(var_idx); @@ -463,108 +414,58 @@ bool extend_clique(const std::vector& clique, smallest_degree_var = var_idx; } } - std::vector extension_candidates; + work_out += static_cast(clique.size()); + auto smallest_degree_adj_set = clique_table.get_adj_set_of_var(smallest_degree_var); + const double D = static_cast(smallest_degree_adj_set.size()); + work_out += D; + std::unordered_set clique_members(clique.begin(), clique.end()); + work_out += static_cast(clique.size()); + + std::vector extension_candidates; + extension_candidates.reserve(smallest_degree_adj_set.size()); for (const auto& candidate : smallest_degree_adj_set) { if (clique_members.find(candidate) == clique_members.end()) { extension_candidates.push_back(candidate); } } + work_out += D; + std::sort(extension_candidates.begin(), extension_candidates.end(), [&](i_t a, i_t b) { return clique_table.get_degree_of_var(a) > clique_table.get_degree_of_var(b); }); - auto new_clique = clique; - i_t n_of_complement_conflicts = 0; - i_t complement_conflict_var = -1; + const double C = static_cast(extension_candidates.size()); + if (C > 1.0) { work_out += C * std::log2(C); } + + auto new_clique = clique; for (size_t idx = 0; idx < extension_candidates.size(); idx++) { - i_t var_idx = extension_candidates[idx]; - bool add = true; - bool complement_conflict = false; - i_t complement_conflict_idx = -1; + i_t var_idx = extension_candidates[idx]; + bool add = true; for (size_t i = 0; i < new_clique.size(); i++) { - if (var_idx % clique_table.n_variables == new_clique[i] % clique_table.n_variables) { - complement_conflict = true; - complement_conflict_idx = var_idx % clique_table.n_variables; - } - // check if the tested variable conflicts with all vars in the new clique + work_out += 1.0; if (!clique_table.check_adjacency(var_idx, new_clique[i])) { add = false; break; } } - if (add) { - new_clique.push_back(var_idx); - if (complement_conflict) { - n_of_complement_conflicts++; - complement_conflict_var = complement_conflict_idx; - } - } + if (add) { new_clique.push_back(var_idx); } } - // if we found a larger cliqe, insert it into the formulation + if (new_clique.size() > clique.size()) { - if (n_of_complement_conflicts > 0) { - CUOPT_LOG_DEBUG("Found %d complement conflicts on var %d", - n_of_complement_conflicts, - complement_conflict_var); - cuopt_assert(n_of_complement_conflicts == 1, "There can only be one complement conflict"); - // Keep the discovered extension in the clique table for downstream dominance checks. - clique_table.first.push_back(new_clique); - for (const auto& var_idx : new_clique) { - clique_table.var_degrees[var_idx] = -1; - } - if (modify_problem) { - // fix all other variables other than complementing var - for (size_t i = 0; i < new_clique.size(); i++) { - if (new_clique[i] % clique_table.n_variables != complement_conflict_var) { - CUOPT_LOG_DEBUG("Fixing variable %d", new_clique[i]); - if (new_clique[i] >= problem.num_cols) { - cuopt_assert(problem.lower[new_clique[i] - problem.num_cols] != 0 || - problem.upper[new_clique[i] - problem.num_cols] != 0, - "Variable is fixed to other side"); - problem.lower[new_clique[i] - problem.num_cols] = 1; - problem.upper[new_clique[i] - problem.num_cols] = 1; - } else { - cuopt_assert(problem.lower[new_clique[i]] != 1 || problem.upper[new_clique[i]] != 1, - "Variable is fixed to other side"); - problem.lower[new_clique[i]] = 0; - problem.upper[new_clique[i]] = 0; - } - } - } - } - return true; - } else { - // Keep the discovered extension in the clique table even when row insertion is skipped by - // row/nnz budgets. - clique_table.first.push_back(new_clique); - for (const auto& var_idx : new_clique) { - clique_table.var_degrees[var_idx] = -1; - } + clique_table.first.push_back(new_clique); + for (const auto& var_idx : new_clique) { + clique_table.var_degrees[var_idx] = -1; + } + work_out += static_cast(new_clique.size()); #if DEBUG_KNAPSACK_CONSTRAINTS - CUOPT_LOG_DEBUG("Extended clique: %lu from %lu", new_clique.size(), clique.size()); + CUOPT_LOG_DEBUG("Extended clique: %lu from %lu", new_clique.size(), clique.size()); #endif - i_t extension_gain = static_cast(new_clique.size() - clique.size()); - if (extension_gain < min_extension_gain) { return true; } - if (remaining_rows_budget <= 0 || - remaining_nnz_budget < static_cast(new_clique.size())) { - return true; - } - // Row insertion is now deferred until dominance is confirmed against model rows. - // This keeps extension and replacement sequential: detect dominance first, then replace. - inserted_row_nnz = 0; - } + return true; } - return new_clique.size() > clique.size(); + return false; } -template -struct clique_sig_t { - i_t knapsack_idx; - i_t size; - long long signature; -}; - template struct extension_candidate_t { i_t knapsack_idx; @@ -572,19 +473,6 @@ struct extension_candidate_t { i_t clique_size; }; -template -bool compare_clique_sig(const clique_sig_t& a, const clique_sig_t& b) -{ - if (a.signature != b.signature) { return a.signature < b.signature; } - return a.size < b.size; -} - -template -bool compare_signature_value(long long value, const clique_sig_t& a) -{ - return value < a.signature; -} - template bool compare_extension_candidate(const extension_candidate_t& a, const extension_candidate_t& b) @@ -594,265 +482,28 @@ bool compare_extension_candidate(const extension_candidate_t& a, return a.knapsack_idx < b.knapsack_idx; } -template -bool is_sorted_subset(const std::vector& a, const std::vector& b) -{ - size_t i = 0; - size_t j = 0; - while (i < a.size() && j < b.size()) { - if (a[i] == b[j]) { - i++; - j++; - } else if (a[i] > b[j]) { - j++; - } else { - return false; - } - } - return i == a.size(); -} - -template -void fix_difference(const std::vector& superset, - const std::vector& subset, - dual_simplex::user_problem_t& problem) -{ - cuopt_assert(std::is_sorted(subset.begin(), subset.end()), - "subset vector passed to fix_difference is not sorted"); - for (auto var_idx : superset) { - if (std::binary_search(subset.begin(), subset.end(), var_idx)) { continue; } - if (var_idx >= problem.num_cols) { - i_t orig_idx = var_idx - problem.num_cols; - CUOPT_LOG_DEBUG("Fixing variable %d", orig_idx); - cuopt_assert(problem.lower[orig_idx] != 0 || problem.upper[orig_idx] != 0, - "Variable is fixed to other side"); - problem.lower[orig_idx] = 1; - problem.upper[orig_idx] = 1; - } else { - CUOPT_LOG_DEBUG("Fixing variable %d", var_idx); - cuopt_assert(problem.lower[var_idx] != 1 || problem.upper[var_idx] != 1, - "Variable is fixed to other side"); - problem.lower[var_idx] = 0; - problem.upper[var_idx] = 0; - } - } -} - -template -void remove_marked_elements(std::vector& vec, const std::vector& removal_marker) -{ - size_t write_idx = 0; - for (size_t i = 0; i < vec.size(); i++) { - if (!removal_marker[i]) { - if (write_idx != i) { vec[write_idx] = std::move(vec[i]); } - write_idx++; - } - } - vec.resize(write_idx); -} - -template -void remove_dominated_cliques_in_problem_for_single_extended_clique( - const std::vector& curr_clique, - f_t coeff_scale, - i_t remaining_rows_budget, - i_t remaining_nnz_budget, - i_t& inserted_row_nnz, - const std::vector>& sp_sigs, - const std::vector>& cstr_vars, - const std::vector>& knapsack_constraints, - std::vector& original_to_current_row_idx, - dual_simplex::user_problem_t& problem, - dual_simplex::csr_matrix_t& A, - cuopt::timer_t& timer) -{ - inserted_row_nnz = 0; - if (curr_clique.empty() || sp_sigs.empty()) { return; } - std::vector curr_clique_vars(curr_clique.begin(), curr_clique.end()); - std::sort(curr_clique_vars.begin(), curr_clique_vars.end()); - curr_clique_vars.erase(std::unique(curr_clique_vars.begin(), curr_clique_vars.end()), - curr_clique_vars.end()); - long long signature = 0; - for (auto v : curr_clique_vars) { - signature += static_cast(v); - } - constexpr size_t dominance_window = 20000; - auto end_it = - std::upper_bound(sp_sigs.begin(), sp_sigs.end(), signature, compare_signature_value); - size_t end = static_cast(std::distance(sp_sigs.begin(), end_it)); - size_t start = (end > dominance_window) ? (end - dominance_window) : 0; - std::vector rows_to_remove; - bool covering_clique_implied_by_partitioning = false; - for (size_t idx = end; idx > start; idx--) { - if (timer.check_time_limit()) { break; } - const auto& sp = sp_sigs[idx - 1]; - const auto& vars_sp = cstr_vars[sp.knapsack_idx]; - if (vars_sp.size() > curr_clique_vars.size()) { continue; } - cuopt_assert(std::is_sorted(vars_sp.begin(), vars_sp.end()), - "vars_sp vector passed to is_sorted_subset is not sorted"); - if (!is_sorted_subset(vars_sp, curr_clique_vars)) { continue; } - if (knapsack_constraints[sp.knapsack_idx].is_set_partitioning) { - if (vars_sp.size() != curr_clique_vars.size()) { - fix_difference(curr_clique_vars, vars_sp, problem); - covering_clique_implied_by_partitioning = true; - } - continue; - } - i_t original_row_idx = knapsack_constraints[sp.knapsack_idx].cstr_idx; - if (original_row_idx < 0) { continue; } - cuopt_assert(original_row_idx < static_cast(original_to_current_row_idx.size()), - "Invalid original row index in knapsack constraint"); - i_t current_row_idx = original_to_current_row_idx[original_row_idx]; - if (current_row_idx < 0) { continue; } - cuopt_assert(current_row_idx < static_cast(problem.row_sense.size()), - "Invalid current row index in row mapping"); - rows_to_remove.push_back(current_row_idx); - } - if (rows_to_remove.empty()) { return; } - std::sort(rows_to_remove.begin(), rows_to_remove.end()); - rows_to_remove.erase(std::unique(rows_to_remove.begin(), rows_to_remove.end()), - rows_to_remove.end()); - if (!covering_clique_implied_by_partitioning) { - if (remaining_rows_budget <= 0 || - remaining_nnz_budget < static_cast(curr_clique_vars.size())) { - return; - } - insert_clique_into_problem(curr_clique_vars, problem, A, coeff_scale); - inserted_row_nnz = static_cast(curr_clique_vars.size()); - } - std::vector removal_marker(problem.row_sense.size(), 0); - for (auto row_idx : rows_to_remove) { - cuopt_assert(row_idx >= 0 && row_idx < static_cast(removal_marker.size()), - "Invalid dominated row index"); - CUOPT_LOG_DEBUG("Removing dominated row %d", row_idx); - removal_marker[row_idx] = true; - } - dual_simplex::csr_matrix_t A_removed(0, 0, 0); - A.remove_rows(removal_marker, A_removed); - A = std::move(A_removed); - problem.num_rows = A.m; - remove_marked_elements(problem.row_sense, removal_marker); - remove_marked_elements(problem.rhs, removal_marker); - remove_marked_elements(problem.row_names, removal_marker); - cuopt_assert(problem.rhs.size() == problem.row_sense.size(), "rhs and row sense size mismatch"); - cuopt_assert(problem.row_names.size() == problem.rhs.size(), "row names and rhs size mismatch"); - cuopt_assert(problem.num_rows == static_cast(problem.rhs.size()), - "matrix and num rows mismatch after removal"); - if (!problem.range_rows.empty()) { - std::vector old_to_new_indices; - old_to_new_indices.reserve(removal_marker.size()); - i_t new_idx = 0; - for (size_t i = 0; i < removal_marker.size(); ++i) { - if (!removal_marker[i]) { - old_to_new_indices.push_back(new_idx++); - } else { - old_to_new_indices.push_back(-1); - } - } - std::vector new_range_rows; - std::vector new_range_values; - for (size_t i = 0; i < problem.range_rows.size(); ++i) { - i_t old_row = problem.range_rows[i]; - cuopt_assert(old_row >= 0 && old_row < static_cast(removal_marker.size()), - "Invalid row index in range_rows"); - if (!removal_marker[old_row]) { - i_t new_row = old_to_new_indices[old_row]; - cuopt_assert(new_row != -1, "Invalid new row index for ranged row renumbering"); - new_range_rows.push_back(new_row); - new_range_values.push_back(problem.range_value[i]); - } - } - problem.range_rows = std::move(new_range_rows); - problem.range_value = std::move(new_range_values); - } - problem.num_range_rows = static_cast(problem.range_rows.size()); - std::vector removed_prefix(removal_marker.size() + 1, 0); - for (size_t row_idx = 0; row_idx < removal_marker.size(); row_idx++) { - removed_prefix[row_idx + 1] = - removed_prefix[row_idx] + static_cast(removal_marker[row_idx]); - } - for (i_t row_idx = 0; row_idx < static_cast(original_to_current_row_idx.size()); row_idx++) { - i_t current_row_idx = original_to_current_row_idx[row_idx]; - if (current_row_idx < 0) { continue; } - cuopt_assert(current_row_idx < static_cast(removal_marker.size()), - "Row index map is out of bounds"); - if (removal_marker[current_row_idx]) { - original_to_current_row_idx[row_idx] = -1; - } else { - original_to_current_row_idx[row_idx] = current_row_idx - removed_prefix[current_row_idx]; - } - } -} - -// Also known as clique merging. Infer larger clique constraints which allows inclusion of vars from -// other constraints. This only extends the original cliques in the formulation for now. -// TODO: consider a heuristic on how much of the cliques derived from knapsacks to include here +// Extends set-packing cliques. Soft floor: min_work; hard ceiling: max_work +// or `timer`. signal_extend only honored after min_work. template i_t extend_cliques(const std::vector>& knapsack_constraints, - const std::unordered_set& set_packing_constraints, clique_table_t& clique_table, - dual_simplex::user_problem_t& problem, - dual_simplex::csr_matrix_t& A, - bool modify_problem, cuopt::timer_t& timer, double* work_estimate_out, - double max_work_estimate) + double min_work, + double max_work, + std::atomic* signal_extend) { - constexpr i_t min_extension_gain = 2; - constexpr i_t extension_yield_window = 64; - constexpr i_t min_successes_per_window = 1; + constexpr i_t min_extension_gain = 2; double local_work = 0.0; double& work = work_estimate_out ? *work_estimate_out : local_work; - i_t base_rows = A.m; - i_t base_nnz = A.row_start[A.m]; - i_t max_added_rows = std::max(8, base_rows / 50); - i_t max_added_nnz = std::max(8 * clique_table.max_clique_size_for_extension, base_nnz / 50); - - i_t added_rows = 0; - i_t added_nnz = 0; - i_t window_attempts = 0; - i_t window_successes = 0; - - CUOPT_LOG_DEBUG("Clique extension heuristics: min_gain=%d row_budget=%d nnz_budget=%d", - min_extension_gain, - max_added_rows, - max_added_nnz); - std::vector> cstr_vars(knapsack_constraints.size()); - std::vector> sp_sigs; - sp_sigs.reserve(set_packing_constraints.size()); - for (const auto knapsack_idx : set_packing_constraints) { - cuopt_assert(knapsack_idx >= 0 && knapsack_idx < static_cast(knapsack_constraints.size()), - "Invalid set packing constraint index"); - const auto& vars = knapsack_constraints[knapsack_idx].entries; - cstr_vars[knapsack_idx].reserve(vars.size()); - for (const auto& entry : vars) { - cstr_vars[knapsack_idx].push_back(entry.col); - } - std::sort(cstr_vars[knapsack_idx].begin(), cstr_vars[knapsack_idx].end()); - cstr_vars[knapsack_idx].erase( - std::unique(cstr_vars[knapsack_idx].begin(), cstr_vars[knapsack_idx].end()), - cstr_vars[knapsack_idx].end()); - long long signature = 0; - for (auto v : cstr_vars[knapsack_idx]) { - signature += static_cast(v); - } - sp_sigs.push_back({knapsack_idx, static_cast(cstr_vars[knapsack_idx].size()), signature}); - work += cstr_vars[knapsack_idx].size(); - } - if (work > max_work_estimate) { return 0; } - std::sort(sp_sigs.begin(), sp_sigs.end(), compare_clique_sig); - std::vector original_to_current_row_idx(problem.row_sense.size(), -1); - for (i_t row_idx = 0; row_idx < static_cast(original_to_current_row_idx.size()); row_idx++) { - original_to_current_row_idx[row_idx] = row_idx; - } std::vector> extension_worklist; extension_worklist.reserve(knapsack_constraints.size()); for (i_t knapsack_idx = 0; knapsack_idx < static_cast(knapsack_constraints.size()); knapsack_idx++) { if (timer.check_time_limit()) { break; } - if (work > max_work_estimate) { break; } + if (work >= max_work) { break; } const auto& knapsack_constraint = knapsack_constraints[knapsack_idx]; if (!knapsack_constraint.is_set_packing) { continue; } i_t clique_size = static_cast(knapsack_constraint.entries.size()); @@ -864,99 +515,93 @@ i_t extend_cliques(const std::vector>& knapsack_ i_t estimated_gain = std::max(0, smallest_degree - (clique_size - 1)); if (estimated_gain < min_extension_gain) { continue; } extension_worklist.push_back({knapsack_idx, estimated_gain, clique_size}); - work += knapsack_constraint.entries.size(); + work += static_cast(knapsack_constraint.entries.size()); } std::stable_sort( extension_worklist.begin(), extension_worklist.end(), compare_extension_candidate); + if (!extension_worklist.empty()) { + work += static_cast(extension_worklist.size()) * + std::log2(static_cast(extension_worklist.size())); + } CUOPT_LOG_DEBUG("Clique extension candidates after scoring: %zu", extension_worklist.size()); i_t n_extended_cliques = 0; for (const auto& candidate : extension_worklist) { if (timer.check_time_limit()) { break; } - if (work > max_work_estimate) { break; } - if (added_rows >= max_added_rows || added_nnz >= max_added_nnz) { - CUOPT_LOG_DEBUG( - "Stopping clique extension: budget reached (rows=%d nnz=%d)", added_rows, added_nnz); - break; + if (work >= min_work) { + if (work >= max_work) { break; } + if (signal_extend && signal_extend->load(std::memory_order_acquire)) { + CUOPT_LOG_DEBUG("Stopping clique extension: cut-pass signal received (work=%.0f)", work); + break; + } } - window_attempts++; const auto& knapsack_constraint = knapsack_constraints[candidate.knapsack_idx]; std::vector clique; + clique.reserve(knapsack_constraint.entries.size()); for (const auto& entry : knapsack_constraint.entries) { clique.push_back(entry.col); } - i_t inserted_row_nnz = 0; - f_t coeff_scale = knapsack_constraint.entries[0].val; - bool extended_clique = extend_clique(clique, - clique_table, - problem, - A, - coeff_scale, - modify_problem, - min_extension_gain, - max_added_rows - added_rows, - max_added_nnz - added_nnz, - inserted_row_nnz); - work += clique.size() * clique.size(); - if (extended_clique) { - n_extended_cliques++; - i_t replacement_row_nnz = 0; - if (modify_problem) { - remove_dominated_cliques_in_problem_for_single_extended_clique(clique_table.first.back(), - coeff_scale, - max_added_rows - added_rows, - max_added_nnz - added_nnz, - replacement_row_nnz, - sp_sigs, - cstr_vars, - knapsack_constraints, - original_to_current_row_idx, - problem, - A, - timer); - } - if (replacement_row_nnz > 0) { - window_successes++; - added_rows++; - added_nnz += replacement_row_nnz; - } - } - if (window_attempts >= extension_yield_window) { - if (window_successes < min_successes_per_window) { - CUOPT_LOG_DEBUG( - "Stopping clique extension: low yield (%d/%d)", window_successes, window_attempts); - break; - } - window_attempts = 0; - window_successes = 0; - } + if (extend_clique(clique, clique_table, work)) { n_extended_cliques++; } } - if (modify_problem) { - // copy modified matrix back to problem - A.to_compressed_col(problem.A); - } - CUOPT_LOG_DEBUG("Number of extended cliques: %d", n_extended_cliques); + CUOPT_LOG_DEBUG("Number of extended cliques: %d (work=%.0f)", n_extended_cliques, work); return n_extended_cliques; } template void fill_var_clique_maps(clique_table_t& clique_table) { - clique_table.first_var_positions.resize(clique_table.first.size()); + const i_t n_vertices = 2 * clique_table.n_variables; + + // first_var_positions: per-clique hash map (cliques small ⇒ hash beats binary search). + clique_table.first_var_positions.assign(clique_table.first.size(), {}); + + std::vector> first_pairs; + size_t total_first_members = 0; + for (const auto& c : clique_table.first) { + total_first_members += c.size(); + } + first_pairs.reserve(total_first_members); + for (size_t clique_idx = 0; clique_idx < clique_table.first.size(); clique_idx++) { const auto& clique = clique_table.first[clique_idx]; auto& pos_map = clique_table.first_var_positions[clique_idx]; pos_map.reserve(clique.size()); for (size_t idx = 0; idx < clique.size(); idx++) { - i_t var_idx = clique[idx]; - clique_table.var_clique_map_first[var_idx].insert(clique_idx); + const i_t var_idx = clique[idx]; + first_pairs.emplace_back(var_idx, static_cast(clique_idx)); pos_map[var_idx] = static_cast(idx); } } - for (size_t addtl_c = 0; addtl_c < clique_table.addtl_cliques.size(); addtl_c++) { - const auto& addtl_clique = clique_table.addtl_cliques[addtl_c]; - clique_table.var_clique_map_addtl[addtl_clique.vertex_idx].insert(addtl_c); + clique_table.var_clique_first.finalize_from_unsorted_pairs(n_vertices, first_pairs); + + std::vector> addtl_pairs; + for (size_t addtl_c = 0; addtl_c < clique_table.addtl_cliques.size(); ++addtl_c) { + const auto& a = clique_table.addtl_cliques[addtl_c]; + addtl_pairs.emplace_back(a.vertex_idx, static_cast(addtl_c)); + const auto& base = clique_table.first[a.clique_idx]; + for (i_t pos = a.start_pos_on_clique; pos < static_cast(base.size()); ++pos) { + addtl_pairs.emplace_back(base[pos], static_cast(addtl_c)); + } + } + clique_table.var_clique_addtl.finalize_from_unsorted_pairs(n_vertices, addtl_pairs); +} + +template +void clique_table_t::set_small_clique_adj_for_test( + const std::unordered_map>& edges) +{ + std::vector> pairs; + size_t total = 0; + for (const auto& kv : edges) { + total += kv.second.size(); + } + pairs.reserve(total); + for (const auto& kv : edges) { + for (const auto& v : kv.second) { + pairs.emplace_back(kv.first, v); + } } + small_clique_adj.finalize_from_unsorted_pairs(2 * n_variables, pairs); } template @@ -972,12 +617,10 @@ void build_clique_table(const dual_simplex::user_problem_t& problem, cuopt_assert(problem.var_types.size() == static_cast(problem.num_cols), "Problem variable types size mismatch"); std::vector> knapsack_constraints; - std::unordered_set set_packing_constraints; dual_simplex::csr_matrix_t A(problem.num_rows, problem.num_cols, 0); problem.A.to_compressed_row(A); fill_knapsack_constraints(problem, knapsack_constraints, A); - make_coeff_positive_knapsack_constraint( - problem, knapsack_constraints, set_packing_constraints, tolerances); + make_coeff_positive_knapsack_constraint(problem, knapsack_constraints, tolerances); sort_csr_by_constraint_coefficients(knapsack_constraints); clique_table.tolerances = tolerances; for (const auto& knapsack_constraint : knapsack_constraints) { @@ -1035,7 +678,6 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, typename mip_solver_settings_t::tolerances_t tolerances, std::shared_ptr>* clique_table_out, cuopt::timer_t& timer, - bool modify_problem, std::atomic* signal_extend) { cuopt::timer_t stage_timer(std::numeric_limits::infinity()); @@ -1050,15 +692,13 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, double t_remove = 0.; #endif std::vector> knapsack_constraints; - std::unordered_set set_packing_constraints; dual_simplex::csr_matrix_t A(problem.num_rows, problem.num_cols, 0); problem.A.to_compressed_row(A); fill_knapsack_constraints(problem, knapsack_constraints, A); #ifdef DEBUG_CLIQUE_TABLE t_fill = stage_timer.elapsed_time(); #endif - make_coeff_positive_knapsack_constraint( - problem, knapsack_constraints, set_packing_constraints, tolerances); + make_coeff_positive_knapsack_constraint(problem, knapsack_constraints, tolerances); #ifdef DEBUG_CLIQUE_TABLE t_coeff = stage_timer.elapsed_time(); #endif @@ -1083,9 +723,9 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, double time_limit_for_additional_cliques = timer.remaining_time() / 2; cuopt::timer_t additional_cliques_timer(time_limit_for_additional_cliques); double find_work_estimate = 0.0; + // Always build base cliques in full; signal_extend only gates the extension phase. for (const auto& knapsack_constraint : knapsack_constraints) { if (timer.check_time_limit()) { break; } - if (signal_extend && signal_extend->load(std::memory_order_acquire)) { break; } find_cliques_from_constraint(knapsack_constraint, *clique_table_ptr, additional_cliques_timer); find_work_estimate += knapsack_constraint.entries.size(); } @@ -1105,17 +745,15 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, t_maps = stage_timer.elapsed_time(); #endif if (clique_table_out != nullptr) { *clique_table_out = std::move(clique_table_shared); } - double extend_work = 0.0; - constexpr double max_extend_work = 2e9; - i_t n_extended_cliques = extend_cliques(knapsack_constraints, - set_packing_constraints, + double extend_work = 0.0; + i_t n_extended_cliques = extend_cliques(knapsack_constraints, *clique_table_ptr, - problem, - A, - modify_problem, timer, &extend_work, - max_extend_work); + clique_config.min_extend_work, + clique_config.max_extend_work, + signal_extend); + if (n_extended_cliques > 0) { fill_var_clique_maps(*clique_table_ptr); } #ifdef DEBUG_CLIQUE_TABLE t_extend = stage_timer.elapsed_time(); CUOPT_LOG_DEBUG( @@ -1134,21 +772,21 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, #endif } -#define INSTANTIATE(F_TYPE) \ - template void find_initial_cliques( \ - dual_simplex::user_problem_t & problem, \ - typename mip_solver_settings_t::tolerances_t tolerances, \ - std::shared_ptr> * clique_table_out, \ - cuopt::timer_t & timer, \ - bool modify_problem, \ - std::atomic* signal_extend); \ - template void build_clique_table( \ - const dual_simplex::user_problem_t& problem, \ - clique_table_t& clique_table, \ - typename mip_solver_settings_t::tolerances_t tolerances, \ - bool remove_small_cliques_flag, \ - bool fill_var_clique_maps_flag, \ - cuopt::timer_t& timer); \ +#define INSTANTIATE(F_TYPE) \ + template void find_initial_cliques( \ + dual_simplex::user_problem_t & problem, \ + typename mip_solver_settings_t::tolerances_t tolerances, \ + std::shared_ptr> * clique_table_out, \ + cuopt::timer_t & timer, \ + std::atomic * signal_extend); \ + template void build_clique_table( \ + const dual_simplex::user_problem_t& problem, \ + clique_table_t& clique_table, \ + typename mip_solver_settings_t::tolerances_t tolerances, \ + bool remove_small_cliques_flag, \ + bool fill_var_clique_maps_flag, \ + cuopt::timer_t& timer); \ + template void fill_var_clique_maps(clique_table_t & clique_table); \ template class clique_table_t; #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh index 944241b4f0..10ad4e3942 100644 --- a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh +++ b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh @@ -23,9 +23,11 @@ #include #include +#include #include #include #include +#include #include namespace cuopt::linear_programming::detail { @@ -33,6 +35,10 @@ namespace cuopt::linear_programming::detail { struct clique_config_t { int min_clique_size = 512; int max_clique_size_for_extension = 128; + // extend_cliques work budget; one unit ≈ one hash/scan op in extend_clique. + // Soft floor before honoring cut-gen signal; hard ceiling. + double min_extend_work = 1e7; + double max_extend_work = 2e9; }; template @@ -59,37 +65,138 @@ struct addtl_clique_t { i_t start_pos_on_clique; }; +// CSR per-vertex map: for v in [0, n_vertices), `indices[offsets[v] .. +// offsets[v+1])` is a sorted slice. Build protocol: callers push (src, value) +// pairs and call `finalize_from_unsorted_pairs`. +template +struct csr_var_map_t { + std::vector offsets; // size: n_vertices + 1; offsets[v] is the start in `indices` + std::vector indices; // sorted within each [offsets[v], offsets[v+1]) slice + + void clear_and_resize(i_t n_vertices) + { + offsets.assign(n_vertices + 1, 0); + indices.clear(); + } + i_t n_keys() const { return offsets.empty() ? 0 : static_cast(offsets.size() - 1); } + i_t slice_size(i_t v) const { return offsets[v + 1] - offsets[v]; } + const i_t* slice_begin(i_t v) const { return indices.data() + offsets[v]; } + const i_t* slice_end(i_t v) const { return indices.data() + offsets[v + 1]; } + // O(1) summary used by cut/extension cost-budget heuristics. + double avg_slice_size() const + { + const i_t k = n_keys(); + return k > 0 ? static_cast(indices.size()) / static_cast(k) : 0.0; + } + bool slice_contains(i_t v, i_t value) const + { + const i_t* b = slice_begin(v); + const i_t* e = slice_end(v); + return std::binary_search(b, e, value); + } + + // Build CSR from unsorted (src, value) pairs. Each output slice is sorted + // and deduplicated. Caller must keep p.first in [0, n_vertices). + void finalize_from_unsorted_pairs(i_t n_vertices, std::vector>& pairs) + { + offsets.assign(n_vertices + 1, 0); + for (const auto& p : pairs) { + offsets[p.first + 1]++; + } + for (i_t v = 1; v <= n_vertices; ++v) { + offsets[v] += offsets[v - 1]; + } + indices.assign(static_cast(offsets.back()), i_t{0}); + std::vector head(n_vertices, 0); + for (const auto& p : pairs) { + indices[offsets[p.first] + head[p.first]++] = p.second; + } + for (i_t v = 0; v < n_vertices; ++v) { + auto* b = indices.data() + offsets[v]; + auto* e = indices.data() + offsets[v] + head[v]; + std::sort(b, e); + auto* new_end = std::unique(b, e); + head[v] = static_cast(new_end - b); + } + // Compact away dedupe holes. + std::vector new_offsets(n_vertices + 1, 0); + for (i_t v = 0; v < n_vertices; ++v) { + new_offsets[v + 1] = new_offsets[v] + head[v]; + } + if (new_offsets.back() != offsets.back()) { + std::vector new_indices(static_cast(new_offsets.back())); + for (i_t v = 0; v < n_vertices; ++v) { + std::copy(indices.data() + offsets[v], + indices.data() + offsets[v] + head[v], + new_indices.data() + new_offsets[v]); + } + offsets = std::move(new_offsets); + indices = std::move(new_indices); + } else { + offsets = std::move(new_offsets); + } + } +}; + template struct clique_table_t { clique_table_t(i_t n_vertices, i_t min_clique_size_, i_t max_clique_size_for_extension_) : min_clique_size(min_clique_size_), max_clique_size_for_extension(max_clique_size_for_extension_), - var_clique_map_first(n_vertices), - var_clique_map_addtl(n_vertices), - adj_list_small_cliques(n_vertices), var_degrees(n_vertices, -1), n_variables(n_vertices / 2) { + var_clique_first.clear_and_resize(n_vertices); + var_clique_addtl.clear_and_resize(n_vertices); + small_clique_adj.clear_and_resize(n_vertices); } - std::unordered_set get_adj_set_of_var(i_t var_idx); + // Copy disabled; move provided so tests can return by value. + // Move-assign omitted because of const members. + clique_table_t(const clique_table_t&) = delete; + clique_table_t& operator=(const clique_table_t&) = delete; + + clique_table_t(clique_table_t&& other) noexcept + : first(std::move(other.first)), + addtl_cliques(std::move(other.addtl_cliques)), + var_clique_first(std::move(other.var_clique_first)), + var_clique_addtl(std::move(other.var_clique_addtl)), + first_var_positions(std::move(other.first_var_positions)), + small_clique_adj(std::move(other.small_clique_adj)), + var_degrees(std::move(other.var_degrees)), + n_variables(other.n_variables), + min_clique_size(other.min_clique_size), + max_clique_size_for_extension(other.max_clique_size_for_extension), + tolerances(other.tolerances) + { + } + + clique_table_t& operator=(clique_table_t&&) = delete; + + std::unordered_set get_adj_set_of_var(i_t var_idx) const; i_t get_degree_of_var(i_t var_idx); - bool check_adjacency(i_t var_idx1, i_t var_idx2); + bool check_adjacency(i_t var_idx1, i_t var_idx2) const; + bool empty() const + { + return first.empty() && addtl_cliques.empty() && small_clique_adj.indices.empty(); + } + + void set_small_clique_adj_for_test(const std::unordered_map>& edges); // keeps the large cliques in each constraint std::vector> first; // keeps the additional cliques std::vector> addtl_cliques; - // TODO figure out the performance of lookup for the following: unordered_set vs vector - // keeps the indices of original(first) cliques that contain variable x - std::vector> var_clique_map_first; - // keeps the indices of additional cliques that contain variable x - std::vector> var_clique_map_addtl; + // var_idx → indices of `first` cliques that contain var_idx (CSR). + csr_var_map_t var_clique_first; + // var_idx → indices of `addtl_cliques` containing var_idx (as the extension + // vertex or as a base-suffix member). + csr_var_map_t var_clique_addtl; // var_idx -> position mapping for each first clique, enabling O(1) membership/position checks std::vector> first_var_positions; - // adjacency list to keep small cliques, this basically keeps the vars share a small clique - // constraint - std::unordered_map> adj_list_small_cliques; + // var_idx → pairwise edges from cliques demoted by remove_small_cliques. + // Symmetric: edge (u, v) appears in both u's and v's slices. + csr_var_map_t small_clique_adj; // degrees of each vertex std::vector var_degrees; // number of variables in the original problem @@ -104,7 +211,6 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, typename mip_solver_settings_t::tolerances_t tolerances, std::shared_ptr>* clique_table_out, cuopt::timer_t& timer, - bool modify_problem, std::atomic* signal_extend = nullptr); template @@ -115,6 +221,9 @@ void build_clique_table(const dual_simplex::user_problem_t& problem, bool fill_var_clique_maps, cuopt::timer_t& timer); +template +void fill_var_clique_maps(clique_table_t& clique_table); + } // namespace cuopt::linear_programming::detail // Possible application to rounding procedure, keeping it as reference diff --git a/cpp/tests/mip/cuts_test.cu b/cpp/tests/mip/cuts_test.cu index 7eec8847c9..9e56e32361 100644 --- a/cpp/tests/mip/cuts_test.cu +++ b/cpp/tests/mip/cuts_test.cu @@ -1111,7 +1111,7 @@ TEST(cuts, clique_phase1_remove_small_cliques_preserves_addtl_conflicts) EXPECT_TRUE(clique_table.first.empty()); EXPECT_TRUE(clique_table.addtl_cliques.empty()); - // Conflicts must remain materialized in adj_list_small_cliques after removals. + // Conflicts must remain materialized in small_clique_adj after removals. EXPECT_TRUE(clique_table.check_adjacency(1, 3)); EXPECT_TRUE(clique_table.check_adjacency(3, 1)); EXPECT_TRUE(clique_table.check_adjacency(2, 3)); @@ -1290,7 +1290,7 @@ TEST(cuts, clique_neos8_phase1_addtl_suffix_conflicts_materialized) TEST(cuts, clique_neos8_phase1_symmetry_and_degree_cache_consistency) { auto& clique_table = get_neos8_clique_table_cached(); - const int n_vertices = static_cast(clique_table.var_clique_map_first.size()); + const int n_vertices = static_cast(clique_table.var_clique_first.n_keys()); ASSERT_GT(n_vertices, 0); const int sample_size = std::min(n_vertices, 24); From bc5006f66f3b913ff41202f1d7d265dd4c8eadfa Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 13 May 2026 17:16:09 +0200 Subject: [PATCH 08/47] remove deterministic guards --- cpp/src/branch_and_bound/branch_and_bound.cpp | 51 +++++-------------- cpp/src/cuts/cuts.hpp | 16 +++--- cpp/src/mip_heuristics/solver.cu | 39 +++++++------- 3 files changed, 37 insertions(+), 69 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 5aefa6463d..f760e1d47c 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2218,12 +2218,6 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } cut_pool_t cut_pool(original_lp_.num_cols, settings_); - // Apply CUOPT_CONFIG_ID sweep override (5 configs; see cuts.cpp). - // Mutates `cut_pool` knobs only (clique cousin filter on/off, Jaccard - // tau, integer-support size tilt). No-op when CUOPT_CONFIG_ID is unset - // / out of range. The deterministic measurement path (no concurrent - // root LP, no in-cut-pass RCS, exit-after-cuts) is unconditional. - apply_cut_sweep_config(cut_pool, settings_); cut_generation_t cut_generation(cut_pool, original_lp_, settings_, @@ -2373,13 +2367,18 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut return mip_status_t::NUMERICAL; } - // In-cut-pass reduced-cost strengthening is disabled on this - // branch: the branch exists only to produce a deterministic - // gap-closed-by-cuts baseline, and primal-driven bound - // tightening makes the per-pass cut yield depend on the timing - // of heuristic-found incumbents (non-deterministic across - // reruns). - // Original block intentionally left out. + if (settings_.reduced_cost_strengthening >= 1 && upper_bound_.load() < last_upper_bound) { + mutex_upper_.lock(); + last_upper_bound = upper_bound_.load(); + std::vector lower_bounds; + std::vector upper_bounds; + find_reduced_cost_fixings(upper_bound_.load(), lower_bounds, upper_bounds); + mutex_upper_.unlock(); + mutex_original_lp_.lock(); + original_lp_.lower = lower_bounds; + original_lp_.upper = upper_bounds; + mutex_original_lp_.unlock(); + } // Try to do bound strengthening std::vector bounds_changed(original_lp_.num_cols, true); @@ -2565,31 +2564,6 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut original_lp_.A.col_start[original_lp_.A.n]); } - // Stop here. The cut loop has finished, the post-cut root LP value - // has been published to benchmark_info_t (just above), and the - // cut-info summary has been printed. This branch exists only to - // measure gap-closed-by-cuts, so we return before strong branching - // / B&B exploration. The early-exit point matches the cut_scoring - // branch so MIPLIBGapStat numbers from both branches line up - // exactly. - settings_.log.printf( - "CutBench: cut generation complete (max_passes=%d, pool=%d, time=%.3fs), " - "exiting before strong branching / B&B exploration\n", - static_cast(settings_.max_cut_passes), - static_cast(cut_pool_size), - static_cast(cut_generation_time)); - finish_clique_thread(); - solver_status_ = mip_status_t::TIME_LIMIT; - set_final_solution(solution, root_objective_); - return solver_status_; - - // The B&B exploration that normally follows cut generation is - // intentionally dead-coded out on this branch. Kept under #if 0 so - // the original control-flow stays visible to anyone diffing against - // upstream main, and so reverting the branch back to a normal - // solver only requires deleting the early-return above and the - // matching #if 0 / #endif markers. -#if 0 set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_); pc_.resize(original_lp_.num_cols); @@ -2748,7 +2722,6 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } set_final_solution(solution, lower_bound); return solver_status_; -#endif // dead-coded B&B exploration; see #if 0 marker above } // ============================================================================ diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index 632708dfd0..ebd1d320e9 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -326,12 +326,12 @@ class cut_pool_t { // higher-scoring representative (or, if no score was supplied, the // earlier-inserted one). // - // Defaults: jaccard_tau=0.85, k=8, enable=false. Cousin filter is OFF - // by default so cut_pool_t behavior matches main_baselin (5335b659) - // unless apply_cut_sweep_config() explicitly turns it on. The numeric - // defaults (tau=0.85, k=8) match the cut_scoring branch's "final - // version" so config 1 here lines up with the P2-4 baseline measured - // there. + // Defaults: jaccard_tau=0.95, k=8, enable=true, size_weight=0.0. + // These match "config 3 / cousin_loose" from the clique-sweep on + // commit 0b04683b — the configuration that won the gap-closed-pct + // comparison and was promoted to be the production default for the + // clique cut family. Callers can still override at runtime via + // set_clique_cousin_* if they want to experiment. void set_clique_cousin_filter_enable(bool v) { clique_cousin_filter_enable_ = v; } void set_clique_cousin_jaccard_tau(f_t v) { clique_cousin_jaccard_tau_ = v; } void set_clique_cousin_minhash_k(i_t v) { clique_cousin_minhash_k_ = v; } @@ -398,9 +398,9 @@ class cut_pool_t { std::vector> clique_support_minhash_; std::vector clique_cousin_score_; std::unordered_map> clique_cousin_buckets_; - f_t clique_cousin_jaccard_tau_{static_cast(0.85)}; + f_t clique_cousin_jaccard_tau_{static_cast(0.95)}; i_t clique_cousin_minhash_k_{8}; - bool clique_cousin_filter_enable_{false}; + bool clique_cousin_filter_enable_{true}; // When > 0, the cousin filter's "score" used to pick a winner is // boosted as: effective_score = base_score * (1 + size_weight * log2(1 + clique_size)). // This biases cousin replacement toward larger cliques (more variables diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 3a2ae1e02a..83ecf2e315 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -416,14 +416,7 @@ solution_t mip_solver_t::run_solver() // Set the primal heuristics -> branch and bound callback if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) { - // Force single-threaded dual simplex at root so the root LP - // value (and therefore the cut-pass starting point) is - // deterministic across reruns. The concurrent racer would - // otherwise pick PDLP or DS as the winner non-deterministically - // and the post-cut gap-closed metric would drift. This branch - // is for gap measurement only, so we make it deterministic by - // default rather than gating on an env var. - branch_and_bound->set_concurrent_lp_root_solve(false); + branch_and_bound->set_concurrent_lp_root_solve(true); context.problem_ptr->branch_and_bound_callback = std::bind(&dual_simplex::branch_and_bound_t::set_new_solution, @@ -470,30 +463,32 @@ solution_t mip_solver_t::run_solver() std::ref(branch_and_bound_solution)); } - // The diversity-manager primal heuristics and the post-BB - // feasibility checks are dead-coded out on this branch. The only - // thing we want from a run on main_baselin is the per-instance - // post-cut gap, which dual_simplex BB has already published into - // benchmark_info_t before returning. Skipping `dm.run_solver()` - // (which would otherwise consume the full time budget after BB - // exits early) and the feasibility checks that depend on a real - // incumbent makes the run exit quickly with a default-constructed - // empty solution. This mirrors the early-return at the top of this - // function used when the solve hits the time limit before B&B even - // starts, and matches the cut_scoring branch so timing comparisons - // are valid. + // Start the primal heuristics context.diversity_manager_ptr = &dm; - solution_t sol(*context.problem_ptr); + auto sol = dm.run_solver(); if (run_bb) { + // Wait for the branch and bound to finish auto bb_status = branch_and_bound_status_future.get(); - static_cast(bb_status); if (branch_and_bound_solution.lower_bound > -std::numeric_limits::infinity()) { context.stats.set_solution_bound( context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound)); } + if (bb_status == dual_simplex::mip_status_t::INFEASIBLE) { sol.set_problem_fully_reduced(); } context.stats.num_nodes = branch_and_bound_solution.nodes_explored; context.stats.num_simplex_iterations = branch_and_bound_solution.simplex_iterations; } + sol.compute_feasibility(); + + rmm::device_scalar is_feasible(sol.handle_ptr->get_stream()); + sol.test_variable_bounds(true, is_feasible.data()); + // test_variable_bounds clears is_feasible if the test is failed + if (!is_feasible.value(sol.handle_ptr->get_stream())) { + CUOPT_LOG_ERROR( + "Solution is not feasible due to variable bounds, returning infeasible solution!"); + context.stats.total_solve_time = timer_.elapsed_time(); + context.problem_ptr->post_process_solution(sol); + return sol; + } context.stats.total_solve_time = timer_.elapsed_time(); context.problem_ptr->post_process_solution(sol); return sol; From 3f42c823f1c7a16671805e9a6d5cc41d223e5250 Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 15 May 2026 14:44:53 +0200 Subject: [PATCH 09/47] clique fixes and common subgraph usage --- cpp/src/cuts/cuts.cpp | 607 +++++++++++++++++++++++++++++------------- cpp/src/cuts/cuts.hpp | 67 +++++ 2 files changed, 488 insertions(+), 186 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 8eab7778b5..6994a6151d 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -29,8 +29,9 @@ namespace cuopt::linear_programming::dual_simplex { namespace { -#define DEBUG_CLIQUE_CUTS 0 -#define CHECK_WORKSPACE 0 +#define DEBUG_CLIQUE_CUTS 0 +#define DEBUG_ZERO_HALF_CUTS 1 +#define CHECK_WORKSPACE 0 enum class clique_cut_build_status_t : int8_t { NO_CUT = 0, CUT_ADDED = 1, INFEASIBLE = 2 }; @@ -47,6 +48,22 @@ enum class clique_cut_build_status_t : int8_t { NO_CUT = 0, CUT_ADDED = 1, INFEA } while (0) #endif +// Crash-tolerant logger: writes to stderr and flushes immediately so the +// last log line is visible even if the process aborts/terminates right after. +#if DEBUG_ZERO_HALF_CUTS +#define ZERO_HALF_DEBUG(...) \ + do { \ + std::fprintf(stderr, "[zero_half] "); \ + std::fprintf(stderr, __VA_ARGS__); \ + std::fprintf(stderr, "\n"); \ + std::fflush(stderr); \ + } while (0) +#else +#define ZERO_HALF_DEBUG(...) \ + do { \ + } while (0) +#endif + template clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertices, i_t num_vars, @@ -505,7 +522,20 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert f_t max_work_estimate) { const size_t cycle_size = cycle_vertices.size(); - if (cycle_size < 5 || (cycle_size % 2) == 0) { return clique_cut_build_status_t::NO_CUT; } + ZERO_HALF_DEBUG( + "build_zero_half_cut enter cycle_size=%zu wheel_centers=%zu num_vars=%lld var_types.size=%zu " + "lower.size=%zu upper.size=%zu xstar.size=%zu", + cycle_size, + wheel_centers.size(), + static_cast(num_vars), + var_types.size(), + lower_bounds.size(), + upper_bounds.size(), + xstar.size()); + if (cycle_size < 5 || (cycle_size % 2) == 0) { + ZERO_HALF_DEBUG("build_zero_half_cut reject cycle_size=%zu", cycle_size); + return clique_cut_build_status_t::NO_CUT; + } cuopt_assert(num_vars > 0, "Zero-half cut num_vars must be positive"); cuopt_assert(static_cast(num_vars) <= lower_bounds.size(), "Zero-half cut lower bounds size mismatch"); @@ -516,6 +546,7 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert const f_t total_size = static_cast(cycle_size + wheel_centers.size()); const f_t estimated_work = 8.0 * total_size + 2.0 * total_size * std::log2(total_size + 1.0); if (add_work_estimate(estimated_work, work_estimate, max_work_estimate)) { + ZERO_HALF_DEBUG("build_zero_half_cut work_limit hit"); return clique_cut_build_status_t::NO_CUT; } @@ -533,10 +564,27 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert auto accumulate = [&](const std::vector& verts, f_t weight, bool is_cycle) -> clique_cut_build_status_t { + ZERO_HALF_DEBUG("build_zero_half_cut accumulate verts.size=%zu weight=%g is_cycle=%d", + verts.size(), + static_cast(weight), + static_cast(is_cycle)); for (const auto vertex_idx : verts) { + ZERO_HALF_DEBUG(" acc vertex_idx=%lld (range [0, %lld))", + static_cast(vertex_idx), + static_cast(2 * num_vars)); + if (vertex_idx < 0 || vertex_idx >= 2 * num_vars) { + ZERO_HALF_DEBUG(" acc OUT_OF_RANGE vertex_idx=%lld", static_cast(vertex_idx)); + return clique_cut_build_status_t::NO_CUT; + } cuopt_assert(vertex_idx >= 0 && vertex_idx < 2 * num_vars, "Zero-half vertex out of range"); const i_t var_idx = vertex_idx % num_vars; const bool complement = vertex_idx >= num_vars; + if (var_idx < 0 || static_cast(var_idx) >= lower_bounds.size() || + static_cast(var_idx) >= upper_bounds.size() || + static_cast(var_idx) >= var_types.size()) { + ZERO_HALF_DEBUG(" acc var_idx OUT_OF_RANGE var_idx=%lld", static_cast(var_idx)); + return clique_cut_build_status_t::NO_CUT; + } const f_t lower_bound = lower_bounds[var_idx]; const f_t upper_bound = upper_bounds[var_idx]; cuopt_assert(var_types[var_idx] != variable_type_t::CONTINUOUS, @@ -564,10 +612,12 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert if (accumulate(cycle_vertices, static_cast(1), true) != clique_cut_build_status_t::CUT_ADDED) { + ZERO_HALF_DEBUG("build_zero_half_cut cycle accumulate failed"); return clique_cut_build_status_t::NO_CUT; } if (m > 0 && !wheel_centers.empty()) { if (accumulate(wheel_centers, f_m, false) != clique_cut_build_status_t::CUT_ADDED) { + ZERO_HALF_DEBUG("build_zero_half_cut wheel accumulate failed"); return clique_cut_build_status_t::NO_CUT; } } @@ -582,18 +632,20 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert } if (cut.i.empty()) { - CUOPT_LOG_DEBUG("[zero_half] build_zero_half_cut empty support after accumulation"); + ZERO_HALF_DEBUG("build_zero_half_cut empty support after accumulation"); return clique_cut_build_status_t::NO_CUT; } cut_rhs = rhs_acc; + ZERO_HALF_DEBUG( + "build_zero_half_cut pre-sort nz=%zu rhs=%g", cut.i.size(), static_cast(cut_rhs)); cut.sort(); + ZERO_HALF_DEBUG("build_zero_half_cut post-sort nz=%zu", cut.i.size()); const f_t dot = cut.dot(xstar); const f_t violation = cut_rhs - dot; - CUOPT_LOG_DEBUG( - "[zero_half] build_zero_half_cut nz=%lld rhs=%g dot=%g violation=%g threshold=%g cycle=%lld " - "wheel=%lld", + ZERO_HALF_DEBUG( + "build_zero_half_cut nz=%lld rhs=%g dot=%g violation=%g threshold=%g cycle=%lld wheel=%lld", static_cast(cut.i.size()), static_cast(cut_rhs), static_cast(dot), @@ -601,6 +653,12 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert static_cast(min_violation), static_cast(cycle_size), static_cast(wheel_centers.size())); + // Dijkstra found a path < 0.5 − min_violation, so the violation should be + // > min_violation here (modulo wheel-lift effects, dropped near-zero + // coefficients, and FP reorder). Slight drift below the threshold is fine + // — we just won't ship the cut. A *strongly* negative violation indicates + // a real bug in cycle construction, the wheel lift, or the cut algebra. + cuopt_assert(violation > -bound_tol, "Zero-half cut violation flipped sign unexpectedly"); if (violation > min_violation) { return clique_cut_build_status_t::CUT_ADDED; } return clique_cut_build_status_t::NO_CUT; } @@ -624,6 +682,18 @@ bool dijkstra_odd_cycle(i_t source_local, f_t max_work_estimate) { const i_t num_local = static_cast(local_adj.size()); + ZERO_HALF_DEBUG("dijkstra_odd_cycle enter source_local=%lld num_local=%lld weights.size=%zu", + static_cast(source_local), + static_cast(num_local), + weights.size()); + if (source_local < 0 || source_local >= num_local) { + ZERO_HALF_DEBUG("dijkstra_odd_cycle source OUT_OF_RANGE"); + return false; + } + if (weights.size() != static_cast(num_local)) { + ZERO_HALF_DEBUG("dijkstra_odd_cycle weights size mismatch"); + return false; + } cuopt_assert(source_local >= 0 && source_local < num_local, "Zero-half Dijkstra source out of range"); cuopt_assert(weights.size() == static_cast(num_local), @@ -642,9 +712,17 @@ bool dijkstra_odd_cycle(i_t source_local, std::priority_queue, std::greater> pq; pq.emplace(static_cast(0), source_idx); + i_t pops = 0; while (!pq.empty()) { auto [d, u] = pq.top(); pq.pop(); + ++pops; + if (u < 0 || u >= total_idx) { + ZERO_HALF_DEBUG("dijkstra_odd_cycle popped u OUT_OF_RANGE u=%lld total_idx=%lld", + static_cast(u), + static_cast(total_idx)); + return false; + } if (d > dist[u]) { continue; } if (u == target_idx) { break; } if (cutoff > 0 && d >= cutoff) { break; } @@ -654,13 +732,36 @@ bool dijkstra_odd_cycle(i_t source_local, const i_t v_part = 1 - u_part; cuopt_assert(u_part == 0 || u_part == 1, "Bipartite part out of range"); + if (u_local < 0 || u_local >= static_cast(local_adj.size())) { + ZERO_HALF_DEBUG("dijkstra_odd_cycle u_local OUT_OF_RANGE u_local=%lld local_adj.size=%zu", + static_cast(u_local), + local_adj.size()); + return false; + } const auto& neigh = local_adj[u_local]; if (add_work_estimate(static_cast(neigh.size()) + 4.0, work_estimate, max_work_estimate)) { + ZERO_HALF_DEBUG("dijkstra_odd_cycle work_limit hit pops=%lld", static_cast(pops)); return false; } for (const auto v_local : neigh) { + if (v_local < 0 || v_local >= num_local) { + ZERO_HALF_DEBUG("dijkstra_odd_cycle neighbor OUT_OF_RANGE v_local=%lld num_local=%lld", + static_cast(v_local), + static_cast(num_local)); + return false; + } cuopt_assert(v_local >= 0 && v_local < num_local, "Zero-half Dijkstra neighbor out of range"); + // Edge weight = (1 − x_u − x_v) / 2, where x_u/x_v are the LP values of + // the literals at u and v. For a CG edge the conflict constraint + // x_u + x_v <= 1 must hold, so the weight is non-negative. Tiny + // negative values arise from FP drift; clamp them. A *significantly* + // negative weight means the LP is meaningfully violating a conflict + // constraint — that's an upstream bug we want to know about, hence + // the debug-only assert with a generous tolerance. f_t edge_w = (static_cast(1) - weights[u_local] - weights[v_local]) / 2; + cuopt_assert(edge_w >= -static_cast(1e-6), + "Zero-half edge weight significantly negative — conflict constraint violated by " + "LP?"); if (edge_w < 0) { edge_w = 0; } const i_t v = v_local + v_part * num_local; const f_t nd = d + edge_w; @@ -672,9 +773,17 @@ bool dijkstra_odd_cycle(i_t source_local, } } - if (!std::isfinite(dist[target_idx])) { return false; } + if (!std::isfinite(dist[target_idx])) { + ZERO_HALF_DEBUG("dijkstra_odd_cycle no path pops=%lld", static_cast(pops)); + return false; + } total_weight = dist[target_idx]; - if (cutoff > 0 && total_weight >= cutoff) { return false; } + if (cutoff > 0 && total_weight >= cutoff) { + ZERO_HALF_DEBUG("dijkstra_odd_cycle path too long total=%g cutoff=%g", + static_cast(total_weight), + static_cast(cutoff)); + return false; + } path.clear(); for (i_t cur = target_idx; cur != -1; cur = prev[cur]) { @@ -686,6 +795,10 @@ bool dijkstra_odd_cycle(i_t source_local, std::reverse(path.begin(), path.end()); // bipartite path from j1 to j2 must have odd number of edges cuopt_assert((path.size() % 2) == 0, "Zero-half bipartite path must have even node count"); + ZERO_HALF_DEBUG("dijkstra_odd_cycle done path.size=%zu total_weight=%g pops=%lld", + path.size(), + static_cast(total_weight), + static_cast(pops)); return true; } @@ -702,30 +815,57 @@ bool path_to_odd_cycle(const std::vector& bipartite_path, f_t* work_estimate, f_t max_work_estimate) { + ZERO_HALF_DEBUG( + "path_to_odd_cycle enter bipartite_path.size=%zu vertices.size=%zu num_local=%lld " + "num_vars=%lld", + bipartite_path.size(), + vertices.size(), + static_cast(num_local), + static_cast(num_vars)); cycle_vertices.clear(); - if (bipartite_path.size() < 4) { return false; } + if (bipartite_path.size() < 4) { + ZERO_HALF_DEBUG("path_to_odd_cycle reject short path"); + return false; + } if (add_work_estimate( static_cast(bipartite_path.size()) * 2.0, work_estimate, max_work_estimate)) { + ZERO_HALF_DEBUG("path_to_odd_cycle work_limit hit"); return false; } std::vector local_seq; local_seq.reserve(bipartite_path.size()); for (const auto bv : bipartite_path) { + if (num_local <= 0) { + ZERO_HALF_DEBUG("path_to_odd_cycle num_local <= 0 num_local=%lld", + static_cast(num_local)); + return false; + } local_seq.push_back(bv % num_local); } // First and last entry should both correspond to the source CG vertex + if (local_seq.front() != local_seq.back()) { + ZERO_HALF_DEBUG("path_to_odd_cycle endpoints mismatch front=%lld back=%lld", + static_cast(local_seq.front()), + static_cast(local_seq.back())); + return false; + } cuopt_assert(local_seq.front() == local_seq.back(), "Zero-half cycle path endpoints must match"); // Drop the duplicate end so we have a sequence covering each cycle vertex once local_seq.pop_back(); - if ((local_seq.size() % 2) == 0 || local_seq.size() < 5) { return false; } + if ((local_seq.size() % 2) == 0 || local_seq.size() < 5) { + ZERO_HALF_DEBUG("path_to_odd_cycle reject local_seq.size=%zu", local_seq.size()); + return false; + } std::unordered_set seen_local; seen_local.reserve(local_seq.size()); for (const auto lv : local_seq) { if (!seen_local.insert(lv).second) { // Same CG vertex appears twice in the path; reject (degenerate cycle) + ZERO_HALF_DEBUG("path_to_odd_cycle duplicate local vertex lv=%lld", + static_cast(lv)); return false; } } @@ -734,16 +874,32 @@ bool path_to_odd_cycle(const std::vector& bipartite_path, std::unordered_set seen_var; seen_var.reserve(local_seq.size()); for (const auto lv : local_seq) { + if (lv < 0 || lv >= num_local || static_cast(lv) >= vertices.size()) { + ZERO_HALF_DEBUG( + "path_to_odd_cycle local idx OUT_OF_RANGE lv=%lld num_local=%lld vertices.size=%zu", + static_cast(lv), + static_cast(num_local), + vertices.size()); + return false; + } cuopt_assert(lv >= 0 && lv < num_local, "Zero-half local idx out of range"); const i_t global = vertices[lv]; + if (global < 0 || global >= 2 * num_vars) { + ZERO_HALF_DEBUG("path_to_odd_cycle global vertex OUT_OF_RANGE global=%lld 2*num_vars=%lld", + static_cast(global), + static_cast(2 * num_vars)); + return false; + } cuopt_assert(global >= 0 && global < 2 * num_vars, "Zero-half global vertex out of range"); const i_t var_idx = global % num_vars; if (!seen_var.insert(var_idx).second) { // Variable appears as both x and ¯x in the cycle; reject (degenerate) + ZERO_HALF_DEBUG("path_to_odd_cycle duplicate var_idx=%lld", static_cast(var_idx)); return false; } cycle_vertices.push_back(global); } + ZERO_HALF_DEBUG("path_to_odd_cycle done cycle_vertices.size=%zu", cycle_vertices.size()); return cycle_vertices.size() >= 5; } @@ -761,6 +917,13 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, f_t* work_estimate, f_t max_work_estimate) { + ZERO_HALF_DEBUG( + "extend_to_odd_wheel enter cycle.size=%zu num_vars=%lld reduced_costs.size=%zu " + "graph.n_variables=%lld", + cycle_vertices.size(), + static_cast(num_vars), + reduced_costs.size(), + static_cast(graph.n_variables)); wheel_centers.clear(); if (cycle_vertices.empty()) { return; } if (toc(start_time) >= time_limit) { return; } @@ -769,20 +932,34 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, i_t smallest_degree_var = -1; for (auto v : cycle_vertices) { if (toc(start_time) >= time_limit) { return; } + if (v < 0 || v >= 2 * num_vars) { + ZERO_HALF_DEBUG("extend_to_odd_wheel cycle vertex OUT_OF_RANGE v=%lld", + static_cast(v)); + return; + } i_t degree = graph.get_degree_of_var(v); if (degree < smallest_degree) { smallest_degree = degree; smallest_degree_var = v; } } + ZERO_HALF_DEBUG("extend_to_odd_wheel smallest_degree_var=%lld smallest_degree=%lld", + static_cast(smallest_degree_var), + static_cast(smallest_degree)); if (smallest_degree_var < 0) { return; } auto adj_set = graph.get_adj_set_of_var(smallest_degree_var); + ZERO_HALF_DEBUG("extend_to_odd_wheel adj_set.size=%zu", adj_set.size()); std::unordered_set cycle_members(cycle_vertices.begin(), cycle_vertices.end()); std::vector candidates; candidates.reserve(adj_set.size()); for (const auto candidate : adj_set) { if (toc(start_time) >= time_limit) { return; } + if (candidate < 0 || candidate >= 2 * num_vars) { + ZERO_HALF_DEBUG("extend_to_odd_wheel candidate OUT_OF_RANGE candidate=%lld", + static_cast(candidate)); + continue; + } if (cycle_members.count(candidate) != 0) { continue; } bool adj_to_all = true; for (const auto v : cycle_vertices) { @@ -797,6 +974,7 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, } if (adj_to_all) { candidates.push_back(candidate); } } + ZERO_HALF_DEBUG("extend_to_odd_wheel candidates.size=%zu", candidates.size()); if (candidates.empty()) { return; } const f_t candidate_size = static_cast(candidates.size()); @@ -807,11 +985,17 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, if (add_work_estimate(adj_set_cost + cycle_size_f * candidate_size + sort_cost, work_estimate, max_work_estimate)) { + ZERO_HALF_DEBUG("extend_to_odd_wheel work_limit hit pre-sort"); return; } auto reduced_cost = [&](i_t vertex_idx) -> f_t { i_t var_idx = vertex_idx % num_vars; + if (var_idx < 0 || static_cast(var_idx) >= reduced_costs.size()) { + ZERO_HALF_DEBUG("extend_to_odd_wheel reduced_cost OUT_OF_RANGE var_idx=%lld", + static_cast(var_idx)); + return 0.0; + } cuopt_assert(var_idx >= 0 && var_idx < static_cast(reduced_costs.size()), "Reduced cost index out of range"); f_t rc = reduced_costs[var_idx]; @@ -841,6 +1025,7 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, } if (adj_to_wheel) { wheel_centers.push_back(candidate); } } + ZERO_HALF_DEBUG("extend_to_odd_wheel done wheel_centers.size=%zu", wheel_centers.size()); } } // namespace @@ -2434,6 +2619,146 @@ void cut_generation_t::generate_implied_bound_cuts( } } +template +void cut_generation_t::prepare_fractional_sub_cg( + const simplex_solver_settings_t& settings, + const std::vector& xstar, + f_t start_time) +{ + sub_cg_.clear(); + + if (settings.clique_cuts == 0 && settings.zero_half_cuts == 0) { return; } + if (toc(start_time) >= settings.time_limit) { return; } + + // Resolve the async clique-table future, if any. Both the clique-cut and + // zero-half routines depend on the conflict graph; the first to need it + // pays for the join here so we avoid duplicating the wait below. + if (clique_table_ == nullptr && clique_table_future_ != nullptr && + clique_table_future_->valid()) { + if (signal_extend_) { signal_extend_->store(true, std::memory_order_release); } + clique_table_ = clique_table_future_->get(); + clique_table_future_ = nullptr; + } + + if (clique_table_ == nullptr) { return; } + // small_clique_adj may carry pairwise CG edges from cliques demoted by + // remove_small_cliques; clique_table_t::empty() accounts for that. + if (clique_table_->empty()) { return; } + + const i_t num_vars = user_problem_.num_cols; + cuopt_assert(clique_table_->n_variables == num_vars, + "prepare_fractional_sub_cg clique table variable count mismatch"); + cuopt_assert(static_cast(num_vars) <= xstar.size(), + "prepare_fractional_sub_cg xstar size mismatch"); + cuopt_assert(user_problem_.var_types.size() == static_cast(num_vars), + "prepare_fractional_sub_cg user problem var_types size mismatch"); + + const f_t bound_tol = settings.primal_tol; + f_t work_estimate = 0.0; + const f_t max_work_estimate = 1e8; + + sub_cg_.num_vars = num_vars; + sub_cg_.vertices.reserve(static_cast(num_vars) * 2); + sub_cg_.weights.reserve(static_cast(num_vars) * 2); + + for (i_t j = 0; j < num_vars; ++j) { + if (user_problem_.var_types[j] == variable_type_t::CONTINUOUS) { continue; } + const f_t lower_bound = user_problem_.lower[j]; + const f_t upper_bound = user_problem_.upper[j]; + if (lower_bound < -bound_tol || upper_bound > 1 + bound_tol) { continue; } + const f_t xj = xstar[j]; + if (std::abs(xj - std::round(xj)) <= settings.integer_tol) { continue; } + sub_cg_.vertices.push_back(j); + sub_cg_.weights.push_back(xj); + sub_cg_.vertices.push_back(j + num_vars); + sub_cg_.weights.push_back(static_cast(1.0) - xj); + } + work_estimate += + 4.0 * static_cast(num_vars) + 2.0 * static_cast(sub_cg_.vertices.size()); + if (work_estimate > max_work_estimate) { + sub_cg_.clear(); + return; + } + + if (sub_cg_.vertices.empty()) { + // No fractional binaries — both separators have nothing to do, but the + // build itself succeeded. Mark ready so callers don't keep retrying. + sub_cg_.ready = true; + return; + } + + sub_cg_.vertex_to_local.assign(static_cast(2 * num_vars), -1); + sub_cg_.in_subgraph.assign(static_cast(2 * num_vars), 0); + for (size_t idx = 0; idx < sub_cg_.vertices.size(); ++idx) { + if (toc(start_time) >= settings.time_limit) { + sub_cg_.clear(); + return; + } + const i_t v_idx = sub_cg_.vertices[idx]; + sub_cg_.vertex_to_local[v_idx] = static_cast(idx); + sub_cg_.in_subgraph[v_idx] = 1; + } + work_estimate += 3.0 * static_cast(sub_cg_.vertices.size()); + if (work_estimate > max_work_estimate) { + sub_cg_.clear(); + return; + } + + sub_cg_.adj_local.assign(sub_cg_.vertices.size(), {}); + size_t total_adj_entries = 0; + size_t kept_adj_entries = 0; + for (size_t idx = 0; idx < sub_cg_.vertices.size(); ++idx) { + if (toc(start_time) >= settings.time_limit) { + sub_cg_.clear(); + return; + } + const i_t v_idx = sub_cg_.vertices[idx]; + auto adj_set = clique_table_->get_adj_set_of_var(v_idx); + total_adj_entries += adj_set.size(); + auto& adj = sub_cg_.adj_local[idx]; + adj.reserve(adj_set.size()); + for (const auto neighbor : adj_set) { + cuopt_assert(neighbor >= 0 && neighbor < 2 * num_vars, + "prepare_fractional_sub_cg neighbor out of range"); + if (!sub_cg_.in_subgraph[neighbor]) { continue; } + const i_t local_neighbor = sub_cg_.vertex_to_local[neighbor]; + cuopt_assert(local_neighbor >= 0, "prepare_fractional_sub_cg local_neighbor out of range"); + adj.push_back(local_neighbor); + } + kept_adj_entries += adj.size(); +#ifdef ASSERT_MODE + { + std::unordered_set adj_global; + adj_global.reserve(adj.size()); + for (const auto neighbor : adj) { + const i_t v = sub_cg_.vertices[neighbor]; + cuopt_assert(adj_global.insert(v).second, + "Duplicate neighbor in fractional sub-CG adjacency list"); + const i_t complement = (v >= num_vars) ? (v - num_vars) : (v + num_vars); + cuopt_assert(adj_global.find(complement) == adj_global.end(), + "Fractional sub-CG adjacency list contains complementing variable"); + } + } +#endif + } + work_estimate += static_cast(sub_cg_.vertices.size()) + static_cast(total_adj_entries) + + 2.0 * static_cast(kept_adj_entries); + if (work_estimate > max_work_estimate) { + sub_cg_.clear(); + return; + } + + sub_cg_.ready = true; + CLIQUE_CUTS_DEBUG("prepare_fractional_sub_cg ready vertices=%lld raw_adj=%lld kept_adj=%lld", + static_cast(sub_cg_.vertices.size()), + static_cast(total_adj_entries), + static_cast(kept_adj_entries)); + ZERO_HALF_DEBUG("prepare_fractional_sub_cg ready vertices=%lld raw_adj=%lld kept_adj=%lld", + static_cast(sub_cg_.vertices.size()), + static_cast(total_adj_entries), + static_cast(kept_adj_entries)); +} + template bool cut_generation_t::generate_cuts(const lp_problem_t& lp, const simplex_solver_settings_t& settings, @@ -2480,6 +2805,14 @@ bool cut_generation_t::generate_cuts(const lp_problem_t& lp, } } + // Build the fractional conflict-graph subgraph once (resolving the async + // clique-table future on the way) so both clique-cut and zero-half cut + // separators consume the same vertex/weight/adjacency tables instead of + // each recomputing them. Done here, after the cut routines that don't + // need the clique table, to give the background clique-table thread as + // much time as possible to finish before we join it. + prepare_fractional_sub_cg(settings, xstar, start_time); + // Generate Clique cuts (last to give background clique table generation maximum time) if (settings.clique_cuts != 0) { f_t cut_start_time = tic(); @@ -2496,8 +2829,11 @@ bool cut_generation_t::generate_cuts(const lp_problem_t& lp, // Generate Zero-half (odd-cycle / odd-wheel) cuts; reuses the clique table built above if (settings.zero_half_cuts != 0) { + ZERO_HALF_DEBUG("generate_cuts: about to call generate_zero_half_cuts"); f_t cut_start_time = tic(); bool feasible = generate_zero_half_cuts(lp, settings, var_types, xstar, zstar, start_time); + ZERO_HALF_DEBUG("generate_cuts: returned from generate_zero_half_cuts feasible=%d", + static_cast(feasible)); if (!feasible) { settings.log.printf("Zero-half cuts proved infeasible\n"); return false; @@ -2506,6 +2842,9 @@ bool cut_generation_t::generate_cuts(const lp_problem_t& lp, if (cut_generation_time > 1.0) { settings.log.debug("Zero-half cut generation time %.2f seconds\n", cut_generation_time); } + } else { + ZERO_HALF_DEBUG("generate_cuts: zero_half_cuts disabled (setting=%d)", + static_cast(settings.zero_half_cuts)); } // Generate implied bound cuts @@ -2559,34 +2898,22 @@ bool cut_generation_t::generate_clique_cuts( static_cast(settings.time_limit), static_cast(toc(start_time))); - if (clique_table_ == nullptr && clique_table_future_ != nullptr && - clique_table_future_->valid()) { - CLIQUE_CUTS_DEBUG("generate_clique_cuts signaling background thread and waiting"); - if (signal_extend_) { signal_extend_->store(true, std::memory_order_release); } - clique_table_ = clique_table_future_->get(); - clique_table_future_ = nullptr; - if (clique_table_) { - CLIQUE_CUTS_DEBUG("generate_clique_cuts received clique table first=%lld addtl=%lld", - static_cast(clique_table_->first.size()), - static_cast(clique_table_->addtl_cliques.size())); - } - } - - if (clique_table_ == nullptr) { - CLIQUE_CUTS_DEBUG("generate_clique_cuts no clique table available, skipping"); + // The fractional conflict-graph subgraph is built once per cut pass in + // prepare_fractional_sub_cg() (called from generate_cuts) and shared with + // the zero-half cut separator. Skip if the build was unable to produce a + // useable sub-CG (clique table missing/empty, work/time budget hit, etc.). + if (!sub_cg_.ready) { + CLIQUE_CUTS_DEBUG("generate_clique_cuts sub_cg_ not ready, skipping"); return true; } - CLIQUE_CUTS_DEBUG("generate_clique_cuts using clique table first=%lld addtl=%lld", - static_cast(clique_table_->first.size()), - static_cast(clique_table_->addtl_cliques.size())); - - if (clique_table_->first.empty() && clique_table_->addtl_cliques.empty()) { - CLIQUE_CUTS_DEBUG("generate_clique_cuts empty clique table, nothing to separate"); + if (sub_cg_.empty_subgraph()) { + CLIQUE_CUTS_DEBUG("generate_clique_cuts no fractional binary vertices"); return true; } - - cuopt_assert(clique_table_->n_variables == num_vars, "Clique table variable count mismatch"); + cuopt_assert(sub_cg_.num_vars == num_vars, "generate_clique_cuts sub_cg_ num_vars mismatch"); cuopt_assert(static_cast(num_vars) <= xstar.size(), "Clique cut xstar size mismatch"); + cuopt_assert(user_problem_.var_types.size() == static_cast(num_vars), + "User problem var_types size mismatch"); const f_t min_violation = std::max(settings.primal_tol, static_cast(1e-6)); const f_t bound_tol = settings.primal_tol; @@ -2596,91 +2923,14 @@ bool cut_generation_t::generate_clique_cuts( f_t work_estimate = 0.0; const f_t max_work_estimate = 1e8; - cuopt_assert(user_problem_.var_types.size() == static_cast(num_vars), - "User problem var_types size mismatch"); - - std::vector vertices; - std::vector weights; - vertices.reserve(num_vars * 2); - weights.reserve(num_vars * 2); + const auto& vertices = sub_cg_.vertices; + const auto& weights = sub_cg_.weights; + const auto& adj_local = sub_cg_.adj_local; - // create the sub graph induced by fractional binary variables - for (i_t j = 0; j < num_vars; ++j) { - if (user_problem_.var_types[j] == variable_type_t::CONTINUOUS) { continue; } - const f_t lower_bound = user_problem_.lower[j]; - const f_t upper_bound = user_problem_.upper[j]; - if (lower_bound < -bound_tol || upper_bound > 1 + bound_tol) { continue; } - const f_t xj = xstar[j]; - if (std::abs(xj - std::round(xj)) <= settings.integer_tol) { continue; } - vertices.push_back(j); - weights.push_back(xj); - vertices.push_back(j + num_vars); - weights.push_back(1.0 - xj); - } - // Coarse loop estimate: variable scans + selected vertex/weight writes - work_estimate += 4.0 * static_cast(num_vars) + 2.0 * static_cast(vertices.size()); - if (work_estimate > max_work_estimate) { return true; } - - if (vertices.empty()) { - CLIQUE_CUTS_DEBUG("generate_clique_cuts no fractional binary vertices"); - return true; - } CLIQUE_CUTS_DEBUG("generate_clique_cuts fractional subgraph vertices=%lld (literals=%lld)", static_cast(vertices.size() / 2), static_cast(vertices.size())); - std::vector vertex_to_local(2 * num_vars, -1); - std::vector in_subgraph(2 * num_vars, 0); - for (size_t idx = 0; idx < vertices.size(); ++idx) { - if (toc(start_time) >= settings.time_limit) { return true; } - const i_t vertex_idx = vertices[idx]; - vertex_to_local[vertex_idx] = static_cast(idx); - in_subgraph[vertex_idx] = 1; - } - work_estimate += 3.0 * static_cast(vertices.size()); - if (work_estimate > max_work_estimate) { return true; } - - std::vector> adj_local(vertices.size()); - size_t total_adj_entries = 0; - size_t kept_adj_entries = 0; - for (size_t idx = 0; idx < vertices.size(); ++idx) { - if (toc(start_time) >= settings.time_limit) { return true; } - i_t vertex_idx = vertices[idx]; - // returns the complement as well - auto adj_set = clique_table_->get_adj_set_of_var(vertex_idx); - total_adj_entries += adj_set.size(); - auto& adj = adj_local[idx]; - adj.reserve(adj_set.size()); - for (const auto neighbor : adj_set) { - if (toc(start_time) >= settings.time_limit) { return true; } - cuopt_assert(neighbor >= 0 && neighbor < 2 * num_vars, "Neighbor out of range"); - if (!in_subgraph[neighbor]) { continue; } - i_t local_neighbor = vertex_to_local[neighbor]; - cuopt_assert(local_neighbor >= 0, "Local neighbor out of range"); - adj.push_back(local_neighbor); - } - kept_adj_entries += adj.size(); -#ifdef ASSERT_MODE - { - std::unordered_set adj_global; - adj_global.reserve(adj.size()); - for (const auto neighbor : adj) { - i_t v = vertices[neighbor]; - cuopt_assert(adj_global.insert(v).second, "Duplicate neighbor in adjacency list"); - i_t complement = (v >= num_vars) ? (v - num_vars) : (v + num_vars); - cuopt_assert(adj_global.find(complement) == adj_global.end(), - "Adjacency list contains complementing variable"); - } - } -#endif - } - work_estimate += static_cast(vertices.size()) + static_cast(total_adj_entries) + - 2.0 * static_cast(kept_adj_entries); - if (work_estimate > max_work_estimate) { return true; } - CLIQUE_CUTS_DEBUG("generate_clique_cuts adjacency raw_entries=%lld kept_entries=%lld", - static_cast(total_adj_entries), - static_cast(kept_adj_entries)); - const size_t words = bitset_words(vertices.size()); std::vector> adj_bitset(vertices.size(), std::vector(words, 0)); size_t local_adj_entries = 0; @@ -2834,26 +3084,41 @@ bool cut_generation_t::generate_zero_half_cuts( if (toc(start_time) >= settings.time_limit) { return true; } const i_t num_vars = user_problem_.num_cols; - CUOPT_LOG_DEBUG("[zero_half] generate_zero_half_cuts start num_vars=%lld elapsed=%g", - static_cast(num_vars), - static_cast(toc(start_time))); - - if (clique_table_ == nullptr && clique_table_future_ != nullptr && - clique_table_future_->valid()) { - if (signal_extend_) { signal_extend_->store(true, std::memory_order_release); } - clique_table_ = clique_table_future_->get(); - clique_table_future_ = nullptr; + ZERO_HALF_DEBUG( + "generate_zero_half_cuts ENTER num_vars=%lld elapsed=%g time_limit=%g xstar.size=%zu " + "reduced_costs.size=%zu var_types.size=%zu user_problem_.lower.size=%zu " + "user_problem_.upper.size=%zu user_problem_.var_types.size=%zu lp.num_cols=%lld " + "sub_cg_.ready=%d sub_cg_.vertices=%zu", + static_cast(num_vars), + static_cast(toc(start_time)), + static_cast(settings.time_limit), + xstar.size(), + reduced_costs.size(), + var_types.size(), + user_problem_.lower.size(), + user_problem_.upper.size(), + user_problem_.var_types.size(), + static_cast(lp.num_cols), + static_cast(sub_cg_.ready), + sub_cg_.vertices.size()); + + // The fractional conflict-graph subgraph is built once per cut pass in + // prepare_fractional_sub_cg() (called from generate_cuts) and shared with + // the clique-cut separator. Skip if the build was unable to produce a + // useable sub-CG (clique table missing/empty, work/time budget hit, etc.). + if (!sub_cg_.ready) { + ZERO_HALF_DEBUG("sub_cg_ not ready, skipping"); + return true; } - - if (clique_table_ == nullptr) { - CUOPT_LOG_DEBUG("[zero_half] no clique table available, skipping"); + if (sub_cg_.empty_subgraph()) { + ZERO_HALF_DEBUG("no fractional binary vertices"); return true; } - if (clique_table_->first.empty() && clique_table_->addtl_cliques.empty()) { - CUOPT_LOG_DEBUG("[zero_half] empty clique table, nothing to separate"); + if (clique_table_ == nullptr) { + ZERO_HALF_DEBUG("no clique table available, skipping"); return true; } - + cuopt_assert(sub_cg_.num_vars == num_vars, "generate_zero_half_cuts sub_cg_ num_vars mismatch"); cuopt_assert(clique_table_->n_variables == num_vars, "Zero-half clique table variable count mismatch"); cuopt_assert(static_cast(num_vars) <= xstar.size(), "Zero-half xstar size mismatch"); @@ -2867,60 +3132,12 @@ bool cut_generation_t::generate_zero_half_cuts( f_t work_estimate = 0.0; const f_t max_work_estimate = 1e8; - std::vector vertices; - std::vector weights; - vertices.reserve(num_vars * 2); - weights.reserve(num_vars * 2); - - for (i_t j = 0; j < num_vars; ++j) { - if (user_problem_.var_types[j] == variable_type_t::CONTINUOUS) { continue; } - const f_t lower_bound = user_problem_.lower[j]; - const f_t upper_bound = user_problem_.upper[j]; - if (lower_bound < -bound_tol || upper_bound > 1 + bound_tol) { continue; } - const f_t xj = xstar[j]; - if (std::abs(xj - std::round(xj)) <= settings.integer_tol) { continue; } - vertices.push_back(j); - weights.push_back(xj); - vertices.push_back(j + num_vars); - weights.push_back(1.0 - xj); - } - work_estimate += 4.0 * static_cast(num_vars) + 2.0 * static_cast(vertices.size()); - if (work_estimate > max_work_estimate) { return true; } - if (vertices.empty()) { - CUOPT_LOG_DEBUG("[zero_half] no fractional binary vertices"); - return true; - } - - const i_t num_local = static_cast(vertices.size()); - CUOPT_LOG_DEBUG("[zero_half] fractional sub-CG vertices=%lld", static_cast(num_local)); - - std::vector vertex_to_local(2 * num_vars, -1); - std::vector in_subgraph(2 * num_vars, 0); - for (i_t idx = 0; idx < num_local; ++idx) { - const i_t vertex_idx = vertices[idx]; - vertex_to_local[vertex_idx] = idx; - in_subgraph[vertex_idx] = 1; - } - work_estimate += 3.0 * static_cast(num_local); - if (work_estimate > max_work_estimate) { return true; } - - std::vector> adj_local(num_local); - for (i_t idx = 0; idx < num_local; ++idx) { - if (toc(start_time) >= settings.time_limit) { return true; } - const i_t vertex_idx = vertices[idx]; - auto adj_set = clique_table_->get_adj_set_of_var(vertex_idx); - auto& adj = adj_local[idx]; - adj.reserve(adj_set.size()); - for (const auto neighbor : adj_set) { - cuopt_assert(neighbor >= 0 && neighbor < 2 * num_vars, "Zero-half neighbor out of range"); - if (!in_subgraph[neighbor]) { continue; } - const i_t local_neighbor = vertex_to_local[neighbor]; - cuopt_assert(local_neighbor >= 0, "Zero-half local neighbor out of range"); - adj.push_back(local_neighbor); - } - work_estimate += static_cast(adj_set.size()); - } - if (work_estimate > max_work_estimate) { return true; } + const auto& vertices = sub_cg_.vertices; + const auto& weights = sub_cg_.weights; + const auto& adj_local = sub_cg_.adj_local; + const auto& vertex_to_local = sub_cg_.vertex_to_local; + const i_t num_local = sub_cg_.num_local(); + ZERO_HALF_DEBUG("starting separation loop num_local=%lld", static_cast(num_local)); sparse_vector_t cut(lp.num_cols, 0); f_t cut_rhs = 0.0; @@ -2937,6 +3154,9 @@ bool cut_generation_t::generate_zero_half_cuts( if (toc(start_time) >= settings.time_limit) { break; } if (work_estimate > max_work_estimate) { break; } if (already_used[s]) { continue; } + ZERO_HALF_DEBUG("separation loop s=%lld / %lld", + static_cast(s), + static_cast(num_local)); f_t total_weight = 0; if (!dijkstra_odd_cycle(s, @@ -2959,6 +3179,9 @@ bool cut_generation_t::generate_zero_half_cuts( continue; } cycles_found++; + ZERO_HALF_DEBUG("cycle found s=%lld cycle_vertices.size=%zu", + static_cast(s), + cycle_vertices.size()); extend_to_odd_wheel(cycle_vertices, wheel_centers, @@ -2970,6 +3193,9 @@ bool cut_generation_t::generate_zero_half_cuts( &work_estimate, max_work_estimate); + ZERO_HALF_DEBUG("calling build_zero_half_cut cycle=%zu wheel=%zu", + cycle_vertices.size(), + wheel_centers.size()); const auto build_status = build_zero_half_cut(cycle_vertices, wheel_centers, num_vars, @@ -2983,28 +3209,37 @@ bool cut_generation_t::generate_zero_half_cuts( cut_rhs, &work_estimate, max_work_estimate); + ZERO_HALF_DEBUG("build_zero_half_cut returned status=%d", static_cast(build_status)); if (work_estimate > max_work_estimate) { break; } if (build_status == clique_cut_build_status_t::INFEASIBLE) { - CUOPT_LOG_DEBUG("[zero_half] infeasible cycle detected"); + ZERO_HALF_DEBUG("infeasible cycle detected, returning false"); return false; } if (build_status == clique_cut_build_status_t::CUT_ADDED) { inequality_t cut_inequality; cut_inequality.vector = cut; cut_inequality.rhs = cut_rhs; + ZERO_HALF_DEBUG( + "adding cut to pool nz=%zu rhs=%g", cut.i.size(), static_cast(cut_rhs)); cut_pool_.add_cut(cut_type_t::ZERO_HALF, cut_inequality); + ZERO_HALF_DEBUG("cut added to pool"); cuts_added++; added_per_var++; // mark all CG vertices that participated so we do not re-derive the same // cycle from a different source vertex for (const auto v : cycle_vertices) { + if (v < 0 || v >= 2 * num_vars) { + ZERO_HALF_DEBUG("mark already_used: cycle v OUT_OF_RANGE v=%lld", + static_cast(v)); + continue; + } const i_t lv = vertex_to_local[v]; - if (lv >= 0) { already_used[lv] = 1; } + if (lv >= 0 && lv < num_local) { already_used[lv] = 1; } } } } - CUOPT_LOG_DEBUG("[zero_half] generate_zero_half_cuts done cycles=%lld cuts=%lld work=%g", + ZERO_HALF_DEBUG("generate_zero_half_cuts EXIT cycles=%lld cuts=%lld work=%g", static_cast(cycles_found), static_cast(cuts_added), static_cast(work_estimate)); diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index 29aba8bbb4..adf791d9e4 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -619,6 +619,59 @@ class mixed_integer_rounding_cut_t; template class variable_bounds_t; +// Shared fractional conflict-graph subgraph used by both the clique-cut and +// zero-half cut separators. Built once per cut pass in +// cut_generation_t::generate_cuts and consumed by both routines so neither +// has to rebuild the same vertex/weight/adjacency tables. +// +// Vertex indexing: each fractional binary variable j contributes two CG +// vertices — the original literal `j` and the complement literal +// `j + num_vars`. Local indices are dense in `[0, vertices.size())`. +template +struct fractional_conflict_subgraph_t { + // Number of variables in the original problem; CG vertex indices are in + // [0, 2 * num_vars). + i_t num_vars{0}; + + // Global CG vertex indices (length = 2 * #fractional binary vars). + std::vector vertices; + + // LP value of the literal at each local index. weights[k] = x_j for the + // original copy of variable j; 1 - x_j for the complement copy. + std::vector weights; + + // Inverse mapping: vertex_to_local[CG_vertex] = local_idx (or -1 if not + // in the subgraph). Sized 2 * num_vars when ready. + std::vector vertex_to_local; + + // 1 if CG_vertex is in the subgraph, 0 otherwise. Sized 2 * num_vars when + // ready. + std::vector in_subgraph; + + // For each local index l, adj_local[l] is the list of local indices of + // its neighbors (CG neighbors restricted to the subgraph). + std::vector> adj_local; + + // True iff a build completed for the current cut pass. May be true with + // an empty subgraph (no fractional binaries), in which case both + // separators have nothing to do but the build itself succeeded. + bool ready{false}; + + i_t num_local() const { return static_cast(vertices.size()); } + bool empty_subgraph() const { return vertices.empty(); } + + void clear() + { + num_vars = 0; + vertices.clear(); + weights.clear(); + vertex_to_local.clear(); + in_subgraph.clear(); + adj_local.clear(); + ready = false; + } +}; + template class cut_generation_t { public: @@ -712,6 +765,16 @@ class cut_generation_t { const std::vector& xstar, f_t start_time); + // Resolve the async clique-table future (if still pending) and build the + // fractional conflict-graph subgraph against the current xstar. Both the + // clique-cut and zero-half cut separators consume the result via sub_cg_. + // Skips cleanly (sub_cg_.ready = false) if the clique table is missing or + // empty, if budgets are exceeded, or if cut routines depending on it are + // disabled. Safe to call multiple times per cut pass. + void prepare_fractional_sub_cg(const simplex_solver_settings_t& settings, + const std::vector& xstar, + f_t start_time); + cut_pool_t& cut_pool_; knapsack_generation_t knapsack_generation_; const user_problem_t& user_problem_; @@ -719,6 +782,10 @@ class cut_generation_t { std::shared_ptr> clique_table_; std::future>>* clique_table_future_{nullptr}; std::atomic* signal_extend_{nullptr}; + // Cached fractional sub-CG, rebuilt at the top of each generate_cuts call + // by prepare_fractional_sub_cg. Both clique cuts and zero-half cuts read + // from this and skip if !sub_cg_.ready. + fractional_conflict_subgraph_t sub_cg_; }; template From af65630d482943b6bbc098acde8d604518eef3d8 Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 15 May 2026 16:07:23 +0200 Subject: [PATCH 10/47] fix complement bug --- cpp/src/cuts/cuts.cpp | 106 ++++++++---------- .../local_search/local_search.cu | 8 +- 2 files changed, 52 insertions(+), 62 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 5fe52ddc0f..44a9dcbb15 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -83,50 +83,21 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic cut.i.clear(); cut.x.clear(); - // P0-3 (1): two-pass complement-pair detection. The baseline returned - // NO_CUT on the first variable that appeared both as itself and as its - // complement; that hides how many such conflicts a candidate clique has - // and makes it impossible to attribute infeasibility events to specific - // clique generators. Pre-pass collects every original/complement - // occurrence per variable, counts the actual complement pairs, and only - // then decides. Accept/reject behavior matches baseline (a complement - // pair still aborts cut construction); only the diagnostics change. + // First pass: collect literal/complement occurrences per variable and the + // set of variables that appear both as themselves and as their complement + // in the clique. Bounds / var_type sanity checks are folded in here so we + // touch each clique vertex once. std::unordered_set seen_original; std::unordered_set seen_complement; + std::unordered_set complement_pairs; seen_original.reserve(clique_vertices.size()); seen_complement.reserve(clique_vertices.size()); for (const auto vertex_idx : clique_vertices) { cuopt_assert(vertex_idx >= 0 && vertex_idx < 2 * num_vars, "Clique vertex out of range"); - const i_t var_idx = vertex_idx % num_vars; - const bool complement = vertex_idx >= num_vars; - if (complement) { - seen_complement.insert(var_idx); - } else { - seen_original.insert(var_idx); - } - } - i_t complement_pairs = 0; - for (const auto var_idx : seen_original) { - if (seen_complement.count(var_idx) > 0) { complement_pairs++; } - } - if (complement_pairs > 0) { - CLIQUE_CUTS_DEBUG("build_clique_cut infeasible: %lld complement-pairs", - static_cast(complement_pairs)); - return clique_cut_build_status_t::NO_CUT; - } - - // Second pass: emit cut coefficients. We already know there are no - // complement-pair conflicts so the lookups against seen_original / - // seen_complement that the baseline performed are now redundant. - i_t num_complements = 0; - const bool has_original = !seen_original.empty(); - const bool has_complement = !seen_complement.empty(); - for (const auto vertex_idx : clique_vertices) { const i_t var_idx = vertex_idx % num_vars; const bool complement = vertex_idx >= num_vars; const f_t lower_bound = lower_bounds[var_idx]; const f_t upper_bound = upper_bounds[var_idx]; - cuopt_assert(var_types[var_idx] != variable_type_t::CONTINUOUS, "Clique contains continuous variable"); cuopt_assert(lower_bound >= -bound_tol, "Clique variable lower bound below zero"); @@ -134,57 +105,73 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic static_cast(lower_bound); static_cast(upper_bound); - // Cut is stored in form sum_j a_j x_j >= rhs for direct dot-product - // violation checks. Complemented literals (1 - x_j) contribute +1*x_j - // to the inequality and originals contribute -1*x_j. if (complement) { - num_complements++; - cut.i.push_back(var_idx); - cut.x.push_back(1.0); + cuopt_assert(seen_complement.count(var_idx) == 0, "Duplicate complement in clique"); + if (seen_original.count(var_idx) > 0) { complement_pairs.insert(var_idx); } + seen_complement.insert(var_idx); } else { - cut.i.push_back(var_idx); - cut.x.push_back(-1.0); + cuopt_assert(seen_original.count(var_idx) == 0, "Duplicate variable in clique"); + if (seen_complement.count(var_idx) > 0) { complement_pairs.insert(var_idx); } + seen_original.insert(var_idx); } } + // >= 2 complement pairs force two distinct variables each into + // {0} \cap {1} simultaneously => node is LP-infeasible. The caller is + // expected to short-circuit the rest of cut generation on INFEASIBLE. + if (complement_pairs.size() >= 2) { + CLIQUE_CUTS_DEBUG("build_clique_cut infeasible: %lld complement-pairs", + static_cast(complement_pairs.size())); + return clique_cut_build_status_t::INFEASIBLE; + } + + // Exactly one complement pair (x + (1-x) = 1) contributes nothing to the + // sum but forces every other clique member to 0. We drop the paired + // variable from the support and bump rhs by 1, producing a fixing cut. + const bool has_pair = complement_pairs.size() == 1; + i_t num_complements = 0; + for (const auto vertex_idx : clique_vertices) { + const i_t var_idx = vertex_idx % num_vars; + const bool complement = vertex_idx >= num_vars; + if (has_pair && complement_pairs.count(var_idx) > 0) { continue; } + cut.i.push_back(var_idx); + cut.x.push_back(complement ? static_cast(1.0) : static_cast(-1.0)); + if (complement) { num_complements++; } + } + if (cut.i.empty()) { CLIQUE_CUTS_DEBUG("build_clique_cut no_cut empty support"); return clique_cut_build_status_t::NO_CUT; } - cut_rhs = static_cast(num_complements - 1); + cut_rhs = has_pair ? static_cast(num_complements) : static_cast(num_complements - 1); cut.sort(); - // P0-3 (4): has_pair distinguishes pure (all originals OR all - // complements) from mixed cliques in the accepted-cut log line so - // post-mortem analysis can attribute gap closure to one variant or - // the other. - const int has_pair = (has_original && has_complement) ? 1 : 0; const f_t dot = cut.dot(xstar); const f_t violation = cut_rhs - dot; if (violation > min_violation) { CLIQUE_CUTS_DEBUG( - "build_clique_cut accepted nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld " - "has_pair=%d", + "build_clique_cut accepted has_pair=%d nz=%lld rhs=%g dot=%g violation=%g threshold=%g " + "complements=%lld", + has_pair ? 1 : 0, static_cast(cut.i.size()), static_cast(cut_rhs), static_cast(dot), static_cast(violation), static_cast(min_violation), - static_cast(num_complements), - has_pair); + static_cast(num_complements)); return clique_cut_build_status_t::CUT_ADDED; } CLIQUE_CUTS_DEBUG( - "build_clique_cut rejected nz=%lld rhs=%g dot=%g violation=%g threshold=%g complements=%lld " - "has_pair=%d", + "build_clique_cut rejected has_pair=%d nz=%lld rhs=%g dot=%g violation=%g threshold=%g " + "complements=%lld", + has_pair ? 1 : 0, static_cast(cut.i.size()), static_cast(cut_rhs), static_cast(dot), static_cast(violation), static_cast(min_violation), - static_cast(num_complements), - has_pair); + static_cast(num_complements)); return clique_cut_build_status_t::NO_CUT; } @@ -2213,14 +2200,15 @@ bool cut_generation_t::generate_clique_cuts( kept_adj_entries += adj.size(); #ifdef ASSERT_MODE { + // {k, ~k} as neighbors is legal (vertex_idx is then implicitly fixed to + // 0 by the conflict structure); build_clique_cut handles the resulting + // cliques as fixing cuts or infeasibility signals, so only duplicates + // are a real invariant here. std::unordered_set adj_global; adj_global.reserve(adj.size()); for (const auto neighbor : adj) { i_t v = vertices[neighbor]; cuopt_assert(adj_global.insert(v).second, "Duplicate neighbor in adjacency list"); - i_t complement = (v >= num_vars) ? (v - num_vars) : (v + num_vars); - cuopt_assert(adj_global.find(complement) == adj_global.end(), - "Adjacency list contains complementing variable"); } } #endif diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index b96b48a413..620b7831e7 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -125,12 +125,14 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_); scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: "; scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback = - [&population](f_t obj, const std::vector& h_vec, double /*work_units*/) { + [&population, problem_ptr = context.problem_ptr]( + f_t obj, const std::vector& h_vec, double /*work_units*/) { population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); + (void)problem_ptr; if (obj < local_search_best_obj) { CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", - context.problem_ptr->get_user_obj_from_solver_obj(obj), - context.problem_ptr->get_user_obj_from_solver_obj( + problem_ptr->get_user_obj_from_solver_obj(obj), + problem_ptr->get_user_obj_from_solver_obj( population.is_feasible() ? population.best_feasible().get_objective() : std::numeric_limits::max())); local_search_best_obj = obj; From f2004bcbcaf945d9fa9f04d6f8dc215d524a375a Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 15 May 2026 18:50:12 +0200 Subject: [PATCH 11/47] fix compile error --- cpp/src/cuts/cuts.cpp | 36 ++---------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index e8875db1f7..22c16ab773 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -112,31 +112,6 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic seen_complement.reserve(clique_vertices.size()); for (const auto vertex_idx : clique_vertices) { cuopt_assert(vertex_idx >= 0 && vertex_idx < 2 * num_vars, "Clique vertex out of range"); - const i_t var_idx = vertex_idx % num_vars; - const bool complement = vertex_idx >= num_vars; - if (complement) { - seen_complement.insert(var_idx); - } else { - seen_original.insert(var_idx); - } - } - i_t complement_pairs = 0; - for (const auto var_idx : seen_original) { - if (seen_complement.count(var_idx) > 0) { complement_pairs++; } - } - if (complement_pairs > 0) { - CLIQUE_CUTS_DEBUG("build_clique_cut infeasible: %lld complement-pairs", - static_cast(complement_pairs)); - return clique_cut_build_status_t::NO_CUT; - } - - // Second pass: emit cut coefficients. We already know there are no - // complement-pair conflicts so the lookups against seen_original / - // seen_complement that the baseline performed are now redundant. - i_t num_complements = 0; - const bool has_original = !seen_original.empty(); - const bool has_complement = !seen_complement.empty(); - for (const auto vertex_idx : clique_vertices) { const i_t var_idx = vertex_idx % num_vars; const bool complement = vertex_idx >= num_vars; const f_t lower_bound = lower_bounds[var_idx]; @@ -190,11 +165,6 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic cut_rhs = has_pair ? static_cast(num_complements) : static_cast(num_complements - 1); cut.sort(); - // P0-3 (4): has_pair distinguishes pure (all originals OR all - // complements) from mixed cliques in the accepted-cut log line so - // post-mortem analysis can attribute gap closure to one variant or - // the other. - const int has_pair = (has_original && has_complement) ? 1 : 0; const f_t dot = cut.dot(xstar); const f_t violation = cut_rhs - dot; if (violation > min_violation) { @@ -207,8 +177,7 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic static_cast(dot), static_cast(violation), static_cast(min_violation), - static_cast(num_complements), - has_pair); + static_cast(num_complements)); return clique_cut_build_status_t::CUT_ADDED; } CLIQUE_CUTS_DEBUG( @@ -220,8 +189,7 @@ clique_cut_build_status_t build_clique_cut(const std::vector& clique_vertic static_cast(dot), static_cast(violation), static_cast(min_violation), - static_cast(num_complements), - has_pair); + static_cast(num_complements)); return clique_cut_build_status_t::NO_CUT; } From 3f0ace164bc4bb0d0a2b3b69a26adcf08d5b30ac Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 15 May 2026 20:03:11 +0200 Subject: [PATCH 12/47] fix omp --- cpp/src/CMakeLists.txt | 1 + cpp/src/utilities/omp_helpers.cpp | 45 +++++++++++++++++++++++++++++++ cpp/src/utilities/omp_helpers.hpp | 35 ++++++++---------------- 3 files changed, 57 insertions(+), 24 deletions(-) create mode 100644 cpp/src/utilities/omp_helpers.cpp diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt index c99210bf34..94821065f4 100644 --- a/cpp/src/CMakeLists.txt +++ b/cpp/src/CMakeLists.txt @@ -5,6 +5,7 @@ set(UTIL_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/utilities/seed_generator.cu ${CMAKE_CURRENT_SOURCE_DIR}/utilities/logger.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/utilities/omp_helpers.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/version_info.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/timestamp_utils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/work_unit_scheduler.cpp) diff --git a/cpp/src/utilities/omp_helpers.cpp b/cpp/src/utilities/omp_helpers.cpp new file mode 100644 index 0000000000..974197c196 --- /dev/null +++ b/cpp/src/utilities/omp_helpers.cpp @@ -0,0 +1,45 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#ifdef _OPENMP + +#include + +namespace cuopt { + +// All operations on the underlying `omp_lock_t` are defined out-of-line so +// that `new omp_lock_t` and the matching (sized) `delete` invoked through +// `std::unique_ptr` exist in exactly one translation unit. This +// avoids ODR-induced `new-delete-type-mismatch` errors when other TUs (most +// notably NVCC host passes) end up with a differently sized `omp_lock_t`. + +omp_mutex_t::omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); } + +omp_mutex_t::omp_mutex_t(omp_mutex_t&& other) noexcept { *this = std::move(other); } + +omp_mutex_t& omp_mutex_t::operator=(omp_mutex_t&& other) noexcept +{ + if (&other != this) { + if (mutex) { omp_destroy_lock(mutex.get()); } + mutex = std::move(other.mutex); + } + return *this; +} + +omp_mutex_t::~omp_mutex_t() +{ + if (mutex) { + omp_destroy_lock(mutex.get()); + mutex.reset(); + } +} + +} // namespace cuopt + +#endif // _OPENMP diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp index bbf4327f81..afeb3acc27 100644 --- a/cpp/src/utilities/omp_helpers.hpp +++ b/cpp/src/utilities/omp_helpers.hpp @@ -17,37 +17,24 @@ namespace cuopt { // Wrapper of omp_lock_t. Optionally, you can provide a hint as defined in // https://www.openmp.org/spec-html/5.1/openmpse39.html#x224-2570003.9 +// +// The constructor / destructor / move-ops are intentionally out-of-line and +// defined in omp_helpers.cpp. This ensures that the `new omp_lock_t` and the +// matching (sized) `delete` instantiated through `std::unique_ptr` +// happen in exactly one translation unit. Otherwise NVCC host passes (and +// other TUs) can end up with different `sizeof(omp_lock_t)` values, which +// ODR-merges into a `new-delete-type-mismatch` at runtime under ASan. class omp_mutex_t { public: - omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); } - + omp_mutex_t(); omp_mutex_t(const omp_mutex_t&) = delete; - - omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); } - + omp_mutex_t(omp_mutex_t&& other) noexcept; omp_mutex_t& operator=(const omp_mutex_t&) = delete; - - omp_mutex_t& operator=(omp_mutex_t&& other) - { - if (&other != this) { - if (mutex) { omp_destroy_lock(mutex.get()); } - mutex = std::move(other.mutex); - } - return *this; - } - - virtual ~omp_mutex_t() - { - if (mutex) { - omp_destroy_lock(mutex.get()); - mutex.reset(); - } - } + omp_mutex_t& operator=(omp_mutex_t&& other) noexcept; + ~omp_mutex_t(); void lock() { omp_set_lock(mutex.get()); } - void unlock() { omp_unset_lock(mutex.get()); } - bool try_lock() { return omp_test_lock(mutex.get()); } private: From 7995451a7ef738b5e0c703a9715748417db31651 Mon Sep 17 00:00:00 2001 From: akif Date: Sat, 16 May 2026 08:54:37 +0200 Subject: [PATCH 13/47] with additional fix --- cpp/src/utilities/omp_helpers.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp index afeb3acc27..abe0bcce40 100644 --- a/cpp/src/utilities/omp_helpers.hpp +++ b/cpp/src/utilities/omp_helpers.hpp @@ -24,6 +24,11 @@ namespace cuopt { // happen in exactly one translation unit. Otherwise NVCC host passes (and // other TUs) can end up with different `sizeof(omp_lock_t)` values, which // ODR-merges into a `new-delete-type-mismatch` at runtime under ASan. +// +// `virtual` on the destructor is preserved on purpose: it has been part of the +// class for a long time and removing it would change `sizeof(omp_mutex_t)` +// (no more vtable pointer), which would silently break any incremental build +// or any object file that wasn't rebuilt against the new header. class omp_mutex_t { public: omp_mutex_t(); @@ -31,7 +36,7 @@ class omp_mutex_t { omp_mutex_t(omp_mutex_t&& other) noexcept; omp_mutex_t& operator=(const omp_mutex_t&) = delete; omp_mutex_t& operator=(omp_mutex_t&& other) noexcept; - ~omp_mutex_t(); + virtual ~omp_mutex_t(); void lock() { omp_set_lock(mutex.get()); } void unlock() { omp_unset_lock(mutex.get()); } From 8ad61ddf3a34773d1b72143fb0a6b9202bb976a8 Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 19 May 2026 22:05:37 -0400 Subject: [PATCH 14/47] add cuda error recovery for capture --- .../feasibility_jump/feasibility_jump.cu | 46 +++-- .../feasibility_jump/feasibility_jump.cuh | 4 +- cpp/src/pdlp/pdhg.cu | 30 +--- .../weighted_average_solution.cu | 9 +- .../adaptive_step_size_strategy.cu | 8 +- cpp/src/pdlp/utilities/ping_pong_graph.cu | 80 --------- cpp/src/pdlp/utilities/ping_pong_graph.cuh | 61 +++++-- cpp/src/utilities/manual_cuda_graph.cuh | 159 ++++++++++++++++++ 8 files changed, 236 insertions(+), 161 deletions(-) create mode 100644 cpp/src/utilities/manual_cuda_graph.cuh diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu index 6b440aed4f..527a99cec3 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu @@ -108,8 +108,7 @@ fj_t::fj_t(mip_solver_context_t& context_, fj_settings_t in_ template void fj_t::reset_cuda_graph() { - if (graph_created) cudaGraphExecDestroy(graph_instance); - graph_created = false; + step_graph_.reset(); } template @@ -682,18 +681,23 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream // Load-balanced codepath not updated yet to handle rounding mode if (settings.mode == fj_mode_t::ROUNDING) { use_load_balancing = false; } - cudaGraph_t graph; void* kernel_args[] = {&v}; bool force_reset = false; void* reset_moves_args[] = {&v, &force_reset}; bool ignore_load_balancing = false; void* update_assignment_args[] = {&v, &ignore_load_balancing}; - if (!graph_created || !use_graph) { - // CUB temp storage initialization - size_t compaction_temp_storage_bytes = 0; - auto valid_move_iterator = thrust::make_transform_iterator( - thrust::counting_iterator(0), - cuda::proclaim_return_type([v] __device__(i_t i) -> i_t { return v.admits_move(i); })); + + // CUB temp storage probe + resize is intentionally done OUTSIDE the + // captured region: the resize would allocate, which is forbidden during + // capture, and the probe itself is a pure size calculation. We only need + // to (re)compute it on first capture for graph mode, and every time for + // eager mode -- the temp-storage size depends on n_variables only and is + // stable across iterations otherwise. + size_t compaction_temp_storage_bytes = 0; + auto valid_move_iterator = thrust::make_transform_iterator( + thrust::counting_iterator(0), + cuda::proclaim_return_type([v] __device__(i_t i) -> i_t { return v.admits_move(i); })); + if (!step_graph_.is_initialized() || !use_graph) { cub::DeviceSelect::Flagged((void*)nullptr, compaction_temp_storage_bytes, thrust::counting_iterator(0), @@ -705,10 +709,9 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream if (compaction_temp_storage_bytes > data.cub_storage_bytes.size()) { data.cub_storage_bytes.resize(compaction_temp_storage_bytes, climber_stream); } + } - if (use_graph) { - RAFT_CUDA_TRY(cudaStreamBeginCapture(climber_stream, cudaStreamCaptureModeThreadLocal)); - } + auto step_body = [&]() { for (i_t i = 0; i < (use_graph ? iterations_per_graph : 1); ++i) { { // related varialbe array has to be dynamically computed each iteration @@ -806,22 +809,13 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream 0, climber_stream)); } + }; - if (use_graph) { - RAFT_CUDA_TRY(cudaStreamEndCapture(climber_stream, &graph)); - try { - RAFT_CUDA_TRY(cudaGraphInstantiate(&graph_instance, graph)); - } catch (...) { - RAFT_CUDA_TRY(cudaGraphDestroy(graph)); - throw; - } - RAFT_CHECK_CUDA(climber_stream); - RAFT_CUDA_TRY(cudaGraphDestroy(graph)); - graph_created = true; - } + if (use_graph) { + step_graph_.run(climber_stream, step_body); + } else { + step_body(); } - - if (use_graph) RAFT_CUDA_TRY(cudaGraphLaunch(graph_instance, climber_stream)); } template diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh index 33d1ac527f..aefe5c2f28 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh @@ -18,6 +18,7 @@ #include #include +#include #include @@ -267,8 +268,7 @@ class fj_t { rmm::device_uvector work_id_to_nonbin_var_idx; rmm::device_uvector work_ids_for_related_vars; - cudaGraphExec_t graph_instance; - bool graph_created = false; + cuopt::manual_cuda_graph_t step_graph_; // kernel launch dimensions, computed once inside the constructor std::pair setval_launch_dims; diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index d9dbb083f9..dca0b828de 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -597,29 +597,23 @@ void pdhg_solver_t::compute_next_primal_dual_solution( #endif // Primal and dual steps are captured in a cuda graph since called very often - if (!graph_all.is_initialized(total_pdlp_iterations)) { - graph_all.start_capture(total_pdlp_iterations); + graph_all.run(total_pdlp_iterations, [&]() { // First compute only A_t @ y, needed later in adaptative step size compute_At_y(); // Compute fused primal gradient with projection compute_primal_projection_with_gradient(primal_step_size); // Compute next dual solution compute_next_dual_solution(dual_step_size); - graph_all.end_capture(total_pdlp_iterations); - } - graph_all.launch(total_pdlp_iterations); + }); } else { #ifdef PDLP_DEBUG_MODE std::cout << " Not computing A_t * Y" << std::endl; #endif // A_t * y was already computed in previous iteration - if (!graph_prim_proj_gradient_dual.is_initialized(total_pdlp_iterations)) { - graph_prim_proj_gradient_dual.start_capture(total_pdlp_iterations); + graph_prim_proj_gradient_dual.run(total_pdlp_iterations, [&]() { compute_primal_projection_with_gradient(primal_step_size); compute_next_dual_solution(dual_step_size); - graph_prim_proj_gradient_dual.end_capture(total_pdlp_iterations); - } - graph_prim_proj_gradient_dual.launch(total_pdlp_iterations); + }); } } @@ -1063,9 +1057,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // Compute next primal solution reflected if (should_major) { - if (!graph_all.is_initialized(should_major)) { - graph_all.start_capture(should_major); - + graph_all.run(should_major, [&]() { compute_At_y(); if (!batch_mode_) { cub::DeviceTransform::Transform( @@ -1166,14 +1158,10 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( print("potential_next_dual_solution_", potential_next_dual_solution_); print("reflected_dual_", reflected_dual_); #endif - graph_all.end_capture(should_major); - } - graph_all.launch(should_major); + }); } else { - if (!graph_all.is_initialized(should_major)) { - graph_all.start_capture(should_major); - + graph_all.run(should_major, [&]() { // Compute next primal compute_At_y(); @@ -1281,9 +1269,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( #ifdef CUPDLP_DEBUG_MODE print("reflected_dual_", reflected_dual_); #endif - graph_all.end_capture(should_major); - } - graph_all.launch(should_major); + }); } } diff --git a/cpp/src/pdlp/restart_strategy/weighted_average_solution.cu b/cpp/src/pdlp/restart_strategy/weighted_average_solution.cu index 70a448a9de..098ad7911b 100644 --- a/cpp/src/pdlp/restart_strategy/weighted_average_solution.cu +++ b/cpp/src/pdlp/restart_strategy/weighted_average_solution.cu @@ -71,9 +71,7 @@ void weighted_average_solution_t::add_current_solution_to_weighted_ave // (same for primal and dual although julia repo makes it seem as though these should/could be // different) - if (!graph.is_initialized(total_pdlp_iterations)) { - graph.start_capture(total_pdlp_iterations); - + graph.run(total_pdlp_iterations, [&]() { cub::DeviceTransform::Transform( cuda::std::make_tuple(sum_primal_solutions_.data(), primal_solution), sum_primal_solutions_.data(), @@ -93,10 +91,7 @@ void weighted_average_solution_t::add_current_solution_to_weighted_ave weight.data(), sum_primal_solution_weights_.data(), sum_dual_solution_weights_.data()); - - graph.end_capture(total_pdlp_iterations); - } - graph.launch(total_pdlp_iterations); + }); iterations_since_last_restart_ += 1; } diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index c95ed67ca6..1f137dc9ea 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -326,9 +326,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( cuopt_assert(!batch_mode_, "Batch mode is not supported for compute_step_sizes"); - if (!graph.is_initialized(total_pdlp_iterations)) { - graph.start_capture(total_pdlp_iterations); - + graph.run(total_pdlp_iterations, [&]() { // compute numerator and deminator of n_lim compute_interaction_and_movement(pdhg_solver.get_primal_tmp_resource(), pdhg_solver.get_cusparse_view(), @@ -339,9 +337,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( primal_step_size.data(), dual_step_size.data(), pdhg_solver.get_d_total_pdhg_iterations().data()); - graph.end_capture(total_pdlp_iterations); - } - graph.launch(total_pdlp_iterations); + }); // Steam sync so that next call can see modification made to host var valid_step_size RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); } diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu index 0df3861b5a..7e93692b39 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cu +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -7,11 +7,6 @@ #include -#include -#include - -#include - namespace cuopt::linear_programming::detail { template @@ -21,81 +16,6 @@ ping_pong_graph_t::ping_pong_graph_t(rmm::cuda_stream_view stream_view, { } -template -ping_pong_graph_t::~ping_pong_graph_t() -{ -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } - if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } - } -#endif -} - -template -void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) -{ -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); - capture_even_active_ = true; - } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); - capture_odd_active_ = true; - } - } -#endif -} - -template -void ping_pong_graph_t::end_capture(i_t total_pdlp_iterations) -{ -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); - capture_even_active_ = false; - RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); - even_initialized = true; - RAFT_CUDA_TRY(cudaGraphDestroy(even_graph)); - } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); - capture_odd_active_ = false; - RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); - odd_initialized = true; - RAFT_CUDA_TRY(cudaGraphDestroy(odd_graph)); - } - } -#endif -} - -template -void ping_pong_graph_t::launch(i_t total_pdlp_iterations) -{ -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && even_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); - } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); - } - } -#endif -} - -template -bool ping_pong_graph_t::is_initialized(i_t total_pdlp_iterations) -{ -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - return (total_pdlp_iterations % 2 == 0 && even_initialized) || - (total_pdlp_iterations % 2 == 1 && odd_initialized); - } -#endif - return false; -} - template class ping_pong_graph_t; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index dafecdd06e..0a2eb4b1d8 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -8,38 +8,63 @@ #pragma once #include +#include #include -#include +#include namespace cuopt::linear_programming::detail { -// Helper class to capture and launch CUDA graph -// No additional checks for safe usage (calling launch() before initializing the graph) use with -// caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a -// valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph +// Two-slot CUDA-graph cache for PDLP. PDLP swaps pointers (rather than +// copying vectors) at the end of every pdhg step, so the captured graph +// topology alternates between two layouts depending on iteration parity. +// Each slot is a manual_cuda_graph_t, which (a) builds the parent graph +// explicitly via cudaGraphCreate + cudaStreamBeginCaptureToGraph and +// (b) recovers from cudaErrorStreamCaptureInvalidated by re-executing the +// supplied work eagerly. See manual_cuda_graph.cuh for details. template class ping_pong_graph_t { public: ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false); - ~ping_pong_graph_t(); + ~ping_pong_graph_t() = default; - void start_capture(i_t total_pdlp_iterations); - void end_capture(i_t total_pdlp_iterations); - void launch(i_t total_pdlp_iterations); - bool is_initialized(i_t total_pdlp_iterations); + // Non-copyable because the underlying manual_cuda_graph_t owns a + // cudaGraphExec_t handle. Move-assignment is needed by pdlp.cu, which + // re-binds the existing slot to a freshly-constructed legacy-batch-mode + // instance after an SpMM run. + ping_pong_graph_t(const ping_pong_graph_t&) = delete; + ping_pong_graph_t& operator=(const ping_pong_graph_t&) = delete; + ping_pong_graph_t(ping_pong_graph_t&&) noexcept = default; + ping_pong_graph_t& operator=(ping_pong_graph_t&&) noexcept = default; + + // Either launch the cached graph for this parity slot, or capture `work` + // into a freshly-created parent graph, instantiate, and launch. Capture + // invalidation is recovered by re-running `work` eagerly (see + // manual_cuda_graph_t::run). In CUPDLP_DEBUG_MODE or legacy-batch mode + // the work is always executed eagerly with no graph involvement. + template + void run(i_t total_pdlp_iterations, F&& work) + { +#ifdef CUPDLP_DEBUG_MODE + work(); +#else + if (is_legacy_batch_mode_) { + work(); + return; + } + if (total_pdlp_iterations % 2 == 0) { + even_graph_.run(stream_view_, std::forward(work)); + } else { + odd_graph_.run(stream_view_, std::forward(work)); + } +#endif + } private: - cudaGraph_t even_graph; - cudaGraph_t odd_graph; - cudaGraphExec_t even_instance; - cudaGraphExec_t odd_instance; + manual_cuda_graph_t even_graph_; + manual_cuda_graph_t odd_graph_; rmm::cuda_stream_view stream_view_; - bool even_initialized{false}; - bool odd_initialized{false}; - bool capture_even_active_{false}; - bool capture_odd_active_{false}; bool is_legacy_batch_mode_{false}; }; diff --git a/cpp/src/utilities/manual_cuda_graph.cuh b/cpp/src/utilities/manual_cuda_graph.cuh new file mode 100644 index 0000000000..5ac118ed4f --- /dev/null +++ b/cpp/src/utilities/manual_cuda_graph.cuh @@ -0,0 +1,159 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include + +#include +#include +#include + +#include + +#include + +namespace cuopt { + +// Wrapper around a CUDA graph that is built via *manual* parent-graph +// construction. The cudaGraph_t object is explicitly created with +// cudaGraphCreate; user work is then captured directly into that +// manually-owned parent via cudaStreamBeginCaptureToGraph. CUB / Thrust / +// RAFT / cuSPARSE calls inside the captured region are preserved. +// +// The single public entry point is `run(stream, callable)`. It either +// launches the previously-instantiated graph or, on first call, captures the +// callable into a fresh graph, instantiates it, and launches. +// +// Invalidation recovery: +// If cudaStreamEndCapture returns cudaErrorStreamCaptureInvalidated +// (typically because another thread issued a synchronous CUDA call -- +// cudaDeviceSynchronize, cudaMalloc, cudaFree, or a library first-use that +// internally syncs the device -- concurrently with this capture window), +// the captured work has been recorded but NOT issued to the device. The +// wrapper discards the partial graph, re-executes `work` eagerly so the +// current iteration still produces correct results, and leaves itself +// uninitialized so the next `run` call retries capture. The cost of an +// invalidation is therefore one extra eager pass, not a crash. +// +// Not thread-safe per instance: a single manual_cuda_graph_t must be driven +// from one thread at a time. Multiple instances on per-thread streams, +// captured concurrently across threads, is the supported multi-threaded +// pattern. +class manual_cuda_graph_t { + public: + manual_cuda_graph_t() = default; + + manual_cuda_graph_t(const manual_cuda_graph_t&) = delete; + manual_cuda_graph_t& operator=(const manual_cuda_graph_t&) = delete; + + manual_cuda_graph_t(manual_cuda_graph_t&& other) noexcept { swap(other); } + manual_cuda_graph_t& operator=(manual_cuda_graph_t&& other) noexcept + { + if (this != &other) { + destroy(); + swap(other); + } + return *this; + } + + ~manual_cuda_graph_t() { destroy(); } + + template + void run(rmm::cuda_stream_view stream, F&& work) + { + if (instance_ != nullptr) { + RAFT_CUDA_TRY(cudaGraphLaunch(instance_, stream.value())); + return; + } + + cudaGraph_t parent = nullptr; + RAFT_CUDA_TRY(cudaGraphCreate(&parent, 0)); + + // RAII: if user code throws mid-capture, end capture so the stream isn't + // left in capture state. Errors here are intentionally swallowed -- we're + // already unwinding for another reason and the parent graph is being + // destroyed below. + capture_guard_t guard{stream.value(), parent}; + + RAFT_CUDA_TRY(cudaStreamBeginCaptureToGraph( + stream.value(), parent, nullptr, nullptr, 0, cudaStreamCaptureModeThreadLocal)); + guard.capture_active = true; + + work(); + + cudaGraph_t captured = nullptr; + cudaError_t end_err = cudaStreamEndCapture(stream.value(), &captured); + guard.capture_active = false; + + if (end_err == cudaErrorStreamCaptureInvalidated) { + // Capture got invalidated by a concurrent synchronous CUDA call on + // another thread (cudaMalloc / cudaDeviceSynchronize / library + // first-use). The recorded work has NOT been issued to the device. + // Clear the error, drop the partial graph, and re-run eagerly so the + // current iteration still produces correct results. The wrapper stays + // uninitialized so the next call retries capture. + guard.parent = nullptr; // we're about to destroy it ourselves + if (captured != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(captured)); } + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(parent)); + // Drain the sticky error so the next CUDA call doesn't see it. + cudaGetLastError(); + work(); + return; + } + RAFT_CUDA_TRY(end_err); + + // cudaStreamBeginCaptureToGraph guarantees the returned graph IS the one + // we passed in; the captured handle is just an alias. + cuopt_assert(captured == parent, "cudaStreamEndCapture returned an unexpected graph handle"); + + RAFT_CUDA_TRY(cudaGraphInstantiate(&instance_, parent)); + guard.parent = nullptr; // ownership transferred; we destroy explicitly below + RAFT_CUDA_TRY(cudaGraphDestroy(parent)); + + RAFT_CUDA_TRY(cudaGraphLaunch(instance_, stream.value())); + } + + bool is_initialized() const noexcept { return instance_ != nullptr; } + + // Drop the instantiated graph so the next run() re-captures from scratch. + void reset() { destroy(); } + + private: + // RAII helper: cleans up a partial capture and the manually-created parent + // graph if the user-supplied callable throws between start- and end-capture. + struct capture_guard_t { + cudaStream_t stream{}; + cudaGraph_t parent{nullptr}; + bool capture_active{false}; + + ~capture_guard_t() noexcept + { + if (capture_active) { + cudaGraph_t dummy = nullptr; + // best-effort; we're already unwinding + cudaStreamEndCapture(stream, &dummy); + if (dummy != nullptr) { cudaGraphDestroy(dummy); } + } + if (parent != nullptr) { cudaGraphDestroy(parent); } + } + }; + + void destroy() noexcept + { + if (instance_ != nullptr) { + RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(instance_)); + instance_ = nullptr; + } + } + + void swap(manual_cuda_graph_t& other) noexcept { std::swap(instance_, other.instance_); } + + cudaGraphExec_t instance_{nullptr}; +}; + +} // namespace cuopt From 0a5149b93c28228d5c622c703d686345491f9002 Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 19 May 2026 22:14:54 -0400 Subject: [PATCH 15/47] test CI --- cpp/src/utilities/manual_cuda_graph.cuh | 1 + cpp/tests/mip/incumbent_callback_test.cu | 10 ++++------ .../linear_programming/test_incumbent_callbacks.py | 12 ++---------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/cpp/src/utilities/manual_cuda_graph.cuh b/cpp/src/utilities/manual_cuda_graph.cuh index 5ac118ed4f..e3a3db0219 100644 --- a/cpp/src/utilities/manual_cuda_graph.cuh +++ b/cpp/src/utilities/manual_cuda_graph.cuh @@ -91,6 +91,7 @@ class manual_cuda_graph_t { guard.capture_active = false; if (end_err == cudaErrorStreamCaptureInvalidated) { + CUOPT_LOG_INFO("Capture got invalidated by a concurrent synchronous CUDA call"); // Capture got invalidated by a concurrent synchronous CUDA call on // another thread (cudaMalloc / cudaDeviceSynchronize / library // first-use). The recorded work has NOT been issued to the device. diff --git a/cpp/tests/mip/incumbent_callback_test.cu b/cpp/tests/mip/incumbent_callback_test.cu index 236cd203fc..95a2b0a1b3 100644 --- a/cpp/tests/mip/incumbent_callback_test.cu +++ b/cpp/tests/mip/incumbent_callback_test.cu @@ -138,9 +138,8 @@ void test_incumbent_callback(std::string test_instance, bool include_set_callbac TEST(mip_solve, incumbent_get_callback_test) { - // swath1 is temporarily disabled here because this incumbent callback path can abort - // nondeterministically in CI while MIP root relaxation uses concurrent PDLP CUDA graph capture. - std::vector test_instances = {"mip/50v-10.mps", "mip/neos5-free-bound.mps"}; + std::vector test_instances = { + "mip/50v-10.mps", "mip/neos5-free-bound.mps", "mip/swath1.mps"}; for (const auto& test_instance : test_instances) { test_incumbent_callback(test_instance, false); } @@ -148,9 +147,8 @@ TEST(mip_solve, incumbent_get_callback_test) TEST(mip_solve, incumbent_get_set_callback_test) { - // swath1 is temporarily disabled here because this incumbent callback path can abort - // nondeterministically in CI while MIP root relaxation uses concurrent PDLP CUDA graph capture. - std::vector test_instances = {"mip/50v-10.mps", "mip/neos5-free-bound.mps"}; + std::vector test_instances = { + "mip/50v-10.mps", "mip/neos5-free-bound.mps", "mip/swath1.mps"}; for (const auto& test_instance : test_instances) { test_incumbent_callback(test_instance, true); } diff --git a/python/cuopt/cuopt/tests/linear_programming/test_incumbent_callbacks.py b/python/cuopt/cuopt/tests/linear_programming/test_incumbent_callbacks.py index 55a34016bd..0213b95358 100644 --- a/python/cuopt/cuopt/tests/linear_programming/test_incumbent_callbacks.py +++ b/python/cuopt/cuopt/tests/linear_programming/test_incumbent_callbacks.py @@ -22,14 +22,6 @@ RAPIDS_DATASET_ROOT_DIR = os.getcwd() RAPIDS_DATASET_ROOT_DIR = os.path.join(RAPIDS_DATASET_ROOT_DIR, "datasets") -_SWATH1_GRAPH_CAPTURE_SKIP = pytest.mark.skip( - reason=( - "Temporarily disabled: swath1 incumbent callback tests can abort " - "nondeterministically in CI while MIP root relaxation uses concurrent " - "PDLP CUDA graph capture." - ) -) - def _run_incumbent_solver_callback(file_name, include_set_callback): # Callback for incumbent solution @@ -112,7 +104,7 @@ def set_solution( @pytest.mark.parametrize( "file_name", [ - pytest.param("/mip/swath1.mps", marks=_SWATH1_GRAPH_CAPTURE_SKIP), + ("/mip/swath1.mps"), ("/mip/neos5-free-bound.mps"), ], ) @@ -123,7 +115,7 @@ def test_incumbent_get_callback(file_name): @pytest.mark.parametrize( "file_name", [ - pytest.param("/mip/swath1.mps", marks=_SWATH1_GRAPH_CAPTURE_SKIP), + ("/mip/swath1.mps"), ("/mip/neos5-free-bound.mps"), ], ) From 06352dbc88190db5433b22b32efcec7f5eab79ed Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 19 May 2026 23:12:32 -0400 Subject: [PATCH 16/47] fix logger --- cpp/src/utilities/manual_cuda_graph.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/utilities/manual_cuda_graph.cuh b/cpp/src/utilities/manual_cuda_graph.cuh index e3a3db0219..613f944295 100644 --- a/cpp/src/utilities/manual_cuda_graph.cuh +++ b/cpp/src/utilities/manual_cuda_graph.cuh @@ -7,6 +7,7 @@ #pragma once +#include #include #include From 4d2fb18b27da5ff15d881a4b13e4adc13418472f Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 20 May 2026 06:51:57 -0400 Subject: [PATCH 17/47] restore the api and use api suitable for <12.3 --- cpp/src/utilities/manual_cuda_graph.cuh | 62 +++++++------------------ 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/cpp/src/utilities/manual_cuda_graph.cuh b/cpp/src/utilities/manual_cuda_graph.cuh index 613f944295..682f760494 100644 --- a/cpp/src/utilities/manual_cuda_graph.cuh +++ b/cpp/src/utilities/manual_cuda_graph.cuh @@ -20,26 +20,18 @@ namespace cuopt { -// Wrapper around a CUDA graph that is built via *manual* parent-graph -// construction. The cudaGraph_t object is explicitly created with -// cudaGraphCreate; user work is then captured directly into that -// manually-owned parent via cudaStreamBeginCaptureToGraph. CUB / Thrust / -// RAFT / cuSPARSE calls inside the captured region are preserved. -// -// The single public entry point is `run(stream, callable)`. It either -// launches the previously-instantiated graph or, on first call, captures the -// callable into a fresh graph, instantiates it, and launches. +// Wrapper around a CUDA graph captured from a callable. CUB / Thrust / RAFT / +// cuSPARSE calls inside the captured region are preserved. // // Invalidation recovery: // If cudaStreamEndCapture returns cudaErrorStreamCaptureInvalidated // (typically because another thread issued a synchronous CUDA call -- // cudaDeviceSynchronize, cudaMalloc, cudaFree, or a library first-use that // internally syncs the device -- concurrently with this capture window), -// the captured work has been recorded but NOT issued to the device. The -// wrapper discards the partial graph, re-executes `work` eagerly so the -// current iteration still produces correct results, and leaves itself -// uninitialized so the next `run` call retries capture. The cost of an -// invalidation is therefore one extra eager pass, not a crash. +// the captured work has NOT been issued to the device. The wrapper drains +// the sticky error, re-executes `work` eagerly so the current iteration +// still produces correct results, and leaves itself uninitialized so the +// next `run` call retries capture. // // Not thread-safe per instance: a single manual_cuda_graph_t must be driven // from one thread at a time. Multiple instances on per-thread streams, @@ -72,17 +64,11 @@ class manual_cuda_graph_t { return; } - cudaGraph_t parent = nullptr; - RAFT_CUDA_TRY(cudaGraphCreate(&parent, 0)); - // RAII: if user code throws mid-capture, end capture so the stream isn't - // left in capture state. Errors here are intentionally swallowed -- we're - // already unwinding for another reason and the parent graph is being - // destroyed below. - capture_guard_t guard{stream.value(), parent}; + // left in capture state. Errors are swallowed -- we're already unwinding. + capture_guard_t guard{stream.value()}; - RAFT_CUDA_TRY(cudaStreamBeginCaptureToGraph( - stream.value(), parent, nullptr, nullptr, 0, cudaStreamCaptureModeThreadLocal)); + RAFT_CUDA_TRY(cudaStreamBeginCapture(stream.value(), cudaStreamCaptureModeThreadLocal)); guard.capture_active = true; work(); @@ -92,30 +78,18 @@ class manual_cuda_graph_t { guard.capture_active = false; if (end_err == cudaErrorStreamCaptureInvalidated) { - CUOPT_LOG_INFO("Capture got invalidated by a concurrent synchronous CUDA call"); - // Capture got invalidated by a concurrent synchronous CUDA call on - // another thread (cudaMalloc / cudaDeviceSynchronize / library - // first-use). The recorded work has NOT been issued to the device. - // Clear the error, drop the partial graph, and re-run eagerly so the - // current iteration still produces correct results. The wrapper stays - // uninitialized so the next call retries capture. - guard.parent = nullptr; // we're about to destroy it ourselves - if (captured != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(captured)); } - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(parent)); - // Drain the sticky error so the next CUDA call doesn't see it. + // The recorded work has NOT been issued; drain the sticky error and re-run + // eagerly so this iteration still produces correct results. Stay uninitialized + // so the next call retries capture. EndCapture sets `captured` to nullptr on + // invalidation, so there is no graph to free here. cudaGetLastError(); work(); return; } RAFT_CUDA_TRY(end_err); - // cudaStreamBeginCaptureToGraph guarantees the returned graph IS the one - // we passed in; the captured handle is just an alias. - cuopt_assert(captured == parent, "cudaStreamEndCapture returned an unexpected graph handle"); - - RAFT_CUDA_TRY(cudaGraphInstantiate(&instance_, parent)); - guard.parent = nullptr; // ownership transferred; we destroy explicitly below - RAFT_CUDA_TRY(cudaGraphDestroy(parent)); + RAFT_CUDA_TRY(cudaGraphInstantiate(&instance_, captured)); + RAFT_CUDA_TRY(cudaGraphDestroy(captured)); RAFT_CUDA_TRY(cudaGraphLaunch(instance_, stream.value())); } @@ -126,11 +100,10 @@ class manual_cuda_graph_t { void reset() { destroy(); } private: - // RAII helper: cleans up a partial capture and the manually-created parent - // graph if the user-supplied callable throws between start- and end-capture. + // RAII helper: cleans up a partial capture if the user-supplied callable + // throws between start- and end-capture. struct capture_guard_t { cudaStream_t stream{}; - cudaGraph_t parent{nullptr}; bool capture_active{false}; ~capture_guard_t() noexcept @@ -141,7 +114,6 @@ class manual_cuda_graph_t { cudaStreamEndCapture(stream, &dummy); if (dummy != nullptr) { cudaGraphDestroy(dummy); } } - if (parent != nullptr) { cudaGraphDestroy(parent); } } }; From dba39c8b48eff3a9140428a6d03633da8e2bd3b1 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 20 May 2026 07:05:51 -0400 Subject: [PATCH 18/47] more comments --- cpp/src/utilities/manual_cuda_graph.cuh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/utilities/manual_cuda_graph.cuh b/cpp/src/utilities/manual_cuda_graph.cuh index 682f760494..68b37b7c71 100644 --- a/cpp/src/utilities/manual_cuda_graph.cuh +++ b/cpp/src/utilities/manual_cuda_graph.cuh @@ -32,6 +32,10 @@ namespace cuopt { // the sticky error, re-executes `work` eagerly so the current iteration // still produces correct results, and leaves itself uninitialized so the // next `run` call retries capture. +// IMPORTANT: because `work` is invoked a second time on recovery, any +// host-side mutations inside the callable will run twice -- keep `work` +// host-idempotent or move host bookkeeping (counters, flags, hash updates, +// etc.) outside the callable. // // Not thread-safe per instance: a single manual_cuda_graph_t must be driven // from one thread at a time. Multiple instances on per-thread streams, @@ -88,8 +92,12 @@ class manual_cuda_graph_t { } RAFT_CUDA_TRY(end_err); - RAFT_CUDA_TRY(cudaGraphInstantiate(&instance_, captured)); - RAFT_CUDA_TRY(cudaGraphDestroy(captured)); + // Destroy the source graph regardless of whether instantiation succeeded: + // on failure cudaGraphInstantiate leaves instance_ at nullptr per the API + // contract, and the source graph is unconditionally not needed any more. + cudaError_t inst_err = cudaGraphInstantiate(&instance_, captured); + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(captured)); + RAFT_CUDA_TRY(inst_err); RAFT_CUDA_TRY(cudaGraphLaunch(instance_, stream.value())); } From 56b6e846a896b1bce2976d9e188aef3122fb90d2 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 20 May 2026 09:16:55 -0400 Subject: [PATCH 19/47] fix ping pong graph major, non-major logic --- cpp/src/pdlp/pdhg.cu | 25 ++++++++++++++++++++----- cpp/src/pdlp/pdhg.hpp | 13 +++++++++++-- cpp/src/pdlp/pdlp.cu | 11 +++++++---- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index dca0b828de..15b306eb15 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -97,6 +97,7 @@ pdhg_solver_t::pdhg_solver_t( // Currently graph capture is not supported for cuSparse SpMM // TODO enable once cuSparse SpMM supports graph capture graph_all{stream_view_, is_legacy_batch_mode || batch_mode_}, + graph_all_non_major{stream_view_, is_legacy_batch_mode || batch_mode_}, graph_prim_proj_gradient_dual{stream_view_, is_legacy_batch_mode}, d_total_pdhg_iterations_{0, stream_view_}, climber_strategies_(climber_strategies), @@ -363,6 +364,12 @@ ping_pong_graph_t& pdhg_solver_t::get_graph_all() return graph_all; } +template +ping_pong_graph_t& pdhg_solver_t::get_graph_all_non_major() +{ + return graph_all_non_major; +} + template rmm::device_scalar& pdhg_solver_t::get_d_total_pdhg_iterations() { @@ -1048,16 +1055,23 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, const rmm::device_uvector& bound_rescaling, - bool should_major) + bool should_major, + i_t total_pdlp_iterations) { raft::common::nvtx::range fun_scope("compute_next_primal_dual_solution_reflected"); using f_t2 = typename type_2::type; - // Compute next primal solution reflected + // Compute next primal solution reflected. + // + // The major and non-major branches build different graph topologies, and update_solution() + // swaps the primal/dual ping-pong buffers between outer pdlp iterations — so the captured + // graph's baked-in pointers depend on `total_pdlp_iterations` parity, not on `should_major`. + // Use a dedicated ping-pong cache per branch and key each on `total_pdlp_iterations` so each + // (branch, parity) pair maps to its own cached executable. if (should_major) { - graph_all.run(should_major, [&]() { + graph_all.run(total_pdlp_iterations, [&]() { compute_At_y(); if (!batch_mode_) { cub::DeviceTransform::Transform( @@ -1161,7 +1175,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( }); } else { - graph_all.run(should_major, [&]() { + graph_all_non_major.run(total_pdlp_iterations, [&]() { // Compute next primal compute_At_y(); @@ -1301,7 +1315,8 @@ void pdhg_solver_t::take_step(rmm::device_uvector& primal_step_si dual_step_size, bound_rescaling, is_major_iteration || - ((total_pdlp_iterations + 2) % conditional_major(total_pdlp_iterations + 2)) == 0); + ((total_pdlp_iterations + 2) % conditional_major(total_pdlp_iterations + 2)) == 0, + total_pdlp_iterations); } total_pdhg_iterations_ += 1; } diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 52f45dc83d..9c284a6aa7 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -60,6 +60,7 @@ class pdhg_solver_t { const thrust::universal_host_pinned_vector>& swap_pairs, i_t new_size); void resize_context(i_t new_size); ping_pong_graph_t& get_graph_all(); + ping_pong_graph_t& get_graph_all_non_major(); rmm::device_uvector& get_new_bounds_climber_id() { return new_bounds_climber_id_; } rmm::device_uvector& get_new_bounds_idx() { return new_bounds_idx_; } @@ -89,7 +90,8 @@ class pdhg_solver_t { rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, const rmm::device_uvector& bound_rescaling, // Only used in batch mode - bool should_major); + bool should_major, + i_t total_pdlp_iterations); void compute_primal_projection_with_gradient(rmm::device_uvector& primal_step_size); void compute_primal_projection(rmm::device_uvector& primal_step_size); @@ -127,8 +129,15 @@ class pdhg_solver_t { // Different graphs for each case // Either compute the whole next primal step - // Or skip the SpMV (most cases) if it was done at the previous iteration + // Or skip the SpMV (most cases) if it was done at the previous iteration. + // The reflected primal/dual path branches on `should_major`, and the two branches build + // different graph topologies. They get separate ping-pong caches so each branch can key its + // 2-slot cache on `total_pdlp_iterations` parity (the swap state of the primal/dual buffers + // baked into the captured graph) without colliding with the other branch's topology. + // graph_all serves the non-reflected path and the major reflected branch (mutually exclusive + // at runtime); graph_all_non_major serves the non-major reflected branch. ping_pong_graph_t graph_all; + ping_pong_graph_t graph_all_non_major; ping_pong_graph_t graph_prim_proj_gradient_dual; // Needed for faster graph launch diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 49c77e44dc..8179a166b8 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1970,10 +1970,13 @@ void pdlp_solver_t::resize_and_swap_all_context_loop( stream_view_); #endif - // Set PDHG graph to unitilized so that next call can start a new graph - // Currently graph capture is not supported for cuSparse SpMM - // TODO enable once cuSparse SpMM supports graph capture - pdhg_solver_.get_graph_all() = ping_pong_graph_t(stream_view_, true); + // Set PDHG graphs to uninitialized so that next call can start a new graph. + // Currently graph capture is not supported for cuSparse SpMM. + // TODO enable once cuSparse SpMM supports graph capture. + // Reset both reflected-path caches: graph_all (non-reflected + reflected major) and + // graph_all_non_major (reflected non-major). + pdhg_solver_.get_graph_all() = ping_pong_graph_t(stream_view_, true); + pdhg_solver_.get_graph_all_non_major() = ping_pong_graph_t(stream_view_, true); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } From 4de79a06b3881218995952b3cb28d1c7048f2cd8 Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 2 Jun 2026 09:20:04 +0200 Subject: [PATCH 20/47] fix timer, better jaccard --- cpp/src/branch_and_bound/branch_and_bound.cpp | 23 +- cpp/src/cuts/cuts.cpp | 5 +- cpp/src/cuts/cuts.hpp | 7 +- cpp/src/utilities/omp_helpers.hpp | 4 +- cut_gap_timing_stats.patch | 979 ++++++++++++++++++ 5 files changed, 1005 insertions(+), 13 deletions(-) create mode 100644 cut_gap_timing_stats.patch diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index cc1ba3b77c..2bf15f49ff 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2677,8 +2677,20 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut }; cuopt::scope_guard root_cut_cpufj_guard([&]() { stop_root_cut_cpufj(); }); - f_t cut_generation_start_time = tic(); - i_t cut_pool_size = 0; + f_t cut_generation_start_time = tic(); + auto publish_cut_generation_time = [&](bool force_time_limit_value = false) { + if (settings_.benchmark_info_ptr == nullptr) { return; } + f_t cut_generation_time = toc(cut_generation_start_time); + if (force_time_limit_value || cut_generation_time > settings_.time_limit) { + cut_generation_time = settings_.time_limit; + } + if (cut_generation_time < static_cast(0.0)) { + cut_generation_time = static_cast(0.0); + } + settings_.benchmark_info_ptr->cut_generation_time_sec = + static_cast(cut_generation_time); + }; + i_t cut_pool_size = 0; for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { if (num_fractional == 0) { // LP relaxation is already integer-feasible — solved at the root @@ -2689,6 +2701,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut static_cast(compute_user_objective(original_lp_, root_objective_)); } set_solution_at_root(solution, cut_info); + publish_cut_generation_time(); signal_extend_cliques_.store(true, std::memory_order_release); #pragma omp taskwait depend(in : *clique_signal) return mip_status_t::OPTIMAL; @@ -2728,6 +2741,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } if (cut_pass_result.action == cut_pass_action_t::RETURN) { + publish_cut_generation_time(cut_pass_result.status == mip_status_t::TIME_LIMIT); signal_extend_cliques_.store(true, std::memory_order_release); #pragma omp taskwait depend(in : *clique_signal) return cut_pass_result.status; @@ -2768,10 +2782,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut // emit it alongside gap_closed_pct. Always set when the cut loop ran, // even if no cuts were added (the time still measures real work in // generate_cuts + score_cuts + dedup + LP resolves). - if (settings_.benchmark_info_ptr != nullptr) { - settings_.benchmark_info_ptr->cut_generation_time_sec = - static_cast(cut_generation_time); - } + publish_cut_generation_time(); if (cut_info.has_cuts()) { settings_.log.printf("Cut generation time: %.2f seconds\n", cut_generation_time); settings_.log.printf("Cut pool size : %d\n", cut_pool_size); diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index f1a31f0536..8f612bbeef 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -5830,12 +5830,13 @@ void apply_cut_sweep_config(cut_pool_t& cut_pool, cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); break; case 3: - // 03_cousin_loose: looser Jaccard threshold (0.85 -> 0.95). Closer + // 03_cousin_loose: looser Jaccard threshold (0.85 -> 0.875). Allows + // 7/8 min-hash agreement to qualify as cousins when k=8. Closer // to no-filter behavior. Gap should match config 0 if the cousin // filter is mostly absorbing redundancy that the orthogonality // scan would catch anyway. cut_pool.set_clique_cousin_filter_enable(true); - cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.95)); + cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.875)); cut_pool.set_clique_cousin_minhash_k(8); cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); break; diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index fc4497056e..c4dea8dfee 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -329,7 +329,7 @@ class cut_pool_t { // higher-scoring representative (or, if no score was supplied, the // earlier-inserted one). // - // Defaults: jaccard_tau=0.95, k=8, enable=true, size_weight=0.0. + // Defaults: jaccard_tau=0.875, k=8, enable=true, size_weight=0.0. // These match "config 3 / cousin_loose" from the clique-sweep on // commit 0b04683b — the configuration that won the gap-closed-pct // comparison and was promoted to be the production default for the @@ -401,7 +401,7 @@ class cut_pool_t { std::vector> clique_support_minhash_; std::vector clique_cousin_score_; std::unordered_map> clique_cousin_buckets_; - f_t clique_cousin_jaccard_tau_{static_cast(0.95)}; + f_t clique_cousin_jaccard_tau_{static_cast(0.875)}; i_t clique_cousin_minhash_k_{8}; bool clique_cousin_filter_enable_{true}; // When > 0, the cousin filter's "score" used to pick a winner is @@ -439,7 +439,8 @@ class cut_pool_t { // (the cut_scoring final-version P2-4 baseline) // 2 cousin_strict cousin filter on, tau=0.70 (more aggressive // cousin removal — favors quantity reduction) -// 3 cousin_loose cousin filter on, tau=0.95 (closer to no-filter +// 3 cousin_loose cousin filter on, tau=0.875 (allows 7/8 min-hash +// agreement with k=8; still closer to no-filter // extreme — selection-stage absorbs cousins) // 4 cousin_size_tilt cousin filter on, tau=0.85, score = violation * // (1 + 0.5 * log2(1 + clique_size)) — picks the diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp index 38f941ba00..337b52d83e 100644 --- a/cpp/src/utilities/omp_helpers.hpp +++ b/cpp/src/utilities/omp_helpers.hpp @@ -47,8 +47,8 @@ namespace cuopt { // or any object file that wasn't rebuilt against the new header. class omp_mutex_t { public: - omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); } - omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); } + omp_mutex_t(); + omp_mutex_t(omp_mutex_t&& other) noexcept; omp_mutex_t(const omp_mutex_t&) = delete; omp_mutex_t& operator=(const omp_mutex_t&) = delete; diff --git a/cut_gap_timing_stats.patch b/cut_gap_timing_stats.patch new file mode 100644 index 0000000000..7530d47e46 --- /dev/null +++ b/cut_gap_timing_stats.patch @@ -0,0 +1,979 @@ +diff --git a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp +new file mode 100644 +index 00000000..7f6826a5 +--- /dev/null ++++ b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp +@@ -0,0 +1,476 @@ ++/* clang-format off */ ++/* ++ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ * SPDX-License-Identifier: Apache-2.0 ++ */ ++/* clang-format on */ ++ ++// MIPLIB2017 best-known objective ("optimum") lookup for the MIP ++// benchmark runner. Self-contained: no env vars, no external CSV. ++// ++// Coverage: every instance in the MIPLIB2017 *benchmark* set (240 ++// instances). Of those, 232 have a known optimum and live in ++// kBenchmarkOptima; 7 are infeasible and live in kBenchmarkInfeasible ++// so the printer can label them clearly instead of returning "no opt". ++// ++// Lookup uses the basename without directory and stripped of ++// .mps / .mps.gz / .lp / .lp.gz / .gz suffixes, lower-cased. So ++// "miplib2017/MAS74.mps.gz" / "mas74.mps" / "mas74" ++// all hit the same entry. ++// ++// Returns std::optional: nullopt means "instance is in our ++// benchmark set but infeasible" *or* "we don't have an entry for it". ++// is_known_infeasible() distinguishes the two. ++ ++#pragma once ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++namespace cuopt_bench { ++ ++// Strip directory prefix and any .mps/.lp suffix (with optional .gz), ++// then lower-case. Designed to match how MPS instance files are named ++// across MIPLIB downloads (case- and extension-insensitive). ++inline std::string normalize_instance_name(const std::string& raw) ++{ ++ std::string s = raw; ++ const auto slash = s.find_last_of("/\\"); ++ if (slash != std::string::npos) { s = s.substr(slash + 1); } ++ auto endswith = [&](const std::string& suf) { ++ if (s.size() < suf.size()) { return false; } ++ for (size_t i = 0; i < suf.size(); ++i) { ++ if (std::tolower(static_cast(s[s.size() - suf.size() + i])) != ++ std::tolower(static_cast(suf[i]))) { ++ return false; ++ } ++ } ++ return true; ++ }; ++ for (const char* suf : {".mps.gz", ".lp.gz", ".mps", ".lp", ".gz"}) { ++ if (endswith(suf)) { ++ s.resize(s.size() - std::char_traits::length(suf)); ++ break; ++ } ++ } ++ for (char& c : s) { ++ c = static_cast(std::tolower(static_cast(c))); ++ } ++ return s; ++} ++ ++// MIPLIB2017 benchmark-set best-known objectives (n=232). Source: ++// https://miplib.zib.de "The Benchmark Set". Values are stored in the ++// double precision they were published at; unit tests should compare ++// with a tolerance of ~|opt|*1e-9 rather than exact equality. ++inline const std::unordered_map& kBenchmarkOptima() ++{ ++ static const std::unordered_map kOptima = { ++ {"30n20b8", 302}, ++ {"50v-10", 3311.1799841000002}, ++ {"academictimetablesmall", 0}, ++ {"air05", 26374}, ++ {"app1-1", -3}, ++ {"app1-2", -41}, ++ {"assign1-5-8", 211.99999999999801}, ++ {"atlanta-ip", 90.009878614000002}, ++ {"b1c1s1", 24544.25}, ++ {"bab2", -357544.31150000001}, ++ {"bab6", -284248.23070000007}, ++ {"beasleyc3", 753.9999999999128}, ++ {"binkar10_1", 6742.1998835000004}, ++ {"blp-ar98", 6205.2147103999996}, ++ {"blp-ic98", 4491.4475839500001}, ++ {"bnatt400", 1}, ++ {"bppc4-08", 53}, ++ {"brazil3", 24}, ++ {"buildingenergy", 33283.853236000003}, ++ {"cbs-cta", 0}, ++ {"chromaticindex1024-7", 4}, ++ {"chromaticindex512-7", 4}, ++ {"cmflsp50-24-8-8", 55789389.886}, ++ {"cms750_4", 252}, ++ {"co-100", 2639942.0600000001}, ++ {"cod105", -12}, ++ {"comp07-2idx", 6}, ++ {"comp21-2idx", 74}, ++ {"cost266-uue", 25148940.55999998}, ++ {"cryptanalysiskb128n5obj16", 0}, ++ {"csched007", 350.99999999999551}, ++ {"csched008", 173}, ++ {"cvs16r128-89", -97}, ++ {"dano3_3", 576.34463302999995}, ++ {"dano3_5", 576.9249159565619}, ++ {"decomp2", -160}, ++ {"drayage-100-23", 103333.87407000001}, ++ {"drayage-25-23", 101282.647018}, ++ {"dws008-01", 37412.604587945083}, ++ {"eil33-2", 934.007915999999}, ++ {"eila101-2", 880.92010799999991}, ++ {"enlight_hard", 37}, ++ {"ex10", 100}, ++ {"ex9", 81}, ++ {"exp-1-500-5-5", 65887}, ++ {"fast0507", 174}, ++ {"fastxgemm-n2r6s0t2", 230}, ++ {"fhnw-binpack4-48", 0}, ++ {"fiball", 138}, ++ {"gen-ip002", -4783.7333920000001}, ++ {"gen-ip054", 6840.9656417899996}, ++ {"germanrr", 47095869.648999996}, ++ {"gfd-schedulen180f7d50m30k18", 1}, ++ {"glass-sc", 23}, ++ {"glass4", 1200012599.972384}, ++ {"gmu-35-40", -2406733.3687999998}, ++ {"gmu-35-50", -2607958.3300000001}, ++ {"graph20-20-1rand", -9}, ++ {"graphdraw-domain", 19685.999975500381}, ++ {"h80x6320d", 6382.0990482459993}, ++ {"highschool1-aigio", 0}, ++ {"hypothyroid-k1", -2851}, ++ {"ic97_potential", 3941.9999309022501}, ++ {"icir97_tension", 6375}, ++ {"irish-electricity", 3723497.5913959998}, ++ {"irp", 12159.492835396981}, ++ {"istanbul-no-cutoff", 204.08170701}, ++ {"k1mushroom", -3288}, ++ {"lectsched-5-obj", 24}, ++ {"leo1", 404227536.16000003}, ++ {"leo2", 404077441.12}, ++ {"lotsize", 1480195}, ++ {"mad", 0.026800000000000001}, ++ {"map10", -495}, ++ {"map16715-04", -111}, ++ {"markshare2", 1}, ++ {"markshare_4_0", 1}, ++ {"mas74", 11801.185719999999}, ++ {"mas76", 40005.053989999993}, ++ {"mc11", 11688.99999999966}, ++ {"mcsched", 211913}, ++ {"mik-250-20-75-4", -52301}, ++ {"milo-v12-6-r2-40-1", 326481.14282799}, ++ {"momentum1", 109143.4935}, ++ {"mushroom-best", 0.055333761199999998}, ++ {"mzzv11", -21718}, ++ {"mzzv42z", -20540}, ++ {"n2seq36q", 52200}, ++ {"n3div36", 130800}, ++ {"n5-3", 8104.9999999939992}, ++ {"neos-1122047", 161}, ++ {"neos-1171448", -309}, ++ {"neos-1171737", -195}, ++ {"neos-1354092", 46}, ++ {"neos-1445765", -17783}, ++ {"neos-1456979", 176}, ++ {"neos-1582420", 90.999999999999957}, ++ {"neos-2657525-crna", 1.810748}, ++ {"neos-2746589-doon", 2008.1999999999989}, ++ {"neos-2978193-inde", -2.3880616899999998}, ++ {"neos-2987310-joes", -607702988.29999995}, ++ {"neos-3004026-krka", 0}, ++ {"neos-3024952-loue", 26756}, ++ {"neos-3046615-murg", 1600}, ++ {"neos-3083819-nubu", 6307996}, ++ {"neos-3216931-puriri", 71320}, ++ {"neos-3381206-awhea", 453}, ++ {"neos-3402294-bobin", 0.067249999999999491}, ++ {"neos-3555904-turama", -34.700000000000003}, ++ {"neos-3627168-kasai", 988585.61999999976}, ++ {"neos-3656078-kumeu", -13172.200000000001}, ++ {"neos-3754480-nidda", 12941.73838561778}, ++ {"neos-4300652-rahue", 2.1415999999999999}, ++ {"neos-4338804-snowy", 1471}, ++ {"neos-4387871-tavua", 33.384729927000002}, ++ {"neos-4413714-turia", 45.370167019999798}, ++ {"neos-4532248-waihi", 61.599999999999987}, ++ {"neos-4647030-tutaki", 27265.705999999958}, ++ {"neos-4722843-widden", 25009.662227000001}, ++ {"neos-4738912-atrato", 283627956.59500003}, ++ {"neos-4763324-toguru", 1613.0388458499999}, ++ {"neos-4954672-berkel", 2612710}, ++ {"neos-5049753-cuanza", 561.99999716889999}, ++ {"neos-5052403-cygnet", 182}, ++ {"neos-5093327-huahum", 6259.9999971258949}, ++ {"neos-5104907-jarama", 935}, ++ {"neos-5107597-kakapo", 3644.9999999995198}, ++ {"neos-5114902-kasavu", 655}, ++ {"neos-5188808-nattai", 0.110283622999984}, ++ {"neos-5195221-niemur", 0.0038354325999999999}, ++ {"neos-631710", 203}, ++ {"neos-662469", 184379.99999999991}, ++ {"neos-787933", 30}, ++ {"neos-827175", 112.00152}, ++ {"neos-848589", 2351.40309999697}, ++ {"neos-860300", 3200.9999999999982}, ++ {"neos-873061", 113.6562385063}, ++ {"neos-911970", 54.759999999999998}, ++ {"neos-933966", 318}, ++ {"neos-950242", 4}, ++ {"neos-957323", -237.75668150000001}, ++ {"neos-960392", -238}, ++ {"neos17", 0.1500025774}, ++ {"neos5", 15}, ++ {"neos8", -3719}, ++ {"net12", 214}, ++ {"netdiversion", 242}, ++ {"nexp-150-20-8-5", 231}, ++ {"ns1116954", 0}, ++ {"ns1208400", 2}, ++ {"ns1644855", -1524.3333333333301}, ++ {"ns1760995", -549.21438505000003}, ++ {"ns1830653", 20622}, ++ {"ns1952667", 0}, ++ {"nu25-pr12", 53904.999999999993}, ++ {"nursesched-medium-hint03", 115}, ++ {"nursesched-sprint02", 57.999999999999993}, ++ {"nw04", 16862}, ++ {"opm2-z10-s4", -33269}, ++ {"p200x1188c", 15078}, ++ {"peg-solitaire-a3", 1}, ++ {"pg", -8674.3426071199992}, ++ {"pg5_34", -14339.353450000001}, ++ {"physiciansched3-3", 2623271.3266670001}, ++ {"physiciansched6-2", 49324}, ++ {"piperout-08", 125054.9999999999}, ++ {"piperout-27", 8123.9999999999727}, ++ {"pk1", 11}, ++ {"proteindesign121hz512p9", 1473}, ++ {"proteindesign122trx11p8", 1747}, ++ {"qap10", 339.99999999838712}, ++ {"radiationm18-12-05", 17566}, ++ {"radiationm40-10-02", 155328}, ++ {"rail01", -70.569964299999995}, ++ {"rail02", -200.44990770000001}, ++ {"rail507", 174}, ++ {"ran14x18-disj-8", 3712}, ++ {"rd-rplusc-21", 165395.275295}, ++ {"reblock115", -36800603.233199999}, ++ {"rmatr100-p10", 423}, ++ {"rmatr200-p5", 4521}, ++ {"roci-4-11", -6020203}, ++ {"rocii-5-11", -6.6755047315380001}, ++ {"rococob10-011000", 19449}, ++ {"rocococ10-001000", 11460}, ++ {"roi2alpha3n4", -63.208495030000002}, ++ {"roi5alpha10n8", -52.322274350999997}, ++ {"roll3000", 12889.999991999999}, ++ {"s100", -0.16972352705829999}, ++ {"s250r10", -0.17178048342319999}, ++ {"satellites2-40", -19}, ++ {"satellites2-60-fs", -19.000000000099998}, ++ {"savsched1", 3217.6999999999998}, ++ {"sct2", -230.9891623}, ++ {"seymour", 423}, ++ {"seymour1", 410.76370138999999}, ++ {"sing326", 7753674.8537600003}, ++ {"sing44", 8128831.1771999998}, ++ {"snp-02-004-104", 586803238.65672886}, ++ {"sorrell3", -16}, ++ {"sp150x300d", 69}, ++ {"sp97ar", 660705645.75899994}, ++ {"sp98ar", 529740623.19999999}, ++ {"splice1k1", -394}, ++ {"square41", 15}, ++ {"square47", 15.9999999997877}, ++ {"supportcase10", 7}, ++ {"supportcase12", -7559.5330538170001}, ++ {"supportcase18", 48}, ++ {"supportcase19", 12677205.999920519}, ++ {"supportcase22", 110}, // best-known marked "*" in MIPLIB2017 (not proven optimal) ++ {"supportcase26", 1745.1238129999999}, ++ {"supportcase33", -345}, ++ {"supportcase40", 24256.3122898}, ++ {"supportcase42", 7.7586307222700004}, ++ {"supportcase6", 51906.477370000001}, ++ {"supportcase7", -1132.2231770000001}, ++ {"swath1", 379.07129574999999}, ++ {"swath3", 397.76134365000001}, ++ {"tbfp-network", 24.163194440000002}, ++ {"thor50dday", 40417}, ++ {"timtab1", 764771.99999977998}, ++ {"tr12-30", 130595.9999999999}, ++ {"traininstance2", 71820}, ++ {"traininstance6", 28290}, ++ {"trento1", 5189487}, ++ {"triptim1", 22.868099999999899}, ++ {"uccase12", 11507.4050616}, ++ {"uccase9", 10993.131409}, ++ {"uct-subprob", 314}, ++ {"unitcal_7", 19635558.243999999}, ++ {"var-smallemery-m6j6", -149.37501}, ++ {"wachplan", -8}, ++ }; ++ return kOptima; ++} ++ ++// MIPLIB2017 benchmark-set instances flagged as infeasible (n=7). ++// Solver should return Infeasible status; we use this set to label ++// the printer line with status_extra=KnownInfeasible so a downstream ++// "did the run agree with MIPLIB?" check can be a single grep. ++inline const std::unordered_set& kBenchmarkInfeasible() ++{ ++ static const std::unordered_set kInfeas = { ++ "bnatt500", ++ "cryptanalysiskb128n5obj14", ++ "fhnw-binpack4-4", ++ "neos-2075418-temuka", ++ "neos-3402454-bohle", ++ "neos-3988577-wolgan", ++ "neos859080", ++ }; ++ return kInfeas; ++} ++ ++inline std::optional lookup_miplib_optimum(const std::string& filename) ++{ ++ const auto& m = kBenchmarkOptima(); ++ const auto it = m.find(normalize_instance_name(filename)); ++ if (it == m.end()) { return std::nullopt; } ++ return it->second; ++} ++ ++inline bool is_known_infeasible(const std::string& filename) ++{ ++ return kBenchmarkInfeasible().count(normalize_instance_name(filename)) != 0; ++} ++ ++// Single grep-friendly per-instance line. Emits to stdout via printf ++// so the output survives unconditionally regardless of the project's ++// settings_.log routing (NFS-backed log files, gated debug levels) ++// and is trivially cross-compared between cuts-config branches. ++// ++// "Gap closed" is reported relative to the *root LP after cuts*, not ++// relative to the final dual bound at the end of solve. The standard ++// MIP cutting-plane definition is: ++// gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) ++// / (opt - root_lp_no_cuts) ++// On a minimization-form problem all three differences are >= 0 and ++// gap_closed_pct lies in [0, 100]. The ratio is sign-symmetric so the ++// formula also holds verbatim for maximization (numerator and ++// denominator flip sign together). NaN is emitted when either root ++// bound was not published (e.g. B&B never entered the cut loop). ++// ++// Other field semantics (signed for minimization): ++// abs_root_dual_gap = opt - root_lp_with_cuts ++// rel_root_dual_gap_pct = 100 * abs_root_dual_gap / max(|opt|, 1) ++// abs_primal_gap = primal - opt ++// rel_primal_gap_pct = 100 * abs_primal_gap / max(|opt|, 1) ++// ++// The line still also reports `final_dual` (solver's bound at the end ++// of solve) so the new metric and the previous one can be compared ++// without re-running. ++// ++// "TBD" is emitted when the optimum is unknown so downstream parsers ++// can join lines on (instance, field) without dropping rows. "NaN" is ++// emitted for root_lp_* when the value is unavailable. ++template ++inline void print_miplib_gap_stat( ++ const std::string& filename, ++ const Solution& solution, ++ double solve_time_seconds, ++ const std::string& termination_status, ++ double root_lp_no_cuts, ++ double root_lp_with_cuts, ++ double cut_gen_time_sec = std::numeric_limits::quiet_NaN()) ++{ ++ const std::string norm = normalize_instance_name(filename); ++ const auto opt = lookup_miplib_optimum(filename); ++ const double primal = solution.get_objective_value(); ++ const double final_dual = solution.get_solution_bound(); ++ const double mip_gap = solution.get_mip_gap(); ++ const bool primal_finite = std::isfinite(primal); ++ const bool root0_finite = std::isfinite(root_lp_no_cuts); ++ const bool root1_finite = std::isfinite(root_lp_with_cuts); ++ constexpr double NaN = std::numeric_limits::quiet_NaN(); ++ ++ if (is_known_infeasible(filename)) { ++ std::printf( ++ "MIPLIBGapStat instance=%s opt=Infeasible primal=%.10g final_dual=%.10g " ++ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " ++ "abs_root_dual_gap=NA rel_root_dual_gap_pct=NA gap_closed_pct=NA " ++ "abs_primal_gap=NA rel_primal_gap_pct=NA " ++ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", ++ norm.c_str(), ++ primal, ++ final_dual, ++ root_lp_no_cuts, ++ root_lp_with_cuts, ++ mip_gap, ++ solve_time_seconds, ++ cut_gen_time_sec, ++ termination_status.c_str()); ++ } else if (opt.has_value()) { ++ const double o = *opt; ++ const double denom = std::max(std::abs(o), 1.0); ++ ++ const double abs_root_dgap = root1_finite ? (o - root_lp_with_cuts) : NaN; ++ const double rel_root_dgap_pct = root1_finite ? 100.0 * abs_root_dgap / denom : NaN; ++ ++ // Classical gap-closed-by-cuts. Skip when either root bound is ++ // missing, when the LP relaxation already proves optimality ++ // (denominator = opt - root_lp_no_cuts ~= 0), or when the bound ++ // moved the wrong way (numerical noise in either direction). ++ double gap_closed_pct = NaN; ++ if (root0_finite && root1_finite) { ++ const double total_gap = o - root_lp_no_cuts; ++ if (std::abs(total_gap) > 1e-12 * denom) { ++ gap_closed_pct = 100.0 * (root_lp_with_cuts - root_lp_no_cuts) / total_gap; ++ } else { ++ // LP relaxation already (numerically) optimal -> 100% closed ++ // by definition. Avoid /0 noise. ++ gap_closed_pct = 100.0; ++ } ++ } ++ ++ const double abs_pgap = primal_finite ? (primal - o) : NaN; ++ const double rel_pgap_pct = primal_finite ? 100.0 * abs_pgap / denom : NaN; ++ ++ std::printf( ++ "MIPLIBGapStat instance=%s opt=%.10g primal=%.10g final_dual=%.10g " ++ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " ++ "abs_root_dual_gap=%.10g rel_root_dual_gap_pct=%.6g gap_closed_pct=%.6g " ++ "abs_primal_gap=%.10g rel_primal_gap_pct=%.6g " ++ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", ++ norm.c_str(), ++ o, ++ primal, ++ final_dual, ++ root_lp_no_cuts, ++ root_lp_with_cuts, ++ abs_root_dgap, ++ rel_root_dgap_pct, ++ gap_closed_pct, ++ abs_pgap, ++ rel_pgap_pct, ++ mip_gap, ++ solve_time_seconds, ++ cut_gen_time_sec, ++ termination_status.c_str()); ++ } else { ++ std::printf( ++ "MIPLIBGapStat instance=%s opt=TBD primal=%.10g final_dual=%.10g " ++ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " ++ "abs_root_dual_gap=TBD rel_root_dual_gap_pct=TBD gap_closed_pct=TBD " ++ "abs_primal_gap=TBD rel_primal_gap_pct=TBD " ++ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", ++ norm.c_str(), ++ primal, ++ final_dual, ++ root_lp_no_cuts, ++ root_lp_with_cuts, ++ mip_gap, ++ solve_time_seconds, ++ cut_gen_time_sec, ++ termination_status.c_str()); ++ } ++ std::fflush(stdout); ++} ++ ++} // namespace cuopt_bench +diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp +index f3554369..bed4e453 100644 +--- a/benchmarks/linear_programming/cuopt/run_mip.cpp ++++ b/benchmarks/linear_programming/cuopt/run_mip.cpp +@@ -6,6 +6,7 @@ + /* clang-format on */ + #include "initial_solution_reader.hpp" + #include "mip_test_instances.hpp" ++#include "miplib2017_optima.hpp" + + #include + #include +@@ -23,15 +24,20 @@ + #include + #include + ++#include + #include + #include + #include + #include + #include ++#include + #include ++#include + #include ++#include + #include + #include ++#include + #include + #include + #include +@@ -239,6 +245,43 @@ int run_single_file(std::string file_path, + } else { + CUOPT_LOG_INFO("%s: no solution found", base_filename.c_str()); + } ++ ++ // Per-instance "gap closed to optimum" stat. Emits a single ++ // grep-friendly "MIPLIBGapStat ..." line via printf so cross-branch ++ // comparison is just `grep '^MIPLIBGapStat' branchA.log` then diff. ++ // Optima are looked up from the in-source MIPLIB2017 benchmark-set ++ // table (miplib2017_optima.hpp); unknown instances emit "opt=TBD" ++ // and infeasibility-flagged instances emit "opt=Infeasible". ++ { ++ const double _gap_seconds = std::chrono::duration_cast( ++ std::chrono::high_resolution_clock::now() - start_run_solver) ++ .count() / ++ 1000.0; ++ std::string _status_str; ++ switch (solution.get_termination_status()) { ++ case cuopt::linear_programming::mip_termination_status_t::Optimal: ++ _status_str = "Optimal"; ++ break; ++ case cuopt::linear_programming::mip_termination_status_t::FeasibleFound: ++ _status_str = "FeasibleFound"; ++ break; ++ case cuopt::linear_programming::mip_termination_status_t::TimeLimit: ++ _status_str = "TimeLimit"; ++ break; ++ case cuopt::linear_programming::mip_termination_status_t::Infeasible: ++ _status_str = "Infeasible"; ++ break; ++ default: _status_str = "Other"; break; ++ } ++ cuopt_bench::print_miplib_gap_stat(base_filename, ++ solution, ++ _gap_seconds, ++ _status_str, ++ benchmark_info.root_lp_no_cuts, ++ benchmark_info.root_lp_with_cuts, ++ benchmark_info.cut_generation_time_sec); ++ } ++ + std::stringstream ss; + int decimal_places = 2; + double mip_gap = solution.get_mip_gap(); +@@ -293,6 +336,157 @@ void run_single_file_mp(std::string file_path, + exit(sol_found); + } + ++// Return the NUMA node of each GPU (one entry per gpu_id), or -1 when the ++// node can't be determined. Reads /sys/bus/pci/devices//numa_node so it ++// requires no extra dependencies (NVML / hwloc). ++static std::vector get_gpu_numa_nodes(int n_gpus) ++{ ++ std::vector nodes(static_cast(std::max(0, n_gpus)), -1); ++ for (int i = 0; i < n_gpus; ++i) { ++ char pci_id[32] = {0}; ++ if (cudaDeviceGetPCIBusId(pci_id, sizeof(pci_id), i) != cudaSuccess) { continue; } ++ for (char* c = pci_id; *c; ++c) { ++ *c = static_cast(std::tolower(static_cast(*c))); ++ } ++ std::ifstream f(std::string("/sys/bus/pci/devices/") + pci_id + "/numa_node"); ++ if (!f) { continue; } ++ int node = -1; ++ f >> node; ++ nodes[i] = node; ++ } ++ return nodes; ++} ++ ++// Parse a sysfs cpulist string ("0-71,144-215") into a sorted list of CPU IDs. ++// Returns empty on any read or parse failure. ++static std::vector read_numa_cpulist(int numa_node) ++{ ++ std::vector cpus; ++ if (numa_node < 0) { return cpus; } ++ std::ifstream f(std::string("/sys/devices/system/node/node") + std::to_string(numa_node) + ++ "/cpulist"); ++ if (!f) { return cpus; } ++ std::string line; ++ if (!std::getline(f, line)) { return cpus; } ++ size_t pos = 0; ++ while (pos < line.size()) { ++ const size_t comma = line.find(',', pos); ++ const size_t end = (comma == std::string::npos) ? line.size() : comma; ++ const std::string range = line.substr(pos, end - pos); ++ if (!range.empty()) { ++ try { ++ const size_t dash = range.find('-'); ++ const int lo = std::stoi(range.substr(0, dash)); ++ const int hi = (dash == std::string::npos) ? lo : std::stoi(range.substr(dash + 1)); ++ for (int c = lo; c <= hi; ++c) { ++ cpus.push_back(c); ++ } ++ } catch (...) { ++ return std::vector{}; ++ } ++ } ++ if (comma == std::string::npos) { break; } ++ pos = comma + 1; ++ } ++ std::sort(cpus.begin(), cpus.end()); ++ return cpus; ++} ++ ++// Bind the current process to a fair partition of the inherited CPU mask, ++// preferring CPUs on the same NUMA node as the GPU. Returns the actual ++// number of CPUs the child was pinned to, or -1 if the partition could not ++// be applied (caller must then choose a fallback). ++// ++// Algorithm: ++// 1. Read inherited (parent) affinity mask -> visible_cpus. ++// 2. Look up each GPU's NUMA node via PCI BDF. ++// 3. If this GPU's NUMA node is known and has visible CPUs, partition ++// that NUMA node's CPUs among the GPUs that landed on the same node ++// (siblings, ordered by gpu_id). ++// 4. Otherwise fall back to a contiguous global partition of visible_cpus. ++// ++// The function always emits a single stdout line per child summarising the ++// partition (NUMA-local vs contiguous-fallback), so the parent's log isn't ++// interleaved per-CPU across n_gpus children. ++int bind_process_to_cpu_partition(int gpu_id, int n_gpus) ++{ ++ if (gpu_id < 0 || n_gpus <= 0 || gpu_id >= n_gpus) { return -1; } ++ ++ cpu_set_t parent_mask; ++ CPU_ZERO(&parent_mask); ++ if (sched_getaffinity(0, sizeof(parent_mask), &parent_mask) != 0) { ++ perror("sched_getaffinity"); ++ return -1; ++ } ++ ++ std::vector visible_cpus; ++ for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) { ++ if (CPU_ISSET(cpu, &parent_mask)) { visible_cpus.push_back(cpu); } ++ } ++ if (visible_cpus.empty()) { return -1; } ++ std::sort(visible_cpus.begin(), visible_cpus.end()); ++ ++ std::vector chosen_cpus; ++ bool numa_aware = false; ++ ++ const std::vector gpu_numa_nodes = get_gpu_numa_nodes(n_gpus); ++ const int my_numa = gpu_numa_nodes[gpu_id]; ++ if (my_numa >= 0) { ++ std::vector siblings; ++ for (int i = 0; i < n_gpus; ++i) { ++ if (gpu_numa_nodes[i] == my_numa) { siblings.push_back(i); } ++ } ++ std::vector numa_cpus = read_numa_cpulist(my_numa); ++ if (!numa_cpus.empty() && !siblings.empty()) { ++ std::vector local_visible; ++ std::set_intersection(visible_cpus.begin(), ++ visible_cpus.end(), ++ numa_cpus.begin(), ++ numa_cpus.end(), ++ std::back_inserter(local_visible)); ++ if (!local_visible.empty()) { ++ const int siblings_count = static_cast(siblings.size()); ++ const int my_idx = ++ static_cast(std::find(siblings.begin(), siblings.end(), gpu_id) - siblings.begin()); ++ const int local_per_gpu = ++ std::max(1, static_cast(local_visible.size()) / siblings_count); ++ const int s = my_idx * local_per_gpu; ++ const int e = std::min(s + local_per_gpu, static_cast(local_visible.size())); ++ if (s < e) { ++ chosen_cpus.assign(local_visible.begin() + s, local_visible.begin() + e); ++ numa_aware = true; ++ } ++ } ++ } ++ } ++ ++ if (!numa_aware) { ++ const int cpus_per_gpu = std::max(1, static_cast(visible_cpus.size()) / n_gpus); ++ const int start = gpu_id * cpus_per_gpu; ++ if (start >= static_cast(visible_cpus.size())) { return -1; } ++ const int end = std::min(start + cpus_per_gpu, static_cast(visible_cpus.size())); ++ chosen_cpus.assign(visible_cpus.begin() + start, visible_cpus.begin() + end); ++ } ++ ++ cpu_set_t child_mask; ++ CPU_ZERO(&child_mask); ++ std::ostringstream oss; ++ oss << "[gpu " << gpu_id << "] bound to " << chosen_cpus.size() << " CPUs (" ++ << (numa_aware ? "NUMA-local node " + std::to_string(my_numa) : "contiguous-fallback") ++ << "):"; ++ for (int c : chosen_cpus) { ++ CPU_SET(c, &child_mask); ++ oss << ' ' << c; ++ } ++ std::cout << oss.str() << std::endl; ++ ++ if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { ++ perror("sched_setaffinity"); ++ return -1; ++ } ++ return static_cast(chosen_cpus.size()); ++} ++ + void return_gpu_to_the_queue(std::unordered_map& pid_gpu_map, + std::unordered_map& pid_file_map, + std::queue& gpu_queue) +@@ -417,6 +611,11 @@ int main(int argc, char* argv[]) + int reliability_branching = program.get("--reliability-branching"); + bool deterministic = program.get("--determinism"); + ++ if (run_dir && program.is_used("--num-cpu-threads")) { ++ std::cerr << "Warning: --num-cpu-threads is ignored in directory-run mode; " ++ "thread count is set per process from the bound CPU partition.\n"; ++ } ++ + if (num_cpu_threads < 0) { + num_cpu_threads = omp_get_max_threads() / n_gpus; + // std::ifstream smt_file("/sys/devices/system/cpu/smt/active"); +@@ -502,6 +701,18 @@ int main(int argc, char* argv[]) + } + if (sys_pid == 0) { + RAFT_CUDA_TRY(cudaSetDevice(gpu_id)); ++ int assigned_cpus = bind_process_to_cpu_partition(gpu_id, n_gpus); ++ if (assigned_cpus <= 0) { ++ assigned_cpus = std::max(1, omp_get_max_threads() / n_gpus); ++ std::cerr << "[gpu " << gpu_id << "] CPU pin failed; falling back to " ++ << assigned_cpus << " threads\n"; ++ } ++ // Directory-run mode owns the thread count: --num-cpu-threads is ++ // intentionally ignored here so per-process thread budgets match ++ // the bound CPU partition. The single-run path below still ++ // honours --num-cpu-threads. ++ omp_set_num_threads(assigned_cpus); ++ num_cpu_threads = assigned_cpus; + run_single_file_mp(file_name, + gpu_id, + batch_num, +@@ -534,31 +745,36 @@ int main(int argc, char* argv[]) + merge_result_files(out_dir, result_file, n_gpus, batch_num); + } else { + auto memory_resource = make_async(); ++ auto run_single = [&]() { ++ run_single_file(path, ++ 0, ++ 0, ++ n_gpus, ++ out_dir, ++ initial_solution_file, ++ heuristics_only, ++ num_cpu_threads, ++ write_log_file, ++ log_to_console, ++ reliability_branching, ++ time_limit, ++ work_limit, ++ deterministic); ++ }; + if (memory_limit > 0) { + auto limiting_adaptor = + rmm::mr::limiting_resource_adaptor(memory_resource, memory_limit * 1024ULL * 1024ULL); + rmm::mr::set_current_device_resource(limiting_adaptor); ++ run_single(); + } else if (track_allocations) { + rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource, + /*capture_stacks=*/true); + rmm::mr::set_current_device_resource(tracking_adaptor); ++ run_single(); + } else { + rmm::mr::set_current_device_resource(memory_resource); ++ run_single(); + } +- run_single_file(path, +- 0, +- 0, +- n_gpus, +- out_dir, +- initial_solution_file, +- heuristics_only, +- num_cpu_threads, +- write_log_file, +- log_to_console, +- reliability_branching, +- time_limit, +- work_limit, +- deterministic); + } + + return 0; +diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +index 685b1360..b2231b1a 100644 +--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp ++++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +@@ -26,6 +26,27 @@ struct benchmark_info_t { + double last_improvement_of_best_feasible = 0; + double last_improvement_after_recombination = 0; + double objective_of_initial_population = std::numeric_limits::max(); ++ // LP relaxation objective at the root node, BEFORE any cuts have been ++ // added. quiet_NaN() means "B&B did not run cut passes / value was ++ // never written" — distinguishes it from a legitimate 0.0. ++ double root_lp_no_cuts = std::numeric_limits::quiet_NaN(); ++ // LP relaxation objective at the root node, AFTER the full cut loop ++ // (final pass result). The dual gap "by cuts at the root" is then ++ // gap_after_cuts = opt - root_lp_with_cuts (in B&B's solver ++ // objective sense) ++ // and the classical "gap closed by cuts" metric is ++ // gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) ++ // / (opt - root_lp_no_cuts). ++ // quiet_NaN() means "B&B did not finish the cut loop / value not written". ++ double root_lp_with_cuts = std::numeric_limits::quiet_NaN(); ++ ++ // Wall-clock time spent inside the root-node cut generation loop ++ // (sum of generate_cuts + score_cuts + check_for_duplicate_cuts + ++ // get_best_cuts + add_cuts + post-cut LP resolves), in seconds. ++ // Published by branch_and_bound.cpp::solve() at the same point that ++ // root_lp_with_cuts is finalised. quiet_NaN() means "cut loop did ++ // not run / value never written". ++ double cut_generation_time_sec = std::numeric_limits::quiet_NaN(); + }; + + // Forward declare solver_settings_t for friend class +diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp +index 0222ad6f..96d05d6b 100644 +--- a/cpp/src/branch_and_bound/branch_and_bound.cpp ++++ b/cpp/src/branch_and_bound/branch_and_bound.cpp +@@ -11,6 +11,8 @@ + #include + #include + ++#include // benchmark_info_t ++ + #include + #include + #include +@@ -2361,6 +2363,15 @@ auto branch_and_bound_t::do_cut_pass( + } + root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + ++ // Publish after every successful post-cut LP resolve so any ++ // early-exit path below (NUMERICAL, TIME_LIMIT, gap-tolerance ++ // exit) still leaves benchmark_info->root_lp_with_cuts pointing ++ // at the most recent valid LP-with-cuts objective. ++ if (settings_.benchmark_info_ptr != nullptr) { ++ settings_.benchmark_info_ptr->root_lp_with_cuts = ++ static_cast(compute_user_objective(original_lp_, root_objective_)); ++ } ++ + f_t remove_cuts_start_time = tic(); + mutex_original_lp_.lock(); + remove_cuts(original_lp_, +@@ -2479,7 +2490,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + user_problem_t problem_copy = original_problem_; + timer_t timer(std::numeric_limits::infinity()); + detail::find_initial_cliques( +- problem_copy, tolerances_for_clique, &clique_table_, timer, false, clique_signal); ++ problem_copy, tolerances_for_clique, &clique_table_, timer, clique_signal); + } + } + +@@ -2588,6 +2599,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + cut_info_t cut_info; + + if (num_fractional == 0) { ++ // LP relaxation already integer-feasible — solved at the root with ++ // no cuts. Publish both bounds equal to the root LP value so the ++ // gap-closed-by-cuts line still has a finite, meaningful entry ++ // (the printer reports 100% closed when total integrality gap ~= 0). ++ if (settings_.benchmark_info_ptr != nullptr) { ++ const double v = static_cast(compute_user_objective(original_lp_, root_objective_)); ++ settings_.benchmark_info_ptr->root_lp_no_cuts = v; ++ settings_.benchmark_info_ptr->root_lp_with_cuts = v; ++ } + set_solution_at_root(solution, cut_info); + signal_extend_cliques_.store(true, std::memory_order_release); + #pragma omp taskwait depend(in : *clique_signal) +@@ -2624,6 +2644,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + f_t last_objective = root_objective_; + f_t root_relax_objective = root_objective_; + ++ // Publish the no-cuts root LP value once. The with-cuts companion is ++ // published below after the cut loop terminates. Both go to the ++ // benchmark_info_t so callers (run_mip.cpp) can compute ++ // gap-closed-by-cuts without instrumenting the cut loop directly. ++ if (settings_.benchmark_info_ptr != nullptr) { ++ settings_.benchmark_info_ptr->root_lp_no_cuts = ++ static_cast(compute_user_objective(original_lp_, root_relax_objective)); ++ } ++ + constexpr bool enable_root_cut_cpufj = true; + std::unique_ptr> root_cut_cpufj_task; + auto root_cut_cpufj_improvement_callback = +@@ -2649,10 +2678,29 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + cuopt::scope_guard root_cut_cpufj_guard([&]() { stop_root_cut_cpufj(); }); + + f_t cut_generation_start_time = tic(); ++ auto publish_cut_generation_time = [&](bool force_time_limit_value = false) { ++ if (settings_.benchmark_info_ptr == nullptr) { return; } ++ f_t cut_generation_time = toc(cut_generation_start_time); ++ if (force_time_limit_value || cut_generation_time > settings_.time_limit) { ++ cut_generation_time = settings_.time_limit; ++ } ++ if (cut_generation_time < static_cast(0.0)) { ++ cut_generation_time = static_cast(0.0); ++ } ++ settings_.benchmark_info_ptr->cut_generation_time_sec = static_cast(cut_generation_time); ++ }; + i_t cut_pool_size = 0; + for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { + if (num_fractional == 0) { ++ // LP relaxation is already integer-feasible — solved at the root ++ // by the cuts added so far (possibly zero). Publish the with-cuts ++ // value so the gap-closed line still has a non-NaN dual bound. ++ if (settings_.benchmark_info_ptr != nullptr) { ++ settings_.benchmark_info_ptr->root_lp_with_cuts = ++ static_cast(compute_user_objective(original_lp_, root_objective_)); ++ } + set_solution_at_root(solution, cut_info); ++ publish_cut_generation_time(); + signal_extend_cliques_.store(true, std::memory_order_release); + #pragma omp taskwait depend(in : *clique_signal) + return mip_status_t::OPTIMAL; +@@ -2692,6 +2740,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + } + + if (cut_pass_result.action == cut_pass_action_t::RETURN) { ++ publish_cut_generation_time(cut_pass_result.status == mip_status_t::TIME_LIMIT); + signal_extend_cliques_.store(true, std::memory_order_release); + #pragma omp taskwait depend(in : *clique_signal) + return cut_pass_result.status; +@@ -2714,8 +2763,25 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + } + } + ++ // Cut loop terminated (max_cut_passes hit, num_fractional==0 break, ++ // negligible-objective-change break, or time-limit break). Publish ++ // the post-cuts root LP value so benchmark drivers can compute ++ // gap-closed-by-cuts. We use compute_user_objective to flip the sign ++ // back into user space when the LP was dualized, matching the ++ // convention used for root_lp_no_cuts above and for the per-pass ++ // "Bound" column in the search log. ++ if (settings_.benchmark_info_ptr != nullptr) { ++ settings_.benchmark_info_ptr->root_lp_with_cuts = ++ static_cast(compute_user_objective(original_lp_, root_objective_)); ++ } ++ + print_cut_info(settings_, cut_info); + f_t cut_generation_time = toc(cut_generation_start_time); ++ // Publish the cut generation wall time so MIPLIBGapStat / run_mip can ++ // emit it alongside gap_closed_pct. Always set when the cut loop ran, ++ // even if no cuts were added (the time still measures real work in ++ // generate_cuts + score_cuts + dedup + LP resolves). ++ publish_cut_generation_time(); + if (cut_info.has_cuts()) { + settings_.log.printf("Cut generation time: %.2f seconds\n", cut_generation_time); + settings_.log.printf("Cut pool size : %d\n", cut_pool_size); +diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu +index c25ade0c..712be213 100644 +--- a/cpp/src/mip_heuristics/solver.cu ++++ b/cpp/src/mip_heuristics/solver.cu +@@ -377,6 +377,9 @@ solution_t mip_solver_t::run_solver() + context.settings.strong_chvatal_gomory_cuts; + branch_and_bound_settings.cut_change_threshold = context.settings.cut_change_threshold; + branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality; ++ // Forward the run-level benchmark_info_t so B&B can publish root LP ++ // bounds (before / after cuts) for gap-closed-by-cuts measurement. ++ branch_and_bound_settings.benchmark_info_ptr = context.settings.benchmark_info_ptr; + branch_and_bound_settings.mip_batch_pdlp_strong_branching = + context.settings.mip_batch_pdlp_strong_branching; + branch_and_bound_settings.mip_batch_pdlp_reliability_branching = From a17ffa38d8b1570e81bf31ba9a5d740807489c7c Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 2 Jun 2026 09:26:37 +0200 Subject: [PATCH 21/47] simplify comments --- cpp/src/branch_and_bound/branch_and_bound.cpp | 13 +--- cpp/src/cuts/cuts.cpp | 59 ++++-------------- cpp/src/cuts/cuts.hpp | 61 +++++-------------- cut_gap_timing_stats.patch | 31 ++++------ 4 files changed, 43 insertions(+), 121 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 2bf15f49ff..5420b88221 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2764,13 +2764,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } } - // Cut loop terminated (max_cut_passes hit, num_fractional==0 break, - // negligible-objective-change break, or time-limit break). Publish - // the post-cuts root LP value so benchmark drivers can compute - // gap-closed-by-cuts. We use compute_user_objective to flip the sign - // back into user space when the LP was dualized, matching the - // convention used for root_lp_no_cuts above and for the per-pass - // "Bound" column in the search log. + // Publish the post-cuts root LP value. if (settings_.benchmark_info_ptr != nullptr) { settings_.benchmark_info_ptr->root_lp_with_cuts = static_cast(compute_user_objective(original_lp_, root_objective_)); @@ -2778,10 +2772,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut print_cut_info(settings_, cut_info); f_t cut_generation_time = toc(cut_generation_start_time); - // Publish the cut generation wall time so MIPLIBGapStat / run_mip can - // emit it alongside gap_closed_pct. Always set when the cut loop ran, - // even if no cuts were added (the time still measures real work in - // generate_cuts + score_cuts + dedup + LP resolves). + // Publish cut-generation time for reporting. publish_cut_generation_time(); if (cut_info.has_cuts()) { settings_.log.printf("Cut generation time: %.2f seconds\n", cut_generation_time); diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 8f612bbeef..fc0329817e 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -610,20 +610,12 @@ void cut_pool_t::add_cut(cut_type_t cut_type, return; } - // P2-4 at-insert cousin filter for the clique-cut family. Bron-Kerbosch - // emits all maximal cliques, so dense conflict graphs can produce - // hundreds of "cousin" cliques per round whose support sets agree in - // |k - 1| of |k| vertices. The selection-stage orthogonality scan - // catches them only after each cousin has paid the full insert + dedup - // + score cost. The cousin filter intercepts at insert: we estimate - // Jaccard via min-hash and, on a collision >= clique_cousin_jaccard_tau_ - // with an existing pool entry, keep the higher-scoring representative. + // At insert time, use min-hash to detect similar clique cuts and keep + // one representative when estimated Jaccard passes the threshold. std::vector new_sketch; i_t cousin_replace_row = -1; bool cousin_invariant_path = false; - // Apply the size-tilt boost to the caller's score so larger cliques win - // ties on cousin replacement (more variables covered = more constraint - // strength; a proxy for "integer support" since clique vars are 0-1). + // Optional size tilt so larger cliques can win close score comparisons. f_t effective_score = cut_score; if (effective_score >= static_cast(0.0) && clique_cousin_size_weight_ > static_cast(0.0) && cut_type == cut_type_t::CLIQUE) { @@ -643,10 +635,7 @@ void cut_pool_t::add_cut(cut_type_t cut_type, if (bucket_it != clique_cousin_buckets_.end()) { const i_t pool_size = cut_storage_.m; const i_t k = clique_cousin_minhash_k_; - // Walk the bucket, computing min-hash agreement with each peer. - // Bucket sizes are O(distinct max-clique families per round) so - // this loop is short on every realistic instance even when the - // pool is large. + // Compare the new sketch with peers in the same bucket. auto& bucket_rows = bucket_it->second; for (size_t b = 0; b < bucket_rows.size(); b++) { const i_t row = bucket_rows[b]; @@ -672,18 +661,10 @@ void cut_pool_t::add_cut(cut_type_t cut_type, cousin_drops_++; return; } - // New cut beats the existing representative. We "soft-replace": - // clear the loser's sketch so future cousins don't anchor against - // it (restoring the bucket invariant for new inserts), and - // reroute the bucket entry to the new row below. The loser stays - // in cut_storage_ for now and will be filtered by the standard - // orthogonality scan in score_cuts() — main_baselin has no - // mid-pass eviction primitive, and adding one would invalidate - // the per-pass cut_pool_size accounting. + // Soft-replace: redirect bucket entry to the new row and clear + // the old sketch so future inserts ignore the old representative. cousin_replace_row = row; - // Replace at most one peer per insert; a transitive cousin of - // the loser at the same bucket is filtered next time. Matches - // the SCIP / Mops "pairwise" family invariant. + // Replace at most one peer per insert. break; } } @@ -5802,20 +5783,15 @@ void apply_cut_sweep_config(cut_pool_t& cut_pool, config_id = 0; } - // Defaults match cut_pool_t's initializers: cousin filter OFF, tau=0.85, - // k=8, size_weight=0.0. Each case below documents what it tweaks. + // Defaults come from cut_pool_t initializers. Each case overrides + // only the needed cousin-filter parameters. switch (config_id) { case 0: - // 00_baseline_no_cousin: clique algorithmic changes only (8f2cf00a). - // Cousin filter disabled — isolates the impact of the - // build_clique_cut two-pass refactor and the addtl_cliques_scan_cost - // work-accounting. + // 00_baseline_no_cousin: cousin filter off. cut_pool.set_clique_cousin_filter_enable(false); break; case 1: - // 01_cousin_default: P2-4 cousin filter on with the cut_scoring branch - // defaults (tau=0.85, k=8, no size tilt). Score is the caller-supplied - // violation; ties prefer the earlier insert. + // 01_cousin_default: tau=0.85, k=8, no size tilt. cut_pool.set_clique_cousin_filter_enable(true); cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.85)); cut_pool.set_clique_cousin_minhash_k(8); @@ -5830,23 +5806,14 @@ void apply_cut_sweep_config(cut_pool_t& cut_pool, cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); break; case 3: - // 03_cousin_loose: looser Jaccard threshold (0.85 -> 0.875). Allows - // 7/8 min-hash agreement to qualify as cousins when k=8. Closer - // to no-filter behavior. Gap should match config 0 if the cousin - // filter is mostly absorbing redundancy that the orthogonality - // scan would catch anyway. + // 03_cousin_loose: tau=0.875 (7/8 with k=8). cut_pool.set_clique_cousin_filter_enable(true); cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.875)); cut_pool.set_clique_cousin_minhash_k(8); cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); break; case 4: - // 04_cousin_size_tilt: cousin filter on at default tau=0.85, but the - // score used for cousin replacement is multiplied by - // (1 + 0.5 * log2(1 + clique_size)) - // so larger cliques win on ties / near-ties. For clique cuts every - // variable is binary, so clique size is the integer-support count - // — this is the "clique integer support" knob the user requested. + // 04_cousin_size_tilt: tau=0.85 with size-tilted replacement score. cut_pool.set_clique_cousin_filter_enable(true); cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.85)); cut_pool.set_clique_cousin_minhash_k(8); diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index c4dea8dfee..e1ffab9fa2 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -290,12 +290,10 @@ class cut_pool_t { // Add a cut in the form: cut'*x >= rhs. // We expect that the cut is violated by the current relaxation xstar. // - // cut_score is an optional caller-supplied quality score used by the - // P2-4 clique cousin filter (only consulted for cut_type == CLIQUE - // when the cousin filter is enabled). Pass a non-negative value to - // enable score-aware cousin replacement; the default (-1.0) reverts - // to "first-write-wins" cousin policy. Other cut types ignore this - // parameter. + // Optional score used by the clique cousin filter. + // Only used for cut_type == CLIQUE when the filter is enabled. + // Non-negative values enable score-based replacement; -1.0 means + // keep the first inserted representative. void add_cut(cut_type_t cut_type, const inequality_t& cut, f_t cut_score = static_cast(-1.0)); @@ -317,24 +315,10 @@ class cut_pool_t { void check_for_duplicate_cuts(); - // ----- P2-4 clique cousin filter knobs / counters ----------------------- - // - // The clique cut family (Bron-Kerbosch + extension) emits cousin - // cliques whose support sets agree in |k-1| of |k| vertices. The - // selection-stage orthogonality scan catches them but only after the - // full insert + dedup + score cost has been paid. The cousin filter - // intercepts at insert: we min-hash the cut's column-support set, - // bucket on the first sketch hash, and when an existing pool entry - // collides with estimated Jaccard >= jaccard_tau we keep the - // higher-scoring representative (or, if no score was supplied, the - // earlier-inserted one). - // + // Clique cousin filter settings. + // At insert time, we compare min-hash sketches and keep one + // representative when estimated Jaccard >= jaccard_tau. // Defaults: jaccard_tau=0.875, k=8, enable=true, size_weight=0.0. - // These match "config 3 / cousin_loose" from the clique-sweep on - // commit 0b04683b — the configuration that won the gap-closed-pct - // comparison and was promoted to be the production default for the - // clique cut family. Callers can still override at runtime via - // set_clique_cousin_* if they want to experiment. void set_clique_cousin_filter_enable(bool v) { clique_cousin_filter_enable_ = v; } void set_clique_cousin_jaccard_tau(f_t v) { clique_cousin_jaccard_tau_ = v; } void set_clique_cousin_minhash_k(i_t v) { clique_cousin_minhash_k_ = v; } @@ -390,14 +374,9 @@ class cut_pool_t { std::vector best_cuts_; const f_t min_cut_distance_{1e-4}; - // P2-4 cousin filter state. clique_support_minhash_ is sized in - // lock-step with cut_storage_; non-CLIQUE rows carry an empty - // sketch and are skipped by rebuild_clique_cousin_buckets() and the - // cousin loop in add_cut. clique_cousin_score_ holds the - // caller-supplied score (raw violation, or violation * size-tilt) so - // we can decide which representative to keep when two cliques - // collide. clique_cousin_buckets_ maps the first sketch hash to the - // list of pool rows whose sketches start with that hash. + // Cousin filter state. + // Vectors are kept aligned with cut_storage_. Non-CLIQUE rows keep + // empty sketches. Buckets map sketch[0] to candidate rows. std::vector> clique_support_minhash_; std::vector clique_cousin_score_; std::unordered_map> clique_cousin_buckets_; @@ -420,32 +399,22 @@ class cut_pool_t { // --------------------------------------------------------------------------- // Cut-pool sweep configuration dispatch. // -// Selected by the CUOPT_CONFIG_ID environment variable; range-checked -// against CUOPT_MAX_CONFIG (caller-asserted upper bound). One env-var -// dispatch covers the entire clique cut family because the only knobs -// we vary on this branch live on cut_pool_t (cousin filter on/off, -// Jaccard tau, integer-support size tilt). The deterministic -// measurement path (no concurrent root LP, no in-cut-pass RCS, exit -// after the cut loop) is unconditional and lives in branch_and_bound. +// Selected by CUOPT_CONFIG_ID and range-checked against +// CUOPT_MAX_CONFIG. Configs control the clique cousin filter knobs. // // Keep kCutSweepNumConfigs in sync with the switch table in // apply_cut_sweep_config() (see cuts.cpp) and with cut_sweep_config_name() // below. // // Layout: -// 0 baseline_no_cousin clique cut algorithmic changes only -// (cousin filter off; isolates 8f2cf00a impact) +// 0 baseline_no_cousin cousin filter off // 1 cousin_default cousin filter on, tau=0.85, k=8, score=violation -// (the cut_scoring final-version P2-4 baseline) // 2 cousin_strict cousin filter on, tau=0.70 (more aggressive // cousin removal — favors quantity reduction) // 3 cousin_loose cousin filter on, tau=0.875 (allows 7/8 min-hash -// agreement with k=8; still closer to no-filter -// extreme — selection-stage absorbs cousins) +// agreement with k=8) // 4 cousin_size_tilt cousin filter on, tau=0.85, score = violation * -// (1 + 0.5 * log2(1 + clique_size)) — picks the -// larger clique on cousin replacement (integer -// support proxy, since clique vars are 0-1) +// (1 + 0.5 * log2(1 + clique_size)) constexpr int kCutSweepNumConfigs = 5; inline const char* cut_sweep_config_name(int config_id) diff --git a/cut_gap_timing_stats.patch b/cut_gap_timing_stats.patch index 7530d47e46..0e87cc9d95 100644 --- a/cut_gap_timing_stats.patch +++ b/cut_gap_timing_stats.patch @@ -830,7 +830,7 @@ index 685b1360..b2231b1a 100644 // Forward declare solver_settings_t for friend class diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp -index 0222ad6f..96d05d6b 100644 +index 0222ad6f..5420b882 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -11,6 +11,8 @@ @@ -899,10 +899,13 @@ index 0222ad6f..96d05d6b 100644 constexpr bool enable_root_cut_cpufj = true; std::unique_ptr> root_cut_cpufj_task; auto root_cut_cpufj_improvement_callback = -@@ -2649,10 +2678,29 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut +@@ -2648,11 +2677,31 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + }; cuopt::scope_guard root_cut_cpufj_guard([&]() { stop_root_cut_cpufj(); }); - f_t cut_generation_start_time = tic(); +- f_t cut_generation_start_time = tic(); +- i_t cut_pool_size = 0; ++ f_t cut_generation_start_time = tic(); + auto publish_cut_generation_time = [&](bool force_time_limit_value = false) { + if (settings_.benchmark_info_ptr == nullptr) { return; } + f_t cut_generation_time = toc(cut_generation_start_time); @@ -912,9 +915,10 @@ index 0222ad6f..96d05d6b 100644 + if (cut_generation_time < static_cast(0.0)) { + cut_generation_time = static_cast(0.0); + } -+ settings_.benchmark_info_ptr->cut_generation_time_sec = static_cast(cut_generation_time); ++ settings_.benchmark_info_ptr->cut_generation_time_sec = ++ static_cast(cut_generation_time); + }; - i_t cut_pool_size = 0; ++ i_t cut_pool_size = 0; for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { if (num_fractional == 0) { + // LP relaxation is already integer-feasible — solved at the root @@ -929,7 +933,7 @@ index 0222ad6f..96d05d6b 100644 signal_extend_cliques_.store(true, std::memory_order_release); #pragma omp taskwait depend(in : *clique_signal) return mip_status_t::OPTIMAL; -@@ -2692,6 +2740,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut +@@ -2692,6 +2741,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } if (cut_pass_result.action == cut_pass_action_t::RETURN) { @@ -937,17 +941,11 @@ index 0222ad6f..96d05d6b 100644 signal_extend_cliques_.store(true, std::memory_order_release); #pragma omp taskwait depend(in : *clique_signal) return cut_pass_result.status; -@@ -2714,8 +2763,25 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut +@@ -2714,8 +2764,16 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } } -+ // Cut loop terminated (max_cut_passes hit, num_fractional==0 break, -+ // negligible-objective-change break, or time-limit break). Publish -+ // the post-cuts root LP value so benchmark drivers can compute -+ // gap-closed-by-cuts. We use compute_user_objective to flip the sign -+ // back into user space when the LP was dualized, matching the -+ // convention used for root_lp_no_cuts above and for the per-pass -+ // "Bound" column in the search log. ++ // Publish the post-cuts root LP value. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_with_cuts = + static_cast(compute_user_objective(original_lp_, root_objective_)); @@ -955,10 +953,7 @@ index 0222ad6f..96d05d6b 100644 + print_cut_info(settings_, cut_info); f_t cut_generation_time = toc(cut_generation_start_time); -+ // Publish the cut generation wall time so MIPLIBGapStat / run_mip can -+ // emit it alongside gap_closed_pct. Always set when the cut loop ran, -+ // even if no cuts were added (the time still measures real work in -+ // generate_cuts + score_cuts + dedup + LP resolves). ++ // Publish cut-generation time for reporting. + publish_cut_generation_time(); if (cut_info.has_cuts()) { settings_.log.printf("Cut generation time: %.2f seconds\n", cut_generation_time); From e5bc991a28596eb6b2974c52b32764f240259ea2 Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 2 Jun 2026 09:49:42 +0200 Subject: [PATCH 22/47] cut stats --- .../cuopt/miplib2017_optima.hpp | 476 +++++++++ .../linear_programming/cuopt/run_mip.cpp | 244 ++++- .../mip/solver_settings.hpp | 21 + cpp/src/branch_and_bound/branch_and_bound.cpp | 64 +- .../dual_simplex/simplex_solver_settings.hpp | 6 + cpp/src/mip_heuristics/solver.cu | 3 + cut_gap_timing_stats.patch | 974 ++++++++++++++++++ 7 files changed, 1771 insertions(+), 17 deletions(-) create mode 100644 benchmarks/linear_programming/cuopt/miplib2017_optima.hpp create mode 100644 cut_gap_timing_stats.patch diff --git a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp new file mode 100644 index 0000000000..7f6826a5ce --- /dev/null +++ b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp @@ -0,0 +1,476 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +// MIPLIB2017 best-known objective ("optimum") lookup for the MIP +// benchmark runner. Self-contained: no env vars, no external CSV. +// +// Coverage: every instance in the MIPLIB2017 *benchmark* set (240 +// instances). Of those, 232 have a known optimum and live in +// kBenchmarkOptima; 7 are infeasible and live in kBenchmarkInfeasible +// so the printer can label them clearly instead of returning "no opt". +// +// Lookup uses the basename without directory and stripped of +// .mps / .mps.gz / .lp / .lp.gz / .gz suffixes, lower-cased. So +// "miplib2017/MAS74.mps.gz" / "mas74.mps" / "mas74" +// all hit the same entry. +// +// Returns std::optional: nullopt means "instance is in our +// benchmark set but infeasible" *or* "we don't have an entry for it". +// is_known_infeasible() distinguishes the two. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuopt_bench { + +// Strip directory prefix and any .mps/.lp suffix (with optional .gz), +// then lower-case. Designed to match how MPS instance files are named +// across MIPLIB downloads (case- and extension-insensitive). +inline std::string normalize_instance_name(const std::string& raw) +{ + std::string s = raw; + const auto slash = s.find_last_of("/\\"); + if (slash != std::string::npos) { s = s.substr(slash + 1); } + auto endswith = [&](const std::string& suf) { + if (s.size() < suf.size()) { return false; } + for (size_t i = 0; i < suf.size(); ++i) { + if (std::tolower(static_cast(s[s.size() - suf.size() + i])) != + std::tolower(static_cast(suf[i]))) { + return false; + } + } + return true; + }; + for (const char* suf : {".mps.gz", ".lp.gz", ".mps", ".lp", ".gz"}) { + if (endswith(suf)) { + s.resize(s.size() - std::char_traits::length(suf)); + break; + } + } + for (char& c : s) { + c = static_cast(std::tolower(static_cast(c))); + } + return s; +} + +// MIPLIB2017 benchmark-set best-known objectives (n=232). Source: +// https://miplib.zib.de "The Benchmark Set". Values are stored in the +// double precision they were published at; unit tests should compare +// with a tolerance of ~|opt|*1e-9 rather than exact equality. +inline const std::unordered_map& kBenchmarkOptima() +{ + static const std::unordered_map kOptima = { + {"30n20b8", 302}, + {"50v-10", 3311.1799841000002}, + {"academictimetablesmall", 0}, + {"air05", 26374}, + {"app1-1", -3}, + {"app1-2", -41}, + {"assign1-5-8", 211.99999999999801}, + {"atlanta-ip", 90.009878614000002}, + {"b1c1s1", 24544.25}, + {"bab2", -357544.31150000001}, + {"bab6", -284248.23070000007}, + {"beasleyc3", 753.9999999999128}, + {"binkar10_1", 6742.1998835000004}, + {"blp-ar98", 6205.2147103999996}, + {"blp-ic98", 4491.4475839500001}, + {"bnatt400", 1}, + {"bppc4-08", 53}, + {"brazil3", 24}, + {"buildingenergy", 33283.853236000003}, + {"cbs-cta", 0}, + {"chromaticindex1024-7", 4}, + {"chromaticindex512-7", 4}, + {"cmflsp50-24-8-8", 55789389.886}, + {"cms750_4", 252}, + {"co-100", 2639942.0600000001}, + {"cod105", -12}, + {"comp07-2idx", 6}, + {"comp21-2idx", 74}, + {"cost266-uue", 25148940.55999998}, + {"cryptanalysiskb128n5obj16", 0}, + {"csched007", 350.99999999999551}, + {"csched008", 173}, + {"cvs16r128-89", -97}, + {"dano3_3", 576.34463302999995}, + {"dano3_5", 576.9249159565619}, + {"decomp2", -160}, + {"drayage-100-23", 103333.87407000001}, + {"drayage-25-23", 101282.647018}, + {"dws008-01", 37412.604587945083}, + {"eil33-2", 934.007915999999}, + {"eila101-2", 880.92010799999991}, + {"enlight_hard", 37}, + {"ex10", 100}, + {"ex9", 81}, + {"exp-1-500-5-5", 65887}, + {"fast0507", 174}, + {"fastxgemm-n2r6s0t2", 230}, + {"fhnw-binpack4-48", 0}, + {"fiball", 138}, + {"gen-ip002", -4783.7333920000001}, + {"gen-ip054", 6840.9656417899996}, + {"germanrr", 47095869.648999996}, + {"gfd-schedulen180f7d50m30k18", 1}, + {"glass-sc", 23}, + {"glass4", 1200012599.972384}, + {"gmu-35-40", -2406733.3687999998}, + {"gmu-35-50", -2607958.3300000001}, + {"graph20-20-1rand", -9}, + {"graphdraw-domain", 19685.999975500381}, + {"h80x6320d", 6382.0990482459993}, + {"highschool1-aigio", 0}, + {"hypothyroid-k1", -2851}, + {"ic97_potential", 3941.9999309022501}, + {"icir97_tension", 6375}, + {"irish-electricity", 3723497.5913959998}, + {"irp", 12159.492835396981}, + {"istanbul-no-cutoff", 204.08170701}, + {"k1mushroom", -3288}, + {"lectsched-5-obj", 24}, + {"leo1", 404227536.16000003}, + {"leo2", 404077441.12}, + {"lotsize", 1480195}, + {"mad", 0.026800000000000001}, + {"map10", -495}, + {"map16715-04", -111}, + {"markshare2", 1}, + {"markshare_4_0", 1}, + {"mas74", 11801.185719999999}, + {"mas76", 40005.053989999993}, + {"mc11", 11688.99999999966}, + {"mcsched", 211913}, + {"mik-250-20-75-4", -52301}, + {"milo-v12-6-r2-40-1", 326481.14282799}, + {"momentum1", 109143.4935}, + {"mushroom-best", 0.055333761199999998}, + {"mzzv11", -21718}, + {"mzzv42z", -20540}, + {"n2seq36q", 52200}, + {"n3div36", 130800}, + {"n5-3", 8104.9999999939992}, + {"neos-1122047", 161}, + {"neos-1171448", -309}, + {"neos-1171737", -195}, + {"neos-1354092", 46}, + {"neos-1445765", -17783}, + {"neos-1456979", 176}, + {"neos-1582420", 90.999999999999957}, + {"neos-2657525-crna", 1.810748}, + {"neos-2746589-doon", 2008.1999999999989}, + {"neos-2978193-inde", -2.3880616899999998}, + {"neos-2987310-joes", -607702988.29999995}, + {"neos-3004026-krka", 0}, + {"neos-3024952-loue", 26756}, + {"neos-3046615-murg", 1600}, + {"neos-3083819-nubu", 6307996}, + {"neos-3216931-puriri", 71320}, + {"neos-3381206-awhea", 453}, + {"neos-3402294-bobin", 0.067249999999999491}, + {"neos-3555904-turama", -34.700000000000003}, + {"neos-3627168-kasai", 988585.61999999976}, + {"neos-3656078-kumeu", -13172.200000000001}, + {"neos-3754480-nidda", 12941.73838561778}, + {"neos-4300652-rahue", 2.1415999999999999}, + {"neos-4338804-snowy", 1471}, + {"neos-4387871-tavua", 33.384729927000002}, + {"neos-4413714-turia", 45.370167019999798}, + {"neos-4532248-waihi", 61.599999999999987}, + {"neos-4647030-tutaki", 27265.705999999958}, + {"neos-4722843-widden", 25009.662227000001}, + {"neos-4738912-atrato", 283627956.59500003}, + {"neos-4763324-toguru", 1613.0388458499999}, + {"neos-4954672-berkel", 2612710}, + {"neos-5049753-cuanza", 561.99999716889999}, + {"neos-5052403-cygnet", 182}, + {"neos-5093327-huahum", 6259.9999971258949}, + {"neos-5104907-jarama", 935}, + {"neos-5107597-kakapo", 3644.9999999995198}, + {"neos-5114902-kasavu", 655}, + {"neos-5188808-nattai", 0.110283622999984}, + {"neos-5195221-niemur", 0.0038354325999999999}, + {"neos-631710", 203}, + {"neos-662469", 184379.99999999991}, + {"neos-787933", 30}, + {"neos-827175", 112.00152}, + {"neos-848589", 2351.40309999697}, + {"neos-860300", 3200.9999999999982}, + {"neos-873061", 113.6562385063}, + {"neos-911970", 54.759999999999998}, + {"neos-933966", 318}, + {"neos-950242", 4}, + {"neos-957323", -237.75668150000001}, + {"neos-960392", -238}, + {"neos17", 0.1500025774}, + {"neos5", 15}, + {"neos8", -3719}, + {"net12", 214}, + {"netdiversion", 242}, + {"nexp-150-20-8-5", 231}, + {"ns1116954", 0}, + {"ns1208400", 2}, + {"ns1644855", -1524.3333333333301}, + {"ns1760995", -549.21438505000003}, + {"ns1830653", 20622}, + {"ns1952667", 0}, + {"nu25-pr12", 53904.999999999993}, + {"nursesched-medium-hint03", 115}, + {"nursesched-sprint02", 57.999999999999993}, + {"nw04", 16862}, + {"opm2-z10-s4", -33269}, + {"p200x1188c", 15078}, + {"peg-solitaire-a3", 1}, + {"pg", -8674.3426071199992}, + {"pg5_34", -14339.353450000001}, + {"physiciansched3-3", 2623271.3266670001}, + {"physiciansched6-2", 49324}, + {"piperout-08", 125054.9999999999}, + {"piperout-27", 8123.9999999999727}, + {"pk1", 11}, + {"proteindesign121hz512p9", 1473}, + {"proteindesign122trx11p8", 1747}, + {"qap10", 339.99999999838712}, + {"radiationm18-12-05", 17566}, + {"radiationm40-10-02", 155328}, + {"rail01", -70.569964299999995}, + {"rail02", -200.44990770000001}, + {"rail507", 174}, + {"ran14x18-disj-8", 3712}, + {"rd-rplusc-21", 165395.275295}, + {"reblock115", -36800603.233199999}, + {"rmatr100-p10", 423}, + {"rmatr200-p5", 4521}, + {"roci-4-11", -6020203}, + {"rocii-5-11", -6.6755047315380001}, + {"rococob10-011000", 19449}, + {"rocococ10-001000", 11460}, + {"roi2alpha3n4", -63.208495030000002}, + {"roi5alpha10n8", -52.322274350999997}, + {"roll3000", 12889.999991999999}, + {"s100", -0.16972352705829999}, + {"s250r10", -0.17178048342319999}, + {"satellites2-40", -19}, + {"satellites2-60-fs", -19.000000000099998}, + {"savsched1", 3217.6999999999998}, + {"sct2", -230.9891623}, + {"seymour", 423}, + {"seymour1", 410.76370138999999}, + {"sing326", 7753674.8537600003}, + {"sing44", 8128831.1771999998}, + {"snp-02-004-104", 586803238.65672886}, + {"sorrell3", -16}, + {"sp150x300d", 69}, + {"sp97ar", 660705645.75899994}, + {"sp98ar", 529740623.19999999}, + {"splice1k1", -394}, + {"square41", 15}, + {"square47", 15.9999999997877}, + {"supportcase10", 7}, + {"supportcase12", -7559.5330538170001}, + {"supportcase18", 48}, + {"supportcase19", 12677205.999920519}, + {"supportcase22", 110}, // best-known marked "*" in MIPLIB2017 (not proven optimal) + {"supportcase26", 1745.1238129999999}, + {"supportcase33", -345}, + {"supportcase40", 24256.3122898}, + {"supportcase42", 7.7586307222700004}, + {"supportcase6", 51906.477370000001}, + {"supportcase7", -1132.2231770000001}, + {"swath1", 379.07129574999999}, + {"swath3", 397.76134365000001}, + {"tbfp-network", 24.163194440000002}, + {"thor50dday", 40417}, + {"timtab1", 764771.99999977998}, + {"tr12-30", 130595.9999999999}, + {"traininstance2", 71820}, + {"traininstance6", 28290}, + {"trento1", 5189487}, + {"triptim1", 22.868099999999899}, + {"uccase12", 11507.4050616}, + {"uccase9", 10993.131409}, + {"uct-subprob", 314}, + {"unitcal_7", 19635558.243999999}, + {"var-smallemery-m6j6", -149.37501}, + {"wachplan", -8}, + }; + return kOptima; +} + +// MIPLIB2017 benchmark-set instances flagged as infeasible (n=7). +// Solver should return Infeasible status; we use this set to label +// the printer line with status_extra=KnownInfeasible so a downstream +// "did the run agree with MIPLIB?" check can be a single grep. +inline const std::unordered_set& kBenchmarkInfeasible() +{ + static const std::unordered_set kInfeas = { + "bnatt500", + "cryptanalysiskb128n5obj14", + "fhnw-binpack4-4", + "neos-2075418-temuka", + "neos-3402454-bohle", + "neos-3988577-wolgan", + "neos859080", + }; + return kInfeas; +} + +inline std::optional lookup_miplib_optimum(const std::string& filename) +{ + const auto& m = kBenchmarkOptima(); + const auto it = m.find(normalize_instance_name(filename)); + if (it == m.end()) { return std::nullopt; } + return it->second; +} + +inline bool is_known_infeasible(const std::string& filename) +{ + return kBenchmarkInfeasible().count(normalize_instance_name(filename)) != 0; +} + +// Single grep-friendly per-instance line. Emits to stdout via printf +// so the output survives unconditionally regardless of the project's +// settings_.log routing (NFS-backed log files, gated debug levels) +// and is trivially cross-compared between cuts-config branches. +// +// "Gap closed" is reported relative to the *root LP after cuts*, not +// relative to the final dual bound at the end of solve. The standard +// MIP cutting-plane definition is: +// gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) +// / (opt - root_lp_no_cuts) +// On a minimization-form problem all three differences are >= 0 and +// gap_closed_pct lies in [0, 100]. The ratio is sign-symmetric so the +// formula also holds verbatim for maximization (numerator and +// denominator flip sign together). NaN is emitted when either root +// bound was not published (e.g. B&B never entered the cut loop). +// +// Other field semantics (signed for minimization): +// abs_root_dual_gap = opt - root_lp_with_cuts +// rel_root_dual_gap_pct = 100 * abs_root_dual_gap / max(|opt|, 1) +// abs_primal_gap = primal - opt +// rel_primal_gap_pct = 100 * abs_primal_gap / max(|opt|, 1) +// +// The line still also reports `final_dual` (solver's bound at the end +// of solve) so the new metric and the previous one can be compared +// without re-running. +// +// "TBD" is emitted when the optimum is unknown so downstream parsers +// can join lines on (instance, field) without dropping rows. "NaN" is +// emitted for root_lp_* when the value is unavailable. +template +inline void print_miplib_gap_stat( + const std::string& filename, + const Solution& solution, + double solve_time_seconds, + const std::string& termination_status, + double root_lp_no_cuts, + double root_lp_with_cuts, + double cut_gen_time_sec = std::numeric_limits::quiet_NaN()) +{ + const std::string norm = normalize_instance_name(filename); + const auto opt = lookup_miplib_optimum(filename); + const double primal = solution.get_objective_value(); + const double final_dual = solution.get_solution_bound(); + const double mip_gap = solution.get_mip_gap(); + const bool primal_finite = std::isfinite(primal); + const bool root0_finite = std::isfinite(root_lp_no_cuts); + const bool root1_finite = std::isfinite(root_lp_with_cuts); + constexpr double NaN = std::numeric_limits::quiet_NaN(); + + if (is_known_infeasible(filename)) { + std::printf( + "MIPLIBGapStat instance=%s opt=Infeasible primal=%.10g final_dual=%.10g " + "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " + "abs_root_dual_gap=NA rel_root_dual_gap_pct=NA gap_closed_pct=NA " + "abs_primal_gap=NA rel_primal_gap_pct=NA " + "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", + norm.c_str(), + primal, + final_dual, + root_lp_no_cuts, + root_lp_with_cuts, + mip_gap, + solve_time_seconds, + cut_gen_time_sec, + termination_status.c_str()); + } else if (opt.has_value()) { + const double o = *opt; + const double denom = std::max(std::abs(o), 1.0); + + const double abs_root_dgap = root1_finite ? (o - root_lp_with_cuts) : NaN; + const double rel_root_dgap_pct = root1_finite ? 100.0 * abs_root_dgap / denom : NaN; + + // Classical gap-closed-by-cuts. Skip when either root bound is + // missing, when the LP relaxation already proves optimality + // (denominator = opt - root_lp_no_cuts ~= 0), or when the bound + // moved the wrong way (numerical noise in either direction). + double gap_closed_pct = NaN; + if (root0_finite && root1_finite) { + const double total_gap = o - root_lp_no_cuts; + if (std::abs(total_gap) > 1e-12 * denom) { + gap_closed_pct = 100.0 * (root_lp_with_cuts - root_lp_no_cuts) / total_gap; + } else { + // LP relaxation already (numerically) optimal -> 100% closed + // by definition. Avoid /0 noise. + gap_closed_pct = 100.0; + } + } + + const double abs_pgap = primal_finite ? (primal - o) : NaN; + const double rel_pgap_pct = primal_finite ? 100.0 * abs_pgap / denom : NaN; + + std::printf( + "MIPLIBGapStat instance=%s opt=%.10g primal=%.10g final_dual=%.10g " + "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " + "abs_root_dual_gap=%.10g rel_root_dual_gap_pct=%.6g gap_closed_pct=%.6g " + "abs_primal_gap=%.10g rel_primal_gap_pct=%.6g " + "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", + norm.c_str(), + o, + primal, + final_dual, + root_lp_no_cuts, + root_lp_with_cuts, + abs_root_dgap, + rel_root_dgap_pct, + gap_closed_pct, + abs_pgap, + rel_pgap_pct, + mip_gap, + solve_time_seconds, + cut_gen_time_sec, + termination_status.c_str()); + } else { + std::printf( + "MIPLIBGapStat instance=%s opt=TBD primal=%.10g final_dual=%.10g " + "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " + "abs_root_dual_gap=TBD rel_root_dual_gap_pct=TBD gap_closed_pct=TBD " + "abs_primal_gap=TBD rel_primal_gap_pct=TBD " + "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", + norm.c_str(), + primal, + final_dual, + root_lp_no_cuts, + root_lp_with_cuts, + mip_gap, + solve_time_seconds, + cut_gen_time_sec, + termination_status.c_str()); + } + std::fflush(stdout); +} + +} // namespace cuopt_bench diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index f35543696f..bed4e453b0 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -6,6 +6,7 @@ /* clang-format on */ #include "initial_solution_reader.hpp" #include "mip_test_instances.hpp" +#include "miplib2017_optima.hpp" #include #include @@ -23,15 +24,20 @@ #include #include +#include #include #include #include #include #include +#include #include +#include #include +#include #include #include +#include #include #include #include @@ -239,6 +245,43 @@ int run_single_file(std::string file_path, } else { CUOPT_LOG_INFO("%s: no solution found", base_filename.c_str()); } + + // Per-instance "gap closed to optimum" stat. Emits a single + // grep-friendly "MIPLIBGapStat ..." line via printf so cross-branch + // comparison is just `grep '^MIPLIBGapStat' branchA.log` then diff. + // Optima are looked up from the in-source MIPLIB2017 benchmark-set + // table (miplib2017_optima.hpp); unknown instances emit "opt=TBD" + // and infeasibility-flagged instances emit "opt=Infeasible". + { + const double _gap_seconds = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_run_solver) + .count() / + 1000.0; + std::string _status_str; + switch (solution.get_termination_status()) { + case cuopt::linear_programming::mip_termination_status_t::Optimal: + _status_str = "Optimal"; + break; + case cuopt::linear_programming::mip_termination_status_t::FeasibleFound: + _status_str = "FeasibleFound"; + break; + case cuopt::linear_programming::mip_termination_status_t::TimeLimit: + _status_str = "TimeLimit"; + break; + case cuopt::linear_programming::mip_termination_status_t::Infeasible: + _status_str = "Infeasible"; + break; + default: _status_str = "Other"; break; + } + cuopt_bench::print_miplib_gap_stat(base_filename, + solution, + _gap_seconds, + _status_str, + benchmark_info.root_lp_no_cuts, + benchmark_info.root_lp_with_cuts, + benchmark_info.cut_generation_time_sec); + } + std::stringstream ss; int decimal_places = 2; double mip_gap = solution.get_mip_gap(); @@ -293,6 +336,157 @@ void run_single_file_mp(std::string file_path, exit(sol_found); } +// Return the NUMA node of each GPU (one entry per gpu_id), or -1 when the +// node can't be determined. Reads /sys/bus/pci/devices//numa_node so it +// requires no extra dependencies (NVML / hwloc). +static std::vector get_gpu_numa_nodes(int n_gpus) +{ + std::vector nodes(static_cast(std::max(0, n_gpus)), -1); + for (int i = 0; i < n_gpus; ++i) { + char pci_id[32] = {0}; + if (cudaDeviceGetPCIBusId(pci_id, sizeof(pci_id), i) != cudaSuccess) { continue; } + for (char* c = pci_id; *c; ++c) { + *c = static_cast(std::tolower(static_cast(*c))); + } + std::ifstream f(std::string("/sys/bus/pci/devices/") + pci_id + "/numa_node"); + if (!f) { continue; } + int node = -1; + f >> node; + nodes[i] = node; + } + return nodes; +} + +// Parse a sysfs cpulist string ("0-71,144-215") into a sorted list of CPU IDs. +// Returns empty on any read or parse failure. +static std::vector read_numa_cpulist(int numa_node) +{ + std::vector cpus; + if (numa_node < 0) { return cpus; } + std::ifstream f(std::string("/sys/devices/system/node/node") + std::to_string(numa_node) + + "/cpulist"); + if (!f) { return cpus; } + std::string line; + if (!std::getline(f, line)) { return cpus; } + size_t pos = 0; + while (pos < line.size()) { + const size_t comma = line.find(',', pos); + const size_t end = (comma == std::string::npos) ? line.size() : comma; + const std::string range = line.substr(pos, end - pos); + if (!range.empty()) { + try { + const size_t dash = range.find('-'); + const int lo = std::stoi(range.substr(0, dash)); + const int hi = (dash == std::string::npos) ? lo : std::stoi(range.substr(dash + 1)); + for (int c = lo; c <= hi; ++c) { + cpus.push_back(c); + } + } catch (...) { + return std::vector{}; + } + } + if (comma == std::string::npos) { break; } + pos = comma + 1; + } + std::sort(cpus.begin(), cpus.end()); + return cpus; +} + +// Bind the current process to a fair partition of the inherited CPU mask, +// preferring CPUs on the same NUMA node as the GPU. Returns the actual +// number of CPUs the child was pinned to, or -1 if the partition could not +// be applied (caller must then choose a fallback). +// +// Algorithm: +// 1. Read inherited (parent) affinity mask -> visible_cpus. +// 2. Look up each GPU's NUMA node via PCI BDF. +// 3. If this GPU's NUMA node is known and has visible CPUs, partition +// that NUMA node's CPUs among the GPUs that landed on the same node +// (siblings, ordered by gpu_id). +// 4. Otherwise fall back to a contiguous global partition of visible_cpus. +// +// The function always emits a single stdout line per child summarising the +// partition (NUMA-local vs contiguous-fallback), so the parent's log isn't +// interleaved per-CPU across n_gpus children. +int bind_process_to_cpu_partition(int gpu_id, int n_gpus) +{ + if (gpu_id < 0 || n_gpus <= 0 || gpu_id >= n_gpus) { return -1; } + + cpu_set_t parent_mask; + CPU_ZERO(&parent_mask); + if (sched_getaffinity(0, sizeof(parent_mask), &parent_mask) != 0) { + perror("sched_getaffinity"); + return -1; + } + + std::vector visible_cpus; + for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) { + if (CPU_ISSET(cpu, &parent_mask)) { visible_cpus.push_back(cpu); } + } + if (visible_cpus.empty()) { return -1; } + std::sort(visible_cpus.begin(), visible_cpus.end()); + + std::vector chosen_cpus; + bool numa_aware = false; + + const std::vector gpu_numa_nodes = get_gpu_numa_nodes(n_gpus); + const int my_numa = gpu_numa_nodes[gpu_id]; + if (my_numa >= 0) { + std::vector siblings; + for (int i = 0; i < n_gpus; ++i) { + if (gpu_numa_nodes[i] == my_numa) { siblings.push_back(i); } + } + std::vector numa_cpus = read_numa_cpulist(my_numa); + if (!numa_cpus.empty() && !siblings.empty()) { + std::vector local_visible; + std::set_intersection(visible_cpus.begin(), + visible_cpus.end(), + numa_cpus.begin(), + numa_cpus.end(), + std::back_inserter(local_visible)); + if (!local_visible.empty()) { + const int siblings_count = static_cast(siblings.size()); + const int my_idx = + static_cast(std::find(siblings.begin(), siblings.end(), gpu_id) - siblings.begin()); + const int local_per_gpu = + std::max(1, static_cast(local_visible.size()) / siblings_count); + const int s = my_idx * local_per_gpu; + const int e = std::min(s + local_per_gpu, static_cast(local_visible.size())); + if (s < e) { + chosen_cpus.assign(local_visible.begin() + s, local_visible.begin() + e); + numa_aware = true; + } + } + } + } + + if (!numa_aware) { + const int cpus_per_gpu = std::max(1, static_cast(visible_cpus.size()) / n_gpus); + const int start = gpu_id * cpus_per_gpu; + if (start >= static_cast(visible_cpus.size())) { return -1; } + const int end = std::min(start + cpus_per_gpu, static_cast(visible_cpus.size())); + chosen_cpus.assign(visible_cpus.begin() + start, visible_cpus.begin() + end); + } + + cpu_set_t child_mask; + CPU_ZERO(&child_mask); + std::ostringstream oss; + oss << "[gpu " << gpu_id << "] bound to " << chosen_cpus.size() << " CPUs (" + << (numa_aware ? "NUMA-local node " + std::to_string(my_numa) : "contiguous-fallback") + << "):"; + for (int c : chosen_cpus) { + CPU_SET(c, &child_mask); + oss << ' ' << c; + } + std::cout << oss.str() << std::endl; + + if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { + perror("sched_setaffinity"); + return -1; + } + return static_cast(chosen_cpus.size()); +} + void return_gpu_to_the_queue(std::unordered_map& pid_gpu_map, std::unordered_map& pid_file_map, std::queue& gpu_queue) @@ -417,6 +611,11 @@ int main(int argc, char* argv[]) int reliability_branching = program.get("--reliability-branching"); bool deterministic = program.get("--determinism"); + if (run_dir && program.is_used("--num-cpu-threads")) { + std::cerr << "Warning: --num-cpu-threads is ignored in directory-run mode; " + "thread count is set per process from the bound CPU partition.\n"; + } + if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; // std::ifstream smt_file("/sys/devices/system/cpu/smt/active"); @@ -502,6 +701,18 @@ int main(int argc, char* argv[]) } if (sys_pid == 0) { RAFT_CUDA_TRY(cudaSetDevice(gpu_id)); + int assigned_cpus = bind_process_to_cpu_partition(gpu_id, n_gpus); + if (assigned_cpus <= 0) { + assigned_cpus = std::max(1, omp_get_max_threads() / n_gpus); + std::cerr << "[gpu " << gpu_id << "] CPU pin failed; falling back to " + << assigned_cpus << " threads\n"; + } + // Directory-run mode owns the thread count: --num-cpu-threads is + // intentionally ignored here so per-process thread budgets match + // the bound CPU partition. The single-run path below still + // honours --num-cpu-threads. + omp_set_num_threads(assigned_cpus); + num_cpu_threads = assigned_cpus; run_single_file_mp(file_name, gpu_id, batch_num, @@ -534,31 +745,36 @@ int main(int argc, char* argv[]) merge_result_files(out_dir, result_file, n_gpus, batch_num); } else { auto memory_resource = make_async(); + auto run_single = [&]() { + run_single_file(path, + 0, + 0, + n_gpus, + out_dir, + initial_solution_file, + heuristics_only, + num_cpu_threads, + write_log_file, + log_to_console, + reliability_branching, + time_limit, + work_limit, + deterministic); + }; if (memory_limit > 0) { auto limiting_adaptor = rmm::mr::limiting_resource_adaptor(memory_resource, memory_limit * 1024ULL * 1024ULL); rmm::mr::set_current_device_resource(limiting_adaptor); + run_single(); } else if (track_allocations) { rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource, /*capture_stacks=*/true); rmm::mr::set_current_device_resource(tracking_adaptor); + run_single(); } else { rmm::mr::set_current_device_resource(memory_resource); + run_single(); } - run_single_file(path, - 0, - 0, - n_gpus, - out_dir, - initial_solution_file, - heuristics_only, - num_cpu_threads, - write_log_file, - log_to_console, - reliability_branching, - time_limit, - work_limit, - deterministic); } return 0; diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 685b1360b8..b2231b1a3b 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -26,6 +26,27 @@ struct benchmark_info_t { double last_improvement_of_best_feasible = 0; double last_improvement_after_recombination = 0; double objective_of_initial_population = std::numeric_limits::max(); + // LP relaxation objective at the root node, BEFORE any cuts have been + // added. quiet_NaN() means "B&B did not run cut passes / value was + // never written" — distinguishes it from a legitimate 0.0. + double root_lp_no_cuts = std::numeric_limits::quiet_NaN(); + // LP relaxation objective at the root node, AFTER the full cut loop + // (final pass result). The dual gap "by cuts at the root" is then + // gap_after_cuts = opt - root_lp_with_cuts (in B&B's solver + // objective sense) + // and the classical "gap closed by cuts" metric is + // gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) + // / (opt - root_lp_no_cuts). + // quiet_NaN() means "B&B did not finish the cut loop / value not written". + double root_lp_with_cuts = std::numeric_limits::quiet_NaN(); + + // Wall-clock time spent inside the root-node cut generation loop + // (sum of generate_cuts + score_cuts + check_for_duplicate_cuts + + // get_best_cuts + add_cuts + post-cut LP resolves), in seconds. + // Published by branch_and_bound.cpp::solve() at the same point that + // root_lp_with_cuts is finalised. quiet_NaN() means "cut loop did + // not run / value never written". + double cut_generation_time_sec = std::numeric_limits::quiet_NaN(); }; // Forward declare solver_settings_t for friend class diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 0222ad6fe9..5420b88221 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -11,6 +11,8 @@ #include #include +#include // benchmark_info_t + #include #include #include @@ -2361,6 +2363,15 @@ auto branch_and_bound_t::do_cut_pass( } root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + // Publish after every successful post-cut LP resolve so any + // early-exit path below (NUMERICAL, TIME_LIMIT, gap-tolerance + // exit) still leaves benchmark_info->root_lp_with_cuts pointing + // at the most recent valid LP-with-cuts objective. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_with_cuts = + static_cast(compute_user_objective(original_lp_, root_objective_)); + } + f_t remove_cuts_start_time = tic(); mutex_original_lp_.lock(); remove_cuts(original_lp_, @@ -2479,7 +2490,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut user_problem_t problem_copy = original_problem_; timer_t timer(std::numeric_limits::infinity()); detail::find_initial_cliques( - problem_copy, tolerances_for_clique, &clique_table_, timer, false, clique_signal); + problem_copy, tolerances_for_clique, &clique_table_, timer, clique_signal); } } @@ -2588,6 +2599,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut cut_info_t cut_info; if (num_fractional == 0) { + // LP relaxation already integer-feasible — solved at the root with + // no cuts. Publish both bounds equal to the root LP value so the + // gap-closed-by-cuts line still has a finite, meaningful entry + // (the printer reports 100% closed when total integrality gap ~= 0). + if (settings_.benchmark_info_ptr != nullptr) { + const double v = static_cast(compute_user_objective(original_lp_, root_objective_)); + settings_.benchmark_info_ptr->root_lp_no_cuts = v; + settings_.benchmark_info_ptr->root_lp_with_cuts = v; + } set_solution_at_root(solution, cut_info); signal_extend_cliques_.store(true, std::memory_order_release); #pragma omp taskwait depend(in : *clique_signal) @@ -2624,6 +2644,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut f_t last_objective = root_objective_; f_t root_relax_objective = root_objective_; + // Publish the no-cuts root LP value once. The with-cuts companion is + // published below after the cut loop terminates. Both go to the + // benchmark_info_t so callers (run_mip.cpp) can compute + // gap-closed-by-cuts without instrumenting the cut loop directly. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_no_cuts = + static_cast(compute_user_objective(original_lp_, root_relax_objective)); + } + constexpr bool enable_root_cut_cpufj = true; std::unique_ptr> root_cut_cpufj_task; auto root_cut_cpufj_improvement_callback = @@ -2648,11 +2677,31 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut }; cuopt::scope_guard root_cut_cpufj_guard([&]() { stop_root_cut_cpufj(); }); - f_t cut_generation_start_time = tic(); - i_t cut_pool_size = 0; + f_t cut_generation_start_time = tic(); + auto publish_cut_generation_time = [&](bool force_time_limit_value = false) { + if (settings_.benchmark_info_ptr == nullptr) { return; } + f_t cut_generation_time = toc(cut_generation_start_time); + if (force_time_limit_value || cut_generation_time > settings_.time_limit) { + cut_generation_time = settings_.time_limit; + } + if (cut_generation_time < static_cast(0.0)) { + cut_generation_time = static_cast(0.0); + } + settings_.benchmark_info_ptr->cut_generation_time_sec = + static_cast(cut_generation_time); + }; + i_t cut_pool_size = 0; for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { if (num_fractional == 0) { + // LP relaxation is already integer-feasible — solved at the root + // by the cuts added so far (possibly zero). Publish the with-cuts + // value so the gap-closed line still has a non-NaN dual bound. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_with_cuts = + static_cast(compute_user_objective(original_lp_, root_objective_)); + } set_solution_at_root(solution, cut_info); + publish_cut_generation_time(); signal_extend_cliques_.store(true, std::memory_order_release); #pragma omp taskwait depend(in : *clique_signal) return mip_status_t::OPTIMAL; @@ -2692,6 +2741,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } if (cut_pass_result.action == cut_pass_action_t::RETURN) { + publish_cut_generation_time(cut_pass_result.status == mip_status_t::TIME_LIMIT); signal_extend_cliques_.store(true, std::memory_order_release); #pragma omp taskwait depend(in : *clique_signal) return cut_pass_result.status; @@ -2714,8 +2764,16 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } } + // Publish the post-cuts root LP value. + if (settings_.benchmark_info_ptr != nullptr) { + settings_.benchmark_info_ptr->root_lp_with_cuts = + static_cast(compute_user_objective(original_lp_, root_objective_)); + } + print_cut_info(settings_, cut_info); f_t cut_generation_time = toc(cut_generation_start_time); + // Publish cut-generation time for reporting. + publish_cut_generation_time(); if (cut_info.has_cuts()) { settings_.log.printf("Cut generation time: %.2f seconds\n", cut_generation_time); settings_.log.printf("Cut pool size : %d\n", cut_pool_size); diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 27eac7f985..d5eacc673d 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -18,6 +18,10 @@ #include #include +namespace cuopt::linear_programming { +struct benchmark_info_t; +} + namespace cuopt::linear_programming::dual_simplex { template @@ -241,6 +245,8 @@ struct simplex_solver_settings_t { mutable logger_t log; std::atomic* concurrent_halt; // if nullptr ignored, if !nullptr, 0 if solver should // continue, 1 if solver should halt + // Optional non-owning pointer to run-level benchmark stats. + cuopt::linear_programming::benchmark_info_t* benchmark_info_ptr = nullptr; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index c25ade0c05..712be21342 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -377,6 +377,9 @@ solution_t mip_solver_t::run_solver() context.settings.strong_chvatal_gomory_cuts; branch_and_bound_settings.cut_change_threshold = context.settings.cut_change_threshold; branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality; + // Forward the run-level benchmark_info_t so B&B can publish root LP + // bounds (before / after cuts) for gap-closed-by-cuts measurement. + branch_and_bound_settings.benchmark_info_ptr = context.settings.benchmark_info_ptr; branch_and_bound_settings.mip_batch_pdlp_strong_branching = context.settings.mip_batch_pdlp_strong_branching; branch_and_bound_settings.mip_batch_pdlp_reliability_branching = diff --git a/cut_gap_timing_stats.patch b/cut_gap_timing_stats.patch new file mode 100644 index 0000000000..0e87cc9d95 --- /dev/null +++ b/cut_gap_timing_stats.patch @@ -0,0 +1,974 @@ +diff --git a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp +new file mode 100644 +index 00000000..7f6826a5 +--- /dev/null ++++ b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp +@@ -0,0 +1,476 @@ ++/* clang-format off */ ++/* ++ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ * SPDX-License-Identifier: Apache-2.0 ++ */ ++/* clang-format on */ ++ ++// MIPLIB2017 best-known objective ("optimum") lookup for the MIP ++// benchmark runner. Self-contained: no env vars, no external CSV. ++// ++// Coverage: every instance in the MIPLIB2017 *benchmark* set (240 ++// instances). Of those, 232 have a known optimum and live in ++// kBenchmarkOptima; 7 are infeasible and live in kBenchmarkInfeasible ++// so the printer can label them clearly instead of returning "no opt". ++// ++// Lookup uses the basename without directory and stripped of ++// .mps / .mps.gz / .lp / .lp.gz / .gz suffixes, lower-cased. So ++// "miplib2017/MAS74.mps.gz" / "mas74.mps" / "mas74" ++// all hit the same entry. ++// ++// Returns std::optional: nullopt means "instance is in our ++// benchmark set but infeasible" *or* "we don't have an entry for it". ++// is_known_infeasible() distinguishes the two. ++ ++#pragma once ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++namespace cuopt_bench { ++ ++// Strip directory prefix and any .mps/.lp suffix (with optional .gz), ++// then lower-case. Designed to match how MPS instance files are named ++// across MIPLIB downloads (case- and extension-insensitive). ++inline std::string normalize_instance_name(const std::string& raw) ++{ ++ std::string s = raw; ++ const auto slash = s.find_last_of("/\\"); ++ if (slash != std::string::npos) { s = s.substr(slash + 1); } ++ auto endswith = [&](const std::string& suf) { ++ if (s.size() < suf.size()) { return false; } ++ for (size_t i = 0; i < suf.size(); ++i) { ++ if (std::tolower(static_cast(s[s.size() - suf.size() + i])) != ++ std::tolower(static_cast(suf[i]))) { ++ return false; ++ } ++ } ++ return true; ++ }; ++ for (const char* suf : {".mps.gz", ".lp.gz", ".mps", ".lp", ".gz"}) { ++ if (endswith(suf)) { ++ s.resize(s.size() - std::char_traits::length(suf)); ++ break; ++ } ++ } ++ for (char& c : s) { ++ c = static_cast(std::tolower(static_cast(c))); ++ } ++ return s; ++} ++ ++// MIPLIB2017 benchmark-set best-known objectives (n=232). Source: ++// https://miplib.zib.de "The Benchmark Set". Values are stored in the ++// double precision they were published at; unit tests should compare ++// with a tolerance of ~|opt|*1e-9 rather than exact equality. ++inline const std::unordered_map& kBenchmarkOptima() ++{ ++ static const std::unordered_map kOptima = { ++ {"30n20b8", 302}, ++ {"50v-10", 3311.1799841000002}, ++ {"academictimetablesmall", 0}, ++ {"air05", 26374}, ++ {"app1-1", -3}, ++ {"app1-2", -41}, ++ {"assign1-5-8", 211.99999999999801}, ++ {"atlanta-ip", 90.009878614000002}, ++ {"b1c1s1", 24544.25}, ++ {"bab2", -357544.31150000001}, ++ {"bab6", -284248.23070000007}, ++ {"beasleyc3", 753.9999999999128}, ++ {"binkar10_1", 6742.1998835000004}, ++ {"blp-ar98", 6205.2147103999996}, ++ {"blp-ic98", 4491.4475839500001}, ++ {"bnatt400", 1}, ++ {"bppc4-08", 53}, ++ {"brazil3", 24}, ++ {"buildingenergy", 33283.853236000003}, ++ {"cbs-cta", 0}, ++ {"chromaticindex1024-7", 4}, ++ {"chromaticindex512-7", 4}, ++ {"cmflsp50-24-8-8", 55789389.886}, ++ {"cms750_4", 252}, ++ {"co-100", 2639942.0600000001}, ++ {"cod105", -12}, ++ {"comp07-2idx", 6}, ++ {"comp21-2idx", 74}, ++ {"cost266-uue", 25148940.55999998}, ++ {"cryptanalysiskb128n5obj16", 0}, ++ {"csched007", 350.99999999999551}, ++ {"csched008", 173}, ++ {"cvs16r128-89", -97}, ++ {"dano3_3", 576.34463302999995}, ++ {"dano3_5", 576.9249159565619}, ++ {"decomp2", -160}, ++ {"drayage-100-23", 103333.87407000001}, ++ {"drayage-25-23", 101282.647018}, ++ {"dws008-01", 37412.604587945083}, ++ {"eil33-2", 934.007915999999}, ++ {"eila101-2", 880.92010799999991}, ++ {"enlight_hard", 37}, ++ {"ex10", 100}, ++ {"ex9", 81}, ++ {"exp-1-500-5-5", 65887}, ++ {"fast0507", 174}, ++ {"fastxgemm-n2r6s0t2", 230}, ++ {"fhnw-binpack4-48", 0}, ++ {"fiball", 138}, ++ {"gen-ip002", -4783.7333920000001}, ++ {"gen-ip054", 6840.9656417899996}, ++ {"germanrr", 47095869.648999996}, ++ {"gfd-schedulen180f7d50m30k18", 1}, ++ {"glass-sc", 23}, ++ {"glass4", 1200012599.972384}, ++ {"gmu-35-40", -2406733.3687999998}, ++ {"gmu-35-50", -2607958.3300000001}, ++ {"graph20-20-1rand", -9}, ++ {"graphdraw-domain", 19685.999975500381}, ++ {"h80x6320d", 6382.0990482459993}, ++ {"highschool1-aigio", 0}, ++ {"hypothyroid-k1", -2851}, ++ {"ic97_potential", 3941.9999309022501}, ++ {"icir97_tension", 6375}, ++ {"irish-electricity", 3723497.5913959998}, ++ {"irp", 12159.492835396981}, ++ {"istanbul-no-cutoff", 204.08170701}, ++ {"k1mushroom", -3288}, ++ {"lectsched-5-obj", 24}, ++ {"leo1", 404227536.16000003}, ++ {"leo2", 404077441.12}, ++ {"lotsize", 1480195}, ++ {"mad", 0.026800000000000001}, ++ {"map10", -495}, ++ {"map16715-04", -111}, ++ {"markshare2", 1}, ++ {"markshare_4_0", 1}, ++ {"mas74", 11801.185719999999}, ++ {"mas76", 40005.053989999993}, ++ {"mc11", 11688.99999999966}, ++ {"mcsched", 211913}, ++ {"mik-250-20-75-4", -52301}, ++ {"milo-v12-6-r2-40-1", 326481.14282799}, ++ {"momentum1", 109143.4935}, ++ {"mushroom-best", 0.055333761199999998}, ++ {"mzzv11", -21718}, ++ {"mzzv42z", -20540}, ++ {"n2seq36q", 52200}, ++ {"n3div36", 130800}, ++ {"n5-3", 8104.9999999939992}, ++ {"neos-1122047", 161}, ++ {"neos-1171448", -309}, ++ {"neos-1171737", -195}, ++ {"neos-1354092", 46}, ++ {"neos-1445765", -17783}, ++ {"neos-1456979", 176}, ++ {"neos-1582420", 90.999999999999957}, ++ {"neos-2657525-crna", 1.810748}, ++ {"neos-2746589-doon", 2008.1999999999989}, ++ {"neos-2978193-inde", -2.3880616899999998}, ++ {"neos-2987310-joes", -607702988.29999995}, ++ {"neos-3004026-krka", 0}, ++ {"neos-3024952-loue", 26756}, ++ {"neos-3046615-murg", 1600}, ++ {"neos-3083819-nubu", 6307996}, ++ {"neos-3216931-puriri", 71320}, ++ {"neos-3381206-awhea", 453}, ++ {"neos-3402294-bobin", 0.067249999999999491}, ++ {"neos-3555904-turama", -34.700000000000003}, ++ {"neos-3627168-kasai", 988585.61999999976}, ++ {"neos-3656078-kumeu", -13172.200000000001}, ++ {"neos-3754480-nidda", 12941.73838561778}, ++ {"neos-4300652-rahue", 2.1415999999999999}, ++ {"neos-4338804-snowy", 1471}, ++ {"neos-4387871-tavua", 33.384729927000002}, ++ {"neos-4413714-turia", 45.370167019999798}, ++ {"neos-4532248-waihi", 61.599999999999987}, ++ {"neos-4647030-tutaki", 27265.705999999958}, ++ {"neos-4722843-widden", 25009.662227000001}, ++ {"neos-4738912-atrato", 283627956.59500003}, ++ {"neos-4763324-toguru", 1613.0388458499999}, ++ {"neos-4954672-berkel", 2612710}, ++ {"neos-5049753-cuanza", 561.99999716889999}, ++ {"neos-5052403-cygnet", 182}, ++ {"neos-5093327-huahum", 6259.9999971258949}, ++ {"neos-5104907-jarama", 935}, ++ {"neos-5107597-kakapo", 3644.9999999995198}, ++ {"neos-5114902-kasavu", 655}, ++ {"neos-5188808-nattai", 0.110283622999984}, ++ {"neos-5195221-niemur", 0.0038354325999999999}, ++ {"neos-631710", 203}, ++ {"neos-662469", 184379.99999999991}, ++ {"neos-787933", 30}, ++ {"neos-827175", 112.00152}, ++ {"neos-848589", 2351.40309999697}, ++ {"neos-860300", 3200.9999999999982}, ++ {"neos-873061", 113.6562385063}, ++ {"neos-911970", 54.759999999999998}, ++ {"neos-933966", 318}, ++ {"neos-950242", 4}, ++ {"neos-957323", -237.75668150000001}, ++ {"neos-960392", -238}, ++ {"neos17", 0.1500025774}, ++ {"neos5", 15}, ++ {"neos8", -3719}, ++ {"net12", 214}, ++ {"netdiversion", 242}, ++ {"nexp-150-20-8-5", 231}, ++ {"ns1116954", 0}, ++ {"ns1208400", 2}, ++ {"ns1644855", -1524.3333333333301}, ++ {"ns1760995", -549.21438505000003}, ++ {"ns1830653", 20622}, ++ {"ns1952667", 0}, ++ {"nu25-pr12", 53904.999999999993}, ++ {"nursesched-medium-hint03", 115}, ++ {"nursesched-sprint02", 57.999999999999993}, ++ {"nw04", 16862}, ++ {"opm2-z10-s4", -33269}, ++ {"p200x1188c", 15078}, ++ {"peg-solitaire-a3", 1}, ++ {"pg", -8674.3426071199992}, ++ {"pg5_34", -14339.353450000001}, ++ {"physiciansched3-3", 2623271.3266670001}, ++ {"physiciansched6-2", 49324}, ++ {"piperout-08", 125054.9999999999}, ++ {"piperout-27", 8123.9999999999727}, ++ {"pk1", 11}, ++ {"proteindesign121hz512p9", 1473}, ++ {"proteindesign122trx11p8", 1747}, ++ {"qap10", 339.99999999838712}, ++ {"radiationm18-12-05", 17566}, ++ {"radiationm40-10-02", 155328}, ++ {"rail01", -70.569964299999995}, ++ {"rail02", -200.44990770000001}, ++ {"rail507", 174}, ++ {"ran14x18-disj-8", 3712}, ++ {"rd-rplusc-21", 165395.275295}, ++ {"reblock115", -36800603.233199999}, ++ {"rmatr100-p10", 423}, ++ {"rmatr200-p5", 4521}, ++ {"roci-4-11", -6020203}, ++ {"rocii-5-11", -6.6755047315380001}, ++ {"rococob10-011000", 19449}, ++ {"rocococ10-001000", 11460}, ++ {"roi2alpha3n4", -63.208495030000002}, ++ {"roi5alpha10n8", -52.322274350999997}, ++ {"roll3000", 12889.999991999999}, ++ {"s100", -0.16972352705829999}, ++ {"s250r10", -0.17178048342319999}, ++ {"satellites2-40", -19}, ++ {"satellites2-60-fs", -19.000000000099998}, ++ {"savsched1", 3217.6999999999998}, ++ {"sct2", -230.9891623}, ++ {"seymour", 423}, ++ {"seymour1", 410.76370138999999}, ++ {"sing326", 7753674.8537600003}, ++ {"sing44", 8128831.1771999998}, ++ {"snp-02-004-104", 586803238.65672886}, ++ {"sorrell3", -16}, ++ {"sp150x300d", 69}, ++ {"sp97ar", 660705645.75899994}, ++ {"sp98ar", 529740623.19999999}, ++ {"splice1k1", -394}, ++ {"square41", 15}, ++ {"square47", 15.9999999997877}, ++ {"supportcase10", 7}, ++ {"supportcase12", -7559.5330538170001}, ++ {"supportcase18", 48}, ++ {"supportcase19", 12677205.999920519}, ++ {"supportcase22", 110}, // best-known marked "*" in MIPLIB2017 (not proven optimal) ++ {"supportcase26", 1745.1238129999999}, ++ {"supportcase33", -345}, ++ {"supportcase40", 24256.3122898}, ++ {"supportcase42", 7.7586307222700004}, ++ {"supportcase6", 51906.477370000001}, ++ {"supportcase7", -1132.2231770000001}, ++ {"swath1", 379.07129574999999}, ++ {"swath3", 397.76134365000001}, ++ {"tbfp-network", 24.163194440000002}, ++ {"thor50dday", 40417}, ++ {"timtab1", 764771.99999977998}, ++ {"tr12-30", 130595.9999999999}, ++ {"traininstance2", 71820}, ++ {"traininstance6", 28290}, ++ {"trento1", 5189487}, ++ {"triptim1", 22.868099999999899}, ++ {"uccase12", 11507.4050616}, ++ {"uccase9", 10993.131409}, ++ {"uct-subprob", 314}, ++ {"unitcal_7", 19635558.243999999}, ++ {"var-smallemery-m6j6", -149.37501}, ++ {"wachplan", -8}, ++ }; ++ return kOptima; ++} ++ ++// MIPLIB2017 benchmark-set instances flagged as infeasible (n=7). ++// Solver should return Infeasible status; we use this set to label ++// the printer line with status_extra=KnownInfeasible so a downstream ++// "did the run agree with MIPLIB?" check can be a single grep. ++inline const std::unordered_set& kBenchmarkInfeasible() ++{ ++ static const std::unordered_set kInfeas = { ++ "bnatt500", ++ "cryptanalysiskb128n5obj14", ++ "fhnw-binpack4-4", ++ "neos-2075418-temuka", ++ "neos-3402454-bohle", ++ "neos-3988577-wolgan", ++ "neos859080", ++ }; ++ return kInfeas; ++} ++ ++inline std::optional lookup_miplib_optimum(const std::string& filename) ++{ ++ const auto& m = kBenchmarkOptima(); ++ const auto it = m.find(normalize_instance_name(filename)); ++ if (it == m.end()) { return std::nullopt; } ++ return it->second; ++} ++ ++inline bool is_known_infeasible(const std::string& filename) ++{ ++ return kBenchmarkInfeasible().count(normalize_instance_name(filename)) != 0; ++} ++ ++// Single grep-friendly per-instance line. Emits to stdout via printf ++// so the output survives unconditionally regardless of the project's ++// settings_.log routing (NFS-backed log files, gated debug levels) ++// and is trivially cross-compared between cuts-config branches. ++// ++// "Gap closed" is reported relative to the *root LP after cuts*, not ++// relative to the final dual bound at the end of solve. The standard ++// MIP cutting-plane definition is: ++// gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) ++// / (opt - root_lp_no_cuts) ++// On a minimization-form problem all three differences are >= 0 and ++// gap_closed_pct lies in [0, 100]. The ratio is sign-symmetric so the ++// formula also holds verbatim for maximization (numerator and ++// denominator flip sign together). NaN is emitted when either root ++// bound was not published (e.g. B&B never entered the cut loop). ++// ++// Other field semantics (signed for minimization): ++// abs_root_dual_gap = opt - root_lp_with_cuts ++// rel_root_dual_gap_pct = 100 * abs_root_dual_gap / max(|opt|, 1) ++// abs_primal_gap = primal - opt ++// rel_primal_gap_pct = 100 * abs_primal_gap / max(|opt|, 1) ++// ++// The line still also reports `final_dual` (solver's bound at the end ++// of solve) so the new metric and the previous one can be compared ++// without re-running. ++// ++// "TBD" is emitted when the optimum is unknown so downstream parsers ++// can join lines on (instance, field) without dropping rows. "NaN" is ++// emitted for root_lp_* when the value is unavailable. ++template ++inline void print_miplib_gap_stat( ++ const std::string& filename, ++ const Solution& solution, ++ double solve_time_seconds, ++ const std::string& termination_status, ++ double root_lp_no_cuts, ++ double root_lp_with_cuts, ++ double cut_gen_time_sec = std::numeric_limits::quiet_NaN()) ++{ ++ const std::string norm = normalize_instance_name(filename); ++ const auto opt = lookup_miplib_optimum(filename); ++ const double primal = solution.get_objective_value(); ++ const double final_dual = solution.get_solution_bound(); ++ const double mip_gap = solution.get_mip_gap(); ++ const bool primal_finite = std::isfinite(primal); ++ const bool root0_finite = std::isfinite(root_lp_no_cuts); ++ const bool root1_finite = std::isfinite(root_lp_with_cuts); ++ constexpr double NaN = std::numeric_limits::quiet_NaN(); ++ ++ if (is_known_infeasible(filename)) { ++ std::printf( ++ "MIPLIBGapStat instance=%s opt=Infeasible primal=%.10g final_dual=%.10g " ++ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " ++ "abs_root_dual_gap=NA rel_root_dual_gap_pct=NA gap_closed_pct=NA " ++ "abs_primal_gap=NA rel_primal_gap_pct=NA " ++ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", ++ norm.c_str(), ++ primal, ++ final_dual, ++ root_lp_no_cuts, ++ root_lp_with_cuts, ++ mip_gap, ++ solve_time_seconds, ++ cut_gen_time_sec, ++ termination_status.c_str()); ++ } else if (opt.has_value()) { ++ const double o = *opt; ++ const double denom = std::max(std::abs(o), 1.0); ++ ++ const double abs_root_dgap = root1_finite ? (o - root_lp_with_cuts) : NaN; ++ const double rel_root_dgap_pct = root1_finite ? 100.0 * abs_root_dgap / denom : NaN; ++ ++ // Classical gap-closed-by-cuts. Skip when either root bound is ++ // missing, when the LP relaxation already proves optimality ++ // (denominator = opt - root_lp_no_cuts ~= 0), or when the bound ++ // moved the wrong way (numerical noise in either direction). ++ double gap_closed_pct = NaN; ++ if (root0_finite && root1_finite) { ++ const double total_gap = o - root_lp_no_cuts; ++ if (std::abs(total_gap) > 1e-12 * denom) { ++ gap_closed_pct = 100.0 * (root_lp_with_cuts - root_lp_no_cuts) / total_gap; ++ } else { ++ // LP relaxation already (numerically) optimal -> 100% closed ++ // by definition. Avoid /0 noise. ++ gap_closed_pct = 100.0; ++ } ++ } ++ ++ const double abs_pgap = primal_finite ? (primal - o) : NaN; ++ const double rel_pgap_pct = primal_finite ? 100.0 * abs_pgap / denom : NaN; ++ ++ std::printf( ++ "MIPLIBGapStat instance=%s opt=%.10g primal=%.10g final_dual=%.10g " ++ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " ++ "abs_root_dual_gap=%.10g rel_root_dual_gap_pct=%.6g gap_closed_pct=%.6g " ++ "abs_primal_gap=%.10g rel_primal_gap_pct=%.6g " ++ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", ++ norm.c_str(), ++ o, ++ primal, ++ final_dual, ++ root_lp_no_cuts, ++ root_lp_with_cuts, ++ abs_root_dgap, ++ rel_root_dgap_pct, ++ gap_closed_pct, ++ abs_pgap, ++ rel_pgap_pct, ++ mip_gap, ++ solve_time_seconds, ++ cut_gen_time_sec, ++ termination_status.c_str()); ++ } else { ++ std::printf( ++ "MIPLIBGapStat instance=%s opt=TBD primal=%.10g final_dual=%.10g " ++ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " ++ "abs_root_dual_gap=TBD rel_root_dual_gap_pct=TBD gap_closed_pct=TBD " ++ "abs_primal_gap=TBD rel_primal_gap_pct=TBD " ++ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", ++ norm.c_str(), ++ primal, ++ final_dual, ++ root_lp_no_cuts, ++ root_lp_with_cuts, ++ mip_gap, ++ solve_time_seconds, ++ cut_gen_time_sec, ++ termination_status.c_str()); ++ } ++ std::fflush(stdout); ++} ++ ++} // namespace cuopt_bench +diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp +index f3554369..bed4e453 100644 +--- a/benchmarks/linear_programming/cuopt/run_mip.cpp ++++ b/benchmarks/linear_programming/cuopt/run_mip.cpp +@@ -6,6 +6,7 @@ + /* clang-format on */ + #include "initial_solution_reader.hpp" + #include "mip_test_instances.hpp" ++#include "miplib2017_optima.hpp" + + #include + #include +@@ -23,15 +24,20 @@ + #include + #include + ++#include + #include + #include + #include + #include + #include ++#include + #include ++#include + #include ++#include + #include + #include ++#include + #include + #include + #include +@@ -239,6 +245,43 @@ int run_single_file(std::string file_path, + } else { + CUOPT_LOG_INFO("%s: no solution found", base_filename.c_str()); + } ++ ++ // Per-instance "gap closed to optimum" stat. Emits a single ++ // grep-friendly "MIPLIBGapStat ..." line via printf so cross-branch ++ // comparison is just `grep '^MIPLIBGapStat' branchA.log` then diff. ++ // Optima are looked up from the in-source MIPLIB2017 benchmark-set ++ // table (miplib2017_optima.hpp); unknown instances emit "opt=TBD" ++ // and infeasibility-flagged instances emit "opt=Infeasible". ++ { ++ const double _gap_seconds = std::chrono::duration_cast( ++ std::chrono::high_resolution_clock::now() - start_run_solver) ++ .count() / ++ 1000.0; ++ std::string _status_str; ++ switch (solution.get_termination_status()) { ++ case cuopt::linear_programming::mip_termination_status_t::Optimal: ++ _status_str = "Optimal"; ++ break; ++ case cuopt::linear_programming::mip_termination_status_t::FeasibleFound: ++ _status_str = "FeasibleFound"; ++ break; ++ case cuopt::linear_programming::mip_termination_status_t::TimeLimit: ++ _status_str = "TimeLimit"; ++ break; ++ case cuopt::linear_programming::mip_termination_status_t::Infeasible: ++ _status_str = "Infeasible"; ++ break; ++ default: _status_str = "Other"; break; ++ } ++ cuopt_bench::print_miplib_gap_stat(base_filename, ++ solution, ++ _gap_seconds, ++ _status_str, ++ benchmark_info.root_lp_no_cuts, ++ benchmark_info.root_lp_with_cuts, ++ benchmark_info.cut_generation_time_sec); ++ } ++ + std::stringstream ss; + int decimal_places = 2; + double mip_gap = solution.get_mip_gap(); +@@ -293,6 +336,157 @@ void run_single_file_mp(std::string file_path, + exit(sol_found); + } + ++// Return the NUMA node of each GPU (one entry per gpu_id), or -1 when the ++// node can't be determined. Reads /sys/bus/pci/devices//numa_node so it ++// requires no extra dependencies (NVML / hwloc). ++static std::vector get_gpu_numa_nodes(int n_gpus) ++{ ++ std::vector nodes(static_cast(std::max(0, n_gpus)), -1); ++ for (int i = 0; i < n_gpus; ++i) { ++ char pci_id[32] = {0}; ++ if (cudaDeviceGetPCIBusId(pci_id, sizeof(pci_id), i) != cudaSuccess) { continue; } ++ for (char* c = pci_id; *c; ++c) { ++ *c = static_cast(std::tolower(static_cast(*c))); ++ } ++ std::ifstream f(std::string("/sys/bus/pci/devices/") + pci_id + "/numa_node"); ++ if (!f) { continue; } ++ int node = -1; ++ f >> node; ++ nodes[i] = node; ++ } ++ return nodes; ++} ++ ++// Parse a sysfs cpulist string ("0-71,144-215") into a sorted list of CPU IDs. ++// Returns empty on any read or parse failure. ++static std::vector read_numa_cpulist(int numa_node) ++{ ++ std::vector cpus; ++ if (numa_node < 0) { return cpus; } ++ std::ifstream f(std::string("/sys/devices/system/node/node") + std::to_string(numa_node) + ++ "/cpulist"); ++ if (!f) { return cpus; } ++ std::string line; ++ if (!std::getline(f, line)) { return cpus; } ++ size_t pos = 0; ++ while (pos < line.size()) { ++ const size_t comma = line.find(',', pos); ++ const size_t end = (comma == std::string::npos) ? line.size() : comma; ++ const std::string range = line.substr(pos, end - pos); ++ if (!range.empty()) { ++ try { ++ const size_t dash = range.find('-'); ++ const int lo = std::stoi(range.substr(0, dash)); ++ const int hi = (dash == std::string::npos) ? lo : std::stoi(range.substr(dash + 1)); ++ for (int c = lo; c <= hi; ++c) { ++ cpus.push_back(c); ++ } ++ } catch (...) { ++ return std::vector{}; ++ } ++ } ++ if (comma == std::string::npos) { break; } ++ pos = comma + 1; ++ } ++ std::sort(cpus.begin(), cpus.end()); ++ return cpus; ++} ++ ++// Bind the current process to a fair partition of the inherited CPU mask, ++// preferring CPUs on the same NUMA node as the GPU. Returns the actual ++// number of CPUs the child was pinned to, or -1 if the partition could not ++// be applied (caller must then choose a fallback). ++// ++// Algorithm: ++// 1. Read inherited (parent) affinity mask -> visible_cpus. ++// 2. Look up each GPU's NUMA node via PCI BDF. ++// 3. If this GPU's NUMA node is known and has visible CPUs, partition ++// that NUMA node's CPUs among the GPUs that landed on the same node ++// (siblings, ordered by gpu_id). ++// 4. Otherwise fall back to a contiguous global partition of visible_cpus. ++// ++// The function always emits a single stdout line per child summarising the ++// partition (NUMA-local vs contiguous-fallback), so the parent's log isn't ++// interleaved per-CPU across n_gpus children. ++int bind_process_to_cpu_partition(int gpu_id, int n_gpus) ++{ ++ if (gpu_id < 0 || n_gpus <= 0 || gpu_id >= n_gpus) { return -1; } ++ ++ cpu_set_t parent_mask; ++ CPU_ZERO(&parent_mask); ++ if (sched_getaffinity(0, sizeof(parent_mask), &parent_mask) != 0) { ++ perror("sched_getaffinity"); ++ return -1; ++ } ++ ++ std::vector visible_cpus; ++ for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) { ++ if (CPU_ISSET(cpu, &parent_mask)) { visible_cpus.push_back(cpu); } ++ } ++ if (visible_cpus.empty()) { return -1; } ++ std::sort(visible_cpus.begin(), visible_cpus.end()); ++ ++ std::vector chosen_cpus; ++ bool numa_aware = false; ++ ++ const std::vector gpu_numa_nodes = get_gpu_numa_nodes(n_gpus); ++ const int my_numa = gpu_numa_nodes[gpu_id]; ++ if (my_numa >= 0) { ++ std::vector siblings; ++ for (int i = 0; i < n_gpus; ++i) { ++ if (gpu_numa_nodes[i] == my_numa) { siblings.push_back(i); } ++ } ++ std::vector numa_cpus = read_numa_cpulist(my_numa); ++ if (!numa_cpus.empty() && !siblings.empty()) { ++ std::vector local_visible; ++ std::set_intersection(visible_cpus.begin(), ++ visible_cpus.end(), ++ numa_cpus.begin(), ++ numa_cpus.end(), ++ std::back_inserter(local_visible)); ++ if (!local_visible.empty()) { ++ const int siblings_count = static_cast(siblings.size()); ++ const int my_idx = ++ static_cast(std::find(siblings.begin(), siblings.end(), gpu_id) - siblings.begin()); ++ const int local_per_gpu = ++ std::max(1, static_cast(local_visible.size()) / siblings_count); ++ const int s = my_idx * local_per_gpu; ++ const int e = std::min(s + local_per_gpu, static_cast(local_visible.size())); ++ if (s < e) { ++ chosen_cpus.assign(local_visible.begin() + s, local_visible.begin() + e); ++ numa_aware = true; ++ } ++ } ++ } ++ } ++ ++ if (!numa_aware) { ++ const int cpus_per_gpu = std::max(1, static_cast(visible_cpus.size()) / n_gpus); ++ const int start = gpu_id * cpus_per_gpu; ++ if (start >= static_cast(visible_cpus.size())) { return -1; } ++ const int end = std::min(start + cpus_per_gpu, static_cast(visible_cpus.size())); ++ chosen_cpus.assign(visible_cpus.begin() + start, visible_cpus.begin() + end); ++ } ++ ++ cpu_set_t child_mask; ++ CPU_ZERO(&child_mask); ++ std::ostringstream oss; ++ oss << "[gpu " << gpu_id << "] bound to " << chosen_cpus.size() << " CPUs (" ++ << (numa_aware ? "NUMA-local node " + std::to_string(my_numa) : "contiguous-fallback") ++ << "):"; ++ for (int c : chosen_cpus) { ++ CPU_SET(c, &child_mask); ++ oss << ' ' << c; ++ } ++ std::cout << oss.str() << std::endl; ++ ++ if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { ++ perror("sched_setaffinity"); ++ return -1; ++ } ++ return static_cast(chosen_cpus.size()); ++} ++ + void return_gpu_to_the_queue(std::unordered_map& pid_gpu_map, + std::unordered_map& pid_file_map, + std::queue& gpu_queue) +@@ -417,6 +611,11 @@ int main(int argc, char* argv[]) + int reliability_branching = program.get("--reliability-branching"); + bool deterministic = program.get("--determinism"); + ++ if (run_dir && program.is_used("--num-cpu-threads")) { ++ std::cerr << "Warning: --num-cpu-threads is ignored in directory-run mode; " ++ "thread count is set per process from the bound CPU partition.\n"; ++ } ++ + if (num_cpu_threads < 0) { + num_cpu_threads = omp_get_max_threads() / n_gpus; + // std::ifstream smt_file("/sys/devices/system/cpu/smt/active"); +@@ -502,6 +701,18 @@ int main(int argc, char* argv[]) + } + if (sys_pid == 0) { + RAFT_CUDA_TRY(cudaSetDevice(gpu_id)); ++ int assigned_cpus = bind_process_to_cpu_partition(gpu_id, n_gpus); ++ if (assigned_cpus <= 0) { ++ assigned_cpus = std::max(1, omp_get_max_threads() / n_gpus); ++ std::cerr << "[gpu " << gpu_id << "] CPU pin failed; falling back to " ++ << assigned_cpus << " threads\n"; ++ } ++ // Directory-run mode owns the thread count: --num-cpu-threads is ++ // intentionally ignored here so per-process thread budgets match ++ // the bound CPU partition. The single-run path below still ++ // honours --num-cpu-threads. ++ omp_set_num_threads(assigned_cpus); ++ num_cpu_threads = assigned_cpus; + run_single_file_mp(file_name, + gpu_id, + batch_num, +@@ -534,31 +745,36 @@ int main(int argc, char* argv[]) + merge_result_files(out_dir, result_file, n_gpus, batch_num); + } else { + auto memory_resource = make_async(); ++ auto run_single = [&]() { ++ run_single_file(path, ++ 0, ++ 0, ++ n_gpus, ++ out_dir, ++ initial_solution_file, ++ heuristics_only, ++ num_cpu_threads, ++ write_log_file, ++ log_to_console, ++ reliability_branching, ++ time_limit, ++ work_limit, ++ deterministic); ++ }; + if (memory_limit > 0) { + auto limiting_adaptor = + rmm::mr::limiting_resource_adaptor(memory_resource, memory_limit * 1024ULL * 1024ULL); + rmm::mr::set_current_device_resource(limiting_adaptor); ++ run_single(); + } else if (track_allocations) { + rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource, + /*capture_stacks=*/true); + rmm::mr::set_current_device_resource(tracking_adaptor); ++ run_single(); + } else { + rmm::mr::set_current_device_resource(memory_resource); ++ run_single(); + } +- run_single_file(path, +- 0, +- 0, +- n_gpus, +- out_dir, +- initial_solution_file, +- heuristics_only, +- num_cpu_threads, +- write_log_file, +- log_to_console, +- reliability_branching, +- time_limit, +- work_limit, +- deterministic); + } + + return 0; +diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +index 685b1360..b2231b1a 100644 +--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp ++++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +@@ -26,6 +26,27 @@ struct benchmark_info_t { + double last_improvement_of_best_feasible = 0; + double last_improvement_after_recombination = 0; + double objective_of_initial_population = std::numeric_limits::max(); ++ // LP relaxation objective at the root node, BEFORE any cuts have been ++ // added. quiet_NaN() means "B&B did not run cut passes / value was ++ // never written" — distinguishes it from a legitimate 0.0. ++ double root_lp_no_cuts = std::numeric_limits::quiet_NaN(); ++ // LP relaxation objective at the root node, AFTER the full cut loop ++ // (final pass result). The dual gap "by cuts at the root" is then ++ // gap_after_cuts = opt - root_lp_with_cuts (in B&B's solver ++ // objective sense) ++ // and the classical "gap closed by cuts" metric is ++ // gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) ++ // / (opt - root_lp_no_cuts). ++ // quiet_NaN() means "B&B did not finish the cut loop / value not written". ++ double root_lp_with_cuts = std::numeric_limits::quiet_NaN(); ++ ++ // Wall-clock time spent inside the root-node cut generation loop ++ // (sum of generate_cuts + score_cuts + check_for_duplicate_cuts + ++ // get_best_cuts + add_cuts + post-cut LP resolves), in seconds. ++ // Published by branch_and_bound.cpp::solve() at the same point that ++ // root_lp_with_cuts is finalised. quiet_NaN() means "cut loop did ++ // not run / value never written". ++ double cut_generation_time_sec = std::numeric_limits::quiet_NaN(); + }; + + // Forward declare solver_settings_t for friend class +diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp +index 0222ad6f..5420b882 100644 +--- a/cpp/src/branch_and_bound/branch_and_bound.cpp ++++ b/cpp/src/branch_and_bound/branch_and_bound.cpp +@@ -11,6 +11,8 @@ + #include + #include + ++#include // benchmark_info_t ++ + #include + #include + #include +@@ -2361,6 +2363,15 @@ auto branch_and_bound_t::do_cut_pass( + } + root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + ++ // Publish after every successful post-cut LP resolve so any ++ // early-exit path below (NUMERICAL, TIME_LIMIT, gap-tolerance ++ // exit) still leaves benchmark_info->root_lp_with_cuts pointing ++ // at the most recent valid LP-with-cuts objective. ++ if (settings_.benchmark_info_ptr != nullptr) { ++ settings_.benchmark_info_ptr->root_lp_with_cuts = ++ static_cast(compute_user_objective(original_lp_, root_objective_)); ++ } ++ + f_t remove_cuts_start_time = tic(); + mutex_original_lp_.lock(); + remove_cuts(original_lp_, +@@ -2479,7 +2490,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + user_problem_t problem_copy = original_problem_; + timer_t timer(std::numeric_limits::infinity()); + detail::find_initial_cliques( +- problem_copy, tolerances_for_clique, &clique_table_, timer, false, clique_signal); ++ problem_copy, tolerances_for_clique, &clique_table_, timer, clique_signal); + } + } + +@@ -2588,6 +2599,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + cut_info_t cut_info; + + if (num_fractional == 0) { ++ // LP relaxation already integer-feasible — solved at the root with ++ // no cuts. Publish both bounds equal to the root LP value so the ++ // gap-closed-by-cuts line still has a finite, meaningful entry ++ // (the printer reports 100% closed when total integrality gap ~= 0). ++ if (settings_.benchmark_info_ptr != nullptr) { ++ const double v = static_cast(compute_user_objective(original_lp_, root_objective_)); ++ settings_.benchmark_info_ptr->root_lp_no_cuts = v; ++ settings_.benchmark_info_ptr->root_lp_with_cuts = v; ++ } + set_solution_at_root(solution, cut_info); + signal_extend_cliques_.store(true, std::memory_order_release); + #pragma omp taskwait depend(in : *clique_signal) +@@ -2624,6 +2644,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + f_t last_objective = root_objective_; + f_t root_relax_objective = root_objective_; + ++ // Publish the no-cuts root LP value once. The with-cuts companion is ++ // published below after the cut loop terminates. Both go to the ++ // benchmark_info_t so callers (run_mip.cpp) can compute ++ // gap-closed-by-cuts without instrumenting the cut loop directly. ++ if (settings_.benchmark_info_ptr != nullptr) { ++ settings_.benchmark_info_ptr->root_lp_no_cuts = ++ static_cast(compute_user_objective(original_lp_, root_relax_objective)); ++ } ++ + constexpr bool enable_root_cut_cpufj = true; + std::unique_ptr> root_cut_cpufj_task; + auto root_cut_cpufj_improvement_callback = +@@ -2648,11 +2677,31 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + }; + cuopt::scope_guard root_cut_cpufj_guard([&]() { stop_root_cut_cpufj(); }); + +- f_t cut_generation_start_time = tic(); +- i_t cut_pool_size = 0; ++ f_t cut_generation_start_time = tic(); ++ auto publish_cut_generation_time = [&](bool force_time_limit_value = false) { ++ if (settings_.benchmark_info_ptr == nullptr) { return; } ++ f_t cut_generation_time = toc(cut_generation_start_time); ++ if (force_time_limit_value || cut_generation_time > settings_.time_limit) { ++ cut_generation_time = settings_.time_limit; ++ } ++ if (cut_generation_time < static_cast(0.0)) { ++ cut_generation_time = static_cast(0.0); ++ } ++ settings_.benchmark_info_ptr->cut_generation_time_sec = ++ static_cast(cut_generation_time); ++ }; ++ i_t cut_pool_size = 0; + for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { + if (num_fractional == 0) { ++ // LP relaxation is already integer-feasible — solved at the root ++ // by the cuts added so far (possibly zero). Publish the with-cuts ++ // value so the gap-closed line still has a non-NaN dual bound. ++ if (settings_.benchmark_info_ptr != nullptr) { ++ settings_.benchmark_info_ptr->root_lp_with_cuts = ++ static_cast(compute_user_objective(original_lp_, root_objective_)); ++ } + set_solution_at_root(solution, cut_info); ++ publish_cut_generation_time(); + signal_extend_cliques_.store(true, std::memory_order_release); + #pragma omp taskwait depend(in : *clique_signal) + return mip_status_t::OPTIMAL; +@@ -2692,6 +2741,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + } + + if (cut_pass_result.action == cut_pass_action_t::RETURN) { ++ publish_cut_generation_time(cut_pass_result.status == mip_status_t::TIME_LIMIT); + signal_extend_cliques_.store(true, std::memory_order_release); + #pragma omp taskwait depend(in : *clique_signal) + return cut_pass_result.status; +@@ -2714,8 +2764,16 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut + } + } + ++ // Publish the post-cuts root LP value. ++ if (settings_.benchmark_info_ptr != nullptr) { ++ settings_.benchmark_info_ptr->root_lp_with_cuts = ++ static_cast(compute_user_objective(original_lp_, root_objective_)); ++ } ++ + print_cut_info(settings_, cut_info); + f_t cut_generation_time = toc(cut_generation_start_time); ++ // Publish cut-generation time for reporting. ++ publish_cut_generation_time(); + if (cut_info.has_cuts()) { + settings_.log.printf("Cut generation time: %.2f seconds\n", cut_generation_time); + settings_.log.printf("Cut pool size : %d\n", cut_pool_size); +diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu +index c25ade0c..712be213 100644 +--- a/cpp/src/mip_heuristics/solver.cu ++++ b/cpp/src/mip_heuristics/solver.cu +@@ -377,6 +377,9 @@ solution_t mip_solver_t::run_solver() + context.settings.strong_chvatal_gomory_cuts; + branch_and_bound_settings.cut_change_threshold = context.settings.cut_change_threshold; + branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality; ++ // Forward the run-level benchmark_info_t so B&B can publish root LP ++ // bounds (before / after cuts) for gap-closed-by-cuts measurement. ++ branch_and_bound_settings.benchmark_info_ptr = context.settings.benchmark_info_ptr; + branch_and_bound_settings.mip_batch_pdlp_strong_branching = + context.settings.mip_batch_pdlp_strong_branching; + branch_and_bound_settings.mip_batch_pdlp_reliability_branching = From b5bd4b22b3a99d0850ed55702e5b40ef91865fce Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 2 Jun 2026 13:15:59 +0200 Subject: [PATCH 23/47] Revert PDLP/PDHG cuda-graph changes to baseline --- cpp/src/pdlp/pdhg.cu | 13 ++----------- cpp/src/pdlp/pdhg.hpp | 5 +---- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index e6e5ed14bb..e88366a295 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -97,7 +97,6 @@ pdhg_solver_t::pdhg_solver_t( // Currently graph capture is not supported for cuSparse SpMM // TODO enable once cuSparse SpMM supports graph capture graph_all{stream_view_, is_legacy_batch_mode || batch_mode_}, - graph_all_non_major{stream_view_, is_legacy_batch_mode || batch_mode_}, graph_prim_proj_gradient_dual{stream_view_, is_legacy_batch_mode}, d_total_pdhg_iterations_{0, stream_view_}, climber_strategies_(climber_strategies), @@ -364,12 +363,6 @@ ping_pong_graph_t& pdhg_solver_t::get_graph_all() return graph_all; } -template -ping_pong_graph_t& pdhg_solver_t::get_graph_all_non_major() -{ - return graph_all_non_major; -} - template rmm::device_scalar& pdhg_solver_t::get_d_total_pdhg_iterations() { @@ -1114,8 +1107,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, const rmm::device_uvector& bound_rescaling, - bool should_major, - i_t total_pdlp_iterations) + bool should_major) { raft::common::nvtx::range fun_scope("compute_next_primal_dual_solution_reflected"); @@ -1368,8 +1360,7 @@ void pdhg_solver_t::take_step(rmm::device_uvector& primal_step_si dual_step_size, bound_rescaling, is_major_iteration || - ((total_pdlp_iterations + 2) % conditional_major(total_pdlp_iterations + 2)) == 0, - total_pdlp_iterations); + ((total_pdlp_iterations + 2) % conditional_major(total_pdlp_iterations + 2)) == 0); } total_pdhg_iterations_ += 1; } diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index b0c28a886f..01eabe967a 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -60,7 +60,6 @@ class pdhg_solver_t { const thrust::universal_host_pinned_vector>& swap_pairs, i_t new_size); void resize_context(i_t new_size); ping_pong_graph_t& get_graph_all(); - ping_pong_graph_t& get_graph_all_non_major(); rmm::device_uvector& get_new_bounds_climber_id() { return new_bounds_climber_id_; } rmm::device_uvector& get_new_bounds_idx() { return new_bounds_idx_; } @@ -90,8 +89,7 @@ class pdhg_solver_t { rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, const rmm::device_uvector& bound_rescaling, // Only used in batch mode - bool should_major, - i_t total_pdlp_iterations); + bool should_major); void compute_primal_projection_with_gradient(rmm::device_uvector& primal_step_size); void compute_primal_projection(rmm::device_uvector& primal_step_size); @@ -139,7 +137,6 @@ class pdhg_solver_t { // graph_all serves the non-reflected path and the major reflected branch (mutually exclusive // at runtime); graph_all_non_major serves the non-major reflected branch. ping_pong_graph_t graph_all; - ping_pong_graph_t graph_all_non_major; ping_pong_graph_t graph_prim_proj_gradient_dual; // Needed for faster graph launch From c35573a52cd6b40888c03e0007cfc8b232863595 Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 2 Jun 2026 17:36:52 +0200 Subject: [PATCH 24/47] revery cpu affinity mapping --- .../linear_programming/cuopt/run_mip.cpp | 173 ------------------ 1 file changed, 173 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index bed4e453b0..5c6e50288b 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -24,20 +24,15 @@ #include #include -#include #include #include #include #include #include -#include #include -#include #include -#include #include #include -#include #include #include #include @@ -336,157 +331,6 @@ void run_single_file_mp(std::string file_path, exit(sol_found); } -// Return the NUMA node of each GPU (one entry per gpu_id), or -1 when the -// node can't be determined. Reads /sys/bus/pci/devices//numa_node so it -// requires no extra dependencies (NVML / hwloc). -static std::vector get_gpu_numa_nodes(int n_gpus) -{ - std::vector nodes(static_cast(std::max(0, n_gpus)), -1); - for (int i = 0; i < n_gpus; ++i) { - char pci_id[32] = {0}; - if (cudaDeviceGetPCIBusId(pci_id, sizeof(pci_id), i) != cudaSuccess) { continue; } - for (char* c = pci_id; *c; ++c) { - *c = static_cast(std::tolower(static_cast(*c))); - } - std::ifstream f(std::string("/sys/bus/pci/devices/") + pci_id + "/numa_node"); - if (!f) { continue; } - int node = -1; - f >> node; - nodes[i] = node; - } - return nodes; -} - -// Parse a sysfs cpulist string ("0-71,144-215") into a sorted list of CPU IDs. -// Returns empty on any read or parse failure. -static std::vector read_numa_cpulist(int numa_node) -{ - std::vector cpus; - if (numa_node < 0) { return cpus; } - std::ifstream f(std::string("/sys/devices/system/node/node") + std::to_string(numa_node) + - "/cpulist"); - if (!f) { return cpus; } - std::string line; - if (!std::getline(f, line)) { return cpus; } - size_t pos = 0; - while (pos < line.size()) { - const size_t comma = line.find(',', pos); - const size_t end = (comma == std::string::npos) ? line.size() : comma; - const std::string range = line.substr(pos, end - pos); - if (!range.empty()) { - try { - const size_t dash = range.find('-'); - const int lo = std::stoi(range.substr(0, dash)); - const int hi = (dash == std::string::npos) ? lo : std::stoi(range.substr(dash + 1)); - for (int c = lo; c <= hi; ++c) { - cpus.push_back(c); - } - } catch (...) { - return std::vector{}; - } - } - if (comma == std::string::npos) { break; } - pos = comma + 1; - } - std::sort(cpus.begin(), cpus.end()); - return cpus; -} - -// Bind the current process to a fair partition of the inherited CPU mask, -// preferring CPUs on the same NUMA node as the GPU. Returns the actual -// number of CPUs the child was pinned to, or -1 if the partition could not -// be applied (caller must then choose a fallback). -// -// Algorithm: -// 1. Read inherited (parent) affinity mask -> visible_cpus. -// 2. Look up each GPU's NUMA node via PCI BDF. -// 3. If this GPU's NUMA node is known and has visible CPUs, partition -// that NUMA node's CPUs among the GPUs that landed on the same node -// (siblings, ordered by gpu_id). -// 4. Otherwise fall back to a contiguous global partition of visible_cpus. -// -// The function always emits a single stdout line per child summarising the -// partition (NUMA-local vs contiguous-fallback), so the parent's log isn't -// interleaved per-CPU across n_gpus children. -int bind_process_to_cpu_partition(int gpu_id, int n_gpus) -{ - if (gpu_id < 0 || n_gpus <= 0 || gpu_id >= n_gpus) { return -1; } - - cpu_set_t parent_mask; - CPU_ZERO(&parent_mask); - if (sched_getaffinity(0, sizeof(parent_mask), &parent_mask) != 0) { - perror("sched_getaffinity"); - return -1; - } - - std::vector visible_cpus; - for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) { - if (CPU_ISSET(cpu, &parent_mask)) { visible_cpus.push_back(cpu); } - } - if (visible_cpus.empty()) { return -1; } - std::sort(visible_cpus.begin(), visible_cpus.end()); - - std::vector chosen_cpus; - bool numa_aware = false; - - const std::vector gpu_numa_nodes = get_gpu_numa_nodes(n_gpus); - const int my_numa = gpu_numa_nodes[gpu_id]; - if (my_numa >= 0) { - std::vector siblings; - for (int i = 0; i < n_gpus; ++i) { - if (gpu_numa_nodes[i] == my_numa) { siblings.push_back(i); } - } - std::vector numa_cpus = read_numa_cpulist(my_numa); - if (!numa_cpus.empty() && !siblings.empty()) { - std::vector local_visible; - std::set_intersection(visible_cpus.begin(), - visible_cpus.end(), - numa_cpus.begin(), - numa_cpus.end(), - std::back_inserter(local_visible)); - if (!local_visible.empty()) { - const int siblings_count = static_cast(siblings.size()); - const int my_idx = - static_cast(std::find(siblings.begin(), siblings.end(), gpu_id) - siblings.begin()); - const int local_per_gpu = - std::max(1, static_cast(local_visible.size()) / siblings_count); - const int s = my_idx * local_per_gpu; - const int e = std::min(s + local_per_gpu, static_cast(local_visible.size())); - if (s < e) { - chosen_cpus.assign(local_visible.begin() + s, local_visible.begin() + e); - numa_aware = true; - } - } - } - } - - if (!numa_aware) { - const int cpus_per_gpu = std::max(1, static_cast(visible_cpus.size()) / n_gpus); - const int start = gpu_id * cpus_per_gpu; - if (start >= static_cast(visible_cpus.size())) { return -1; } - const int end = std::min(start + cpus_per_gpu, static_cast(visible_cpus.size())); - chosen_cpus.assign(visible_cpus.begin() + start, visible_cpus.begin() + end); - } - - cpu_set_t child_mask; - CPU_ZERO(&child_mask); - std::ostringstream oss; - oss << "[gpu " << gpu_id << "] bound to " << chosen_cpus.size() << " CPUs (" - << (numa_aware ? "NUMA-local node " + std::to_string(my_numa) : "contiguous-fallback") - << "):"; - for (int c : chosen_cpus) { - CPU_SET(c, &child_mask); - oss << ' ' << c; - } - std::cout << oss.str() << std::endl; - - if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { - perror("sched_setaffinity"); - return -1; - } - return static_cast(chosen_cpus.size()); -} - void return_gpu_to_the_queue(std::unordered_map& pid_gpu_map, std::unordered_map& pid_file_map, std::queue& gpu_queue) @@ -611,11 +455,6 @@ int main(int argc, char* argv[]) int reliability_branching = program.get("--reliability-branching"); bool deterministic = program.get("--determinism"); - if (run_dir && program.is_used("--num-cpu-threads")) { - std::cerr << "Warning: --num-cpu-threads is ignored in directory-run mode; " - "thread count is set per process from the bound CPU partition.\n"; - } - if (num_cpu_threads < 0) { num_cpu_threads = omp_get_max_threads() / n_gpus; // std::ifstream smt_file("/sys/devices/system/cpu/smt/active"); @@ -701,18 +540,6 @@ int main(int argc, char* argv[]) } if (sys_pid == 0) { RAFT_CUDA_TRY(cudaSetDevice(gpu_id)); - int assigned_cpus = bind_process_to_cpu_partition(gpu_id, n_gpus); - if (assigned_cpus <= 0) { - assigned_cpus = std::max(1, omp_get_max_threads() / n_gpus); - std::cerr << "[gpu " << gpu_id << "] CPU pin failed; falling back to " - << assigned_cpus << " threads\n"; - } - // Directory-run mode owns the thread count: --num-cpu-threads is - // intentionally ignored here so per-process thread budgets match - // the bound CPU partition. The single-run path below still - // honours --num-cpu-threads. - omp_set_num_threads(assigned_cpus); - num_cpu_threads = assigned_cpus; run_single_file_mp(file_name, gpu_id, batch_num, From 85a4024fb03ac9cca598e2e05b44c4d3e99bc5f3 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 3 Jun 2026 08:26:28 +0200 Subject: [PATCH 25/47] try without cousin filter --- cpp/src/cuts/cuts.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index e1ffab9fa2..3c2a4919df 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -382,7 +382,7 @@ class cut_pool_t { std::unordered_map> clique_cousin_buckets_; f_t clique_cousin_jaccard_tau_{static_cast(0.875)}; i_t clique_cousin_minhash_k_{8}; - bool clique_cousin_filter_enable_{true}; + bool clique_cousin_filter_enable_{false}; // When > 0, the cousin filter's "score" used to pick a winner is // boosted as: effective_score = base_score * (1 + size_weight * log2(1 + clique_size)). // This biases cousin replacement toward larger cliques (more variables From 92d9996e61798a0b297cf7f41a55e627ab4a6989 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 3 Jun 2026 10:06:38 +0200 Subject: [PATCH 26/47] remove cut configs and cousin logic --- cpp/src/cuts/cuts.cpp | 287 +----------------------------------------- cpp/src/cuts/cuts.hpp | 105 +--------------- 2 files changed, 3 insertions(+), 389 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index fc0329817e..c1d6aa3721 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -531,66 +531,8 @@ std::vector> find_maximal_cliques_for_test( return ctx.cliques; } -namespace { - -// 64-bit integer mixer (SplitMix64). Used as the building block for the -// cousin filter's per-slot independent hash family. -inline uint64_t splitmix64_mix(uint64_t x) -{ - x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; - x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; - x = x ^ (x >> 31); - return x; -} - -inline uint64_t hash64_with_seed(uint64_t value, uint64_t seed) -{ - return splitmix64_mix(value ^ (seed * 0xbf58476d1ce4e5b9ULL + 0x9e3779b97f4a7c15ULL)); -} - -} // namespace - -template -void cut_pool_t::compute_clique_minhash_sketch(const inequality_t& cut, - std::vector& sketch) const -{ - // Min-hash over the cut's column-support set. With clique_cousin_minhash_k_ - // independent random orderings of the variable index space, the expected - // number of agreements between two sketches is k * Jaccard(supp_a, supp_b), - // so sketch comparison estimates Jaccard in O(k) regardless of support - // sizes. - const i_t k = clique_cousin_minhash_k_; - sketch.assign(k, std::numeric_limits::max()); - const i_t nz = cut.size(); - for (i_t p = 0; p < nz; p++) { - const uint64_t j = static_cast(cut.index(p)); - for (i_t s = 0; s < k; s++) { - const uint64_t h = hash64_with_seed(j, static_cast(s)); - if (h < sketch[s]) { sketch[s] = h; } - } - } -} - template -void cut_pool_t::rebuild_clique_cousin_buckets() -{ - // Buckets index CLIQUE rows by the first sketch hash. Compaction - // routines (check_for_duplicate_cuts) shift row indices, so they call - // this after the parallel sketch vector has been remapped to make - // sure bucket entries point to the post-compaction rows. - clique_cousin_buckets_.clear(); - const i_t m = static_cast(clique_support_minhash_.size()); - for (i_t i = 0; i < m; i++) { - if (clique_support_minhash_[i].empty()) { continue; } - const uint64_t key = clique_support_minhash_[i][0]; - clique_cousin_buckets_[key].push_back(i); - } -} - -template -void cut_pool_t::add_cut(cut_type_t cut_type, - const inequality_t& cut, - f_t cut_score) +void cut_pool_t::add_cut(cut_type_t cut_type, const inequality_t& cut) { // TODO: Add fast duplicate check and only add if the cut is not already in the pool @@ -610,106 +552,10 @@ void cut_pool_t::add_cut(cut_type_t cut_type, return; } - // At insert time, use min-hash to detect similar clique cuts and keep - // one representative when estimated Jaccard passes the threshold. - std::vector new_sketch; - i_t cousin_replace_row = -1; - bool cousin_invariant_path = false; - // Optional size tilt so larger cliques can win close score comparisons. - f_t effective_score = cut_score; - if (effective_score >= static_cast(0.0) && - clique_cousin_size_weight_ > static_cast(0.0) && cut_type == cut_type_t::CLIQUE) { - const f_t sz = static_cast(cut_squeezed.size()); - const f_t mult = - static_cast(1.0) + - clique_cousin_size_weight_ * static_cast(std::log2(1.0 + static_cast(sz))); - effective_score *= mult; - } - if (cut_type == cut_type_t::CLIQUE) { clique_inserts_++; } - if (cut_type == cut_type_t::CLIQUE && clique_cousin_filter_enable_ && - clique_cousin_minhash_k_ > 0) { - cousin_invariant_path = true; - compute_clique_minhash_sketch(cut_squeezed, new_sketch); - const uint64_t bucket_key = new_sketch[0]; - auto bucket_it = clique_cousin_buckets_.find(bucket_key); - if (bucket_it != clique_cousin_buckets_.end()) { - const i_t pool_size = cut_storage_.m; - const i_t k = clique_cousin_minhash_k_; - // Compare the new sketch with peers in the same bucket. - auto& bucket_rows = bucket_it->second; - for (size_t b = 0; b < bucket_rows.size(); b++) { - const i_t row = bucket_rows[b]; - if (row < 0 || row >= pool_size) { continue; } - if (static_cast(clique_support_minhash_[row].size()) != k) { continue; } - i_t agree = 0; - for (i_t s = 0; s < k; s++) { - if (clique_support_minhash_[row][s] == new_sketch[s]) { agree++; } - } - const f_t jaccard_est = static_cast(agree) / static_cast(k); - if (jaccard_est < clique_cousin_jaccard_tau_) { continue; } - // Cousin found. Compare scores; keep the better representative. - const f_t existing_score = clique_cousin_score_[row]; - if (effective_score < static_cast(0.0)) { - // Caller did not supply a score — be conservative and drop the - // new cut; the existing entry stays as the bucket invariant - // winner ("first-write-wins" policy). - cousin_drops_++; - return; - } - if (effective_score <= existing_score) { - // Existing representative is at least as good; drop the new cut. - cousin_drops_++; - return; - } - // Soft-replace: redirect bucket entry to the new row and clear - // the old sketch so future inserts ignore the old representative. - cousin_replace_row = row; - // Replace at most one peer per insert. - break; - } - } - } - cut_storage_.append_row(cut_squeezed.vector); rhs_storage_.push_back(cut_squeezed.rhs); cut_type_.push_back(cut_type); cut_age_.push_back(0); - - // Keep the cousin-filter side tables sized like cut_storage_ regardless - // of cut type. Non-CLIQUE rows carry an empty sketch and a zero score; - // they are skipped by rebuild_clique_cousin_buckets(). - const i_t new_row = cut_storage_.m - 1; - clique_support_minhash_.resize(cut_storage_.m); - clique_cousin_score_.resize(cut_storage_.m, static_cast(0.0)); - if (cousin_invariant_path) { - clique_support_minhash_[new_row] = std::move(new_sketch); - clique_cousin_score_[new_row] = effective_score; - if (cousin_replace_row >= 0) { - // Reroute the bucket entry from the loser's row to the new row. - // Other peers in the same bucket (if any) keep their entries. - const uint64_t bucket_key = clique_support_minhash_[new_row][0]; - auto& rows = clique_cousin_buckets_[bucket_key]; - bool replaced = false; - for (auto& r : rows) { - if (r == cousin_replace_row) { - r = new_row; - replaced = true; - break; - } - } - if (!replaced) { rows.push_back(new_row); } - // Clear the loser's sketch so it's a no-op on subsequent inserts - // and rebuild_clique_cousin_buckets() ignores it. Loser's row - // lingers in cut_storage_ until score_cuts compacts via dedup or - // filters via orthogonality. - clique_support_minhash_[cousin_replace_row].clear(); - clique_cousin_score_[cousin_replace_row] = static_cast(0.0); - cousin_replaces_++; - } else { - const uint64_t bucket_key = clique_support_minhash_[new_row][0]; - clique_cousin_buckets_[bucket_key].push_back(new_row); - } - } } template @@ -884,35 +730,17 @@ void cut_pool_t::check_for_duplicate_cuts() cut_storage_.remove_rows(cuts_to_remove, new_cut_storage); cut_storage_ = new_cut_storage; i_t write = 0; - // Cousin-filter parallel arrays: only meaningful when populated, but - // size them up before the loop so the row remap is uniform. - const bool cousin_active = !clique_support_minhash_.empty(); - if (cousin_active) { - clique_support_minhash_.resize(m); - clique_cousin_score_.resize(m, static_cast(0.0)); - } for (i_t i = 0; i < m; i++) { if (cuts_to_remove[i] == 0) { rhs_storage_[write] = rhs_storage_[i]; cut_type_[write] = cut_type_[i]; cut_age_[write] = cut_age_[i]; - if (cousin_active) { - clique_support_minhash_[write] = std::move(clique_support_minhash_[i]); - clique_cousin_score_[write] = clique_cousin_score_[i]; - } write++; } } rhs_storage_.resize(write); cut_type_.resize(write); cut_age_.resize(write); - if (cousin_active) { - clique_support_minhash_.resize(write); - clique_cousin_score_.resize(write); - // Row indices changed — bucket entries point to old row IDs and - // would corrupt the next at-insert filter. Rebuild from scratch. - rebuild_clique_cousin_buckets(); - } } } @@ -3110,11 +2938,7 @@ bool cut_generation_t::generate_clique_cuts( inequality_t cut_inequality; cut_inequality.vector = cut; cut_inequality.rhs = cut_rhs; - // Pass the LP violation as the cousin-filter score so add_cut can - // pick the stronger representative on a Jaccard collision (>= tau). - // build_clique_cut has already verified violation > min_violation. - const f_t cut_violation = cut_rhs - cut.dot(xstar); - cut_pool_.add_cut(cut_type_t::CLIQUE, cut_inequality, cut_violation); + cut_pool_.add_cut(cut_type_t::CLIQUE, cut_inequality); #if DEBUG_CLIQUE_CUTS added_cuts++; CLIQUE_CUTS_DEBUG("generate_clique_cuts added cut nz=%lld rhs=%g clique_size=%lld", @@ -5732,110 +5556,6 @@ void verify_cuts_against_saved_solution(const csr_matrix_t& cuts, } } -// Cut-pool sweep configuration dispatch (declared in cuts.hpp). -// -// Driven by the same CUOPT_CONFIG_ID / CUOPT_MAX_CONFIG env vars the -// diversity manager uses (see diversity_manager.cu). One integer -// selects one of kCutSweepNumConfigs hard-coded cut-pool configurations. -// Caller side is just: -// CUOPT_MAX_CONFIG=5 CUOPT_CONFIG_ID=$id $RUN_MIP ... -// -// CUOPT_MAX_CONFIG is the caller's expected upper bound; when set -// we additionally range-check CUOPT_CONFIG_ID against it. -// CUOPT_CONFIG_ID unset / unparsable -> baseline (config 0). -// CUOPT_CONFIG_ID < 0 or >= valid range -> baseline + warning. -// -// Banner printf is gated to a single emission per process so B&B -// restarts (which re-construct cut_pool_t) don't spam the log. -template -void apply_cut_sweep_config(cut_pool_t& cut_pool, - const simplex_solver_settings_t& settings) -{ - static std::atomic banner_emitted{false}; - - const char* env_config_id_raw = std::getenv("CUOPT_CONFIG_ID"); - int config_id = -1; - if (env_config_id_raw != nullptr && env_config_id_raw[0] != '\0') { - try { - config_id = std::stoi(env_config_id_raw); - } catch (const std::exception&) { - config_id = -1; - } - } - - int max_config = kCutSweepNumConfigs; - const char* env_max_config = std::getenv("CUOPT_MAX_CONFIG"); - if (env_max_config != nullptr && env_max_config[0] != '\0') { - try { - max_config = std::stoi(env_max_config); - } catch (const std::exception&) { - max_config = kCutSweepNumConfigs; - } - } - - if (config_id < 0 || config_id >= max_config || config_id >= kCutSweepNumConfigs) { - if (!banner_emitted.load()) { - settings.log.printf( - "CutPoolConfig WARN config_id=%d out of range [0,%d), falling back to baseline\n", - config_id, - std::min(max_config, kCutSweepNumConfigs)); - } - config_id = 0; - } - - // Defaults come from cut_pool_t initializers. Each case overrides - // only the needed cousin-filter parameters. - switch (config_id) { - case 0: - // 00_baseline_no_cousin: cousin filter off. - cut_pool.set_clique_cousin_filter_enable(false); - break; - case 1: - // 01_cousin_default: tau=0.85, k=8, no size tilt. - cut_pool.set_clique_cousin_filter_enable(true); - cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.85)); - cut_pool.set_clique_cousin_minhash_k(8); - cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); - break; - case 2: - // 02_cousin_strict: tighter Jaccard threshold (0.85 -> 0.70). Calls - // more cliques "cousins" so we drop / replace more aggressively. - cut_pool.set_clique_cousin_filter_enable(true); - cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.70)); - cut_pool.set_clique_cousin_minhash_k(8); - cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); - break; - case 3: - // 03_cousin_loose: tau=0.875 (7/8 with k=8). - cut_pool.set_clique_cousin_filter_enable(true); - cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.875)); - cut_pool.set_clique_cousin_minhash_k(8); - cut_pool.set_clique_cousin_size_weight(static_cast(0.0)); - break; - case 4: - // 04_cousin_size_tilt: tau=0.85 with size-tilted replacement score. - cut_pool.set_clique_cousin_filter_enable(true); - cut_pool.set_clique_cousin_jaccard_tau(static_cast(0.85)); - cut_pool.set_clique_cousin_minhash_k(8); - cut_pool.set_clique_cousin_size_weight(static_cast(0.5)); - break; - default: break; // unreachable due to range check above - } - - if (!banner_emitted.exchange(true)) { - settings.log.printf( - "CutPoolConfig id=%d name=%s clique_cousin_enable=%d clique_cousin_tau=%g " - "clique_cousin_k=%d clique_cousin_size_weight=%g\n", - config_id, - cut_sweep_config_name(config_id), - static_cast(cut_pool.clique_cousin_filter_enable() ? 1 : 0), - static_cast(cut_pool.clique_cousin_jaccard_tau()), - static_cast(cut_pool.clique_cousin_minhash_k()), - static_cast(cut_pool.clique_cousin_size_weight())); - std::fflush(stdout); - } -} - #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE template class cut_pool_t; template class cut_generation_t; @@ -5845,9 +5565,6 @@ template class tableau_equality_t; template class complemented_mixed_integer_rounding_cut_t; template class variable_bounds_t; -template void apply_cut_sweep_config( - cut_pool_t& cut_pool, const simplex_solver_settings_t& settings); - template int add_cuts(const simplex_solver_settings_t& settings, const csr_matrix_t& cuts, const std::vector& cut_rhs, diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index 3c2a4919df..1a7af97611 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -289,14 +289,7 @@ class cut_pool_t { // Add a cut in the form: cut'*x >= rhs. // We expect that the cut is violated by the current relaxation xstar. - // - // Optional score used by the clique cousin filter. - // Only used for cut_type == CLIQUE when the filter is enabled. - // Non-negative values enable score-based replacement; -1.0 means - // keep the first inserted representative. - void add_cut(cut_type_t cut_type, - const inequality_t& cut, - f_t cut_score = static_cast(-1.0)); + void add_cut(cut_type_t cut_type, const inequality_t& cut); void score_cuts(std::vector& x_relax); @@ -315,49 +308,11 @@ class cut_pool_t { void check_for_duplicate_cuts(); - // Clique cousin filter settings. - // At insert time, we compare min-hash sketches and keep one - // representative when estimated Jaccard >= jaccard_tau. - // Defaults: jaccard_tau=0.875, k=8, enable=true, size_weight=0.0. - void set_clique_cousin_filter_enable(bool v) { clique_cousin_filter_enable_ = v; } - void set_clique_cousin_jaccard_tau(f_t v) { clique_cousin_jaccard_tau_ = v; } - void set_clique_cousin_minhash_k(i_t v) { clique_cousin_minhash_k_ = v; } - void set_clique_cousin_size_weight(f_t v) { clique_cousin_size_weight_ = v; } - - bool clique_cousin_filter_enable() const { return clique_cousin_filter_enable_; } - f_t clique_cousin_jaccard_tau() const { return clique_cousin_jaccard_tau_; } - i_t clique_cousin_minhash_k() const { return clique_cousin_minhash_k_; } - f_t clique_cousin_size_weight() const { return clique_cousin_size_weight_; } - - // Per-pool tally for log lines (instance-level diagnostic). All three - // counters are reset by reset_cousin_stats() and incremented inside - // add_cut() / cousin replacement. - i_t cousin_drops() const { return cousin_drops_; } - i_t cousin_replaces() const { return cousin_replaces_; } - i_t clique_inserts() const { return clique_inserts_; } - void reset_cousin_stats() - { - cousin_drops_ = 0; - cousin_replaces_ = 0; - clique_inserts_ = 0; - } - private: f_t cut_distance(i_t row, const std::vector& x, f_t& cut_violation, f_t& cut_norm); f_t cut_density(i_t row); f_t cut_orthogonality(i_t i, i_t j); - // Cousin filter helpers. compute_clique_minhash_sketch() fills - // `sketch` (length = clique_cousin_minhash_k_) with k independent - // min-hashes over the cut's column-support set. Two sketches agree - // on slot s with probability Jaccard(supp_a, supp_b), so element-wise - // agreement count divided by k estimates the Jaccard similarity. - void compute_clique_minhash_sketch(const inequality_t& cut, - std::vector& sketch) const; - // Rebuilds clique_cousin_buckets_ from clique_support_minhash_ after - // any compaction that remaps row indices (e.g. dedup). - void rebuild_clique_cousin_buckets(); - i_t original_vars_; const simplex_solver_settings_t& settings_; @@ -373,66 +328,8 @@ class cut_pool_t { std::vector cut_scores_; std::vector best_cuts_; const f_t min_cut_distance_{1e-4}; - - // Cousin filter state. - // Vectors are kept aligned with cut_storage_. Non-CLIQUE rows keep - // empty sketches. Buckets map sketch[0] to candidate rows. - std::vector> clique_support_minhash_; - std::vector clique_cousin_score_; - std::unordered_map> clique_cousin_buckets_; - f_t clique_cousin_jaccard_tau_{static_cast(0.875)}; - i_t clique_cousin_minhash_k_{8}; - bool clique_cousin_filter_enable_{false}; - // When > 0, the cousin filter's "score" used to pick a winner is - // boosted as: effective_score = base_score * (1 + size_weight * log2(1 + clique_size)). - // This biases cousin replacement toward larger cliques (more variables - // covered, larger integer support). 0 disables the tilt. - f_t clique_cousin_size_weight_{static_cast(0.0)}; - - // Diagnostic counters reset at the start of each cut pass via - // reset_cousin_stats(). - i_t cousin_drops_{0}; - i_t cousin_replaces_{0}; - i_t clique_inserts_{0}; }; -// --------------------------------------------------------------------------- -// Cut-pool sweep configuration dispatch. -// -// Selected by CUOPT_CONFIG_ID and range-checked against -// CUOPT_MAX_CONFIG. Configs control the clique cousin filter knobs. -// -// Keep kCutSweepNumConfigs in sync with the switch table in -// apply_cut_sweep_config() (see cuts.cpp) and with cut_sweep_config_name() -// below. -// -// Layout: -// 0 baseline_no_cousin cousin filter off -// 1 cousin_default cousin filter on, tau=0.85, k=8, score=violation -// 2 cousin_strict cousin filter on, tau=0.70 (more aggressive -// cousin removal — favors quantity reduction) -// 3 cousin_loose cousin filter on, tau=0.875 (allows 7/8 min-hash -// agreement with k=8) -// 4 cousin_size_tilt cousin filter on, tau=0.85, score = violation * -// (1 + 0.5 * log2(1 + clique_size)) -constexpr int kCutSweepNumConfigs = 5; - -inline const char* cut_sweep_config_name(int config_id) -{ - switch (config_id) { - case 0: return "00_baseline_no_cousin"; - case 1: return "01_cousin_default"; - case 2: return "02_cousin_strict"; - case 3: return "03_cousin_loose"; - case 4: return "04_cousin_size_tilt"; - default: return "unknown"; - } -} - -template -void apply_cut_sweep_config(cut_pool_t& cut_pool, - const simplex_solver_settings_t& settings); - template class variable_bounds_t; From 14b0ab83fb6ba94e42e570319a9419e605ce0fb9 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 3 Jun 2026 11:48:47 +0200 Subject: [PATCH 27/47] Separate small adjacency-list clique cuts generate_clique_cuts() used a hand-rolled emptiness test on only first + addtl_cliques, so it early-exited and skipped separation when only small_clique_adj (the small/adjacency-list cliques) was populated. Use the canonical clique_table_t::empty(), which also accounts for small_clique_adj, so those smaller clique cuts are generated. Signed-off-by: akif --- cpp/src/cuts/cuts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index c1d6aa3721..cea30c915d 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -2728,7 +2728,7 @@ bool cut_generation_t::generate_clique_cuts( static_cast(clique_table_->first.size()), static_cast(clique_table_->addtl_cliques.size())); - if (clique_table_->first.empty() && clique_table_->addtl_cliques.empty()) { + if (clique_table_->empty()) { CLIQUE_CUTS_DEBUG("generate_clique_cuts empty clique table, nothing to separate"); return true; } From ee7f2b5da5ccbb3c397f5e1b5a85020ef5e368f2 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 3 Jun 2026 22:11:02 +0200 Subject: [PATCH 28/47] revert omp inlining --- cpp/src/CMakeLists.txt | 1 - cpp/src/utilities/omp_helpers.cpp | 45 ------------------------------- cpp/src/utilities/omp_helpers.hpp | 42 +++++++++++++++++------------ 3 files changed, 25 insertions(+), 63 deletions(-) delete mode 100644 cpp/src/utilities/omp_helpers.cpp diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt index 9465be5331..1ae6988466 100644 --- a/cpp/src/CMakeLists.txt +++ b/cpp/src/CMakeLists.txt @@ -5,7 +5,6 @@ set(UTIL_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/utilities/seed_generator.cu ${CMAKE_CURRENT_SOURCE_DIR}/utilities/logger.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/utilities/omp_helpers.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/version_info.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/timestamp_utils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/utilities/work_unit_scheduler.cpp) diff --git a/cpp/src/utilities/omp_helpers.cpp b/cpp/src/utilities/omp_helpers.cpp deleted file mode 100644 index 974197c196..0000000000 --- a/cpp/src/utilities/omp_helpers.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -#include - -#ifdef _OPENMP - -#include - -namespace cuopt { - -// All operations on the underlying `omp_lock_t` are defined out-of-line so -// that `new omp_lock_t` and the matching (sized) `delete` invoked through -// `std::unique_ptr` exist in exactly one translation unit. This -// avoids ODR-induced `new-delete-type-mismatch` errors when other TUs (most -// notably NVCC host passes) end up with a differently sized `omp_lock_t`. - -omp_mutex_t::omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); } - -omp_mutex_t::omp_mutex_t(omp_mutex_t&& other) noexcept { *this = std::move(other); } - -omp_mutex_t& omp_mutex_t::operator=(omp_mutex_t&& other) noexcept -{ - if (&other != this) { - if (mutex) { omp_destroy_lock(mutex.get()); } - mutex = std::move(other.mutex); - } - return *this; -} - -omp_mutex_t::~omp_mutex_t() -{ - if (mutex) { - omp_destroy_lock(mutex.get()); - mutex.reset(); - } -} - -} // namespace cuopt - -#endif // _OPENMP diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp index 337b52d83e..24416c2b1c 100644 --- a/cpp/src/utilities/omp_helpers.hpp +++ b/cpp/src/utilities/omp_helpers.hpp @@ -28,35 +28,43 @@ std::pair calculate_index_range(i_t k, double total, double n) #include #include +#include namespace cuopt { // Wrapper of omp_lock_t. Optionally, you can provide a hint as defined in // https://www.openmp.org/spec-html/5.1/openmpse39.html#x224-2570003.9 -// -// The constructor / destructor / move-ops are intentionally out-of-line and -// defined in omp_helpers.cpp. This ensures that the `new omp_lock_t` and the -// matching (sized) `delete` instantiated through `std::unique_ptr` -// happen in exactly one translation unit. Otherwise NVCC host passes (and -// other TUs) can end up with different `sizeof(omp_lock_t)` values, which -// ODR-merges into a `new-delete-type-mismatch` at runtime under ASan. -// -// `virtual` on the destructor is preserved on purpose: it has been part of the -// class for a long time and removing it would change `sizeof(omp_mutex_t)` -// (no more vtable pointer), which would silently break any incremental build -// or any object file that wasn't rebuilt against the new header. class omp_mutex_t { public: - omp_mutex_t(); - omp_mutex_t(omp_mutex_t&& other) noexcept; + omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); } + + omp_mutex_t(const omp_mutex_t&) = delete; + + omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); } - omp_mutex_t(const omp_mutex_t&) = delete; omp_mutex_t& operator=(const omp_mutex_t&) = delete; - omp_mutex_t& operator=(omp_mutex_t&& other) noexcept; - virtual ~omp_mutex_t(); + + omp_mutex_t& operator=(omp_mutex_t&& other) + { + if (&other != this) { + if (mutex) { omp_destroy_lock(mutex.get()); } + mutex = std::move(other.mutex); + } + return *this; + } + + virtual ~omp_mutex_t() + { + if (mutex) { + omp_destroy_lock(mutex.get()); + mutex.reset(); + } + } void lock() { omp_set_lock(mutex.get()); } + void unlock() { omp_unset_lock(mutex.get()); } + bool try_lock() { return omp_test_lock(mutex.get()); } private: From 98b1ee2a8678f5cad1475f56abf637701db8aa25 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 3 Jun 2026 22:12:03 +0200 Subject: [PATCH 29/47] remove patch --- cut_gap_timing_stats.patch | 974 ------------------------------------- 1 file changed, 974 deletions(-) delete mode 100644 cut_gap_timing_stats.patch diff --git a/cut_gap_timing_stats.patch b/cut_gap_timing_stats.patch deleted file mode 100644 index 0e87cc9d95..0000000000 --- a/cut_gap_timing_stats.patch +++ /dev/null @@ -1,974 +0,0 @@ -diff --git a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp -new file mode 100644 -index 00000000..7f6826a5 ---- /dev/null -+++ b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp -@@ -0,0 +1,476 @@ -+/* clang-format off */ -+/* -+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -+ * SPDX-License-Identifier: Apache-2.0 -+ */ -+/* clang-format on */ -+ -+// MIPLIB2017 best-known objective ("optimum") lookup for the MIP -+// benchmark runner. Self-contained: no env vars, no external CSV. -+// -+// Coverage: every instance in the MIPLIB2017 *benchmark* set (240 -+// instances). Of those, 232 have a known optimum and live in -+// kBenchmarkOptima; 7 are infeasible and live in kBenchmarkInfeasible -+// so the printer can label them clearly instead of returning "no opt". -+// -+// Lookup uses the basename without directory and stripped of -+// .mps / .mps.gz / .lp / .lp.gz / .gz suffixes, lower-cased. So -+// "miplib2017/MAS74.mps.gz" / "mas74.mps" / "mas74" -+// all hit the same entry. -+// -+// Returns std::optional: nullopt means "instance is in our -+// benchmark set but infeasible" *or* "we don't have an entry for it". -+// is_known_infeasible() distinguishes the two. -+ -+#pragma once -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+namespace cuopt_bench { -+ -+// Strip directory prefix and any .mps/.lp suffix (with optional .gz), -+// then lower-case. Designed to match how MPS instance files are named -+// across MIPLIB downloads (case- and extension-insensitive). -+inline std::string normalize_instance_name(const std::string& raw) -+{ -+ std::string s = raw; -+ const auto slash = s.find_last_of("/\\"); -+ if (slash != std::string::npos) { s = s.substr(slash + 1); } -+ auto endswith = [&](const std::string& suf) { -+ if (s.size() < suf.size()) { return false; } -+ for (size_t i = 0; i < suf.size(); ++i) { -+ if (std::tolower(static_cast(s[s.size() - suf.size() + i])) != -+ std::tolower(static_cast(suf[i]))) { -+ return false; -+ } -+ } -+ return true; -+ }; -+ for (const char* suf : {".mps.gz", ".lp.gz", ".mps", ".lp", ".gz"}) { -+ if (endswith(suf)) { -+ s.resize(s.size() - std::char_traits::length(suf)); -+ break; -+ } -+ } -+ for (char& c : s) { -+ c = static_cast(std::tolower(static_cast(c))); -+ } -+ return s; -+} -+ -+// MIPLIB2017 benchmark-set best-known objectives (n=232). Source: -+// https://miplib.zib.de "The Benchmark Set". Values are stored in the -+// double precision they were published at; unit tests should compare -+// with a tolerance of ~|opt|*1e-9 rather than exact equality. -+inline const std::unordered_map& kBenchmarkOptima() -+{ -+ static const std::unordered_map kOptima = { -+ {"30n20b8", 302}, -+ {"50v-10", 3311.1799841000002}, -+ {"academictimetablesmall", 0}, -+ {"air05", 26374}, -+ {"app1-1", -3}, -+ {"app1-2", -41}, -+ {"assign1-5-8", 211.99999999999801}, -+ {"atlanta-ip", 90.009878614000002}, -+ {"b1c1s1", 24544.25}, -+ {"bab2", -357544.31150000001}, -+ {"bab6", -284248.23070000007}, -+ {"beasleyc3", 753.9999999999128}, -+ {"binkar10_1", 6742.1998835000004}, -+ {"blp-ar98", 6205.2147103999996}, -+ {"blp-ic98", 4491.4475839500001}, -+ {"bnatt400", 1}, -+ {"bppc4-08", 53}, -+ {"brazil3", 24}, -+ {"buildingenergy", 33283.853236000003}, -+ {"cbs-cta", 0}, -+ {"chromaticindex1024-7", 4}, -+ {"chromaticindex512-7", 4}, -+ {"cmflsp50-24-8-8", 55789389.886}, -+ {"cms750_4", 252}, -+ {"co-100", 2639942.0600000001}, -+ {"cod105", -12}, -+ {"comp07-2idx", 6}, -+ {"comp21-2idx", 74}, -+ {"cost266-uue", 25148940.55999998}, -+ {"cryptanalysiskb128n5obj16", 0}, -+ {"csched007", 350.99999999999551}, -+ {"csched008", 173}, -+ {"cvs16r128-89", -97}, -+ {"dano3_3", 576.34463302999995}, -+ {"dano3_5", 576.9249159565619}, -+ {"decomp2", -160}, -+ {"drayage-100-23", 103333.87407000001}, -+ {"drayage-25-23", 101282.647018}, -+ {"dws008-01", 37412.604587945083}, -+ {"eil33-2", 934.007915999999}, -+ {"eila101-2", 880.92010799999991}, -+ {"enlight_hard", 37}, -+ {"ex10", 100}, -+ {"ex9", 81}, -+ {"exp-1-500-5-5", 65887}, -+ {"fast0507", 174}, -+ {"fastxgemm-n2r6s0t2", 230}, -+ {"fhnw-binpack4-48", 0}, -+ {"fiball", 138}, -+ {"gen-ip002", -4783.7333920000001}, -+ {"gen-ip054", 6840.9656417899996}, -+ {"germanrr", 47095869.648999996}, -+ {"gfd-schedulen180f7d50m30k18", 1}, -+ {"glass-sc", 23}, -+ {"glass4", 1200012599.972384}, -+ {"gmu-35-40", -2406733.3687999998}, -+ {"gmu-35-50", -2607958.3300000001}, -+ {"graph20-20-1rand", -9}, -+ {"graphdraw-domain", 19685.999975500381}, -+ {"h80x6320d", 6382.0990482459993}, -+ {"highschool1-aigio", 0}, -+ {"hypothyroid-k1", -2851}, -+ {"ic97_potential", 3941.9999309022501}, -+ {"icir97_tension", 6375}, -+ {"irish-electricity", 3723497.5913959998}, -+ {"irp", 12159.492835396981}, -+ {"istanbul-no-cutoff", 204.08170701}, -+ {"k1mushroom", -3288}, -+ {"lectsched-5-obj", 24}, -+ {"leo1", 404227536.16000003}, -+ {"leo2", 404077441.12}, -+ {"lotsize", 1480195}, -+ {"mad", 0.026800000000000001}, -+ {"map10", -495}, -+ {"map16715-04", -111}, -+ {"markshare2", 1}, -+ {"markshare_4_0", 1}, -+ {"mas74", 11801.185719999999}, -+ {"mas76", 40005.053989999993}, -+ {"mc11", 11688.99999999966}, -+ {"mcsched", 211913}, -+ {"mik-250-20-75-4", -52301}, -+ {"milo-v12-6-r2-40-1", 326481.14282799}, -+ {"momentum1", 109143.4935}, -+ {"mushroom-best", 0.055333761199999998}, -+ {"mzzv11", -21718}, -+ {"mzzv42z", -20540}, -+ {"n2seq36q", 52200}, -+ {"n3div36", 130800}, -+ {"n5-3", 8104.9999999939992}, -+ {"neos-1122047", 161}, -+ {"neos-1171448", -309}, -+ {"neos-1171737", -195}, -+ {"neos-1354092", 46}, -+ {"neos-1445765", -17783}, -+ {"neos-1456979", 176}, -+ {"neos-1582420", 90.999999999999957}, -+ {"neos-2657525-crna", 1.810748}, -+ {"neos-2746589-doon", 2008.1999999999989}, -+ {"neos-2978193-inde", -2.3880616899999998}, -+ {"neos-2987310-joes", -607702988.29999995}, -+ {"neos-3004026-krka", 0}, -+ {"neos-3024952-loue", 26756}, -+ {"neos-3046615-murg", 1600}, -+ {"neos-3083819-nubu", 6307996}, -+ {"neos-3216931-puriri", 71320}, -+ {"neos-3381206-awhea", 453}, -+ {"neos-3402294-bobin", 0.067249999999999491}, -+ {"neos-3555904-turama", -34.700000000000003}, -+ {"neos-3627168-kasai", 988585.61999999976}, -+ {"neos-3656078-kumeu", -13172.200000000001}, -+ {"neos-3754480-nidda", 12941.73838561778}, -+ {"neos-4300652-rahue", 2.1415999999999999}, -+ {"neos-4338804-snowy", 1471}, -+ {"neos-4387871-tavua", 33.384729927000002}, -+ {"neos-4413714-turia", 45.370167019999798}, -+ {"neos-4532248-waihi", 61.599999999999987}, -+ {"neos-4647030-tutaki", 27265.705999999958}, -+ {"neos-4722843-widden", 25009.662227000001}, -+ {"neos-4738912-atrato", 283627956.59500003}, -+ {"neos-4763324-toguru", 1613.0388458499999}, -+ {"neos-4954672-berkel", 2612710}, -+ {"neos-5049753-cuanza", 561.99999716889999}, -+ {"neos-5052403-cygnet", 182}, -+ {"neos-5093327-huahum", 6259.9999971258949}, -+ {"neos-5104907-jarama", 935}, -+ {"neos-5107597-kakapo", 3644.9999999995198}, -+ {"neos-5114902-kasavu", 655}, -+ {"neos-5188808-nattai", 0.110283622999984}, -+ {"neos-5195221-niemur", 0.0038354325999999999}, -+ {"neos-631710", 203}, -+ {"neos-662469", 184379.99999999991}, -+ {"neos-787933", 30}, -+ {"neos-827175", 112.00152}, -+ {"neos-848589", 2351.40309999697}, -+ {"neos-860300", 3200.9999999999982}, -+ {"neos-873061", 113.6562385063}, -+ {"neos-911970", 54.759999999999998}, -+ {"neos-933966", 318}, -+ {"neos-950242", 4}, -+ {"neos-957323", -237.75668150000001}, -+ {"neos-960392", -238}, -+ {"neos17", 0.1500025774}, -+ {"neos5", 15}, -+ {"neos8", -3719}, -+ {"net12", 214}, -+ {"netdiversion", 242}, -+ {"nexp-150-20-8-5", 231}, -+ {"ns1116954", 0}, -+ {"ns1208400", 2}, -+ {"ns1644855", -1524.3333333333301}, -+ {"ns1760995", -549.21438505000003}, -+ {"ns1830653", 20622}, -+ {"ns1952667", 0}, -+ {"nu25-pr12", 53904.999999999993}, -+ {"nursesched-medium-hint03", 115}, -+ {"nursesched-sprint02", 57.999999999999993}, -+ {"nw04", 16862}, -+ {"opm2-z10-s4", -33269}, -+ {"p200x1188c", 15078}, -+ {"peg-solitaire-a3", 1}, -+ {"pg", -8674.3426071199992}, -+ {"pg5_34", -14339.353450000001}, -+ {"physiciansched3-3", 2623271.3266670001}, -+ {"physiciansched6-2", 49324}, -+ {"piperout-08", 125054.9999999999}, -+ {"piperout-27", 8123.9999999999727}, -+ {"pk1", 11}, -+ {"proteindesign121hz512p9", 1473}, -+ {"proteindesign122trx11p8", 1747}, -+ {"qap10", 339.99999999838712}, -+ {"radiationm18-12-05", 17566}, -+ {"radiationm40-10-02", 155328}, -+ {"rail01", -70.569964299999995}, -+ {"rail02", -200.44990770000001}, -+ {"rail507", 174}, -+ {"ran14x18-disj-8", 3712}, -+ {"rd-rplusc-21", 165395.275295}, -+ {"reblock115", -36800603.233199999}, -+ {"rmatr100-p10", 423}, -+ {"rmatr200-p5", 4521}, -+ {"roci-4-11", -6020203}, -+ {"rocii-5-11", -6.6755047315380001}, -+ {"rococob10-011000", 19449}, -+ {"rocococ10-001000", 11460}, -+ {"roi2alpha3n4", -63.208495030000002}, -+ {"roi5alpha10n8", -52.322274350999997}, -+ {"roll3000", 12889.999991999999}, -+ {"s100", -0.16972352705829999}, -+ {"s250r10", -0.17178048342319999}, -+ {"satellites2-40", -19}, -+ {"satellites2-60-fs", -19.000000000099998}, -+ {"savsched1", 3217.6999999999998}, -+ {"sct2", -230.9891623}, -+ {"seymour", 423}, -+ {"seymour1", 410.76370138999999}, -+ {"sing326", 7753674.8537600003}, -+ {"sing44", 8128831.1771999998}, -+ {"snp-02-004-104", 586803238.65672886}, -+ {"sorrell3", -16}, -+ {"sp150x300d", 69}, -+ {"sp97ar", 660705645.75899994}, -+ {"sp98ar", 529740623.19999999}, -+ {"splice1k1", -394}, -+ {"square41", 15}, -+ {"square47", 15.9999999997877}, -+ {"supportcase10", 7}, -+ {"supportcase12", -7559.5330538170001}, -+ {"supportcase18", 48}, -+ {"supportcase19", 12677205.999920519}, -+ {"supportcase22", 110}, // best-known marked "*" in MIPLIB2017 (not proven optimal) -+ {"supportcase26", 1745.1238129999999}, -+ {"supportcase33", -345}, -+ {"supportcase40", 24256.3122898}, -+ {"supportcase42", 7.7586307222700004}, -+ {"supportcase6", 51906.477370000001}, -+ {"supportcase7", -1132.2231770000001}, -+ {"swath1", 379.07129574999999}, -+ {"swath3", 397.76134365000001}, -+ {"tbfp-network", 24.163194440000002}, -+ {"thor50dday", 40417}, -+ {"timtab1", 764771.99999977998}, -+ {"tr12-30", 130595.9999999999}, -+ {"traininstance2", 71820}, -+ {"traininstance6", 28290}, -+ {"trento1", 5189487}, -+ {"triptim1", 22.868099999999899}, -+ {"uccase12", 11507.4050616}, -+ {"uccase9", 10993.131409}, -+ {"uct-subprob", 314}, -+ {"unitcal_7", 19635558.243999999}, -+ {"var-smallemery-m6j6", -149.37501}, -+ {"wachplan", -8}, -+ }; -+ return kOptima; -+} -+ -+// MIPLIB2017 benchmark-set instances flagged as infeasible (n=7). -+// Solver should return Infeasible status; we use this set to label -+// the printer line with status_extra=KnownInfeasible so a downstream -+// "did the run agree with MIPLIB?" check can be a single grep. -+inline const std::unordered_set& kBenchmarkInfeasible() -+{ -+ static const std::unordered_set kInfeas = { -+ "bnatt500", -+ "cryptanalysiskb128n5obj14", -+ "fhnw-binpack4-4", -+ "neos-2075418-temuka", -+ "neos-3402454-bohle", -+ "neos-3988577-wolgan", -+ "neos859080", -+ }; -+ return kInfeas; -+} -+ -+inline std::optional lookup_miplib_optimum(const std::string& filename) -+{ -+ const auto& m = kBenchmarkOptima(); -+ const auto it = m.find(normalize_instance_name(filename)); -+ if (it == m.end()) { return std::nullopt; } -+ return it->second; -+} -+ -+inline bool is_known_infeasible(const std::string& filename) -+{ -+ return kBenchmarkInfeasible().count(normalize_instance_name(filename)) != 0; -+} -+ -+// Single grep-friendly per-instance line. Emits to stdout via printf -+// so the output survives unconditionally regardless of the project's -+// settings_.log routing (NFS-backed log files, gated debug levels) -+// and is trivially cross-compared between cuts-config branches. -+// -+// "Gap closed" is reported relative to the *root LP after cuts*, not -+// relative to the final dual bound at the end of solve. The standard -+// MIP cutting-plane definition is: -+// gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) -+// / (opt - root_lp_no_cuts) -+// On a minimization-form problem all three differences are >= 0 and -+// gap_closed_pct lies in [0, 100]. The ratio is sign-symmetric so the -+// formula also holds verbatim for maximization (numerator and -+// denominator flip sign together). NaN is emitted when either root -+// bound was not published (e.g. B&B never entered the cut loop). -+// -+// Other field semantics (signed for minimization): -+// abs_root_dual_gap = opt - root_lp_with_cuts -+// rel_root_dual_gap_pct = 100 * abs_root_dual_gap / max(|opt|, 1) -+// abs_primal_gap = primal - opt -+// rel_primal_gap_pct = 100 * abs_primal_gap / max(|opt|, 1) -+// -+// The line still also reports `final_dual` (solver's bound at the end -+// of solve) so the new metric and the previous one can be compared -+// without re-running. -+// -+// "TBD" is emitted when the optimum is unknown so downstream parsers -+// can join lines on (instance, field) without dropping rows. "NaN" is -+// emitted for root_lp_* when the value is unavailable. -+template -+inline void print_miplib_gap_stat( -+ const std::string& filename, -+ const Solution& solution, -+ double solve_time_seconds, -+ const std::string& termination_status, -+ double root_lp_no_cuts, -+ double root_lp_with_cuts, -+ double cut_gen_time_sec = std::numeric_limits::quiet_NaN()) -+{ -+ const std::string norm = normalize_instance_name(filename); -+ const auto opt = lookup_miplib_optimum(filename); -+ const double primal = solution.get_objective_value(); -+ const double final_dual = solution.get_solution_bound(); -+ const double mip_gap = solution.get_mip_gap(); -+ const bool primal_finite = std::isfinite(primal); -+ const bool root0_finite = std::isfinite(root_lp_no_cuts); -+ const bool root1_finite = std::isfinite(root_lp_with_cuts); -+ constexpr double NaN = std::numeric_limits::quiet_NaN(); -+ -+ if (is_known_infeasible(filename)) { -+ std::printf( -+ "MIPLIBGapStat instance=%s opt=Infeasible primal=%.10g final_dual=%.10g " -+ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " -+ "abs_root_dual_gap=NA rel_root_dual_gap_pct=NA gap_closed_pct=NA " -+ "abs_primal_gap=NA rel_primal_gap_pct=NA " -+ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", -+ norm.c_str(), -+ primal, -+ final_dual, -+ root_lp_no_cuts, -+ root_lp_with_cuts, -+ mip_gap, -+ solve_time_seconds, -+ cut_gen_time_sec, -+ termination_status.c_str()); -+ } else if (opt.has_value()) { -+ const double o = *opt; -+ const double denom = std::max(std::abs(o), 1.0); -+ -+ const double abs_root_dgap = root1_finite ? (o - root_lp_with_cuts) : NaN; -+ const double rel_root_dgap_pct = root1_finite ? 100.0 * abs_root_dgap / denom : NaN; -+ -+ // Classical gap-closed-by-cuts. Skip when either root bound is -+ // missing, when the LP relaxation already proves optimality -+ // (denominator = opt - root_lp_no_cuts ~= 0), or when the bound -+ // moved the wrong way (numerical noise in either direction). -+ double gap_closed_pct = NaN; -+ if (root0_finite && root1_finite) { -+ const double total_gap = o - root_lp_no_cuts; -+ if (std::abs(total_gap) > 1e-12 * denom) { -+ gap_closed_pct = 100.0 * (root_lp_with_cuts - root_lp_no_cuts) / total_gap; -+ } else { -+ // LP relaxation already (numerically) optimal -> 100% closed -+ // by definition. Avoid /0 noise. -+ gap_closed_pct = 100.0; -+ } -+ } -+ -+ const double abs_pgap = primal_finite ? (primal - o) : NaN; -+ const double rel_pgap_pct = primal_finite ? 100.0 * abs_pgap / denom : NaN; -+ -+ std::printf( -+ "MIPLIBGapStat instance=%s opt=%.10g primal=%.10g final_dual=%.10g " -+ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " -+ "abs_root_dual_gap=%.10g rel_root_dual_gap_pct=%.6g gap_closed_pct=%.6g " -+ "abs_primal_gap=%.10g rel_primal_gap_pct=%.6g " -+ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", -+ norm.c_str(), -+ o, -+ primal, -+ final_dual, -+ root_lp_no_cuts, -+ root_lp_with_cuts, -+ abs_root_dgap, -+ rel_root_dgap_pct, -+ gap_closed_pct, -+ abs_pgap, -+ rel_pgap_pct, -+ mip_gap, -+ solve_time_seconds, -+ cut_gen_time_sec, -+ termination_status.c_str()); -+ } else { -+ std::printf( -+ "MIPLIBGapStat instance=%s opt=TBD primal=%.10g final_dual=%.10g " -+ "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " -+ "abs_root_dual_gap=TBD rel_root_dual_gap_pct=TBD gap_closed_pct=TBD " -+ "abs_primal_gap=TBD rel_primal_gap_pct=TBD " -+ "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", -+ norm.c_str(), -+ primal, -+ final_dual, -+ root_lp_no_cuts, -+ root_lp_with_cuts, -+ mip_gap, -+ solve_time_seconds, -+ cut_gen_time_sec, -+ termination_status.c_str()); -+ } -+ std::fflush(stdout); -+} -+ -+} // namespace cuopt_bench -diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp -index f3554369..bed4e453 100644 ---- a/benchmarks/linear_programming/cuopt/run_mip.cpp -+++ b/benchmarks/linear_programming/cuopt/run_mip.cpp -@@ -6,6 +6,7 @@ - /* clang-format on */ - #include "initial_solution_reader.hpp" - #include "mip_test_instances.hpp" -+#include "miplib2017_optima.hpp" - - #include - #include -@@ -23,15 +24,20 @@ - #include - #include - -+#include - #include - #include - #include - #include - #include -+#include - #include -+#include - #include -+#include - #include - #include -+#include - #include - #include - #include -@@ -239,6 +245,43 @@ int run_single_file(std::string file_path, - } else { - CUOPT_LOG_INFO("%s: no solution found", base_filename.c_str()); - } -+ -+ // Per-instance "gap closed to optimum" stat. Emits a single -+ // grep-friendly "MIPLIBGapStat ..." line via printf so cross-branch -+ // comparison is just `grep '^MIPLIBGapStat' branchA.log` then diff. -+ // Optima are looked up from the in-source MIPLIB2017 benchmark-set -+ // table (miplib2017_optima.hpp); unknown instances emit "opt=TBD" -+ // and infeasibility-flagged instances emit "opt=Infeasible". -+ { -+ const double _gap_seconds = std::chrono::duration_cast( -+ std::chrono::high_resolution_clock::now() - start_run_solver) -+ .count() / -+ 1000.0; -+ std::string _status_str; -+ switch (solution.get_termination_status()) { -+ case cuopt::linear_programming::mip_termination_status_t::Optimal: -+ _status_str = "Optimal"; -+ break; -+ case cuopt::linear_programming::mip_termination_status_t::FeasibleFound: -+ _status_str = "FeasibleFound"; -+ break; -+ case cuopt::linear_programming::mip_termination_status_t::TimeLimit: -+ _status_str = "TimeLimit"; -+ break; -+ case cuopt::linear_programming::mip_termination_status_t::Infeasible: -+ _status_str = "Infeasible"; -+ break; -+ default: _status_str = "Other"; break; -+ } -+ cuopt_bench::print_miplib_gap_stat(base_filename, -+ solution, -+ _gap_seconds, -+ _status_str, -+ benchmark_info.root_lp_no_cuts, -+ benchmark_info.root_lp_with_cuts, -+ benchmark_info.cut_generation_time_sec); -+ } -+ - std::stringstream ss; - int decimal_places = 2; - double mip_gap = solution.get_mip_gap(); -@@ -293,6 +336,157 @@ void run_single_file_mp(std::string file_path, - exit(sol_found); - } - -+// Return the NUMA node of each GPU (one entry per gpu_id), or -1 when the -+// node can't be determined. Reads /sys/bus/pci/devices//numa_node so it -+// requires no extra dependencies (NVML / hwloc). -+static std::vector get_gpu_numa_nodes(int n_gpus) -+{ -+ std::vector nodes(static_cast(std::max(0, n_gpus)), -1); -+ for (int i = 0; i < n_gpus; ++i) { -+ char pci_id[32] = {0}; -+ if (cudaDeviceGetPCIBusId(pci_id, sizeof(pci_id), i) != cudaSuccess) { continue; } -+ for (char* c = pci_id; *c; ++c) { -+ *c = static_cast(std::tolower(static_cast(*c))); -+ } -+ std::ifstream f(std::string("/sys/bus/pci/devices/") + pci_id + "/numa_node"); -+ if (!f) { continue; } -+ int node = -1; -+ f >> node; -+ nodes[i] = node; -+ } -+ return nodes; -+} -+ -+// Parse a sysfs cpulist string ("0-71,144-215") into a sorted list of CPU IDs. -+// Returns empty on any read or parse failure. -+static std::vector read_numa_cpulist(int numa_node) -+{ -+ std::vector cpus; -+ if (numa_node < 0) { return cpus; } -+ std::ifstream f(std::string("/sys/devices/system/node/node") + std::to_string(numa_node) + -+ "/cpulist"); -+ if (!f) { return cpus; } -+ std::string line; -+ if (!std::getline(f, line)) { return cpus; } -+ size_t pos = 0; -+ while (pos < line.size()) { -+ const size_t comma = line.find(',', pos); -+ const size_t end = (comma == std::string::npos) ? line.size() : comma; -+ const std::string range = line.substr(pos, end - pos); -+ if (!range.empty()) { -+ try { -+ const size_t dash = range.find('-'); -+ const int lo = std::stoi(range.substr(0, dash)); -+ const int hi = (dash == std::string::npos) ? lo : std::stoi(range.substr(dash + 1)); -+ for (int c = lo; c <= hi; ++c) { -+ cpus.push_back(c); -+ } -+ } catch (...) { -+ return std::vector{}; -+ } -+ } -+ if (comma == std::string::npos) { break; } -+ pos = comma + 1; -+ } -+ std::sort(cpus.begin(), cpus.end()); -+ return cpus; -+} -+ -+// Bind the current process to a fair partition of the inherited CPU mask, -+// preferring CPUs on the same NUMA node as the GPU. Returns the actual -+// number of CPUs the child was pinned to, or -1 if the partition could not -+// be applied (caller must then choose a fallback). -+// -+// Algorithm: -+// 1. Read inherited (parent) affinity mask -> visible_cpus. -+// 2. Look up each GPU's NUMA node via PCI BDF. -+// 3. If this GPU's NUMA node is known and has visible CPUs, partition -+// that NUMA node's CPUs among the GPUs that landed on the same node -+// (siblings, ordered by gpu_id). -+// 4. Otherwise fall back to a contiguous global partition of visible_cpus. -+// -+// The function always emits a single stdout line per child summarising the -+// partition (NUMA-local vs contiguous-fallback), so the parent's log isn't -+// interleaved per-CPU across n_gpus children. -+int bind_process_to_cpu_partition(int gpu_id, int n_gpus) -+{ -+ if (gpu_id < 0 || n_gpus <= 0 || gpu_id >= n_gpus) { return -1; } -+ -+ cpu_set_t parent_mask; -+ CPU_ZERO(&parent_mask); -+ if (sched_getaffinity(0, sizeof(parent_mask), &parent_mask) != 0) { -+ perror("sched_getaffinity"); -+ return -1; -+ } -+ -+ std::vector visible_cpus; -+ for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) { -+ if (CPU_ISSET(cpu, &parent_mask)) { visible_cpus.push_back(cpu); } -+ } -+ if (visible_cpus.empty()) { return -1; } -+ std::sort(visible_cpus.begin(), visible_cpus.end()); -+ -+ std::vector chosen_cpus; -+ bool numa_aware = false; -+ -+ const std::vector gpu_numa_nodes = get_gpu_numa_nodes(n_gpus); -+ const int my_numa = gpu_numa_nodes[gpu_id]; -+ if (my_numa >= 0) { -+ std::vector siblings; -+ for (int i = 0; i < n_gpus; ++i) { -+ if (gpu_numa_nodes[i] == my_numa) { siblings.push_back(i); } -+ } -+ std::vector numa_cpus = read_numa_cpulist(my_numa); -+ if (!numa_cpus.empty() && !siblings.empty()) { -+ std::vector local_visible; -+ std::set_intersection(visible_cpus.begin(), -+ visible_cpus.end(), -+ numa_cpus.begin(), -+ numa_cpus.end(), -+ std::back_inserter(local_visible)); -+ if (!local_visible.empty()) { -+ const int siblings_count = static_cast(siblings.size()); -+ const int my_idx = -+ static_cast(std::find(siblings.begin(), siblings.end(), gpu_id) - siblings.begin()); -+ const int local_per_gpu = -+ std::max(1, static_cast(local_visible.size()) / siblings_count); -+ const int s = my_idx * local_per_gpu; -+ const int e = std::min(s + local_per_gpu, static_cast(local_visible.size())); -+ if (s < e) { -+ chosen_cpus.assign(local_visible.begin() + s, local_visible.begin() + e); -+ numa_aware = true; -+ } -+ } -+ } -+ } -+ -+ if (!numa_aware) { -+ const int cpus_per_gpu = std::max(1, static_cast(visible_cpus.size()) / n_gpus); -+ const int start = gpu_id * cpus_per_gpu; -+ if (start >= static_cast(visible_cpus.size())) { return -1; } -+ const int end = std::min(start + cpus_per_gpu, static_cast(visible_cpus.size())); -+ chosen_cpus.assign(visible_cpus.begin() + start, visible_cpus.begin() + end); -+ } -+ -+ cpu_set_t child_mask; -+ CPU_ZERO(&child_mask); -+ std::ostringstream oss; -+ oss << "[gpu " << gpu_id << "] bound to " << chosen_cpus.size() << " CPUs (" -+ << (numa_aware ? "NUMA-local node " + std::to_string(my_numa) : "contiguous-fallback") -+ << "):"; -+ for (int c : chosen_cpus) { -+ CPU_SET(c, &child_mask); -+ oss << ' ' << c; -+ } -+ std::cout << oss.str() << std::endl; -+ -+ if (sched_setaffinity(0, sizeof(child_mask), &child_mask) != 0) { -+ perror("sched_setaffinity"); -+ return -1; -+ } -+ return static_cast(chosen_cpus.size()); -+} -+ - void return_gpu_to_the_queue(std::unordered_map& pid_gpu_map, - std::unordered_map& pid_file_map, - std::queue& gpu_queue) -@@ -417,6 +611,11 @@ int main(int argc, char* argv[]) - int reliability_branching = program.get("--reliability-branching"); - bool deterministic = program.get("--determinism"); - -+ if (run_dir && program.is_used("--num-cpu-threads")) { -+ std::cerr << "Warning: --num-cpu-threads is ignored in directory-run mode; " -+ "thread count is set per process from the bound CPU partition.\n"; -+ } -+ - if (num_cpu_threads < 0) { - num_cpu_threads = omp_get_max_threads() / n_gpus; - // std::ifstream smt_file("/sys/devices/system/cpu/smt/active"); -@@ -502,6 +701,18 @@ int main(int argc, char* argv[]) - } - if (sys_pid == 0) { - RAFT_CUDA_TRY(cudaSetDevice(gpu_id)); -+ int assigned_cpus = bind_process_to_cpu_partition(gpu_id, n_gpus); -+ if (assigned_cpus <= 0) { -+ assigned_cpus = std::max(1, omp_get_max_threads() / n_gpus); -+ std::cerr << "[gpu " << gpu_id << "] CPU pin failed; falling back to " -+ << assigned_cpus << " threads\n"; -+ } -+ // Directory-run mode owns the thread count: --num-cpu-threads is -+ // intentionally ignored here so per-process thread budgets match -+ // the bound CPU partition. The single-run path below still -+ // honours --num-cpu-threads. -+ omp_set_num_threads(assigned_cpus); -+ num_cpu_threads = assigned_cpus; - run_single_file_mp(file_name, - gpu_id, - batch_num, -@@ -534,31 +745,36 @@ int main(int argc, char* argv[]) - merge_result_files(out_dir, result_file, n_gpus, batch_num); - } else { - auto memory_resource = make_async(); -+ auto run_single = [&]() { -+ run_single_file(path, -+ 0, -+ 0, -+ n_gpus, -+ out_dir, -+ initial_solution_file, -+ heuristics_only, -+ num_cpu_threads, -+ write_log_file, -+ log_to_console, -+ reliability_branching, -+ time_limit, -+ work_limit, -+ deterministic); -+ }; - if (memory_limit > 0) { - auto limiting_adaptor = - rmm::mr::limiting_resource_adaptor(memory_resource, memory_limit * 1024ULL * 1024ULL); - rmm::mr::set_current_device_resource(limiting_adaptor); -+ run_single(); - } else if (track_allocations) { - rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource, - /*capture_stacks=*/true); - rmm::mr::set_current_device_resource(tracking_adaptor); -+ run_single(); - } else { - rmm::mr::set_current_device_resource(memory_resource); -+ run_single(); - } -- run_single_file(path, -- 0, -- 0, -- n_gpus, -- out_dir, -- initial_solution_file, -- heuristics_only, -- num_cpu_threads, -- write_log_file, -- log_to_console, -- reliability_branching, -- time_limit, -- work_limit, -- deterministic); - } - - return 0; -diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp -index 685b1360..b2231b1a 100644 ---- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp -+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp -@@ -26,6 +26,27 @@ struct benchmark_info_t { - double last_improvement_of_best_feasible = 0; - double last_improvement_after_recombination = 0; - double objective_of_initial_population = std::numeric_limits::max(); -+ // LP relaxation objective at the root node, BEFORE any cuts have been -+ // added. quiet_NaN() means "B&B did not run cut passes / value was -+ // never written" — distinguishes it from a legitimate 0.0. -+ double root_lp_no_cuts = std::numeric_limits::quiet_NaN(); -+ // LP relaxation objective at the root node, AFTER the full cut loop -+ // (final pass result). The dual gap "by cuts at the root" is then -+ // gap_after_cuts = opt - root_lp_with_cuts (in B&B's solver -+ // objective sense) -+ // and the classical "gap closed by cuts" metric is -+ // gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) -+ // / (opt - root_lp_no_cuts). -+ // quiet_NaN() means "B&B did not finish the cut loop / value not written". -+ double root_lp_with_cuts = std::numeric_limits::quiet_NaN(); -+ -+ // Wall-clock time spent inside the root-node cut generation loop -+ // (sum of generate_cuts + score_cuts + check_for_duplicate_cuts + -+ // get_best_cuts + add_cuts + post-cut LP resolves), in seconds. -+ // Published by branch_and_bound.cpp::solve() at the same point that -+ // root_lp_with_cuts is finalised. quiet_NaN() means "cut loop did -+ // not run / value never written". -+ double cut_generation_time_sec = std::numeric_limits::quiet_NaN(); - }; - - // Forward declare solver_settings_t for friend class -diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp -index 0222ad6f..5420b882 100644 ---- a/cpp/src/branch_and_bound/branch_and_bound.cpp -+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp -@@ -11,6 +11,8 @@ - #include - #include - -+#include // benchmark_info_t -+ - #include - #include - #include -@@ -2361,6 +2363,15 @@ auto branch_and_bound_t::do_cut_pass( - } - root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); - -+ // Publish after every successful post-cut LP resolve so any -+ // early-exit path below (NUMERICAL, TIME_LIMIT, gap-tolerance -+ // exit) still leaves benchmark_info->root_lp_with_cuts pointing -+ // at the most recent valid LP-with-cuts objective. -+ if (settings_.benchmark_info_ptr != nullptr) { -+ settings_.benchmark_info_ptr->root_lp_with_cuts = -+ static_cast(compute_user_objective(original_lp_, root_objective_)); -+ } -+ - f_t remove_cuts_start_time = tic(); - mutex_original_lp_.lock(); - remove_cuts(original_lp_, -@@ -2479,7 +2490,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut - user_problem_t problem_copy = original_problem_; - timer_t timer(std::numeric_limits::infinity()); - detail::find_initial_cliques( -- problem_copy, tolerances_for_clique, &clique_table_, timer, false, clique_signal); -+ problem_copy, tolerances_for_clique, &clique_table_, timer, clique_signal); - } - } - -@@ -2588,6 +2599,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut - cut_info_t cut_info; - - if (num_fractional == 0) { -+ // LP relaxation already integer-feasible — solved at the root with -+ // no cuts. Publish both bounds equal to the root LP value so the -+ // gap-closed-by-cuts line still has a finite, meaningful entry -+ // (the printer reports 100% closed when total integrality gap ~= 0). -+ if (settings_.benchmark_info_ptr != nullptr) { -+ const double v = static_cast(compute_user_objective(original_lp_, root_objective_)); -+ settings_.benchmark_info_ptr->root_lp_no_cuts = v; -+ settings_.benchmark_info_ptr->root_lp_with_cuts = v; -+ } - set_solution_at_root(solution, cut_info); - signal_extend_cliques_.store(true, std::memory_order_release); - #pragma omp taskwait depend(in : *clique_signal) -@@ -2624,6 +2644,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut - f_t last_objective = root_objective_; - f_t root_relax_objective = root_objective_; - -+ // Publish the no-cuts root LP value once. The with-cuts companion is -+ // published below after the cut loop terminates. Both go to the -+ // benchmark_info_t so callers (run_mip.cpp) can compute -+ // gap-closed-by-cuts without instrumenting the cut loop directly. -+ if (settings_.benchmark_info_ptr != nullptr) { -+ settings_.benchmark_info_ptr->root_lp_no_cuts = -+ static_cast(compute_user_objective(original_lp_, root_relax_objective)); -+ } -+ - constexpr bool enable_root_cut_cpufj = true; - std::unique_ptr> root_cut_cpufj_task; - auto root_cut_cpufj_improvement_callback = -@@ -2648,11 +2677,31 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut - }; - cuopt::scope_guard root_cut_cpufj_guard([&]() { stop_root_cut_cpufj(); }); - -- f_t cut_generation_start_time = tic(); -- i_t cut_pool_size = 0; -+ f_t cut_generation_start_time = tic(); -+ auto publish_cut_generation_time = [&](bool force_time_limit_value = false) { -+ if (settings_.benchmark_info_ptr == nullptr) { return; } -+ f_t cut_generation_time = toc(cut_generation_start_time); -+ if (force_time_limit_value || cut_generation_time > settings_.time_limit) { -+ cut_generation_time = settings_.time_limit; -+ } -+ if (cut_generation_time < static_cast(0.0)) { -+ cut_generation_time = static_cast(0.0); -+ } -+ settings_.benchmark_info_ptr->cut_generation_time_sec = -+ static_cast(cut_generation_time); -+ }; -+ i_t cut_pool_size = 0; - for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { - if (num_fractional == 0) { -+ // LP relaxation is already integer-feasible — solved at the root -+ // by the cuts added so far (possibly zero). Publish the with-cuts -+ // value so the gap-closed line still has a non-NaN dual bound. -+ if (settings_.benchmark_info_ptr != nullptr) { -+ settings_.benchmark_info_ptr->root_lp_with_cuts = -+ static_cast(compute_user_objective(original_lp_, root_objective_)); -+ } - set_solution_at_root(solution, cut_info); -+ publish_cut_generation_time(); - signal_extend_cliques_.store(true, std::memory_order_release); - #pragma omp taskwait depend(in : *clique_signal) - return mip_status_t::OPTIMAL; -@@ -2692,6 +2741,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut - } - - if (cut_pass_result.action == cut_pass_action_t::RETURN) { -+ publish_cut_generation_time(cut_pass_result.status == mip_status_t::TIME_LIMIT); - signal_extend_cliques_.store(true, std::memory_order_release); - #pragma omp taskwait depend(in : *clique_signal) - return cut_pass_result.status; -@@ -2714,8 +2764,16 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut - } - } - -+ // Publish the post-cuts root LP value. -+ if (settings_.benchmark_info_ptr != nullptr) { -+ settings_.benchmark_info_ptr->root_lp_with_cuts = -+ static_cast(compute_user_objective(original_lp_, root_objective_)); -+ } -+ - print_cut_info(settings_, cut_info); - f_t cut_generation_time = toc(cut_generation_start_time); -+ // Publish cut-generation time for reporting. -+ publish_cut_generation_time(); - if (cut_info.has_cuts()) { - settings_.log.printf("Cut generation time: %.2f seconds\n", cut_generation_time); - settings_.log.printf("Cut pool size : %d\n", cut_pool_size); -diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu -index c25ade0c..712be213 100644 ---- a/cpp/src/mip_heuristics/solver.cu -+++ b/cpp/src/mip_heuristics/solver.cu -@@ -377,6 +377,9 @@ solution_t mip_solver_t::run_solver() - context.settings.strong_chvatal_gomory_cuts; - branch_and_bound_settings.cut_change_threshold = context.settings.cut_change_threshold; - branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality; -+ // Forward the run-level benchmark_info_t so B&B can publish root LP -+ // bounds (before / after cuts) for gap-closed-by-cuts measurement. -+ branch_and_bound_settings.benchmark_info_ptr = context.settings.benchmark_info_ptr; - branch_and_bound_settings.mip_batch_pdlp_strong_branching = - context.settings.mip_batch_pdlp_strong_branching; - branch_and_bound_settings.mip_batch_pdlp_reliability_branching = From 59a05acb9abc7758a418190d4e833ea7c273494f Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 5 Jun 2026 10:33:10 +0200 Subject: [PATCH 30/47] handle ai review --- .../presolve/conflict_graph/clique_table.cuh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh index 8968fd1e7e..da35ceae6e 100644 --- a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh +++ b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh @@ -80,8 +80,14 @@ struct csr_var_map_t { } i_t n_keys() const { return offsets.empty() ? 0 : static_cast(offsets.size() - 1); } i_t slice_size(i_t v) const { return offsets[v + 1] - offsets[v]; } - const i_t* slice_begin(i_t v) const { return indices.data() + offsets[v]; } - const i_t* slice_end(i_t v) const { return indices.data() + offsets[v + 1]; } + const i_t* slice_begin(i_t v) const + { + return indices.empty() ? nullptr : indices.data() + offsets[v]; + } + const i_t* slice_end(i_t v) const + { + return indices.empty() ? nullptr : indices.data() + offsets[v + 1]; + } // O(1) summary used by cut/extension cost-budget heuristics. double avg_slice_size() const { @@ -92,6 +98,7 @@ struct csr_var_map_t { { const i_t* b = slice_begin(v); const i_t* e = slice_end(v); + if (b == nullptr) { return false; } return std::binary_search(b, e, value); } From 6a75d1e1f5ce6790d6d8e847d3f67cf02321ecaf Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 5 Jun 2026 10:58:04 +0200 Subject: [PATCH 31/47] fix tests --- cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu index 554de45812..2a87f6ffbb 100644 --- a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu +++ b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu @@ -231,7 +231,7 @@ void remove_small_cliques(clique_table_t& clique_table, cuopt::timer_t for (size_t clique_idx = 0; clique_idx < clique_table.first.size(); clique_idx++) { if (timer.check_time_limit()) { return; } const auto& clique = clique_table.first[clique_idx]; - if (clique.size() < (size_t)clique_table.min_clique_size) { + if (clique.size() <= (size_t)clique_table.min_clique_size) { for (size_t i = 0; i < clique.size(); i++) { for (size_t j = 0; j < clique.size(); j++) { if (i == j) { continue; } From b3e0eefa2ff8c6a0c63037773b0d93f50b0c36b5 Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 5 Jun 2026 11:04:45 +0200 Subject: [PATCH 32/47] fix timer stats --- cpp/src/branch_and_bound/branch_and_bound.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 5420b88221..2000788ff2 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2681,9 +2681,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut auto publish_cut_generation_time = [&](bool force_time_limit_value = false) { if (settings_.benchmark_info_ptr == nullptr) { return; } f_t cut_generation_time = toc(cut_generation_start_time); - if (force_time_limit_value || cut_generation_time > settings_.time_limit) { - cut_generation_time = settings_.time_limit; - } + if (force_time_limit_value) { cut_generation_time = settings_.time_limit; } if (cut_generation_time < static_cast(0.0)) { cut_generation_time = static_cast(0.0); } From 03a7b3f581893b9958b49d05813cc414b44c3f02 Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 5 Jun 2026 12:16:53 +0200 Subject: [PATCH 33/47] add a skill --- skills/cuopt-developer/references/contributing.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/skills/cuopt-developer/references/contributing.md b/skills/cuopt-developer/references/contributing.md index 34fb75aab1..b11c91f114 100644 --- a/skills/cuopt-developer/references/contributing.md +++ b/skills/cuopt-developer/references/contributing.md @@ -74,6 +74,18 @@ A few non-YAGNI points worth keeping in mind: When in doubt, mirror how the surrounding cuOpt code handles the same concern. +## Resolving Merge Conflicts + +Don't resolve a conflict by mechanically picking the side that looks like a superset. A small, local conflict (a few changed lines in one function) often sits on top of a larger architectural divergence — one branch refactored a mechanism the other left alone — and the conflict markers only show the tip of it. Picking "the bigger hunk" then strands the rest of that mechanism. + +Before choosing a side, reconstruct what each branch actually did: + +- Diff the conflicting symbols across **both branches and the merge base**, not just the two conflict hunks: `git show :` and `git merge-base A B`. Watch for changes to a member's *type*, an ownership/lifetime model, or a synchronization/threading model (e.g. `std::future` → OpenMP task, `std::atomic` → `omp_atomic_t`). Those changes ripple beyond the conflict region. +- Check how the **already-merged, non-conflicted files** use the symbol. If a caller (constructor call, factory, task spawn) was auto-merged to one branch's signature, the conflicted file must conform to that branch — keeping the other branch's member or wait logic leaves it dead. +- When one branch *removed* a mechanism and the other *built on top of it*, the correct resolution is usually to adopt the removal (the newer baseline) and re-port the feature onto the new mechanism — not to keep both, which yields a member that is never set and a guard that never fires. + +A wrong merge resolution frequently **compiles cleanly and fails silently**: a dead pointer stays `nullptr`, the guard that depended on it never triggers, and a whole feature quietly disables itself with no error. Compilation is not evidence of a correct merge — trace the runtime wiring (who sets this field? who waits on it? is that path still reachable?) before declaring the conflict resolved. + ## Common Tasks ### Adding a Solver Parameter From 9cfd2b99a4375ae87fde10309278c5e467987908 Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 5 Jun 2026 12:27:09 +0200 Subject: [PATCH 34/47] more asserts --- cpp/src/cuts/cuts.cpp | 65 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 6142fb1329..ef38eea63d 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -531,8 +531,14 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert "Zero-half cut lower bounds size mismatch"); cuopt_assert(static_cast(num_vars) <= xstar.size(), "Zero-half cut xstar size mismatch"); - const i_t m = static_cast((cycle_size - 1) / 2); - const f_t f_m = static_cast(m); + const i_t m = static_cast((cycle_size - 1) / 2); + const f_t f_m = static_cast(m); + // The guard above rejects even or <5 cycles, so the cycle decomposes as + // exactly 2m+1 literals with m >= 2. The whole zero-half lift (rhs = -m, + // unit cycle coefficients, m-weighted wheel centers) depends on this. + cuopt_assert(2 * m + 1 == static_cast(cycle_size), + "Zero-half cut: cycle_size must equal 2m+1 (odd cycle)"); + cuopt_assert(m >= 2, "Zero-half cut: odd cycle must have length >= 5 (m >= 2)"); const f_t total_size = static_cast(cycle_size + wheel_centers.size()); const f_t estimated_work = 8.0 * total_size + 2.0 * total_size * std::log2(total_size + 1.0); if (add_work_estimate(estimated_work, work_estimate, max_work_estimate)) { @@ -617,9 +623,18 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert cut.x.reserve(coeff_by_var.size()); for (const auto& kv : coeff_by_var) { if (std::abs(kv.second) <= coeff_zero_tol) { continue; } + // Each variable appears at most once on the cycle (contributing +/-1) and + // at most once among the wheel centers (contributing +/-m), so no final + // coefficient can exceed 1 + m in magnitude. A larger value means a vertex + // was double-counted in accumulation. + cuopt_assert(std::abs(kv.second) <= f_m + static_cast(1) + bound_tol, + "Zero-half coefficient exceeds 1 + m (vertex double-counted?)"); cut.i.push_back(kv.first); cut.x.push_back(kv.second); } + // Support is bounded by the number of distinct accumulated vertices. + cuopt_assert(cut.i.size() <= cycle_size + wheel_centers.size(), + "Zero-half cut support exceeds accumulated vertex count"); if (cut.i.empty()) { ZERO_HALF_DEBUG("build_zero_half_cut empty support after accumulation"); @@ -768,6 +783,10 @@ bool dijkstra_odd_cycle(i_t source_local, return false; } total_weight = dist[target_idx]; + // All G' edge weights are clamped to >= 0, so the shortest-path distance must + // be non-negative; a negative total means the clamp/relaxation invariant broke. + cuopt_assert(total_weight >= -static_cast(1e-9), + "Zero-half Dijkstra shortest-path distance must be non-negative"); if (cutoff > 0 && total_weight >= cutoff) { ZERO_HALF_DEBUG("dijkstra_odd_cycle path too long total=%g cutoff=%g", static_cast(total_weight), @@ -783,8 +802,18 @@ bool dijkstra_odd_cycle(i_t source_local, cuopt_assert(!path.empty(), "Zero-half Dijkstra path empty"); cuopt_assert(path.back() == source_idx, "Zero-half Dijkstra path missing source"); std::reverse(path.begin(), path.end()); + cuopt_assert(path.front() == source_idx, "Zero-half Dijkstra path must start at source"); + cuopt_assert(path.back() == target_idx, "Zero-half Dijkstra path must end at target"); // bipartite path from j1 to j2 must have odd number of edges cuopt_assert((path.size() % 2) == 0, "Zero-half bipartite path must have even node count"); +#ifdef ASSERT_MODE + // Every G' edge crosses between the two bipartite copies, so consecutive path + // nodes must live in opposite parts (part = bipartite_idx / num_local). + for (size_t k = 0; k + 1 < path.size(); ++k) { + cuopt_assert((path[k] / num_local) != (path[k + 1] / num_local), + "Zero-half Dijkstra path must alternate bipartite parts"); + } +#endif ZERO_HALF_DEBUG("dijkstra_odd_cycle done path.size=%zu total_weight=%g pops=%lld", path.size(), static_cast(total_weight), @@ -889,6 +918,12 @@ bool path_to_odd_cycle(const std::vector& bipartite_path, } cycle_vertices.push_back(global); } + // Each local-sequence entry maps to exactly one distinct CG vertex (duplicates + // were rejected above), so the extracted cycle keeps the odd length of the + // de-duplicated path. + cuopt_assert(cycle_vertices.size() == local_seq.size(), + "Zero-half cycle dropped vertices during global mapping"); + cuopt_assert((cycle_vertices.size() % 2) == 1, "Zero-half extracted cycle must have odd length"); ZERO_HALF_DEBUG("path_to_odd_cycle done cycle_vertices.size=%zu", cycle_vertices.size()); return cycle_vertices.size() >= 5; } @@ -1015,6 +1050,22 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, } if (adj_to_wheel) { wheel_centers.push_back(candidate); } } +#ifdef ASSERT_MODE + // Post-condition: the selected centers must form a clique that is fully + // adjacent to the cycle — each center adjacent to every cycle vertex and to + // every other center. This is exactly what makes the m-weighted wheel lift a + // valid zero-half inequality. + for (size_t a = 0; a < wheel_centers.size(); ++a) { + for (const auto cv : cycle_vertices) { + cuopt_assert(graph.check_adjacency(wheel_centers[a], cv), + "Zero-half wheel center not adjacent to every cycle vertex"); + } + for (size_t b = a + 1; b < wheel_centers.size(); ++b) { + cuopt_assert(graph.check_adjacency(wheel_centers[a], wheel_centers[b]), + "Zero-half wheel centers must be mutually adjacent (clique)"); + } + } +#endif ZERO_HALF_DEBUG("extend_to_odd_wheel done wheel_centers.size=%zu", wheel_centers.size()); } @@ -3791,6 +3842,12 @@ bool cut_generation_t::generate_zero_half_cuts( continue; } cycles_found++; + cuopt_assert(cycle_vertices.size() >= 5 && (cycle_vertices.size() % 2) == 1, + "Zero-half separated cycle must be odd with length >= 5"); + // dijkstra_odd_cycle only returns true when the path stays below the + // half-integer cutoff, the precondition for the cycle to yield a violation. + cuopt_assert(cutoff <= static_cast(0) || total_weight < cutoff, + "Zero-half cycle weight must be below cutoff"); ZERO_HALF_DEBUG("cycle found s=%lld cycle_vertices.size=%zu", static_cast(s), cycle_vertices.size()); @@ -3828,6 +3885,10 @@ bool cut_generation_t::generate_zero_half_cuts( return false; } if (build_status == clique_cut_build_status_t::CUT_ADDED) { + // Only violated cuts are worth pooling; build_zero_half_cut promised a + // violation > min_violation, so re-check it before we commit. + cuopt_assert(cut_rhs - cut.dot(xstar) > min_violation - bound_tol, + "Zero-half cut added to pool must be violated by xstar"); inequality_t cut_inequality; cut_inequality.vector = cut; cut_inequality.rhs = cut_rhs; From b7f2d0b30a15240e2eac1c4ec31ed3612b79c996 Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 5 Jun 2026 12:45:52 +0200 Subject: [PATCH 35/47] change implied bound cut order --- cpp/src/cuts/cuts.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index ef38eea63d..8b15cba6db 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -3451,6 +3451,16 @@ bool cut_generation_t::generate_cuts(const lp_problem_t& lp, } } + // Generate implied bound cuts + if (settings.implied_bound_cuts != 0) { + f_t cut_start_time = tic(); + generate_implied_bound_cuts(lp, settings, var_types, xstar, start_time); + f_t cut_generation_time = toc(cut_start_time); + if (cut_generation_time > 1.0) { + settings.log.debug("Implied bounds cut generation time %.2f seconds\n", cut_generation_time); + } + } + // Build the fractional conflict-graph subgraph once (resolving the async // clique-table future on the way) so both clique-cut and zero-half cut // separators consume the same vertex/weight/adjacency tables instead of @@ -3492,16 +3502,6 @@ bool cut_generation_t::generate_cuts(const lp_problem_t& lp, ZERO_HALF_DEBUG("generate_cuts: zero_half_cuts disabled (setting=%d)", static_cast(settings.zero_half_cuts)); } - - // Generate implied bound cuts - if (settings.implied_bound_cuts != 0) { - f_t cut_start_time = tic(); - generate_implied_bound_cuts(lp, settings, var_types, xstar, start_time); - f_t cut_generation_time = toc(cut_start_time); - if (cut_generation_time > 1.0) { - settings.log.debug("Implied bounds cut generation time %.2f seconds\n", cut_generation_time); - } - } return true; } From 4cda3f5e6b373d6384916a642208e462a0d53601 Mon Sep 17 00:00:00 2001 From: akif Date: Mon, 8 Jun 2026 11:56:26 +0200 Subject: [PATCH 36/47] remove debug lines --- cpp/src/cuts/cuts.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 8b15cba6db..bcbf5df681 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -757,16 +757,17 @@ bool dijkstra_odd_cycle(i_t source_local, } cuopt_assert(v_local >= 0 && v_local < num_local, "Zero-half Dijkstra neighbor out of range"); // Edge weight = (1 − x_u − x_v) / 2, where x_u/x_v are the LP values of - // the literals at u and v. For a CG edge the conflict constraint - // x_u + x_v <= 1 must hold, so the weight is non-negative. Tiny - // negative values arise from FP drift; clamp them. A *significantly* - // negative weight means the LP is meaningfully violating a conflict - // constraint — that's an upstream bug we want to know about, hence - // the debug-only assert with a generous tolerance. + // the literals at u and v. A conflict-graph edge x_u + x_v <= 1 is an + // *implied* clique inequality (e.g. derived from a knapsack constraint + // when a_i + a_j > rhs, see clique_table.cu): it is valid for every + // integer-feasible point but is NOT explicitly enforced in the LP. So a + // fractional LP optimum routinely violates x_u + x_v <= 1 — that is + // exactly the violation the odd-cycle / zero-half separator exists to + // exploit. A negative raw edge weight is therefore expected, not a bug. + // We clamp it to 0 so the bipartite shortest path stays non-negative; a + // strongly violated edge then becomes a 0-weight (very attractive) edge, + // which is the desired behavior. f_t edge_w = (static_cast(1) - weights[u_local] - weights[v_local]) / 2; - cuopt_assert(edge_w >= -static_cast(1e-6), - "Zero-half edge weight significantly negative — conflict constraint violated by " - "LP?"); if (edge_w < 0) { edge_w = 0; } const i_t v = v_local + v_part * num_local; const f_t nd = d + edge_w; From 1ec6536983021dd478706782197410428bbcc40b Mon Sep 17 00:00:00 2001 From: akif Date: Mon, 8 Jun 2026 15:12:14 +0200 Subject: [PATCH 37/47] clean up debug logs and comments --- cpp/src/cuts/cuts.cpp | 112 +----------------------------------------- 1 file changed, 2 insertions(+), 110 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index bcbf5df681..655b4fe308 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -426,12 +426,6 @@ void extend_clique_vertices(std::vector& clique_vertices, const f_t sort_work = candidate_size > 0.0 ? 2.0 * candidate_size * std::log2(candidate_size + 1.0) : 0.0; const f_t adj_set_build_cost = 2.0 * static_cast(adj_set.size()); - // P0-3 (2): account for the addtl_cliques scan that - // clique_table_t::check_adjacency performs on every adjacency probe. - // Baseline ignored this, so on instances with many addtl_clique entries - // the extension loop dominated cut-generation wall time without being - // attributed to clique cuts. avg_slice_size of var_clique_addtl is a - // robust proxy for the per-call addtl scan cost. const f_t addtl_cliques_scan_cost = 1.0 + static_cast(graph.var_clique_addtl.avg_slice_size()); const f_t adj_check_cost = 5.0 + addtl_cliques_scan_cost; @@ -568,19 +562,9 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert ZERO_HALF_DEBUG(" acc vertex_idx=%lld (range [0, %lld))", static_cast(vertex_idx), static_cast(2 * num_vars)); - if (vertex_idx < 0 || vertex_idx >= 2 * num_vars) { - ZERO_HALF_DEBUG(" acc OUT_OF_RANGE vertex_idx=%lld", static_cast(vertex_idx)); - return clique_cut_build_status_t::NO_CUT; - } cuopt_assert(vertex_idx >= 0 && vertex_idx < 2 * num_vars, "Zero-half vertex out of range"); const i_t var_idx = vertex_idx % num_vars; const bool complement = vertex_idx >= num_vars; - if (var_idx < 0 || static_cast(var_idx) >= lower_bounds.size() || - static_cast(var_idx) >= upper_bounds.size() || - static_cast(var_idx) >= var_types.size()) { - ZERO_HALF_DEBUG(" acc var_idx OUT_OF_RANGE var_idx=%lld", static_cast(var_idx)); - return clique_cut_build_status_t::NO_CUT; - } const f_t lower_bound = lower_bounds[var_idx]; const f_t upper_bound = upper_bounds[var_idx]; cuopt_assert(var_types[var_idx] != variable_type_t::CONTINUOUS, @@ -588,10 +572,6 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert cuopt_assert(lower_bound >= -bound_tol, "Zero-half variable lower bound below zero"); cuopt_assert(upper_bound <= 1 + bound_tol, "Zero-half variable upper bound above one"); - // is_cycle is currently informational only; both cycle and wheel paths - // share the same accumulation logic - (void)is_cycle; - if (complement) { if (seen_original.count(var_idx) > 0) { return clique_cut_build_status_t::NO_CUT; } seen_complement.insert(var_idx); @@ -642,10 +622,7 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert } cut_rhs = rhs_acc; - ZERO_HALF_DEBUG( - "build_zero_half_cut pre-sort nz=%zu rhs=%g", cut.i.size(), static_cast(cut_rhs)); cut.sort(); - ZERO_HALF_DEBUG("build_zero_half_cut post-sort nz=%zu", cut.i.size()); const f_t dot = cut.dot(xstar); const f_t violation = cut_rhs - dot; @@ -687,18 +664,8 @@ bool dijkstra_odd_cycle(i_t source_local, f_t max_work_estimate) { const i_t num_local = static_cast(local_adj.size()); - ZERO_HALF_DEBUG("dijkstra_odd_cycle enter source_local=%lld num_local=%lld weights.size=%zu", - static_cast(source_local), - static_cast(num_local), - weights.size()); - if (source_local < 0 || source_local >= num_local) { - ZERO_HALF_DEBUG("dijkstra_odd_cycle source OUT_OF_RANGE"); - return false; - } - if (weights.size() != static_cast(num_local)) { - ZERO_HALF_DEBUG("dijkstra_odd_cycle weights size mismatch"); - return false; - } + if (source_local < 0 || source_local >= num_local) { return false; } + if (weights.size() != static_cast(num_local)) { return false; } cuopt_assert(source_local >= 0 && source_local < num_local, "Zero-half Dijkstra source out of range"); cuopt_assert(weights.size() == static_cast(num_local), @@ -722,12 +689,6 @@ bool dijkstra_odd_cycle(i_t source_local, auto [d, u] = pq.top(); pq.pop(); ++pops; - if (u < 0 || u >= total_idx) { - ZERO_HALF_DEBUG("dijkstra_odd_cycle popped u OUT_OF_RANGE u=%lld total_idx=%lld", - static_cast(u), - static_cast(total_idx)); - return false; - } if (d > dist[u]) { continue; } if (u == target_idx) { break; } if (cutoff > 0 && d >= cutoff) { break; } @@ -737,36 +698,13 @@ bool dijkstra_odd_cycle(i_t source_local, const i_t v_part = 1 - u_part; cuopt_assert(u_part == 0 || u_part == 1, "Bipartite part out of range"); - if (u_local < 0 || u_local >= static_cast(local_adj.size())) { - ZERO_HALF_DEBUG("dijkstra_odd_cycle u_local OUT_OF_RANGE u_local=%lld local_adj.size=%zu", - static_cast(u_local), - local_adj.size()); - return false; - } const auto& neigh = local_adj[u_local]; if (add_work_estimate(static_cast(neigh.size()) + 4.0, work_estimate, max_work_estimate)) { ZERO_HALF_DEBUG("dijkstra_odd_cycle work_limit hit pops=%lld", static_cast(pops)); return false; } for (const auto v_local : neigh) { - if (v_local < 0 || v_local >= num_local) { - ZERO_HALF_DEBUG("dijkstra_odd_cycle neighbor OUT_OF_RANGE v_local=%lld num_local=%lld", - static_cast(v_local), - static_cast(num_local)); - return false; - } cuopt_assert(v_local >= 0 && v_local < num_local, "Zero-half Dijkstra neighbor out of range"); - // Edge weight = (1 − x_u − x_v) / 2, where x_u/x_v are the LP values of - // the literals at u and v. A conflict-graph edge x_u + x_v <= 1 is an - // *implied* clique inequality (e.g. derived from a knapsack constraint - // when a_i + a_j > rhs, see clique_table.cu): it is valid for every - // integer-feasible point but is NOT explicitly enforced in the LP. So a - // fractional LP optimum routinely violates x_u + x_v <= 1 — that is - // exactly the violation the odd-cycle / zero-half separator exists to - // exploit. A negative raw edge weight is therefore expected, not a bug. - // We clamp it to 0 so the bipartite shortest path stays non-negative; a - // strongly violated edge then becomes a 0-weight (very attractive) edge, - // which is the desired behavior. f_t edge_w = (static_cast(1) - weights[u_local] - weights[v_local]) / 2; if (edge_w < 0) { edge_w = 0; } const i_t v = v_local + v_part * num_local; @@ -856,28 +794,12 @@ bool path_to_odd_cycle(const std::vector& bipartite_path, std::vector local_seq; local_seq.reserve(bipartite_path.size()); for (const auto bv : bipartite_path) { - if (num_local <= 0) { - ZERO_HALF_DEBUG("path_to_odd_cycle num_local <= 0 num_local=%lld", - static_cast(num_local)); - return false; - } local_seq.push_back(bv % num_local); } - // First and last entry should both correspond to the source CG vertex - if (local_seq.front() != local_seq.back()) { - ZERO_HALF_DEBUG("path_to_odd_cycle endpoints mismatch front=%lld back=%lld", - static_cast(local_seq.front()), - static_cast(local_seq.back())); - return false; - } cuopt_assert(local_seq.front() == local_seq.back(), "Zero-half cycle path endpoints must match"); // Drop the duplicate end so we have a sequence covering each cycle vertex once local_seq.pop_back(); - if ((local_seq.size() % 2) == 0 || local_seq.size() < 5) { - ZERO_HALF_DEBUG("path_to_odd_cycle reject local_seq.size=%zu", local_seq.size()); - return false; - } std::unordered_set seen_local; seen_local.reserve(local_seq.size()); @@ -894,22 +816,7 @@ bool path_to_odd_cycle(const std::vector& bipartite_path, std::unordered_set seen_var; seen_var.reserve(local_seq.size()); for (const auto lv : local_seq) { - if (lv < 0 || lv >= num_local || static_cast(lv) >= vertices.size()) { - ZERO_HALF_DEBUG( - "path_to_odd_cycle local idx OUT_OF_RANGE lv=%lld num_local=%lld vertices.size=%zu", - static_cast(lv), - static_cast(num_local), - vertices.size()); - return false; - } - cuopt_assert(lv >= 0 && lv < num_local, "Zero-half local idx out of range"); const i_t global = vertices[lv]; - if (global < 0 || global >= 2 * num_vars) { - ZERO_HALF_DEBUG("path_to_odd_cycle global vertex OUT_OF_RANGE global=%lld 2*num_vars=%lld", - static_cast(global), - static_cast(2 * num_vars)); - return false; - } cuopt_assert(global >= 0 && global < 2 * num_vars, "Zero-half global vertex out of range"); const i_t var_idx = global % num_vars; if (!seen_var.insert(var_idx).second) { @@ -958,11 +865,6 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, i_t smallest_degree_var = -1; for (auto v : cycle_vertices) { if (toc(start_time) >= time_limit) { return; } - if (v < 0 || v >= 2 * num_vars) { - ZERO_HALF_DEBUG("extend_to_odd_wheel cycle vertex OUT_OF_RANGE v=%lld", - static_cast(v)); - return; - } i_t degree = graph.get_degree_of_var(v); if (degree < smallest_degree) { smallest_degree = degree; @@ -981,11 +883,6 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, candidates.reserve(adj_set.size()); for (const auto candidate : adj_set) { if (toc(start_time) >= time_limit) { return; } - if (candidate < 0 || candidate >= 2 * num_vars) { - ZERO_HALF_DEBUG("extend_to_odd_wheel candidate OUT_OF_RANGE candidate=%lld", - static_cast(candidate)); - continue; - } if (cycle_members.count(candidate) != 0) { continue; } bool adj_to_all = true; for (const auto v : cycle_vertices) { @@ -1017,11 +914,6 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, auto reduced_cost = [&](i_t vertex_idx) -> f_t { i_t var_idx = vertex_idx % num_vars; - if (var_idx < 0 || static_cast(var_idx) >= reduced_costs.size()) { - ZERO_HALF_DEBUG("extend_to_odd_wheel reduced_cost OUT_OF_RANGE var_idx=%lld", - static_cast(var_idx)); - return 0.0; - } cuopt_assert(var_idx >= 0 && var_idx < static_cast(reduced_costs.size()), "Reduced cost index out of range"); f_t rc = reduced_costs[var_idx]; From 61653c4f0c0f58768755e72466569ca5b0e316f2 Mon Sep 17 00:00:00 2001 From: akif Date: Mon, 8 Jun 2026 16:43:27 +0200 Subject: [PATCH 38/47] tidy logs --- cpp/src/cuts/cuts.cpp | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 655b4fe308..998ddd4bdc 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -38,33 +38,32 @@ namespace { enum class clique_cut_build_status_t : int8_t { NO_CUT = 0, CUT_ADDED = 1, INFEASIBLE = 2 }; -#if DEBUG_CLIQUE_CUTS -#define CLIQUE_CUTS_DEBUG(...) \ - do { \ - std::fprintf(stderr, "[DEBUG_CLIQUE_CUTS] "); \ - std::fprintf(stderr, __VA_ARGS__); \ - std::fprintf(stderr, "\n"); \ +// Shared crash-tolerant debug logger: writes a prefixed line to stderr and +// flushes immediately so the last line is visible even if the process +// aborts/terminates right after. Each channel below enables it through its own +// DEBUG_* flag and supplies its own prefix; when the flag is 0 the call expands +// to a no-op that still consumes its arguments. +#define CUTS_DEBUG_LOG(prefix, ...) \ + do { \ + std::fprintf(stderr, prefix " "); \ + std::fprintf(stderr, __VA_ARGS__); \ + std::fprintf(stderr, "\n"); \ + std::fflush(stderr); \ } while (0) -#else -#define CLIQUE_CUTS_DEBUG(...) \ - do { \ +#define CUTS_DEBUG_NOOP(...) \ + do { \ } while (0) + +#if DEBUG_CLIQUE_CUTS +#define CLIQUE_CUTS_DEBUG(...) CUTS_DEBUG_LOG("[DEBUG_CLIQUE_CUTS]", __VA_ARGS__) +#else +#define CLIQUE_CUTS_DEBUG(...) CUTS_DEBUG_NOOP(__VA_ARGS__) #endif -// Crash-tolerant logger: writes to stderr and flushes immediately so the -// last log line is visible even if the process aborts/terminates right after. #if DEBUG_ZERO_HALF_CUTS -#define ZERO_HALF_DEBUG(...) \ - do { \ - std::fprintf(stderr, "[zero_half] "); \ - std::fprintf(stderr, __VA_ARGS__); \ - std::fprintf(stderr, "\n"); \ - std::fflush(stderr); \ - } while (0) +#define ZERO_HALF_DEBUG(...) CUTS_DEBUG_LOG("[zero_half]", __VA_ARGS__) #else -#define ZERO_HALF_DEBUG(...) \ - do { \ - } while (0) +#define ZERO_HALF_DEBUG(...) CUTS_DEBUG_NOOP(__VA_ARGS__) #endif template From 79d5a6e3d5d94c1c3882a655e30df7f974028d0d Mon Sep 17 00:00:00 2001 From: akif Date: Mon, 8 Jun 2026 17:32:01 +0200 Subject: [PATCH 39/47] optimize dijstra in-loop allocation and work unit tracking --- cpp/src/cuts/cuts.cpp | 76 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 998ddd4bdc..d6e59ef348 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -644,6 +644,37 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert return clique_cut_build_status_t::NO_CUT; } +// Reusable scratch for dijkstra_odd_cycle. The separation loop runs Dijkstra +// once per source vertex; re-allocating and re-initializing dist/prev (size +// 2 * num_local) on every call is O(num_local) per call, i.e. O(num_local^2) +// over a pass — and that cost is invisible to the work-estimate budget. Instead +// we allocate the buffers once and reset them in O(1) using a generation stamp: +// dist[v]/prev[v] are considered valid for the current call only when +// stamp[v] == gen. Bumping `gen` at the start of each call logically clears the +// whole array without touching memory; entries are (re)stamped lazily as they +// are relaxed, so per-call work is O(touched) rather than O(num_local). +template +struct dijkstra_scratch_t { + std::vector dist; + std::vector prev; + std::vector stamp; // stamp[v] == gen <=> dist[v]/prev[v] valid this call + std::uint64_t gen{0}; + + // Ensure buffers cover `n` bipartite nodes. On growth, stamp is zeroed and + // gen reset so no stale entry can spuriously match a future gen; dist/prev + // need no initialization because they are only read when their stamp matches + // the current gen. + void ensure_size(std::size_t n) + { + if (stamp.size() < n) { + dist.resize(n); + prev.resize(n); + stamp.assign(n, 0); + gen = 0; + } + } +}; + // Run Dijkstra over the bipartite auxiliary graph G' built from the fractional // sub-CG. local_adj is the adjacency in CG (local indices). weights[v] is the // LP value of vertex v in CG. The auxiliary graph has 2 * num_local vertices, @@ -652,6 +683,8 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert // shortest path from `source_local + 0 * num_local` to `source_local + num_local`. // On success, returns true and fills `path` with the path (sequence of bipartite // indices) and `total_weight` with its cost. Otherwise returns false. +// `scratch` holds reusable dist/prev/stamp buffers (see dijkstra_scratch_t); the +// caller owns it and reuses it across all sources in a cut pass. template bool dijkstra_odd_cycle(i_t source_local, const std::vector>& local_adj, @@ -660,7 +693,8 @@ bool dijkstra_odd_cycle(i_t source_local, std::vector& path, f_t& total_weight, f_t* work_estimate, - f_t max_work_estimate) + f_t max_work_estimate, + dijkstra_scratch_t& scratch) { const i_t num_local = static_cast(local_adj.size()); if (source_local < 0 || source_local >= num_local) { return false; } @@ -675,9 +709,19 @@ bool dijkstra_odd_cycle(i_t source_local, const i_t total_idx = 2 * num_local; const f_t f_inf = std::numeric_limits::infinity(); - std::vector dist(static_cast(total_idx), f_inf); - std::vector prev(static_cast(total_idx), -1); - dist[source_idx] = 0; + scratch.ensure_size(static_cast(total_idx)); + ++scratch.gen; + const std::uint64_t gen = scratch.gen; + auto& dist = scratch.dist; + auto& prev = scratch.prev; + auto& stamp = scratch.stamp; + // dist[v]/prev[v] are valid only if last written this call (stamp[v] == gen); + // otherwise the node is unreached, i.e. distance infinity. + auto cur_dist = [&](i_t v) -> f_t { return stamp[v] == gen ? dist[v] : f_inf; }; + + dist[source_idx] = 0; + prev[source_idx] = -1; + stamp[source_idx] = gen; using node_t = std::pair; std::priority_queue, std::greater> pq; @@ -688,7 +732,7 @@ bool dijkstra_odd_cycle(i_t source_local, auto [d, u] = pq.top(); pq.pop(); ++pops; - if (d > dist[u]) { continue; } + if (d > cur_dist(u)) { continue; } if (u == target_idx) { break; } if (cutoff > 0 && d >= cutoff) { break; } @@ -708,19 +752,21 @@ bool dijkstra_odd_cycle(i_t source_local, if (edge_w < 0) { edge_w = 0; } const i_t v = v_local + v_part * num_local; const f_t nd = d + edge_w; - if (nd < dist[v]) { - dist[v] = nd; - prev[v] = u; + if (nd < cur_dist(v)) { + dist[v] = nd; + prev[v] = u; + stamp[v] = gen; pq.emplace(nd, v); } } } - if (!std::isfinite(dist[target_idx])) { + const f_t target_dist = cur_dist(target_idx); + if (!std::isfinite(target_dist)) { ZERO_HALF_DEBUG("dijkstra_odd_cycle no path pops=%lld", static_cast(pops)); return false; } - total_weight = dist[target_idx]; + total_weight = target_dist; // All G' edge weights are clamped to >= 0, so the shortest-path distance must // be non-negative; a negative total means the clamp/relaxation invariant broke. cuopt_assert(total_weight >= -static_cast(1e-9), @@ -1050,6 +1096,7 @@ std::vector> find_violated_odd_cycles_for_test( std::vector bipartite_path; std::vector cycle_local; std::vector already_used(n_vertices, 0); + dijkstra_scratch_t dijkstra_scratch; for (int s = 0; s < num_local; ++s) { if (toc(start_time) >= time_limit) { break; } @@ -1063,7 +1110,8 @@ std::vector> find_violated_odd_cycles_for_test( bipartite_path, total_weight, &work_estimate, - max_work_estimate)) { + max_work_estimate, + dijkstra_scratch)) { continue; } cycle_local.clear(); @@ -3186,7 +3234,7 @@ void cut_generation_t::prepare_fractional_sub_cg( const f_t bound_tol = settings.primal_tol; f_t work_estimate = 0.0; - const f_t max_work_estimate = 1e8; + const f_t max_work_estimate = 1e7; sub_cg_.num_vars = num_vars; sub_cg_.vertices.reserve(static_cast(num_vars) * 2); @@ -3704,6 +3752,7 @@ bool cut_generation_t::generate_zero_half_cuts( i_t cuts_added = 0; i_t added_per_var = 0; std::vector already_used(num_local, 0); + dijkstra_scratch_t dijkstra_scratch; for (i_t s = 0; s < num_local; ++s) { if (toc(start_time) >= settings.time_limit) { break; } @@ -3721,7 +3770,8 @@ bool cut_generation_t::generate_zero_half_cuts( bipartite_path, total_weight, &work_estimate, - max_work_estimate)) { + max_work_estimate, + dijkstra_scratch)) { continue; } if (!path_to_odd_cycle(bipartite_path, From 9f8d1a4d4e115928e86ac279c8c7d944a7fbf69a Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 9 Jun 2026 13:38:37 +0200 Subject: [PATCH 40/47] remove logs and comments --- cpp/src/cuts/cuts.cpp | 253 +++++++++++++++++++----------------------- 1 file changed, 116 insertions(+), 137 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index d6e59ef348..8fd7ab5210 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -33,7 +33,7 @@ namespace cuopt::linear_programming::dual_simplex { namespace { #define DEBUG_CLIQUE_CUTS 0 -#define DEBUG_ZERO_HALF_CUTS 1 +#define DEBUG_ZERO_HALF_CUTS 0 #define CHECK_WORKSPACE 0 enum class clique_cut_build_status_t : int8_t { NO_CUT = 0, CUT_ADDED = 1, INFEASIBLE = 2 }; @@ -371,6 +371,85 @@ void bron_kerbosch(bk_bitset_context_t& ctx, } } +// ---- Shared helpers for greedy CG-based set extension (clique & odd-wheel) ---- + +// Pick the seed vertex with the smallest conflict-graph degree. Returns -1 if +// the seed is empty or the time limit is hit while scanning. +template +i_t min_degree_anchor(const std::vector& seed, + detail::clique_table_t& graph, + f_t start_time, + f_t time_limit) +{ + i_t smallest_degree = std::numeric_limits::max(); + i_t smallest_degree_var = -1; + for (auto v : seed) { + if (toc(start_time) >= time_limit) { return -1; } + i_t degree = graph.get_degree_of_var(v); + if (degree < smallest_degree) { + smallest_degree = degree; + smallest_degree_var = v; + } + } + return smallest_degree_var; +} + +// Reduced-cost key for a CG vertex. A complement vertex (idx >= num_vars) maps +// to the original variable and flips the sign. Sorting candidates by this key +// keeps xstar minimally disturbed so the resulting cut stays binding and the +// dual simplex resolve stays cheap. +template +f_t cg_reduced_cost(i_t vertex_idx, const std::vector& reduced_costs, i_t num_vars) +{ + i_t var_idx = vertex_idx % num_vars; + cuopt_assert(var_idx >= 0 && var_idx < static_cast(reduced_costs.size()), + "Reduced cost index out of range"); + f_t rc = reduced_costs[var_idx]; + if (!std::isfinite(rc)) { rc = 0.0; } + return vertex_idx >= num_vars ? -rc : rc; +} + +template +void sort_candidates_by_reduced_cost(std::vector& candidates, + const std::vector& reduced_costs, + i_t num_vars) +{ + std::sort(candidates.begin(), candidates.end(), [&](i_t a, i_t b) { + return cg_reduced_cost(a, reduced_costs, num_vars) < + cg_reduced_cost(b, reduced_costs, num_vars); + }); +} + +// Greedily grow `selected` by appending candidates (assumed already ordered by +// reduced cost) that are adjacent to every current member of `selected`. The +// resulting `selected` is therefore a clique. Stops early when the time or work +// budget is exhausted. +template +void greedy_extend_clique(std::vector& selected, + const std::vector& candidates, + detail::clique_table_t& graph, + f_t adj_check_cost, + f_t start_time, + f_t time_limit, + f_t* work_estimate, + f_t max_work_estimate) +{ + for (const auto candidate : candidates) { + if (toc(start_time) >= time_limit) { return; } + bool add = true; + i_t checks = 0; + for (const auto v : selected) { + checks++; + if (!graph.check_adjacency(candidate, v)) { + add = false; + break; + } + } + if (add_work_estimate(adj_check_cost * checks, work_estimate, max_work_estimate)) { break; } + if (add) { selected.push_back(candidate); } + } +} + template void extend_clique_vertices(std::vector& clique_vertices, detail::clique_table_t& graph, @@ -392,16 +471,8 @@ void extend_clique_vertices(std::vector& clique_vertices, static_cast(clique_vertices.size())); const f_t initial_clique_size = static_cast(clique_vertices.size()); - i_t smallest_degree = std::numeric_limits::max(); - i_t smallest_degree_var = -1; - for (auto v : clique_vertices) { - if (toc(start_time) >= time_limit) { return; } - i_t degree = graph.get_degree_of_var(v); - if (degree < smallest_degree) { - smallest_degree = degree; - smallest_degree_var = v; - } - } + const i_t smallest_degree_var = min_degree_anchor(clique_vertices, graph, start_time, time_limit); + if (smallest_degree_var < 0) { return; } auto adj_set = graph.get_adj_set_of_var(smallest_degree_var); std::unordered_set clique_members(clique_vertices.begin(), clique_vertices.end()); @@ -415,12 +486,10 @@ void extend_clique_vertices(std::vector& clique_vertices, f_t value = candidate >= num_vars ? (1.0 - xstar[var_idx]) : xstar[var_idx]; if (std::abs(value - std::round(value)) <= integer_tol) { candidates.push_back(candidate); } } - CLIQUE_CUTS_DEBUG( - "extend_clique_vertices anchor=%lld degree=%lld adj_size=%lld integer_candidates=%lld", - static_cast(smallest_degree_var), - static_cast(smallest_degree), - static_cast(adj_set.size()), - static_cast(candidates.size())); + CLIQUE_CUTS_DEBUG("extend_clique_vertices anchor=%lld adj_size=%lld integer_candidates=%lld", + static_cast(smallest_degree_var), + static_cast(adj_set.size()), + static_cast(candidates.size())); const f_t candidate_size = static_cast(candidates.size()); const f_t sort_work = candidate_size > 0.0 ? 2.0 * candidate_size * std::log2(candidate_size + 1.0) : 0.0; @@ -444,40 +513,18 @@ void extend_clique_vertices(std::vector& clique_vertices, // less refactors and less iterations after resolve. // it also increases the cut's effectiveness by keeping xstar not disturbed much // if it is disturbed too much, the cut might become non-binding - auto reduced_cost = [&](i_t vertex_idx) -> f_t { - i_t var_idx = vertex_idx % num_vars; - cuopt_assert(var_idx >= 0 && var_idx < static_cast(reduced_costs.size()), - "Variable index out of range"); - f_t rc = reduced_costs[var_idx]; - if (!std::isfinite(rc)) { rc = 0.0; } - return vertex_idx >= num_vars ? -rc : rc; - }; - - std::sort(candidates.begin(), candidates.end(), [&](i_t a, i_t b) { - return reduced_cost(a) < reduced_cost(b); - }); - - for (const auto candidate : candidates) { - bool add = true; - i_t checks = 0; - for (const auto v : clique_vertices) { - checks++; - if (!graph.check_adjacency(candidate, v)) { - add = false; - break; - } - } - // Each check_adjacency now charges its own addtl_cliques_scan_cost - // term so the per-iteration budget reflects the addtl scan cost. - if (add_work_estimate( - adj_check_cost * static_cast(checks), work_estimate, max_work_estimate)) { - break; - } - if (add) { - clique_vertices.push_back(candidate); - clique_members.insert(candidate); - } - } + sort_candidates_by_reduced_cost(candidates, reduced_costs, num_vars); + + // adj_check_cost folds in addtl_cliques_scan_cost so each check_adjacency + // charges its own addtl scan cost as the clique grows. + greedy_extend_clique(clique_vertices, + candidates, + graph, + adj_check_cost, + start_time, + time_limit, + work_estimate, + max_work_estimate); CLIQUE_CUTS_DEBUG("extend_clique_vertices done start=%lld final=%lld added=%lld", static_cast(initial_clique_vertices), static_cast(clique_vertices.size()), @@ -505,16 +552,6 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert f_t max_work_estimate) { const size_t cycle_size = cycle_vertices.size(); - ZERO_HALF_DEBUG( - "build_zero_half_cut enter cycle_size=%zu wheel_centers=%zu num_vars=%lld var_types.size=%zu " - "lower.size=%zu upper.size=%zu xstar.size=%zu", - cycle_size, - wheel_centers.size(), - static_cast(num_vars), - var_types.size(), - lower_bounds.size(), - upper_bounds.size(), - xstar.size()); if (cycle_size < 5 || (cycle_size % 2) == 0) { ZERO_HALF_DEBUG("build_zero_half_cut reject cycle_size=%zu", cycle_size); return clique_cut_build_status_t::NO_CUT; @@ -634,25 +671,16 @@ clique_cut_build_status_t build_zero_half_cut(const std::vector& cycle_vert static_cast(min_violation), static_cast(cycle_size), static_cast(wheel_centers.size())); - // Dijkstra found a path < 0.5 − min_violation, so the violation should be - // > min_violation here (modulo wheel-lift effects, dropped near-zero - // coefficients, and FP reorder). Slight drift below the threshold is fine - // — we just won't ship the cut. A *strongly* negative violation indicates - // a real bug in cycle construction, the wheel lift, or the cut algebra. cuopt_assert(violation > -bound_tol, "Zero-half cut violation flipped sign unexpectedly"); if (violation > min_violation) { return clique_cut_build_status_t::CUT_ADDED; } return clique_cut_build_status_t::NO_CUT; } // Reusable scratch for dijkstra_odd_cycle. The separation loop runs Dijkstra -// once per source vertex; re-allocating and re-initializing dist/prev (size -// 2 * num_local) on every call is O(num_local) per call, i.e. O(num_local^2) -// over a pass — and that cost is invisible to the work-estimate budget. Instead +// once per source vertex; re-allocating and re-initializing dist/prev. Instead // we allocate the buffers once and reset them in O(1) using a generation stamp: // dist[v]/prev[v] are considered valid for the current call only when -// stamp[v] == gen. Bumping `gen` at the start of each call logically clears the -// whole array without touching memory; entries are (re)stamped lazily as they -// are relaxed, so per-call work is O(touched) rather than O(num_local). +// stamp[v] == gen. template struct dijkstra_scratch_t { std::vector dist; @@ -660,10 +688,6 @@ struct dijkstra_scratch_t { std::vector stamp; // stamp[v] == gen <=> dist[v]/prev[v] valid this call std::uint64_t gen{0}; - // Ensure buffers cover `n` bipartite nodes. On growth, stamp is zeroed and - // gen reset so no stale entry can spuriously match a future gen; dist/prev - // need no initialization because they are only read when their stamp matches - // the current gen. void ensure_size(std::size_t n) { if (stamp.size() < n) { @@ -675,16 +699,6 @@ struct dijkstra_scratch_t { } }; -// Run Dijkstra over the bipartite auxiliary graph G' built from the fractional -// sub-CG. local_adj is the adjacency in CG (local indices). weights[v] is the -// LP value of vertex v in CG. The auxiliary graph has 2 * num_local vertices, -// with bipartite_idx = local_idx + part * num_local, part in {0, 1}. -// Edge weight in G' is max(0, (1 - weights[u] - weights[v]) / 2). We seek the -// shortest path from `source_local + 0 * num_local` to `source_local + num_local`. -// On success, returns true and fills `path` with the path (sequence of bipartite -// indices) and `total_weight` with its cost. Otherwise returns false. -// `scratch` holds reusable dist/prev/stamp buffers (see dijkstra_scratch_t); the -// caller owns it and reuses it across all sources in a cut pass. template bool dijkstra_odd_cycle(i_t source_local, const std::vector>& local_adj, @@ -805,10 +819,6 @@ bool dijkstra_odd_cycle(i_t source_local, return true; } -// Convert a bipartite-graph path (sequence of bipartite indices) into a simple -// odd cycle expressed as global CG vertex indices in [0, 2*num_vars). Returns -// true and fills `cycle_vertices` if a simple cycle of odd length >= 5 (so > -// triangle) was successfully extracted. template bool path_to_odd_cycle(const std::vector& bipartite_path, const std::vector& vertices, @@ -871,9 +881,6 @@ bool path_to_odd_cycle(const std::vector& bipartite_path, } cycle_vertices.push_back(global); } - // Each local-sequence entry maps to exactly one distinct CG vertex (duplicates - // were rejected above), so the extracted cycle keeps the odd length of the - // de-duplicated path. cuopt_assert(cycle_vertices.size() == local_seq.size(), "Zero-half cycle dropped vertices during global mapping"); cuopt_assert((cycle_vertices.size() % 2) == 1, "Zero-half extracted cycle must have odd length"); @@ -882,8 +889,7 @@ bool path_to_odd_cycle(const std::vector& bipartite_path, } // Greedy lifting: extend an odd cycle by attaching a clique of "wheel center" -// vertices that are adjacent (in CG) to every vertex of the cycle. Mirrors the -// behavior of extend_clique_vertices but uses the cycle as the seed. +// vertices that are adjacent (in CG) to every vertex of the cycle. template void extend_to_odd_wheel(const std::vector& cycle_vertices, std::vector& wheel_centers, @@ -906,19 +912,9 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, if (cycle_vertices.empty()) { return; } if (toc(start_time) >= time_limit) { return; } - i_t smallest_degree = std::numeric_limits::max(); - i_t smallest_degree_var = -1; - for (auto v : cycle_vertices) { - if (toc(start_time) >= time_limit) { return; } - i_t degree = graph.get_degree_of_var(v); - if (degree < smallest_degree) { - smallest_degree = degree; - smallest_degree_var = v; - } - } - ZERO_HALF_DEBUG("extend_to_odd_wheel smallest_degree_var=%lld smallest_degree=%lld", - static_cast(smallest_degree_var), - static_cast(smallest_degree)); + const i_t smallest_degree_var = min_degree_anchor(cycle_vertices, graph, start_time, time_limit); + ZERO_HALF_DEBUG("extend_to_odd_wheel smallest_degree_var=%lld", + static_cast(smallest_degree_var)); if (smallest_degree_var < 0) { return; } auto adj_set = graph.get_adj_set_of_var(smallest_degree_var); @@ -957,37 +953,20 @@ void extend_to_odd_wheel(const std::vector& cycle_vertices, return; } - auto reduced_cost = [&](i_t vertex_idx) -> f_t { - i_t var_idx = vertex_idx % num_vars; - cuopt_assert(var_idx >= 0 && var_idx < static_cast(reduced_costs.size()), - "Reduced cost index out of range"); - f_t rc = reduced_costs[var_idx]; - if (!std::isfinite(rc)) { rc = 0.0; } - return vertex_idx >= num_vars ? -rc : rc; - }; - - std::sort(candidates.begin(), candidates.end(), [&](i_t a, i_t b) { - return reduced_cost(a) < reduced_cost(b); - }); + sort_candidates_by_reduced_cost(candidates, reduced_costs, num_vars); + // Candidates are already adjacent to every cycle vertex (filtered above), so + // growing a clique among them yields centers adjacent to the whole cycle and + // to each other. const f_t adj_check_cost = 5.0; - for (const auto candidate : candidates) { - if (toc(start_time) >= time_limit) { return; } - bool adj_to_wheel = true; - i_t checks = 0; - for (const auto w : wheel_centers) { - checks++; - if (!graph.check_adjacency(candidate, w)) { - adj_to_wheel = false; - break; - } - } - if (add_work_estimate( - adj_check_cost * static_cast(checks), work_estimate, max_work_estimate)) { - break; - } - if (adj_to_wheel) { wheel_centers.push_back(candidate); } - } + greedy_extend_clique(wheel_centers, + candidates, + graph, + adj_check_cost, + start_time, + time_limit, + work_estimate, + max_work_estimate); #ifdef ASSERT_MODE // Post-condition: the selected centers must form a clique that is fully // adjacent to the cycle — each center adjacent to every cycle vertex and to From 09bdecdce0d6b9936cb8da99dd92eb585d55e8db Mon Sep 17 00:00:00 2001 From: akif Date: Tue, 9 Jun 2026 17:53:23 +0200 Subject: [PATCH 41/47] test the cublas fix --- .../linear_programming/pdlp/solver_settings.hpp | 4 ++++ cpp/src/barrier/sparse_cholesky.cuh | 13 +++++++++++++ cpp/src/dual_simplex/simplex_solver_settings.hpp | 5 +++++ cpp/src/pdlp/solve.cu | 9 +++++++++ 4 files changed, 31 insertions(+) diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index b30286f9ce..7b4aa745f5 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -311,6 +311,10 @@ class pdlp_solver_settings_t { bool inside_mip{false}; // For concurrent termination std::atomic* concurrent_halt{nullptr}; + // Set by run_concurrent: 1 while PDLP is running, 0 once it exits. Forwarded to + // the barrier's settings so the cuDSS barrier waits for PDLP to exit before + // tearing down (avoids cuDSS teardown corrupting a live PDLP capture). + std::atomic* pdlp_running{nullptr}; // Shared strong branching solved flags for cooperative DS + PDLP cuda::std::span> shared_sb_solved; static constexpr f_t minimal_absolute_tolerance = 1.0e-12; diff --git a/cpp/src/barrier/sparse_cholesky.cuh b/cpp/src/barrier/sparse_cholesky.cuh index c2223e5080..5c02bd35c7 100644 --- a/cpp/src/barrier/sparse_cholesky.cuh +++ b/cpp/src/barrier/sparse_cholesky.cuh @@ -20,6 +20,9 @@ #include "cudss.h" +#include +#include + namespace cuopt::linear_programming::dual_simplex { template @@ -354,6 +357,16 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { ~sparse_cholesky_cudss_t() override { + // In a concurrent solve, wait until the PDLP peer has exited before tearing + // down cuDSS. cuDSS destroy device-synchronizes, which corrupts a live PDLP + // CUDA graph capture (surfaces as CUBLAS_STATUS_INTERNAL_ERROR). The flag is + // cleared right after run_pdlp returns, so this cannot deadlock. + if (settings_.pdlp_running != nullptr) { + while (settings_.pdlp_running->load(std::memory_order_acquire) != 0) { + std::this_thread::yield(); + } + } + cudaFreeAsync(csr_values_d, stream); cudaFreeAsync(csr_columns_d, stream); cudaFreeAsync(csr_offset_d, stream); diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 000ff4bcef..7811e898b7 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -247,6 +247,11 @@ struct simplex_solver_settings_t { mutable logger_t log; std::atomic* concurrent_halt; // if nullptr ignored, if !nullptr, 0 if solver should // continue, 1 if solver should halt + // Non-owning flag set by a concurrent PDLP peer: 1 while PDLP is still running + // (possibly capturing a CUDA graph), 0 once it has exited. The barrier waits for + // this to reach 0 before destroying its cuDSS solver, so cuDSS teardown (which + // device-synchronizes) cannot corrupt a live PDLP capture. nullptr = no wait. + std::atomic* pdlp_running = nullptr; // Optional non-owning pointer to run-level benchmark stats. cuopt::linear_programming::benchmark_info_t* benchmark_info_ptr = nullptr; }; diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 07a4676120..022f6f35ed 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -501,6 +501,7 @@ run_barrier(dual_simplex::user_problem_t& user_problem, barrier_settings.time_limit = settings.time_limit; barrier_settings.iteration_limit = settings.iteration_limit; barrier_settings.concurrent_halt = settings.concurrent_halt; + barrier_settings.pdlp_running = settings.pdlp_running; barrier_settings.folding = settings.folding; barrier_settings.augmented = settings.augmented; barrier_settings.dualize = settings.dualize; @@ -1528,6 +1529,12 @@ optimization_problem_solution_t run_concurrent( settings_pdlp.concurrent_halt = &global_concurrent_halt; } + // 1 while PDLP runs, 0 once it exits. The barrier task waits for this to reach 0 + // before destroying its cuDSS solver, so cuDSS teardown (which device-syncs) + // never collides with a live PDLP CUDA graph capture. + std::atomic pdlp_running{1}; + settings_pdlp.pdlp_running = &pdlp_running; + // Make sure allocations are done on the original stream problem.handle_ptr->sync_stream(); @@ -1639,6 +1646,8 @@ optimization_problem_solution_t run_concurrent( pdlp_exception = std::current_exception(); request_concurrent_halt(); } + // PDLP has exited (no more capture): let the barrier proceed to teardown. + pdlp_running.store(0, std::memory_order_release); // Implicit taskgroup barrier joins all spawned tasks below. } }; From e39f6dbf4b2c1359427a68fb62979ef4a078be09 Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 10 Jun 2026 10:22:48 +0200 Subject: [PATCH 42/47] Revert "test the cublas fix" This reverts commit 09bdecdce0d6b9936cb8da99dd92eb585d55e8db. --- .../linear_programming/pdlp/solver_settings.hpp | 4 ---- cpp/src/barrier/sparse_cholesky.cuh | 13 ------------- cpp/src/dual_simplex/simplex_solver_settings.hpp | 5 ----- cpp/src/pdlp/solve.cu | 9 --------- 4 files changed, 31 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index 7b4aa745f5..b30286f9ce 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -311,10 +311,6 @@ class pdlp_solver_settings_t { bool inside_mip{false}; // For concurrent termination std::atomic* concurrent_halt{nullptr}; - // Set by run_concurrent: 1 while PDLP is running, 0 once it exits. Forwarded to - // the barrier's settings so the cuDSS barrier waits for PDLP to exit before - // tearing down (avoids cuDSS teardown corrupting a live PDLP capture). - std::atomic* pdlp_running{nullptr}; // Shared strong branching solved flags for cooperative DS + PDLP cuda::std::span> shared_sb_solved; static constexpr f_t minimal_absolute_tolerance = 1.0e-12; diff --git a/cpp/src/barrier/sparse_cholesky.cuh b/cpp/src/barrier/sparse_cholesky.cuh index 5c02bd35c7..c2223e5080 100644 --- a/cpp/src/barrier/sparse_cholesky.cuh +++ b/cpp/src/barrier/sparse_cholesky.cuh @@ -20,9 +20,6 @@ #include "cudss.h" -#include -#include - namespace cuopt::linear_programming::dual_simplex { template @@ -357,16 +354,6 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { ~sparse_cholesky_cudss_t() override { - // In a concurrent solve, wait until the PDLP peer has exited before tearing - // down cuDSS. cuDSS destroy device-synchronizes, which corrupts a live PDLP - // CUDA graph capture (surfaces as CUBLAS_STATUS_INTERNAL_ERROR). The flag is - // cleared right after run_pdlp returns, so this cannot deadlock. - if (settings_.pdlp_running != nullptr) { - while (settings_.pdlp_running->load(std::memory_order_acquire) != 0) { - std::this_thread::yield(); - } - } - cudaFreeAsync(csr_values_d, stream); cudaFreeAsync(csr_columns_d, stream); cudaFreeAsync(csr_offset_d, stream); diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 7811e898b7..000ff4bcef 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -247,11 +247,6 @@ struct simplex_solver_settings_t { mutable logger_t log; std::atomic* concurrent_halt; // if nullptr ignored, if !nullptr, 0 if solver should // continue, 1 if solver should halt - // Non-owning flag set by a concurrent PDLP peer: 1 while PDLP is still running - // (possibly capturing a CUDA graph), 0 once it has exited. The barrier waits for - // this to reach 0 before destroying its cuDSS solver, so cuDSS teardown (which - // device-synchronizes) cannot corrupt a live PDLP capture. nullptr = no wait. - std::atomic* pdlp_running = nullptr; // Optional non-owning pointer to run-level benchmark stats. cuopt::linear_programming::benchmark_info_t* benchmark_info_ptr = nullptr; }; diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 022f6f35ed..07a4676120 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -501,7 +501,6 @@ run_barrier(dual_simplex::user_problem_t& user_problem, barrier_settings.time_limit = settings.time_limit; barrier_settings.iteration_limit = settings.iteration_limit; barrier_settings.concurrent_halt = settings.concurrent_halt; - barrier_settings.pdlp_running = settings.pdlp_running; barrier_settings.folding = settings.folding; barrier_settings.augmented = settings.augmented; barrier_settings.dualize = settings.dualize; @@ -1529,12 +1528,6 @@ optimization_problem_solution_t run_concurrent( settings_pdlp.concurrent_halt = &global_concurrent_halt; } - // 1 while PDLP runs, 0 once it exits. The barrier task waits for this to reach 0 - // before destroying its cuDSS solver, so cuDSS teardown (which device-syncs) - // never collides with a live PDLP CUDA graph capture. - std::atomic pdlp_running{1}; - settings_pdlp.pdlp_running = &pdlp_running; - // Make sure allocations are done on the original stream problem.handle_ptr->sync_stream(); @@ -1646,8 +1639,6 @@ optimization_problem_solution_t run_concurrent( pdlp_exception = std::current_exception(); request_concurrent_halt(); } - // PDLP has exited (no more capture): let the barrier proceed to teardown. - pdlp_running.store(0, std::memory_order_release); // Implicit taskgroup barrier joins all spawned tasks below. } }; From 933778ac2d9225bbf8ee207906591af674c2ff1c Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 10 Jun 2026 15:02:11 +0200 Subject: [PATCH 43/47] fix hang and cublas bug --- .../diversity/diversity_manager.cu | 51 +++++++++++++------ cpp/src/utilities/manual_cuda_graph.cuh | 43 ++++++++++++---- det_one.py | 40 +++++++++++++++ determinism_milp_test.py | 43 ++++++++++++++++ 4 files changed, 153 insertions(+), 24 deletions(-) create mode 100644 det_one.py create mode 100644 determinism_milp_test.py diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu index 7b038d6fa6..b88099d8ae 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu +++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu @@ -493,15 +493,22 @@ solution_t diversity_manager_t::run_solver() timer_t lp_timer(lp_time_limit); auto lp_result = solve_lp_with_method(*problem_ptr, pdlp_settings, lp_timer); + // The concurrent root LP can fail to produce a usable solution -- e.g. the barrier + // hits a numerical error on an infeasible problem and PDLP returns NumericalError + // with empty primal/dual. In that case we must not copy or hand off the empty + // result (copying n elements from an empty buffer throws), and we must still + // release B&B's root-relaxation wait so it proceeds with its own dual-simplex root + // instead of spinning forever. + const bool root_lp_usable = + lp_result.get_termination_status() != pdlp_termination_status_t::NumericalError && + lp_result.get_primal_solution().size() == lp_optimal_solution.size() && + lp_result.get_dual_solution().size() == lp_dual_optimal_solution.size(); + bool use_staged_simplex_solution = false; { std::lock_guard guard(relaxed_solution_mutex); use_staged_simplex_solution = simplex_solution_exists.load(); - if (!use_staged_simplex_solution) { - cuopt_assert(lp_result.get_primal_solution().size() == lp_optimal_solution.size(), - "LP optimal solution size mismatch"); - cuopt_assert(lp_result.get_dual_solution().size() == lp_dual_optimal_solution.size(), - "LP dual optimal solution size mismatch"); + if (!use_staged_simplex_solution && root_lp_usable) { raft::copy(lp_optimal_solution.data(), lp_result.get_primal_solution().data(), lp_optimal_solution.size(), @@ -513,14 +520,26 @@ solution_t diversity_manager_t::run_solver() } } if (use_staged_simplex_solution) { consume_staged_simplex_solution(lp_state); } - cuopt_assert(thrust::all_of(problem_ptr->handle_ptr->get_thrust_policy(), - lp_optimal_solution.begin(), - lp_optimal_solution.end(), - [] __host__ __device__(f_t val) { return std::isfinite(val); }), - "LP optimal solution contains non-finite values"); + if (use_staged_simplex_solution || root_lp_usable) { + cuopt_assert(thrust::all_of(problem_ptr->handle_ptr->get_thrust_policy(), + lp_optimal_solution.begin(), + lp_optimal_solution.end(), + [] __host__ __device__(f_t val) { return std::isfinite(val); }), + "LP optimal solution contains non-finite values"); + } ls.lp_optimal_exists = true; if (!use_staged_simplex_solution) { - if (lp_result.get_termination_status() == pdlp_termination_status_t::Optimal) { + if (!root_lp_usable) { + // The concurrent root LP produced no usable solution. Do not hand an empty + // solution to B&B; instead release its root-relaxation wait loop so it falls + // back to its own dual-simplex root rather than deadlocking. + CUOPT_LOG_DEBUG("Root LP produced no usable solution (status %d); releasing B&B root solve", + (int)lp_result.get_termination_status()); + ls.lp_optimal_exists = false; + if (context.branch_and_bound_ptr != nullptr) { + context.branch_and_bound_ptr->set_root_concurrent_halt(1); + } + } else if (lp_result.get_termination_status() == pdlp_termination_status_t::Optimal) { solution_t lp_sol(*problem_ptr); lp_sol.copy_new_assignment(lp_optimal_solution); const bool consider_integrality = false; @@ -541,9 +560,11 @@ solution_t diversity_manager_t::run_solver() } } - // Send relaxed solution to branch and bound only if PDLP found it (not dual simplex via - // set_simplex_solution) - if (!use_staged_simplex_solution && + // Hand the root relaxation off to branch and bound when we have a usable solution + // (sets root_crossover_solution_set_, releasing B&B's wait). When the root LP failed + // the wait is instead released above via set_root_concurrent_halt, and a staged + // dual-simplex solution is owned by B&B already, so neither needs this hand-off. + if (!use_staged_simplex_solution && root_lp_usable && problem_ptr->set_root_relaxation_solution_callback != nullptr) { auto& d_primal_solution = lp_result.get_primal_solution(); auto& d_dual_solution = lp_result.get_dual_solution(); @@ -576,7 +597,7 @@ solution_t diversity_manager_t::run_solver() host_primal, host_dual, host_reduced_costs, solver_obj, user_obj, iterations, method); } - if (!use_staged_simplex_solution) { + if (!use_staged_simplex_solution && root_lp_usable) { // in case the pdlp returned var boudns that are out of bounds clamp_within_var_bounds(lp_optimal_solution, problem_ptr, problem_ptr->handle_ptr); } diff --git a/cpp/src/utilities/manual_cuda_graph.cuh b/cpp/src/utilities/manual_cuda_graph.cuh index 68b37b7c71..d61cf04af8 100644 --- a/cpp/src/utilities/manual_cuda_graph.cuh +++ b/cpp/src/utilities/manual_cuda_graph.cuh @@ -24,14 +24,21 @@ namespace cuopt { // cuSPARSE calls inside the captured region are preserved. // // Invalidation recovery: -// If cudaStreamEndCapture returns cudaErrorStreamCaptureInvalidated -// (typically because another thread issued a synchronous CUDA call -- +// A concurrent thread that issues a capture-hostile CUDA call -- // cudaDeviceSynchronize, cudaMalloc, cudaFree, or a library first-use that -// internally syncs the device -- concurrently with this capture window), -// the captured work has NOT been issued to the device. The wrapper drains -// the sticky error, re-executes `work` eagerly so the current iteration -// still produces correct results, and leaves itself uninitialized so the -// next `run` call retries capture. +// internally syncs the device (e.g. the cuDSS barrier's handle init) -- during +// this capture window invalidates the capture. That shows up in one of two ways, +// both handled here: +// 1. cudaStreamEndCapture returns cudaErrorStreamCaptureInvalidated, or +// 2. a CUDA / cuBLAS / cuSPARSE call inside `work` observes the invalidated +// capture and throws. Because cuBLAS/cuSPARSE cannot return a CUDA error +// code, this surfaces as e.g. CUBLAS_STATUS_INTERNAL_ERROR rather than the +// clean cudaErrorStreamCaptureInvalidated. +// In both cases the captured work has NOT been issued to the device. The wrapper +// drains the sticky error, re-executes `work` eagerly (no capture, so the +// concurrent op cannot break it) so the current iteration still produces correct +// results, and leaves itself uninitialized so the next `run` retries capture. +// A throw whose capture is NOT invalidated is a genuine error and is rethrown. // IMPORTANT: because `work` is invoked a second time on recovery, any // host-side mutations inside the callable will run twice -- keep `work` // host-idempotent or move host bookkeeping (counters, flags, hash updates, @@ -75,9 +82,27 @@ class manual_cuda_graph_t { RAFT_CUDA_TRY(cudaStreamBeginCapture(stream.value(), cudaStreamCaptureModeThreadLocal)); guard.capture_active = true; - work(); - cudaGraph_t captured = nullptr; + try { + work(); + } catch (...) { + // A CUDA / cuBLAS / cuSPARSE call inside `work` threw mid-capture (commonly + // CUBLAS_STATUS_INTERNAL_ERROR when a concurrent capture-hostile op + // invalidated this capture and the failure was observed inside the library + // call). End the capture and let its status disambiguate: if the capture was + // invalidated the recorded work was never issued, so recover by re-running + // `work` eagerly; otherwise the error is genuine and is rethrown. + cudaError_t catch_end_err = cudaStreamEndCapture(stream.value(), &captured); + guard.capture_active = false; + if (catch_end_err == cudaErrorStreamCaptureInvalidated) { + cudaGetLastError(); + work(); + return; + } + if (captured != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(captured)); } + throw; + } + cudaError_t end_err = cudaStreamEndCapture(stream.value(), &captured); guard.capture_active = false; diff --git a/det_one.py b/det_one.py new file mode 100644 index 0000000000..31b57edb0c --- /dev/null +++ b/det_one.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import sys +import numpy as np +from cuopt.linear_programming.problem import Problem, INTEGER, MINIMIZE +from cuopt.linear_programming import SolverSettings +from cuopt.linear_programming.solver.solver_parameters import ( + CUOPT_MIP_DETERMINISM_MODE, + CUOPT_TIME_LIMIT, +) + +mode = sys.argv[1] +rng = np.random.default_rng(0) +n = 120 +a = rng.uniform(1.0, 10.0, n) +b = a + rng.uniform(-0.5, 0.5, n) +need_a = 0.80 * a.sum() +cap_b = 0.30 * b.sum() + +p = Problem("infeasible_milp") +x = [ + p.addVariable(lb=0.0, ub=1.0, vtype=INTEGER, name=f"x{i}") + for i in range(n) +] +p.setObjective(sum(x), sense=MINIMIZE) +p.addConstraint( + sum(float(a[i]) * x[i] for i in range(n)) >= float(need_a), name="need_a" +) +p.addConstraint( + sum(float(b[i]) * x[i] for i in range(n)) <= float(cap_b), name="cap_b" +) + +s = SolverSettings() +s.set_parameter(CUOPT_TIME_LIMIT, 15.0) +if mode == "deterministic": + s.set_parameter(CUOPT_MIP_DETERMINISM_MODE, 1) +print(f"[det_one] solving mode={mode}", flush=True) +p.solve(s) +print("STATUS=" + str(getattr(p.Status, "name", p.Status)), flush=True) diff --git a/determinism_milp_test.py b/determinism_milp_test.py new file mode 100644 index 0000000000..24b445c4a2 --- /dev/null +++ b/determinism_milp_test.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import sys +import textwrap + +WORKER = textwrap.dedent(""" + import sys + import numpy as np + from cuopt.linear_programming.problem import Problem, INTEGER, MINIMIZE + from cuopt.linear_programming import SolverSettings + from cuopt.linear_programming.solver.solver_parameters import ( + CUOPT_MIP_DETERMINISM_MODE, CUOPT_TIME_LIMIT) + + mode = sys.argv[1] + rng = np.random.default_rng(0) + n = 120 + a = rng.uniform(1.0, 10.0, n) + b = a + rng.uniform(-0.5, 0.5, n) # correlated second weight vector + need_a = 0.80 * a.sum() # capture >=80% of a-value ... + cap_b = 0.30 * b.sum() # ... using <=30% of b-value -> infeasible (a~b) + + p = Problem("infeasible_milp") + x = [p.addVariable(lb=0.0, ub=1.0, vtype=INTEGER, name=f"x{i}") for i in range(n)] + p.setObjective(sum(x), sense=MINIMIZE) + p.addConstraint(sum(float(a[i]) * x[i] for i in range(n)) >= float(need_a), name="need_a") + p.addConstraint(sum(float(b[i]) * x[i] for i in range(n)) <= float(cap_b), name="cap_b") + + s = SolverSettings() + s.set_parameter(CUOPT_TIME_LIMIT, 15.0) + if mode == "deterministic": + s.set_parameter(CUOPT_MIP_DETERMINISM_MODE, 1) + p.solve(s) + print("STATUS=" + str(getattr(p.Status, "name", p.Status))) +""") + +for mode in ["deterministic", "opportunistic"]: + r = subprocess.run( + [sys.executable, "-c", WORKER, mode], capture_output=True, text=True + ) + out = r.stdout.strip() or (r.stderr.strip().splitlines() or [""])[-1] + print(f"{mode:>13} -> exit {r.returncode:<4} | {out}") From e246d1f7b64296796b591eb9dad2c7cf8f2b38fd Mon Sep 17 00:00:00 2001 From: akif Date: Wed, 10 Jun 2026 15:02:40 +0200 Subject: [PATCH 44/47] remove test files --- det_one.py | 40 ------------------------------------- determinism_milp_test.py | 43 ---------------------------------------- 2 files changed, 83 deletions(-) delete mode 100644 det_one.py delete mode 100644 determinism_milp_test.py diff --git a/det_one.py b/det_one.py deleted file mode 100644 index 31b57edb0c..0000000000 --- a/det_one.py +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import sys -import numpy as np -from cuopt.linear_programming.problem import Problem, INTEGER, MINIMIZE -from cuopt.linear_programming import SolverSettings -from cuopt.linear_programming.solver.solver_parameters import ( - CUOPT_MIP_DETERMINISM_MODE, - CUOPT_TIME_LIMIT, -) - -mode = sys.argv[1] -rng = np.random.default_rng(0) -n = 120 -a = rng.uniform(1.0, 10.0, n) -b = a + rng.uniform(-0.5, 0.5, n) -need_a = 0.80 * a.sum() -cap_b = 0.30 * b.sum() - -p = Problem("infeasible_milp") -x = [ - p.addVariable(lb=0.0, ub=1.0, vtype=INTEGER, name=f"x{i}") - for i in range(n) -] -p.setObjective(sum(x), sense=MINIMIZE) -p.addConstraint( - sum(float(a[i]) * x[i] for i in range(n)) >= float(need_a), name="need_a" -) -p.addConstraint( - sum(float(b[i]) * x[i] for i in range(n)) <= float(cap_b), name="cap_b" -) - -s = SolverSettings() -s.set_parameter(CUOPT_TIME_LIMIT, 15.0) -if mode == "deterministic": - s.set_parameter(CUOPT_MIP_DETERMINISM_MODE, 1) -print(f"[det_one] solving mode={mode}", flush=True) -p.solve(s) -print("STATUS=" + str(getattr(p.Status, "name", p.Status)), flush=True) diff --git a/determinism_milp_test.py b/determinism_milp_test.py deleted file mode 100644 index 24b445c4a2..0000000000 --- a/determinism_milp_test.py +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import subprocess -import sys -import textwrap - -WORKER = textwrap.dedent(""" - import sys - import numpy as np - from cuopt.linear_programming.problem import Problem, INTEGER, MINIMIZE - from cuopt.linear_programming import SolverSettings - from cuopt.linear_programming.solver.solver_parameters import ( - CUOPT_MIP_DETERMINISM_MODE, CUOPT_TIME_LIMIT) - - mode = sys.argv[1] - rng = np.random.default_rng(0) - n = 120 - a = rng.uniform(1.0, 10.0, n) - b = a + rng.uniform(-0.5, 0.5, n) # correlated second weight vector - need_a = 0.80 * a.sum() # capture >=80% of a-value ... - cap_b = 0.30 * b.sum() # ... using <=30% of b-value -> infeasible (a~b) - - p = Problem("infeasible_milp") - x = [p.addVariable(lb=0.0, ub=1.0, vtype=INTEGER, name=f"x{i}") for i in range(n)] - p.setObjective(sum(x), sense=MINIMIZE) - p.addConstraint(sum(float(a[i]) * x[i] for i in range(n)) >= float(need_a), name="need_a") - p.addConstraint(sum(float(b[i]) * x[i] for i in range(n)) <= float(cap_b), name="cap_b") - - s = SolverSettings() - s.set_parameter(CUOPT_TIME_LIMIT, 15.0) - if mode == "deterministic": - s.set_parameter(CUOPT_MIP_DETERMINISM_MODE, 1) - p.solve(s) - print("STATUS=" + str(getattr(p.Status, "name", p.Status))) -""") - -for mode in ["deterministic", "opportunistic"]: - r = subprocess.run( - [sys.executable, "-c", WORKER, mode], capture_output=True, text=True - ) - out = r.stdout.strip() or (r.stderr.strip().splitlines() or [""])[-1] - print(f"{mode:>13} -> exit {r.returncode:<4} | {out}") From 1b162fd16d3877dfcbe3526d54d91012d5ccb9da Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 12 Jun 2026 15:14:09 +0200 Subject: [PATCH 45/47] clean up comments and unsused code --- .../cuopt/miplib2017_optima.hpp | 476 ------------------ cpp/src/branch_and_bound/branch_and_bound.cpp | 14 +- cpp/src/cuts/cuts.hpp | 121 ----- cpp/src/mip_heuristics/solver.cu | 18 - 4 files changed, 2 insertions(+), 627 deletions(-) delete mode 100644 benchmarks/linear_programming/cuopt/miplib2017_optima.hpp diff --git a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp b/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp deleted file mode 100644 index 7f6826a5ce..0000000000 --- a/benchmarks/linear_programming/cuopt/miplib2017_optima.hpp +++ /dev/null @@ -1,476 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -// MIPLIB2017 best-known objective ("optimum") lookup for the MIP -// benchmark runner. Self-contained: no env vars, no external CSV. -// -// Coverage: every instance in the MIPLIB2017 *benchmark* set (240 -// instances). Of those, 232 have a known optimum and live in -// kBenchmarkOptima; 7 are infeasible and live in kBenchmarkInfeasible -// so the printer can label them clearly instead of returning "no opt". -// -// Lookup uses the basename without directory and stripped of -// .mps / .mps.gz / .lp / .lp.gz / .gz suffixes, lower-cased. So -// "miplib2017/MAS74.mps.gz" / "mas74.mps" / "mas74" -// all hit the same entry. -// -// Returns std::optional: nullopt means "instance is in our -// benchmark set but infeasible" *or* "we don't have an entry for it". -// is_known_infeasible() distinguishes the two. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuopt_bench { - -// Strip directory prefix and any .mps/.lp suffix (with optional .gz), -// then lower-case. Designed to match how MPS instance files are named -// across MIPLIB downloads (case- and extension-insensitive). -inline std::string normalize_instance_name(const std::string& raw) -{ - std::string s = raw; - const auto slash = s.find_last_of("/\\"); - if (slash != std::string::npos) { s = s.substr(slash + 1); } - auto endswith = [&](const std::string& suf) { - if (s.size() < suf.size()) { return false; } - for (size_t i = 0; i < suf.size(); ++i) { - if (std::tolower(static_cast(s[s.size() - suf.size() + i])) != - std::tolower(static_cast(suf[i]))) { - return false; - } - } - return true; - }; - for (const char* suf : {".mps.gz", ".lp.gz", ".mps", ".lp", ".gz"}) { - if (endswith(suf)) { - s.resize(s.size() - std::char_traits::length(suf)); - break; - } - } - for (char& c : s) { - c = static_cast(std::tolower(static_cast(c))); - } - return s; -} - -// MIPLIB2017 benchmark-set best-known objectives (n=232). Source: -// https://miplib.zib.de "The Benchmark Set". Values are stored in the -// double precision they were published at; unit tests should compare -// with a tolerance of ~|opt|*1e-9 rather than exact equality. -inline const std::unordered_map& kBenchmarkOptima() -{ - static const std::unordered_map kOptima = { - {"30n20b8", 302}, - {"50v-10", 3311.1799841000002}, - {"academictimetablesmall", 0}, - {"air05", 26374}, - {"app1-1", -3}, - {"app1-2", -41}, - {"assign1-5-8", 211.99999999999801}, - {"atlanta-ip", 90.009878614000002}, - {"b1c1s1", 24544.25}, - {"bab2", -357544.31150000001}, - {"bab6", -284248.23070000007}, - {"beasleyc3", 753.9999999999128}, - {"binkar10_1", 6742.1998835000004}, - {"blp-ar98", 6205.2147103999996}, - {"blp-ic98", 4491.4475839500001}, - {"bnatt400", 1}, - {"bppc4-08", 53}, - {"brazil3", 24}, - {"buildingenergy", 33283.853236000003}, - {"cbs-cta", 0}, - {"chromaticindex1024-7", 4}, - {"chromaticindex512-7", 4}, - {"cmflsp50-24-8-8", 55789389.886}, - {"cms750_4", 252}, - {"co-100", 2639942.0600000001}, - {"cod105", -12}, - {"comp07-2idx", 6}, - {"comp21-2idx", 74}, - {"cost266-uue", 25148940.55999998}, - {"cryptanalysiskb128n5obj16", 0}, - {"csched007", 350.99999999999551}, - {"csched008", 173}, - {"cvs16r128-89", -97}, - {"dano3_3", 576.34463302999995}, - {"dano3_5", 576.9249159565619}, - {"decomp2", -160}, - {"drayage-100-23", 103333.87407000001}, - {"drayage-25-23", 101282.647018}, - {"dws008-01", 37412.604587945083}, - {"eil33-2", 934.007915999999}, - {"eila101-2", 880.92010799999991}, - {"enlight_hard", 37}, - {"ex10", 100}, - {"ex9", 81}, - {"exp-1-500-5-5", 65887}, - {"fast0507", 174}, - {"fastxgemm-n2r6s0t2", 230}, - {"fhnw-binpack4-48", 0}, - {"fiball", 138}, - {"gen-ip002", -4783.7333920000001}, - {"gen-ip054", 6840.9656417899996}, - {"germanrr", 47095869.648999996}, - {"gfd-schedulen180f7d50m30k18", 1}, - {"glass-sc", 23}, - {"glass4", 1200012599.972384}, - {"gmu-35-40", -2406733.3687999998}, - {"gmu-35-50", -2607958.3300000001}, - {"graph20-20-1rand", -9}, - {"graphdraw-domain", 19685.999975500381}, - {"h80x6320d", 6382.0990482459993}, - {"highschool1-aigio", 0}, - {"hypothyroid-k1", -2851}, - {"ic97_potential", 3941.9999309022501}, - {"icir97_tension", 6375}, - {"irish-electricity", 3723497.5913959998}, - {"irp", 12159.492835396981}, - {"istanbul-no-cutoff", 204.08170701}, - {"k1mushroom", -3288}, - {"lectsched-5-obj", 24}, - {"leo1", 404227536.16000003}, - {"leo2", 404077441.12}, - {"lotsize", 1480195}, - {"mad", 0.026800000000000001}, - {"map10", -495}, - {"map16715-04", -111}, - {"markshare2", 1}, - {"markshare_4_0", 1}, - {"mas74", 11801.185719999999}, - {"mas76", 40005.053989999993}, - {"mc11", 11688.99999999966}, - {"mcsched", 211913}, - {"mik-250-20-75-4", -52301}, - {"milo-v12-6-r2-40-1", 326481.14282799}, - {"momentum1", 109143.4935}, - {"mushroom-best", 0.055333761199999998}, - {"mzzv11", -21718}, - {"mzzv42z", -20540}, - {"n2seq36q", 52200}, - {"n3div36", 130800}, - {"n5-3", 8104.9999999939992}, - {"neos-1122047", 161}, - {"neos-1171448", -309}, - {"neos-1171737", -195}, - {"neos-1354092", 46}, - {"neos-1445765", -17783}, - {"neos-1456979", 176}, - {"neos-1582420", 90.999999999999957}, - {"neos-2657525-crna", 1.810748}, - {"neos-2746589-doon", 2008.1999999999989}, - {"neos-2978193-inde", -2.3880616899999998}, - {"neos-2987310-joes", -607702988.29999995}, - {"neos-3004026-krka", 0}, - {"neos-3024952-loue", 26756}, - {"neos-3046615-murg", 1600}, - {"neos-3083819-nubu", 6307996}, - {"neos-3216931-puriri", 71320}, - {"neos-3381206-awhea", 453}, - {"neos-3402294-bobin", 0.067249999999999491}, - {"neos-3555904-turama", -34.700000000000003}, - {"neos-3627168-kasai", 988585.61999999976}, - {"neos-3656078-kumeu", -13172.200000000001}, - {"neos-3754480-nidda", 12941.73838561778}, - {"neos-4300652-rahue", 2.1415999999999999}, - {"neos-4338804-snowy", 1471}, - {"neos-4387871-tavua", 33.384729927000002}, - {"neos-4413714-turia", 45.370167019999798}, - {"neos-4532248-waihi", 61.599999999999987}, - {"neos-4647030-tutaki", 27265.705999999958}, - {"neos-4722843-widden", 25009.662227000001}, - {"neos-4738912-atrato", 283627956.59500003}, - {"neos-4763324-toguru", 1613.0388458499999}, - {"neos-4954672-berkel", 2612710}, - {"neos-5049753-cuanza", 561.99999716889999}, - {"neos-5052403-cygnet", 182}, - {"neos-5093327-huahum", 6259.9999971258949}, - {"neos-5104907-jarama", 935}, - {"neos-5107597-kakapo", 3644.9999999995198}, - {"neos-5114902-kasavu", 655}, - {"neos-5188808-nattai", 0.110283622999984}, - {"neos-5195221-niemur", 0.0038354325999999999}, - {"neos-631710", 203}, - {"neos-662469", 184379.99999999991}, - {"neos-787933", 30}, - {"neos-827175", 112.00152}, - {"neos-848589", 2351.40309999697}, - {"neos-860300", 3200.9999999999982}, - {"neos-873061", 113.6562385063}, - {"neos-911970", 54.759999999999998}, - {"neos-933966", 318}, - {"neos-950242", 4}, - {"neos-957323", -237.75668150000001}, - {"neos-960392", -238}, - {"neos17", 0.1500025774}, - {"neos5", 15}, - {"neos8", -3719}, - {"net12", 214}, - {"netdiversion", 242}, - {"nexp-150-20-8-5", 231}, - {"ns1116954", 0}, - {"ns1208400", 2}, - {"ns1644855", -1524.3333333333301}, - {"ns1760995", -549.21438505000003}, - {"ns1830653", 20622}, - {"ns1952667", 0}, - {"nu25-pr12", 53904.999999999993}, - {"nursesched-medium-hint03", 115}, - {"nursesched-sprint02", 57.999999999999993}, - {"nw04", 16862}, - {"opm2-z10-s4", -33269}, - {"p200x1188c", 15078}, - {"peg-solitaire-a3", 1}, - {"pg", -8674.3426071199992}, - {"pg5_34", -14339.353450000001}, - {"physiciansched3-3", 2623271.3266670001}, - {"physiciansched6-2", 49324}, - {"piperout-08", 125054.9999999999}, - {"piperout-27", 8123.9999999999727}, - {"pk1", 11}, - {"proteindesign121hz512p9", 1473}, - {"proteindesign122trx11p8", 1747}, - {"qap10", 339.99999999838712}, - {"radiationm18-12-05", 17566}, - {"radiationm40-10-02", 155328}, - {"rail01", -70.569964299999995}, - {"rail02", -200.44990770000001}, - {"rail507", 174}, - {"ran14x18-disj-8", 3712}, - {"rd-rplusc-21", 165395.275295}, - {"reblock115", -36800603.233199999}, - {"rmatr100-p10", 423}, - {"rmatr200-p5", 4521}, - {"roci-4-11", -6020203}, - {"rocii-5-11", -6.6755047315380001}, - {"rococob10-011000", 19449}, - {"rocococ10-001000", 11460}, - {"roi2alpha3n4", -63.208495030000002}, - {"roi5alpha10n8", -52.322274350999997}, - {"roll3000", 12889.999991999999}, - {"s100", -0.16972352705829999}, - {"s250r10", -0.17178048342319999}, - {"satellites2-40", -19}, - {"satellites2-60-fs", -19.000000000099998}, - {"savsched1", 3217.6999999999998}, - {"sct2", -230.9891623}, - {"seymour", 423}, - {"seymour1", 410.76370138999999}, - {"sing326", 7753674.8537600003}, - {"sing44", 8128831.1771999998}, - {"snp-02-004-104", 586803238.65672886}, - {"sorrell3", -16}, - {"sp150x300d", 69}, - {"sp97ar", 660705645.75899994}, - {"sp98ar", 529740623.19999999}, - {"splice1k1", -394}, - {"square41", 15}, - {"square47", 15.9999999997877}, - {"supportcase10", 7}, - {"supportcase12", -7559.5330538170001}, - {"supportcase18", 48}, - {"supportcase19", 12677205.999920519}, - {"supportcase22", 110}, // best-known marked "*" in MIPLIB2017 (not proven optimal) - {"supportcase26", 1745.1238129999999}, - {"supportcase33", -345}, - {"supportcase40", 24256.3122898}, - {"supportcase42", 7.7586307222700004}, - {"supportcase6", 51906.477370000001}, - {"supportcase7", -1132.2231770000001}, - {"swath1", 379.07129574999999}, - {"swath3", 397.76134365000001}, - {"tbfp-network", 24.163194440000002}, - {"thor50dday", 40417}, - {"timtab1", 764771.99999977998}, - {"tr12-30", 130595.9999999999}, - {"traininstance2", 71820}, - {"traininstance6", 28290}, - {"trento1", 5189487}, - {"triptim1", 22.868099999999899}, - {"uccase12", 11507.4050616}, - {"uccase9", 10993.131409}, - {"uct-subprob", 314}, - {"unitcal_7", 19635558.243999999}, - {"var-smallemery-m6j6", -149.37501}, - {"wachplan", -8}, - }; - return kOptima; -} - -// MIPLIB2017 benchmark-set instances flagged as infeasible (n=7). -// Solver should return Infeasible status; we use this set to label -// the printer line with status_extra=KnownInfeasible so a downstream -// "did the run agree with MIPLIB?" check can be a single grep. -inline const std::unordered_set& kBenchmarkInfeasible() -{ - static const std::unordered_set kInfeas = { - "bnatt500", - "cryptanalysiskb128n5obj14", - "fhnw-binpack4-4", - "neos-2075418-temuka", - "neos-3402454-bohle", - "neos-3988577-wolgan", - "neos859080", - }; - return kInfeas; -} - -inline std::optional lookup_miplib_optimum(const std::string& filename) -{ - const auto& m = kBenchmarkOptima(); - const auto it = m.find(normalize_instance_name(filename)); - if (it == m.end()) { return std::nullopt; } - return it->second; -} - -inline bool is_known_infeasible(const std::string& filename) -{ - return kBenchmarkInfeasible().count(normalize_instance_name(filename)) != 0; -} - -// Single grep-friendly per-instance line. Emits to stdout via printf -// so the output survives unconditionally regardless of the project's -// settings_.log routing (NFS-backed log files, gated debug levels) -// and is trivially cross-compared between cuts-config branches. -// -// "Gap closed" is reported relative to the *root LP after cuts*, not -// relative to the final dual bound at the end of solve. The standard -// MIP cutting-plane definition is: -// gap_closed_pct = 100 * (root_lp_with_cuts - root_lp_no_cuts) -// / (opt - root_lp_no_cuts) -// On a minimization-form problem all three differences are >= 0 and -// gap_closed_pct lies in [0, 100]. The ratio is sign-symmetric so the -// formula also holds verbatim for maximization (numerator and -// denominator flip sign together). NaN is emitted when either root -// bound was not published (e.g. B&B never entered the cut loop). -// -// Other field semantics (signed for minimization): -// abs_root_dual_gap = opt - root_lp_with_cuts -// rel_root_dual_gap_pct = 100 * abs_root_dual_gap / max(|opt|, 1) -// abs_primal_gap = primal - opt -// rel_primal_gap_pct = 100 * abs_primal_gap / max(|opt|, 1) -// -// The line still also reports `final_dual` (solver's bound at the end -// of solve) so the new metric and the previous one can be compared -// without re-running. -// -// "TBD" is emitted when the optimum is unknown so downstream parsers -// can join lines on (instance, field) without dropping rows. "NaN" is -// emitted for root_lp_* when the value is unavailable. -template -inline void print_miplib_gap_stat( - const std::string& filename, - const Solution& solution, - double solve_time_seconds, - const std::string& termination_status, - double root_lp_no_cuts, - double root_lp_with_cuts, - double cut_gen_time_sec = std::numeric_limits::quiet_NaN()) -{ - const std::string norm = normalize_instance_name(filename); - const auto opt = lookup_miplib_optimum(filename); - const double primal = solution.get_objective_value(); - const double final_dual = solution.get_solution_bound(); - const double mip_gap = solution.get_mip_gap(); - const bool primal_finite = std::isfinite(primal); - const bool root0_finite = std::isfinite(root_lp_no_cuts); - const bool root1_finite = std::isfinite(root_lp_with_cuts); - constexpr double NaN = std::numeric_limits::quiet_NaN(); - - if (is_known_infeasible(filename)) { - std::printf( - "MIPLIBGapStat instance=%s opt=Infeasible primal=%.10g final_dual=%.10g " - "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " - "abs_root_dual_gap=NA rel_root_dual_gap_pct=NA gap_closed_pct=NA " - "abs_primal_gap=NA rel_primal_gap_pct=NA " - "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", - norm.c_str(), - primal, - final_dual, - root_lp_no_cuts, - root_lp_with_cuts, - mip_gap, - solve_time_seconds, - cut_gen_time_sec, - termination_status.c_str()); - } else if (opt.has_value()) { - const double o = *opt; - const double denom = std::max(std::abs(o), 1.0); - - const double abs_root_dgap = root1_finite ? (o - root_lp_with_cuts) : NaN; - const double rel_root_dgap_pct = root1_finite ? 100.0 * abs_root_dgap / denom : NaN; - - // Classical gap-closed-by-cuts. Skip when either root bound is - // missing, when the LP relaxation already proves optimality - // (denominator = opt - root_lp_no_cuts ~= 0), or when the bound - // moved the wrong way (numerical noise in either direction). - double gap_closed_pct = NaN; - if (root0_finite && root1_finite) { - const double total_gap = o - root_lp_no_cuts; - if (std::abs(total_gap) > 1e-12 * denom) { - gap_closed_pct = 100.0 * (root_lp_with_cuts - root_lp_no_cuts) / total_gap; - } else { - // LP relaxation already (numerically) optimal -> 100% closed - // by definition. Avoid /0 noise. - gap_closed_pct = 100.0; - } - } - - const double abs_pgap = primal_finite ? (primal - o) : NaN; - const double rel_pgap_pct = primal_finite ? 100.0 * abs_pgap / denom : NaN; - - std::printf( - "MIPLIBGapStat instance=%s opt=%.10g primal=%.10g final_dual=%.10g " - "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " - "abs_root_dual_gap=%.10g rel_root_dual_gap_pct=%.6g gap_closed_pct=%.6g " - "abs_primal_gap=%.10g rel_primal_gap_pct=%.6g " - "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", - norm.c_str(), - o, - primal, - final_dual, - root_lp_no_cuts, - root_lp_with_cuts, - abs_root_dgap, - rel_root_dgap_pct, - gap_closed_pct, - abs_pgap, - rel_pgap_pct, - mip_gap, - solve_time_seconds, - cut_gen_time_sec, - termination_status.c_str()); - } else { - std::printf( - "MIPLIBGapStat instance=%s opt=TBD primal=%.10g final_dual=%.10g " - "root_lp_no_cuts=%.10g root_lp_with_cuts=%.10g " - "abs_root_dual_gap=TBD rel_root_dual_gap_pct=TBD gap_closed_pct=TBD " - "abs_primal_gap=TBD rel_primal_gap_pct=TBD " - "mip_gap_reported=%.6g time_s=%.3f cut_gen_time_s=%.3f status=%s\n", - norm.c_str(), - primal, - final_dual, - root_lp_no_cuts, - root_lp_with_cuts, - mip_gap, - solve_time_seconds, - cut_gen_time_sec, - termination_status.c_str()); - } - std::fflush(stdout); -} - -} // namespace cuopt_bench diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 581ef3a056..4514ab6bb6 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -2650,18 +2650,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut }; cuopt::scope_guard root_cut_cpufj_guard([&]() { stop_root_cut_cpufj(); }); - f_t cut_generation_start_time = tic(); - auto publish_cut_generation_time = [&](bool force_time_limit_value = false) { - if (settings_.benchmark_info_ptr == nullptr) { return; } - f_t cut_generation_time = toc(cut_generation_start_time); - if (force_time_limit_value) { cut_generation_time = settings_.time_limit; } - if (cut_generation_time < static_cast(0.0)) { - cut_generation_time = static_cast(0.0); - } - settings_.benchmark_info_ptr->cut_generation_time_sec = - static_cast(cut_generation_time); - }; - i_t cut_pool_size = 0; + f_t cut_generation_start_time = tic(); + i_t cut_pool_size = 0; for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { if (num_fractional == 0) { // LP relaxation is already integer-feasible — solved at the root diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index 7a0befbe9c..0a693ac07e 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -305,24 +305,8 @@ class cut_pool_t { // We expect that the cut is violated by the current relaxation xstar. void add_cut(cut_type_t cut_type, const inequality_t& cut); - // Backward-compatible scoring entry-point. Falls back to the legacy - // geometric-distance / nnz-penalty score when bounds are not provided. void score_cuts(std::vector& x_relax); - // HiGHS-like active-support scoring with adaptive threshold, adaptive - // parallelism rejection, and violation-based aging. Selected - // rows remain in the pool so they can be reconsidered if later removed - // from the LP and violated again. - void score_cuts(const std::vector& x_relax, - const std::vector& lower, - const std::vector& upper, - f_t feastol); - void score_cuts(const std::vector& x_relax, - const std::vector& lower, - const std::vector& upper, - const std::vector& var_types, - f_t feastol); - // We return the cuts in the form best_cuts*x <= best_rhs i_t get_best_cuts(csr_matrix_t& best_cuts, std::vector& best_rhs, @@ -334,52 +318,10 @@ class cut_pool_t { i_t pool_size() const { return cut_storage_.m; } - // Number of nonzeros in the cut at row `row` of the cut pool. - i_t cut_nz(i_t row) const { return cut_storage_.row_length(row); } - void print_cutpool_types() { print_cut_types("In cut pool", cut_type_, settings_); } void check_for_duplicate_cuts(); - // The clique cut family (Bron-Kerbosch + extension) emits cousin - // cliques whose support sets agree in |k-1| of |k| vertices. The - // selection-stage orthogonality scan catches them but only after the - // full insert + dedup + score cost has been paid. The cousin filter - // intercepts at insert: we min-hash the cut's column-support set, - // bucket on the first sketch hash, and when an existing pool entry - // collides with estimated Jaccard >= jaccard_tau we keep the - // higher-scoring representative (or, if no score was supplied, the - // earlier-inserted one). - // - // Defaults: jaccard_tau=0.95, k=8, enable=true, size_weight=0.0. - // These match "config 3 / cousin_loose" from the clique-sweep on - // commit 0b04683b — the configuration that won the gap-closed-pct - // comparison and was promoted to be the production default for the - // clique cut family. Callers can still override at runtime via - // set_clique_cousin_* if they want to experiment. - void set_clique_cousin_filter_enable(bool v) { clique_cousin_filter_enable_ = v; } - void set_clique_cousin_jaccard_tau(f_t v) { clique_cousin_jaccard_tau_ = v; } - void set_clique_cousin_minhash_k(i_t v) { clique_cousin_minhash_k_ = v; } - void set_clique_cousin_size_weight(f_t v) { clique_cousin_size_weight_ = v; } - - bool clique_cousin_filter_enable() const { return clique_cousin_filter_enable_; } - f_t clique_cousin_jaccard_tau() const { return clique_cousin_jaccard_tau_; } - i_t clique_cousin_minhash_k() const { return clique_cousin_minhash_k_; } - f_t clique_cousin_size_weight() const { return clique_cousin_size_weight_; } - - // Per-pool tally for log lines (instance-level diagnostic). All three - // counters are reset by reset_cousin_stats() and incremented inside - // add_cut() / cousin replacement. - i_t cousin_drops() const { return cousin_drops_; } - i_t cousin_replaces() const { return cousin_replaces_; } - i_t clique_inserts() const { return clique_inserts_; } - void reset_cousin_stats() - { - cousin_drops_ = 0; - cousin_replaces_ = 0; - clique_inserts_ = 0; - } - private: f_t cut_distance(i_t row, const std::vector& x, f_t& cut_violation, f_t& cut_norm); f_t cut_density(i_t row); @@ -390,16 +332,8 @@ class cut_pool_t { csr_matrix_t cut_storage_; std::vector rhs_storage_; - // Age convention: - // age >= 0 : cut is in the pool, available for selection. Newly added - // cuts start at max(0, pool_age_limit_ - 5). Each separation - // round, non-violated cuts have age++ and are deleted once - // age >= effective_age_limit; violated cuts reset to age = 0. std::vector cut_age_; std::vector cut_type_; - // 1 / sqrt(sum a_j^2). 0.0 means the cut is degenerate / removed. - std::vector cut_inv_norm_; - std::vector cut_max_abs_coef_; i_t scored_cuts_; std::vector cut_distances_; @@ -408,24 +342,6 @@ class cut_pool_t { std::vector cut_scores_; std::vector best_cuts_; const f_t min_cut_distance_{1e-4}; - - std::vector> clique_support_minhash_; - std::vector clique_cousin_score_; - std::unordered_map> clique_cousin_buckets_; - f_t clique_cousin_jaccard_tau_{static_cast(0.95)}; - i_t clique_cousin_minhash_k_{8}; - bool clique_cousin_filter_enable_{true}; - // When > 0, the cousin filter's "score" used to pick a winner is - // boosted as: effective_score = base_score * (1 + size_weight * log2(1 + clique_size)). - // This biases cousin replacement toward larger cliques (more variables - // covered, larger integer support). 0 disables the tilt. - f_t clique_cousin_size_weight_{static_cast(0.0)}; - - // Diagnostic counters reset at the start of each cut pass via - // reset_cousin_stats(). - i_t cousin_drops_{0}; - i_t cousin_replaces_{0}; - i_t clique_inserts_{0}; }; template @@ -682,42 +598,14 @@ class mixed_integer_rounding_cut_t; template class variable_bounds_t; -// Shared fractional conflict-graph subgraph used by both the clique-cut and -// zero-half cut separators. Built once per cut pass in -// cut_generation_t::generate_cuts and consumed by both routines so neither -// has to rebuild the same vertex/weight/adjacency tables. -// -// Vertex indexing: each fractional binary variable j contributes two CG -// vertices — the original literal `j` and the complement literal -// `j + num_vars`. Local indices are dense in `[0, vertices.size())`. template struct fractional_conflict_subgraph_t { - // Number of variables in the original problem; CG vertex indices are in - // [0, 2 * num_vars). i_t num_vars{0}; - - // Global CG vertex indices (length = 2 * #fractional binary vars). std::vector vertices; - - // LP value of the literal at each local index. weights[k] = x_j for the - // original copy of variable j; 1 - x_j for the complement copy. std::vector weights; - - // Inverse mapping: vertex_to_local[CG_vertex] = local_idx (or -1 if not - // in the subgraph). Sized 2 * num_vars when ready. std::vector vertex_to_local; - - // 1 if CG_vertex is in the subgraph, 0 otherwise. Sized 2 * num_vars when - // ready. std::vector in_subgraph; - - // For each local index l, adj_local[l] is the list of local indices of - // its neighbors (CG neighbors restricted to the subgraph). std::vector> adj_local; - - // True iff a build completed for the current cut pass. May be true with - // an empty subgraph (no fractional binaries), in which case both - // separators have nothing to do but the build itself succeeded. bool ready{false}; i_t num_local() const { return static_cast(vertices.size()); } @@ -835,12 +723,6 @@ class cut_generation_t { const std::vector& xstar, f_t start_time); - // Resolve the async clique-table future (if still pending) and build the - // fractional conflict-graph subgraph against the current xstar. Both the - // clique-cut and zero-half cut separators consume the result via sub_cg_. - // Skips cleanly (sub_cg_.ready = false) if the clique table is missing or - // empty, if budgets are exceeded, or if cut routines depending on it are - // disabled. Safe to call multiple times per cut pass. void prepare_fractional_sub_cg(const simplex_solver_settings_t& settings, const std::vector& xstar, f_t start_time); @@ -852,9 +734,6 @@ class cut_generation_t { const probing_implied_bound_t& probing_implied_bound_; std::shared_ptr> clique_table_; omp_atomic_t* signal_extend_{nullptr}; - // Cached fractional sub-CG, rebuilt at the top of each generate_cuts call - // by prepare_fractional_sub_cg. Both clique cuts and zero-half cuts read - // from this and skip if !sub_cg_.ready. fractional_conflict_subgraph_t sub_cg_; }; diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 6c7bd54329..720d55a251 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -427,14 +427,6 @@ solution_t mip_solver_t::run_solver() std::placeholders::_2); } - // Create the branch and bound object. - // - // Clique-table lifecycle: presolve no longer builds an initial clique - // table, so context.problem_ptr->clique_table is expected to be null - // here. B&B's async build (kicked off inside branch_and_bound_t::solve) - // produces the table and, via the publish callback installed below, - // atomically stores it into context.problem_ptr->clique_table so - // heuristic ensure_clique_data() can observe it on its next iteration. branch_and_bound = std::make_unique>( branch_and_bound_problem, branch_and_bound_settings, @@ -444,16 +436,6 @@ solution_t mip_solver_t::run_solver() context.symmetry.get()); context.branch_and_bound_ptr = branch_and_bound.get(); - // Publish the async-built clique_table onto context.problem_ptr so - // heuristics pick it up via the atomic snapshot accessor. - // { - // auto* pb = context.problem_ptr; - // branch_and_bound->set_clique_publish_callback( - // [pb](std::shared_ptr> ct) { - // pb->publish_clique_table(std::move(ct)); - // }); - // } - // Convert the best external upper bound from user-space to B&B's internal objective space. // context.problem_ptr is the post-trivial-presolve problem, whose get_solver_obj_from_user_obj // produces values in the same space as B&B node lower bounds. From cfb5f4bb588c252df5710ccb7ac44883183f8deb Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 12 Jun 2026 16:11:12 +0200 Subject: [PATCH 46/47] improve test path --- cpp/src/cuts/cuts.cpp | 7 ------- cpp/tests/mip/cuts_test.cu | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 3bb400a811..50f34c8337 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -1073,13 +1073,10 @@ std::vector> find_violated_odd_cycles_for_test( std::vector> result; std::vector bipartite_path; - std::vector cycle_local; - std::vector already_used(n_vertices, 0); dijkstra_scratch_t dijkstra_scratch; for (int s = 0; s < num_local; ++s) { if (toc(start_time) >= time_limit) { break; } - if (already_used[s]) { continue; } double total_weight = 0; if (!dijkstra_odd_cycle(s, @@ -1093,7 +1090,6 @@ std::vector> find_violated_odd_cycles_for_test( dijkstra_scratch)) { continue; } - cycle_local.clear(); if (bipartite_path.size() < 4) { continue; } std::vector seq; seq.reserve(bipartite_path.size()); @@ -1114,9 +1110,6 @@ std::vector> find_violated_odd_cycles_for_test( } if (!simple) { continue; } result.push_back(seq); - for (const auto v : seq) { - already_used[v] = 1; - } } return result; } diff --git a/cpp/tests/mip/cuts_test.cu b/cpp/tests/mip/cuts_test.cu index cbbc360cd8..6c0e0172d1 100644 --- a/cpp/tests/mip/cuts_test.cu +++ b/cpp/tests/mip/cuts_test.cu @@ -1456,6 +1456,28 @@ TEST(cuts, zero_half_unit_separator_disjoint_pentagons) EXPECT_TRUE(found_right); } +TEST(cuts, zero_half_unit_separator_overlapping_pentagons) +{ + std::vector> adj = { + {1, 4, 5, 8}, + {0, 2}, + {1, 3}, + {2, 4}, + {3, 0}, + {0, 6}, + {5, 7}, + {6, 8}, + {7, 0}, + }; + std::vector x_values(9, 0.5); + auto cycles = dual_simplex::find_violated_odd_cycles_for_test( + adj, x_values, 1e-6, std::numeric_limits::infinity()); + cycles = canonicalize_cycles(std::move(cycles)); + + EXPECT_NE(std::find(cycles.begin(), cycles.end(), std::vector{0, 1, 2, 3, 4}), cycles.end()); + EXPECT_NE(std::find(cycles.begin(), cycles.end(), std::vector{0, 5, 6, 7, 8}), cycles.end()); +} + TEST(cuts, zero_half_end_to_end_pentagon_tightens_lp_relaxation) { const raft::handle_t handle{}; From 3edceae4b80163b7411d3f692163fc5f84d63c85 Mon Sep 17 00:00:00 2001 From: akif Date: Fri, 12 Jun 2026 16:44:06 +0200 Subject: [PATCH 47/47] handle compile errors --- cpp/src/cuts/cuts.hpp | 2 -- cpp/tests/mip/cuts_test.cu | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index 0a693ac07e..588e05b465 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -295,8 +295,6 @@ class cut_pool_t { rhs_storage_(0), cut_age_(0), cut_type_(0), - cut_inv_norm_(0), - cut_max_abs_coef_(0), scored_cuts_(0) { } diff --git a/cpp/tests/mip/cuts_test.cu b/cpp/tests/mip/cuts_test.cu index 6c0e0172d1..8da9e983a3 100644 --- a/cpp/tests/mip/cuts_test.cu +++ b/cpp/tests/mip/cuts_test.cu @@ -63,6 +63,27 @@ End )LP"); } +io::mps_data_model_t create_pairwise_pentagon_set_packing_problem() +{ + return cuopt::test::parse_inline_lp(R"LP( +Minimize + obj: -x0 - x1 - x2 - x3 - x4 +Subject To + c1: x0 + x1 <= 1 + c2: x1 + x2 <= 1 + c3: x2 + x3 <= 1 + c4: x3 + x4 <= 1 + c5: x4 + x0 <= 1 +Binaries + x0 + x1 + x2 + x3 + x4 +End +)LP"); +} + // Same triangle conflicts plus an isolated binary x3 with no conflict rows. io::mps_data_model_t create_pairwise_triangle_with_isolated_variable_problem() {