From 9a1daf07c79b94ea5dca48af8bcd29de2dae720b Mon Sep 17 00:00:00 2001
From: Guy Korland <guy@example.com>
Date: Tue, 5 May 2026 00:16:39 +0300
Subject: [PATCH 1/3] perf(leiden): replace per-row GrB_Col_extract with CSR
 walk

Address Dr. Davis's review feedback that the initial Leiden
implementation 'doesn't exploit much of GraphBLAS' and is therefore
slow.  The dominant cost was per-node GraphBLAS overhead inside the
Phase 1 / Phase 2 hot loops.

Optimizations (algorithm semantics, tie-breaking, self-loop handling
and modularity formula are all unchanged):

* Once per outer aggregation level, materialize A_cur into portable
  CSR arrays (Ap, Aj, Ax) via a single GrB_Matrix_extractTuples_FP64
  + counting-sort scatter.  The Phase 1 / Phase 2 inner loops now walk
  Aj[Ap[i]..Ap[i+1]) directly instead of calling GrB_Col_extract +
  GrB_Vector_nvals + GrB_Vector_extractTuples_FP64 for every node on
  every iteration of both phases.

* Compute k_arr[i] from CSR row sums; remove the per-level
  GrB_Matrix_reduce_Monoid + per-element GrB_Vector_extractElement_FP64
  loop.

* Build S_mat via a single GrB_Matrix_build_FP64 instead of n_cur
  GrB_Matrix_setElement_FP64 calls + an implicit GrB_Matrix_wait.
  Existing scratch buffers (nbrs_j, dirty_list, nbrs_v) are reused
  for the build inputs.

* Build the output GrB_Vector via GrB_Vector_build_INT64 instead of
  n GrB_Vector_setElement_INT64 calls (also fixed in the empty-graph
  fallback).

* CSR/tuple buffers grow on demand and are released through
  LG_FREE_WORK on every error path.

Asymptotic per-level cost in the local-move and refinement phases
drops from O(n_cur * iters * GrB-call-overhead) to O(nnz * iters)
with no GraphBLAS overhead in the hot loop.

test_leiden still passes (Q = 0.4188 on karate, well above the 0.37
acceptance threshold).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 experimental/algorithm/LAGraph_Leiden.c | 175 +++++++++++++++++-------
 1 file changed, 124 insertions(+), 51 deletions(-)

diff --git a/experimental/algorithm/LAGraph_Leiden.c b/experimental/algorithm/LAGraph_Leiden.c
index d8a1eeec11..0ad3f9f6e6 100644
--- a/experimental/algorithm/LAGraph_Leiden.c
+++ b/experimental/algorithm/LAGraph_Leiden.c
@@ -66,7 +66,6 @@
 #define LG_FREE_WORK                                        \
 {                                                           \
     GrB_free (&k_vec) ;                                     \
-    GrB_free (&v) ;                                         \
     GrB_free (&A_agg) ;                                     \
     GrB_free (&A_new) ;                                     \
     GrB_free (&S_mat) ;                                     \
@@ -85,6 +84,14 @@
     LAGraph_Free ((void **) &remap,      NULL) ;            \
     LAGraph_Free ((void **) &o_comm,     NULL) ;            \
     LAGraph_Free ((void **) &init_comm,  NULL) ;            \
+    LAGraph_Free ((void **) &Ap,         NULL) ;            \
+    LAGraph_Free ((void **) &Aj,         NULL) ;            \
+    LAGraph_Free ((void **) &Ax,         NULL) ;            \
+    LAGraph_Free ((void **) &I_tup,      NULL) ;            \
+    LAGraph_Free ((void **) &J_tup,      NULL) ;            \
+    LAGraph_Free ((void **) &X_tup,      NULL) ;            \
+    LAGraph_Free ((void **) &cursor,     NULL) ;            \
+    LAGraph_Free ((void **) &iota,       NULL) ;            \
 }
 
 #undef  LG_FREE_ALL
@@ -119,7 +126,6 @@ int LAGraph_Leiden
     //--------------------------------------------------------------------------
 
     GrB_Vector  k_vec      = NULL ;
-    GrB_Vector  v          = NULL ;
     GrB_Matrix  A_agg      = NULL ;   // owned coarsened graph (Phase 3)
     GrB_Matrix  A_new      = NULL ;   // next-level aggregate before ownership transfer
     GrB_Matrix  S_mat      = NULL ;   // temporary membership matrix (Phase 3)
@@ -139,6 +145,20 @@ int LAGraph_Leiden
     GrB_Index  *o_comm     = NULL ;   // o_comm[i] = community of original node i
     GrB_Index  *init_comm  = NULL ;   // init_comm[r] = initial c_arr for aggregate node r
 
+    // CSR materialization of A_cur, rebuilt once per outer aggregation level.
+    // Walking Ap/Aj/Ax directly avoids one GrB_Col_extract+extractTuples per
+    // node per inner-loop iteration (which dominates runtime).
+    GrB_Index  *Ap         = NULL ;   // row pointers, size n_cur+1
+    GrB_Index  *Aj         = NULL ;   // column indices, size Anz
+    double     *Ax         = NULL ;   // values, size Anz
+    GrB_Index  *I_tup      = NULL ;   // raw row indices from extractTuples
+    GrB_Index  *J_tup      = NULL ;   // raw col indices from extractTuples
+    double     *X_tup      = NULL ;   // raw values from extractTuples
+    GrB_Index  *cursor     = NULL ;   // scatter cursor for CSR build
+    GrB_Index  *iota       = NULL ;   // [0,1,...,n-1] for vector/matrix build
+    GrB_Index   Ap_cap     = 0 ;      // current allocated capacity of Ap (entries)
+    GrB_Index   Anz_cap    = 0 ;      // current allocated capacity of Aj/Ax/tuples
+
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
@@ -184,6 +204,8 @@ int LAGraph_Leiden
     LG_TRY (LAGraph_Malloc ((void **) &remap,      n, sizeof (GrB_Index), msg)) ;
     LG_TRY (LAGraph_Malloc ((void **) &o_comm,     n, sizeof (GrB_Index), msg)) ;
     LG_TRY (LAGraph_Malloc ((void **) &init_comm,  n, sizeof (GrB_Index), msg)) ;
+    LG_TRY (LAGraph_Malloc ((void **) &iota,       n, sizeof (GrB_Index), msg)) ;
+    for (GrB_Index i = 0 ; i < n ; i++) iota[i] = i ;
 
     //--------------------------------------------------------------------------
     // compute m = total edge weight / 2 from G->A (invariant under aggregation)
@@ -203,11 +225,11 @@ int LAGraph_Leiden
     // Empty graph: return a singleton partition.
     if (m == 0.0)
     {
+        // c[i] = i for all i, built in one call instead of n setElement calls.
+        for (GrB_Index i = 0 ; i < n ; i++) c_arr[i] = (int64_t) i ;
         GRB_TRY (GrB_Vector_new (c_handle, GrB_INT64, n)) ;
-        for (GrB_Index i = 0 ; i < n ; i++)
-        {
-            GRB_TRY (GrB_Vector_setElement_INT64 (*c_handle, (int64_t) i, i)) ;
-        }
+        GRB_TRY (GrB_Vector_build_INT64 (*c_handle, iota, c_arr, n,
+            GrB_FIRST_INT64)) ;
         LG_FREE_WORK ;
         return (GrB_SUCCESS) ;
     }
@@ -240,22 +262,78 @@ int LAGraph_Leiden
         outer_changed = false ;
 
         //----------------------------------------------------------------------
-        // create GraphBLAS vectors sized for the current level
+        // Materialize A_cur into CSR (Ap, Aj, Ax) once per outer level.
+        //
+        // This replaces the per-node GrB_Col_extract + extractTuples in the
+        // Phase 1 / Phase 2 inner loops (which dominated runtime) with a
+        // single O(nnz) extraction + counting-sort scatter.
         //----------------------------------------------------------------------
 
-        GRB_TRY (GrB_Vector_new (&k_vec, GrB_FP64, n_cur)) ;
-        GRB_TRY (GrB_Vector_new (&v,     GrB_FP64, n_cur)) ;
+        GrB_Index Anz ;
+        GRB_TRY (GrB_Matrix_nvals (&Anz, A_cur)) ;
+
+        // (re)allocate CSR + tuple buffers as needed.
+        if (Ap_cap < n_cur + 1)
+        {
+            LAGraph_Free ((void **) &Ap,     NULL) ;
+            LAGraph_Free ((void **) &cursor, NULL) ;
+            LG_TRY (LAGraph_Malloc ((void **) &Ap,     n_cur + 1,
+                sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &cursor, n_cur,
+                sizeof (GrB_Index), msg)) ;
+            Ap_cap = n_cur + 1 ;
+        }
+        if (Anz_cap < Anz)
+        {
+            // Grow with some slack to avoid repeated reallocs; aggregate
+            // graphs typically shrink monotonically but we don't rely on it.
+            GrB_Index newcap = (Anz < 16) ? 16 : Anz ;
+            LAGraph_Free ((void **) &Aj,    NULL) ;
+            LAGraph_Free ((void **) &Ax,    NULL) ;
+            LAGraph_Free ((void **) &I_tup, NULL) ;
+            LAGraph_Free ((void **) &J_tup, NULL) ;
+            LAGraph_Free ((void **) &X_tup, NULL) ;
+            LG_TRY (LAGraph_Malloc ((void **) &Aj,    newcap,
+                sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &Ax,    newcap,
+                sizeof (double),    msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &I_tup, newcap,
+                sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &J_tup, newcap,
+                sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &X_tup, newcap,
+                sizeof (double),    msg)) ;
+            Anz_cap = newcap ;
+        }
+
+        // Build CSR: count rows, prefix-sum, scatter.
+        memset (Ap, 0, (n_cur + 1) * sizeof (GrB_Index)) ;
+        if (Anz > 0)
+        {
+            GrB_Index nout = Anz ;
+            GRB_TRY (GrB_Matrix_extractTuples_FP64 (
+                I_tup, J_tup, X_tup, &nout, A_cur)) ;
+            for (GrB_Index t = 0 ; t < Anz ; t++) Ap[I_tup[t] + 1]++ ;
+            for (GrB_Index i = 0 ; i < n_cur ; i++) Ap[i + 1] += Ap[i] ;
+            memcpy (cursor, Ap, n_cur * sizeof (GrB_Index)) ;
+            for (GrB_Index t = 0 ; t < Anz ; t++)
+            {
+                GrB_Index r   = I_tup[t] ;
+                GrB_Index pos = cursor[r]++ ;
+                Aj[pos] = J_tup[t] ;
+                Ax[pos] = X_tup[t] ;
+            }
+        }
 
         //----------------------------------------------------------------------
-        // compute k_arr[0..n_cur-1] from A_cur (includes self-loops in A_agg)
+        // compute k_arr[i] = sum of row i (includes self-loops in A_agg).
         //----------------------------------------------------------------------
 
-        GRB_TRY (GrB_Matrix_reduce_Monoid (k_vec, NULL, NULL,
-            GrB_PLUS_MONOID_FP64, A_cur, NULL)) ;
         for (GrB_Index i = 0 ; i < n_cur ; i++)
         {
-            GrB_Info info = GrB_Vector_extractElement_FP64 (&k_arr[i], k_vec, i) ;
-            if (info == GrB_NO_VALUE) k_arr[i] = 0.0 ;
+            double s = 0.0 ;
+            for (GrB_Index p = Ap[i] ; p < Ap[i + 1] ; p++) s += Ax[p] ;
+            k_arr[i] = s ;
         }
 
         //----------------------------------------------------------------------
@@ -286,23 +364,17 @@ int LAGraph_Leiden
 
                 int64_t ci = c_arr[i] ;
 
-                GRB_TRY (GrB_Col_extract (v, NULL, NULL, A_cur,
-                    GrB_ALL, n_cur, i, GrB_DESC_T0)) ;
-
-                GrB_Index nvals ;
-                GRB_TRY (GrB_Vector_nvals (&nvals, v)) ;
-                if (nvals == 0) continue ;
-
-                GRB_TRY (GrB_Vector_extractTuples_FP64 (
-                    nbrs_j, nbrs_v, &nvals, v)) ;
+                GrB_Index row_begin = Ap[i] ;
+                GrB_Index row_end   = Ap[i + 1] ;
+                if (row_begin == row_end) continue ;
 
                 // Temporarily remove i from community ci.
                 k_comm[ci] -= ki ;
 
                 GrB_Index ndirty = 0 ;
-                for (GrB_Index t = 0 ; t < nvals ; t++)
+                for (GrB_Index t = row_begin ; t < row_end ; t++)
                 {
-                    GrB_Index j = nbrs_j[t] ;
+                    GrB_Index j = Aj[t] ;
                     if (j == i) continue ;          // skip self-loop (in A_agg)
                     int64_t cj = c_arr[j] ;
                     if (!dirty[cj])
@@ -311,7 +383,7 @@ int LAGraph_Leiden
                         dirty_list[ndirty++] = (GrB_Index) cj ;
                         T_local[cj]          = 0.0 ;
                     }
-                    T_local[cj] += nbrs_v[t] ;
+                    T_local[cj] += Ax[t] ;
                 }
 
                 double  T_ci      = dirty[ci] ? T_local[ci] : 0.0 ;
@@ -376,22 +448,16 @@ int LAGraph_Leiden
                 int64_t pi     = c_p1[i] ;
                 int64_t ci_ref = c_ref[i] ;
 
-                GRB_TRY (GrB_Col_extract (v, NULL, NULL, A_cur,
-                    GrB_ALL, n_cur, i, GrB_DESC_T0)) ;
-
-                GrB_Index nvals ;
-                GRB_TRY (GrB_Vector_nvals (&nvals, v)) ;
-                if (nvals == 0) continue ;
-
-                GRB_TRY (GrB_Vector_extractTuples_FP64 (
-                    nbrs_j, nbrs_v, &nvals, v)) ;
+                GrB_Index row_begin = Ap[i] ;
+                GrB_Index row_end   = Ap[i + 1] ;
+                if (row_begin == row_end) continue ;
 
                 k_ref_comm[ci_ref] -= ki ;
 
                 GrB_Index ndirty = 0 ;
-                for (GrB_Index t = 0 ; t < nvals ; t++)
+                for (GrB_Index t = row_begin ; t < row_end ; t++)
                 {
-                    GrB_Index j = nbrs_j[t] ;
+                    GrB_Index j = Aj[t] ;
                     if (j == i) continue ;              // skip self-loop
                     if (c_p1[j] != pi) continue ;       // cross-parent: skip
 
@@ -402,7 +468,7 @@ int LAGraph_Leiden
                         dirty_list[ndirty++] = (GrB_Index) cj_ref ;
                         T_local[cj_ref]      = 0.0 ;
                     }
-                    T_local[cj_ref] += nbrs_v[t] ;
+                    T_local[cj_ref] += Ax[t] ;
                 }
 
                 double  T_ci_ref    = dirty[ci_ref] ? T_local[ci_ref] : 0.0 ;
@@ -487,21 +553,29 @@ int LAGraph_Leiden
         // PHASE 3: Aggregation — build coarsened graph if communities merged
         //----------------------------------------------------------------------
 
-        GrB_free (&k_vec) ;  k_vec = NULL ;
-        GrB_free (&v) ;      v     = NULL ;
-
         if (K_ref < n_cur)
         {
             outer_changed = true ;
 
-            // S_mat: n_cur × K_ref indicator matrix; S[i, c_ref[i]] = 1
-            GRB_TRY (GrB_Matrix_new (&S_mat, GrB_FP64, n_cur, K_ref)) ;
+            // S_mat: n_cur × K_ref indicator matrix; S[i, c_ref[i]] = 1.
+            // Build in one shot (single-pass) instead of n_cur setElement
+            // calls + an implicit GrB_Matrix_wait.  Reuse scratch buffers:
+            //   nbrs_j  -> row indices [0..n_cur)
+            //   dirty_list -> column indices (c_ref cast to GrB_Index)
+            //   nbrs_v  -> values (1.0)
+            // (All sized n >= n_cur; build copies them, no aliasing concern.)
             for (GrB_Index i = 0 ; i < n_cur ; i++)
             {
-                GRB_TRY (GrB_Matrix_setElement_FP64 (
-                    S_mat, 1.0, i, (GrB_Index) c_ref[i])) ;
+                nbrs_j[i]     = i ;
+                dirty_list[i] = (GrB_Index) c_ref[i] ;
+                nbrs_v[i]     = 1.0 ;
             }
-            GRB_TRY (GrB_Matrix_wait (S_mat, GrB_MATERIALIZE)) ;
+            // Each (i, c_ref[i]) row index is unique, so duplicates are
+            // impossible and the dup operator is irrelevant; pass a valid
+            // monoid for portability.
+            GRB_TRY (GrB_Matrix_new (&S_mat, GrB_FP64, n_cur, K_ref)) ;
+            GRB_TRY (GrB_Matrix_build_FP64 (S_mat, nbrs_j, dirty_list,
+                nbrs_v, n_cur, GrB_PLUS_FP64)) ;
 
             // A_temp = A_cur * S  (n_cur × K_ref)
             GRB_TRY (GrB_Matrix_new (&A_temp, GrB_FP64, n_cur, K_ref)) ;
@@ -525,15 +599,14 @@ int LAGraph_Leiden
     }
 
     //--------------------------------------------------------------------------
-    // Build output GrB_Vector from o_comm
+    // Build output GrB_Vector from o_comm in one call (vs n setElement calls).
     // (o_comm values are already relabeled 0..K_final-1 from the last iteration)
     //--------------------------------------------------------------------------
 
+    for (GrB_Index i = 0 ; i < n ; i++) c_arr[i] = (int64_t) o_comm[i] ;
     GRB_TRY (GrB_Vector_new (c_handle, GrB_INT64, n)) ;
-    for (GrB_Index i = 0 ; i < n ; i++)
-    {
-        GRB_TRY (GrB_Vector_setElement_INT64 (*c_handle, (int64_t) o_comm[i], i)) ;
-    }
+    GRB_TRY (GrB_Vector_build_INT64 (*c_handle, iota, c_arr, n,
+        GrB_FIRST_INT64)) ;
 
     LG_FREE_WORK ;
     return (GrB_SUCCESS) ;

From d02f37183ed8f69e09146b949da0e49d571df257 Mon Sep 17 00:00:00 2001
From: Guy Korland <gkorland@gmail.com>
Date: Tue, 5 May 2026 09:15:02 +0300
Subject: [PATCH 2/3] perf(leiden): use SuiteSparse Container API for CSR
 access

Address review feedback from @GomezGab on PR #406:

* Replace GrB_Matrix_extractTuples_FP64 + counting-sort scatter with
  GxB_unload_Matrix_into_Container, giving direct pointer access to the
  matrix's internal CSR arrays (Ap/Aj/Ax) with no copy.  Force
  sparse + row-major + non-iso + 64-bit indices via GrB_Matrix_set_INT32
  hints + GrB_wait(GrB_MATERIALIZE), then unload through the container's
  p/i/x vectors with GxB_Vector_unload.  Reload the container before the
  Phase-3 mxm so A_cur is once again a usable GraphBLAS matrix; null our
  raw-array pointers after reload to avoid double-free.

* Compute degrees via GrB_Matrix_reduce_Monoid with GrB_PLUS_FP64 accum
  on a pre-zero-filled k_vec (handles isolated rows), then GxB_Vector_unload
  to obtain k_arr.  k_vec is recreated at size n_cur each outer level.

* Use GxB_Matrix_build_Scalar with a shared GrB_Scalar (=1.0) for the
  S_mat indicator matrix; reuse iota for row indices and reinterpret-cast
  c_ref (int64_t*) directly as GrB_Index* (values are non-negative
  community labels), removing the per-level value array and the temporary
  c_ref copy.

* Use GxB_Vector_load with move semantics to hand the o_comm buffer
  directly to the output vector (with GxB_FULL sparsity), avoiding the
  final GrB_Vector_build_INT64 copy.

* Duplicate G->A as GrB_FP64 once at start (after the m == 0 early
  return) so A_cur is always owned, FP64-typed, and may be unloaded.
  Required because G->A may be any type (BOOL on pattern-only Matrix
  Market inputs, INT*, FP32, ...); typecasting once removes the need
  for per-edge type handling in the inner loops.

test_leiden on karate.mtx: Q = 0.418803, identical to baseline.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 experimental/algorithm/LAGraph_Leiden.c | 293 +++++++++++++-----------
 1 file changed, 161 insertions(+), 132 deletions(-)

diff --git a/experimental/algorithm/LAGraph_Leiden.c b/experimental/algorithm/LAGraph_Leiden.c
index 0ad3f9f6e6..1d4a9a32a4 100644
--- a/experimental/algorithm/LAGraph_Leiden.c
+++ b/experimental/algorithm/LAGraph_Leiden.c
@@ -70,6 +70,8 @@
     GrB_free (&A_new) ;                                     \
     GrB_free (&S_mat) ;                                     \
     GrB_free (&A_temp) ;                                    \
+    GrB_free (&one_scalar) ;                                \
+    GxB_Container_free (&cont) ;                            \
     LAGraph_Free ((void **) &k_arr,      NULL) ;            \
     LAGraph_Free ((void **) &c_arr,      NULL) ;            \
     LAGraph_Free ((void **) &k_comm,     NULL) ;            \
@@ -79,18 +81,12 @@
     LAGraph_Free ((void **) &T_local,    NULL) ;            \
     LAGraph_Free ((void **) &dirty,      NULL) ;            \
     LAGraph_Free ((void **) &dirty_list, NULL) ;            \
-    LAGraph_Free ((void **) &nbrs_j,     NULL) ;            \
-    LAGraph_Free ((void **) &nbrs_v,     NULL) ;            \
     LAGraph_Free ((void **) &remap,      NULL) ;            \
     LAGraph_Free ((void **) &o_comm,     NULL) ;            \
     LAGraph_Free ((void **) &init_comm,  NULL) ;            \
     LAGraph_Free ((void **) &Ap,         NULL) ;            \
     LAGraph_Free ((void **) &Aj,         NULL) ;            \
     LAGraph_Free ((void **) &Ax,         NULL) ;            \
-    LAGraph_Free ((void **) &I_tup,      NULL) ;            \
-    LAGraph_Free ((void **) &J_tup,      NULL) ;            \
-    LAGraph_Free ((void **) &X_tup,      NULL) ;            \
-    LAGraph_Free ((void **) &cursor,     NULL) ;            \
     LAGraph_Free ((void **) &iota,       NULL) ;            \
 }
 
@@ -125,39 +121,36 @@ int LAGraph_Leiden
     // LG_FREE_ALL can safely free them even on early exit)
     //--------------------------------------------------------------------------
 
-    GrB_Vector  k_vec      = NULL ;
-    GrB_Matrix  A_agg      = NULL ;   // owned coarsened graph (Phase 3)
-    GrB_Matrix  A_new      = NULL ;   // next-level aggregate before ownership transfer
-    GrB_Matrix  S_mat      = NULL ;   // temporary membership matrix (Phase 3)
-    GrB_Matrix  A_temp     = NULL ;   // temporary for mxm (Phase 3)
-    double     *k_arr      = NULL ;   // k_arr[i]      = degree of node i (current level)
-    int64_t    *c_arr      = NULL ;   // c_arr[i]      = Phase-1 community label
-    double     *k_comm     = NULL ;   // k_comm[l]     = total degree of community l
-    int64_t    *c_p1       = NULL ;   // c_p1[i]       = Phase-1 parent community
-    int64_t    *c_ref      = NULL ;   // c_ref[i]      = refined sub-community label
-    double     *k_ref_comm = NULL ;   // k_ref_comm[l] = total degree of sub-community l
-    double     *T_local    = NULL ;   // scratch: edge sums from node i to each community
-    int8_t     *dirty      = NULL ;   // dirty[l] = 1 if T_local[l] was written
-    GrB_Index  *dirty_list = NULL ;   // list of community labels touched this node
-    GrB_Index  *nbrs_j     = NULL ;   // scratch: extracted neighbor indices
-    double     *nbrs_v     = NULL ;   // scratch: extracted neighbor weights
-    GrB_Index  *remap      = NULL ;   // remap[old_label] -> new contiguous label
-    GrB_Index  *o_comm     = NULL ;   // o_comm[i] = community of original node i
-    GrB_Index  *init_comm  = NULL ;   // init_comm[r] = initial c_arr for aggregate node r
-
-    // CSR materialization of A_cur, rebuilt once per outer aggregation level.
-    // Walking Ap/Aj/Ax directly avoids one GrB_Col_extract+extractTuples per
-    // node per inner-loop iteration (which dominates runtime).
+    GrB_Vector    k_vec      = NULL ;
+    GrB_Matrix    A_agg      = NULL ;   // owned coarsened graph (Phase 3)
+    GrB_Matrix    A_new      = NULL ;   // next-level aggregate before ownership transfer
+    GrB_Matrix    S_mat      = NULL ;   // temporary membership matrix (Phase 3)
+    GrB_Matrix    A_temp     = NULL ;   // temporary for mxm (Phase 3)
+    GrB_Scalar    one_scalar = NULL ;   // FP64 scalar with value 1.0 for build_Scalar
+    GxB_Container cont       = NULL ;   // for unloading A_cur into raw CSR arrays
+    double       *k_arr      = NULL ;   // k_arr[i]      = degree of node i (current level)
+    int64_t      *c_arr      = NULL ;   // c_arr[i]      = Phase-1 community label
+    double       *k_comm     = NULL ;   // k_comm[l]     = total degree of community l
+    int64_t      *c_p1       = NULL ;   // c_p1[i]       = Phase-1 parent community
+    int64_t      *c_ref      = NULL ;   // c_ref[i]      = refined sub-community label
+    double       *k_ref_comm = NULL ;   // k_ref_comm[l] = total degree of sub-community l
+    double       *T_local    = NULL ;   // scratch: edge sums from node i to each community
+    int8_t       *dirty      = NULL ;   // dirty[l] = 1 if T_local[l] was written
+    GrB_Index    *dirty_list = NULL ;   // list of community labels touched this node
+    GrB_Index    *remap      = NULL ;   // remap[old_label] -> new contiguous label
+    int64_t      *o_comm     = NULL ;   // o_comm[i] = community of original node i
+    GrB_Index    *init_comm  = NULL ;   // init_comm[r] = initial c_arr for aggregate node r
+
+    // Raw CSR pointers obtained by unloading A_cur once per outer aggregation
+    // level via the SuiteSparse Container API.  Walking Ap/Aj/Ax directly
+    // avoids one GrB_Col_extract+extractTuples per node per inner-loop
+    // iteration (which dominated runtime).  These are owned by us only while
+    // unloaded; ownership returns to GraphBLAS on reload (when we then null
+    // them so LG_FREE_WORK doesn't double-free).
     GrB_Index  *Ap         = NULL ;   // row pointers, size n_cur+1
     GrB_Index  *Aj         = NULL ;   // column indices, size Anz
-    double     *Ax         = NULL ;   // values, size Anz
-    GrB_Index  *I_tup      = NULL ;   // raw row indices from extractTuples
-    GrB_Index  *J_tup      = NULL ;   // raw col indices from extractTuples
-    double     *X_tup      = NULL ;   // raw values from extractTuples
-    GrB_Index  *cursor     = NULL ;   // scatter cursor for CSR build
+    double     *Ax         = NULL ;   // values, size Anz (only if !iso)
     GrB_Index  *iota       = NULL ;   // [0,1,...,n-1] for vector/matrix build
-    GrB_Index   Ap_cap     = 0 ;      // current allocated capacity of Ap (entries)
-    GrB_Index   Anz_cap    = 0 ;      // current allocated capacity of Aj/Ax/tuples
 
     //--------------------------------------------------------------------------
     // check inputs
@@ -199,14 +192,19 @@ int LAGraph_Leiden
     LG_TRY (LAGraph_Malloc ((void **) &T_local,    n, sizeof (double),    msg)) ;
     LG_TRY (LAGraph_Malloc ((void **) &dirty,      n, sizeof (int8_t),    msg)) ;
     LG_TRY (LAGraph_Malloc ((void **) &dirty_list, n, sizeof (GrB_Index), msg)) ;
-    LG_TRY (LAGraph_Malloc ((void **) &nbrs_j,     n, sizeof (GrB_Index), msg)) ;
-    LG_TRY (LAGraph_Malloc ((void **) &nbrs_v,     n, sizeof (double),    msg)) ;
     LG_TRY (LAGraph_Malloc ((void **) &remap,      n, sizeof (GrB_Index), msg)) ;
-    LG_TRY (LAGraph_Malloc ((void **) &o_comm,     n, sizeof (GrB_Index), msg)) ;
+    LG_TRY (LAGraph_Malloc ((void **) &o_comm,     n, sizeof (int64_t),   msg)) ;
     LG_TRY (LAGraph_Malloc ((void **) &init_comm,  n, sizeof (GrB_Index), msg)) ;
     LG_TRY (LAGraph_Malloc ((void **) &iota,       n, sizeof (GrB_Index), msg)) ;
     for (GrB_Index i = 0 ; i < n ; i++) iota[i] = i ;
 
+    // Reusable FP64 scalar with value 1.0 for GxB_Matrix_build_Scalar.
+    GRB_TRY (GrB_Scalar_new (&one_scalar, GrB_FP64)) ;
+    GRB_TRY (GrB_Scalar_setElement_FP64 (one_scalar, 1.0)) ;
+
+    // Reusable container for unloading A_cur into raw CSR arrays per level.
+    GRB_TRY (GxB_Container_new (&cont)) ;
+
     //--------------------------------------------------------------------------
     // compute m = total edge weight / 2 from G->A (invariant under aggregation)
     //--------------------------------------------------------------------------
@@ -245,11 +243,19 @@ int LAGraph_Leiden
 
     for (GrB_Index i = 0 ; i < n ; i++)
     {
-        o_comm[i]    = i ;
+        o_comm[i]    = (int64_t) i ;
         init_comm[i] = i ;
     }
 
-    GrB_Matrix A_cur = A ;   // current-level graph (not owned: either G->A or A_agg)
+    // Duplicate G->A as FP64 so we own A_cur and may unload it via the
+    // container API expecting FP64 values.  Done after the m == 0 early
+    // return to avoid an unused copy for empty-edge graphs.  G->A may be
+    // any numeric type (BOOL on pattern-only matrices, INT*, FP32, ...);
+    // we typecast once here so the inner-loop CSR walks always read double.
+    GRB_TRY (GrB_Matrix_new (&A_agg, GrB_FP64, n, n)) ;
+    GRB_TRY (GrB_Matrix_assign (A_agg, NULL, NULL, A,
+        GrB_ALL, n, GrB_ALL, n, NULL)) ;
+    GrB_Matrix A_cur = A_agg ;
     GrB_Index  n_cur = n ;
 
     //==========================================================================
@@ -262,79 +268,89 @@ int LAGraph_Leiden
         outer_changed = false ;
 
         //----------------------------------------------------------------------
-        // Materialize A_cur into CSR (Ap, Aj, Ax) once per outer level.
+        // Compute degrees k_arr[i] = sum of row i (includes self-loops in
+        // A_agg) and unload A_cur into raw CSR arrays Ap/Aj/Ax via the
+        // SuiteSparse Container API.  This replaces O(n_cur) GrB_Col_extract
+        // + extractTuples calls per inner iteration with direct pointer walks.
         //
-        // This replaces the per-node GrB_Col_extract + extractTuples in the
-        // Phase 1 / Phase 2 inner loops (which dominated runtime) with a
-        // single O(nnz) extraction + counting-sort scatter.
+        // Force A_cur into a deterministic state before unloading: dense
+        // reduce target for k, sparse + row-major + non-iso + 64-bit indices
+        // for A.  Container fields are then asserted to match expectations.
         //----------------------------------------------------------------------
 
-        GrB_Index Anz ;
-        GRB_TRY (GrB_Matrix_nvals (&Anz, A_cur)) ;
-
-        // (re)allocate CSR + tuple buffers as needed.
-        if (Ap_cap < n_cur + 1)
+        // 1) Compute degrees with GraphBLAS reduce *before* unloading the
+        //    matrix.  Zero-fill k_vec first so isolated rows produce 0.0
+        //    (otherwise reduce leaves them as missing entries).
+        GrB_free (&k_vec) ;
+        GRB_TRY (GrB_Vector_new (&k_vec, GrB_FP64, n_cur)) ;
+        GRB_TRY (GrB_assign (k_vec, NULL, NULL, (double) 0.0,
+            GrB_ALL, n_cur, NULL)) ;
+        GRB_TRY (GrB_Matrix_reduce_Monoid (k_vec, NULL, GrB_PLUS_FP64,
+            GrB_PLUS_MONOID_FP64, A_cur, NULL)) ;
+
+        // Unload k_vec into k_arr (dense FP64 array of length n_cur).
+        // The previous k_arr workspace allocation is replaced by a pointer
+        // owned by GraphBLAS until LAGraph_Free reclaims it.
+        LAGraph_Free ((void **) &k_arr, NULL) ;
         {
-            LAGraph_Free ((void **) &Ap,     NULL) ;
-            LAGraph_Free ((void **) &cursor, NULL) ;
-            LG_TRY (LAGraph_Malloc ((void **) &Ap,     n_cur + 1,
-                sizeof (GrB_Index), msg)) ;
-            LG_TRY (LAGraph_Malloc ((void **) &cursor, n_cur,
-                sizeof (GrB_Index), msg)) ;
-            Ap_cap = n_cur + 1 ;
-        }
-        if (Anz_cap < Anz)
-        {
-            // Grow with some slack to avoid repeated reallocs; aggregate
-            // graphs typically shrink monotonically but we don't rely on it.
-            GrB_Index newcap = (Anz < 16) ? 16 : Anz ;
-            LAGraph_Free ((void **) &Aj,    NULL) ;
-            LAGraph_Free ((void **) &Ax,    NULL) ;
-            LAGraph_Free ((void **) &I_tup, NULL) ;
-            LAGraph_Free ((void **) &J_tup, NULL) ;
-            LAGraph_Free ((void **) &X_tup, NULL) ;
-            LG_TRY (LAGraph_Malloc ((void **) &Aj,    newcap,
-                sizeof (GrB_Index), msg)) ;
-            LG_TRY (LAGraph_Malloc ((void **) &Ax,    newcap,
-                sizeof (double),    msg)) ;
-            LG_TRY (LAGraph_Malloc ((void **) &I_tup, newcap,
-                sizeof (GrB_Index), msg)) ;
-            LG_TRY (LAGraph_Malloc ((void **) &J_tup, newcap,
-                sizeof (GrB_Index), msg)) ;
-            LG_TRY (LAGraph_Malloc ((void **) &X_tup, newcap,
-                sizeof (double),    msg)) ;
-            Anz_cap = newcap ;
+            GrB_Type k_type = NULL ;
+            uint64_t k_n = 0, k_size = 0 ;
+            int      k_handling = GrB_DEFAULT ;
+            void    *k_void = NULL ;
+            GRB_TRY (GxB_Vector_unload (k_vec, &k_void, &k_type, &k_n,
+                &k_size, &k_handling, NULL)) ;
+            LG_ASSERT_MSG (k_type == GrB_FP64 && k_n == n_cur,
+                GrB_INVALID_VALUE,
+                "k_vec unload: unexpected type or length") ;
+            k_arr = (double *) k_void ;
         }
 
-        // Build CSR: count rows, prefix-sum, scatter.
-        memset (Ap, 0, (n_cur + 1) * sizeof (GrB_Index)) ;
-        if (Anz > 0)
-        {
-            GrB_Index nout = Anz ;
-            GRB_TRY (GrB_Matrix_extractTuples_FP64 (
-                I_tup, J_tup, X_tup, &nout, A_cur)) ;
-            for (GrB_Index t = 0 ; t < Anz ; t++) Ap[I_tup[t] + 1]++ ;
-            for (GrB_Index i = 0 ; i < n_cur ; i++) Ap[i + 1] += Ap[i] ;
-            memcpy (cursor, Ap, n_cur * sizeof (GrB_Index)) ;
-            for (GrB_Index t = 0 ; t < Anz ; t++)
-            {
-                GrB_Index r   = I_tup[t] ;
-                GrB_Index pos = cursor[r]++ ;
-                Aj[pos] = J_tup[t] ;
-                Ax[pos] = X_tup[t] ;
-            }
-        }
-
-        //----------------------------------------------------------------------
-        // compute k_arr[i] = sum of row i (includes self-loops in A_agg).
-        //----------------------------------------------------------------------
-
-        for (GrB_Index i = 0 ; i < n_cur ; i++)
-        {
-            double s = 0.0 ;
-            for (GrB_Index p = Ap[i] ; p < Ap[i + 1] ; p++) s += Ax[p] ;
-            k_arr[i] = s ;
-        }
+        // 2) Force A_cur into the format we want, then unload into container.
+        //    Hints: sparse, row-major, non-iso, 64-bit row pointers/indices.
+        //    GxB_unload_Matrix_into_Container materializes pending work.
+        GRB_TRY (GrB_set (A_cur, GxB_SPARSE, GxB_SPARSITY_CONTROL)) ;
+        GRB_TRY (GrB_set (A_cur, (int32_t) GrB_ROWMAJOR,
+            GrB_STORAGE_ORIENTATION_HINT)) ;
+        GRB_TRY (GrB_Matrix_set_INT32 (A_cur, false, GxB_ISO)) ;
+        GRB_TRY (GrB_Matrix_set_INT32 (A_cur, 64, GxB_OFFSET_INTEGER_HINT)) ;
+        GRB_TRY (GrB_Matrix_set_INT32 (A_cur, 64, GxB_ROWINDEX_INTEGER_HINT)) ;
+        GRB_TRY (GrB_Matrix_set_INT32 (A_cur, 64, GxB_COLINDEX_INTEGER_HINT)) ;
+        GRB_TRY (GrB_wait (A_cur, GrB_MATERIALIZE)) ;
+
+        GRB_TRY (GxB_unload_Matrix_into_Container (A_cur, cont, NULL)) ;
+        LG_ASSERT_MSG (cont->format == GxB_SPARSE,
+            GrB_INVALID_VALUE, "A_cur container is not sparse CSR") ;
+        LG_ASSERT_MSG (cont->orientation == GrB_ROWMAJOR,
+            GrB_INVALID_VALUE, "A_cur container is not row-major") ;
+        LG_ASSERT_MSG (!cont->iso,
+            GrB_INVALID_VALUE, "A_cur container unexpectedly iso") ;
+
+        // Unload row pointers, column indices, and values from the container's
+        // internal vectors into raw arrays for the inner-loop CSR walks.
+        // We hold these arrays as our own until reload below.
+        GrB_Type   pty = NULL,  ity = NULL,  xty = NULL ;
+        uint64_t   pn  = 0,     in_  = 0,    xn  = 0 ;
+        uint64_t   psz = 0,     isz = 0,     xsz = 0 ;
+        int        ph  = GrB_DEFAULT, ih = GrB_DEFAULT, xh = GrB_DEFAULT ;
+        void      *pv  = NULL,  *iv  = NULL, *xv  = NULL ;
+
+        GRB_TRY (GxB_Vector_unload (cont->p, &pv, &pty, &pn, &psz, &ph, NULL));
+        GRB_TRY (GxB_Vector_unload (cont->i, &iv, &ity, &in_, &isz, &ih, NULL));
+        GRB_TRY (GxB_Vector_unload (cont->x, &xv, &xty, &xn, &xsz, &xh, NULL));
+        // Offsets must be 64-bit unsigned (we forced via INTEGER_HINT).
+        // Column indices may come back as either UINT64 or INT64 depending
+        // on SuiteSparse's internal choice; both have identical bit width
+        // and represent non-negative indices, so reinterpret cast is safe.
+        // Values must be FP64 since A_agg was constructed as FP64.
+        LG_ASSERT_MSG (pty == GrB_UINT64,
+            GrB_INVALID_VALUE, "container offsets are not 64-bit unsigned") ;
+        LG_ASSERT_MSG (ity == GrB_UINT64 || ity == GrB_INT64,
+            GrB_INVALID_VALUE, "container indices are not 64-bit") ;
+        LG_ASSERT_MSG (xty == GrB_FP64,
+            GrB_INVALID_VALUE, "container values are not FP64") ;
+        Ap = (GrB_Index *) pv ;
+        Aj = (GrB_Index *) iv ;
+        Ax = (double    *) xv ;
 
         //----------------------------------------------------------------------
         // PHASE 1: Local Move Phase
@@ -546,9 +562,26 @@ int LAGraph_Leiden
 
         for (GrB_Index i = 0 ; i < n ; i++)
         {
-            o_comm[i] = (GrB_Index) c_ref[o_comm[i]] ;
+            o_comm[i] = c_ref[o_comm[i]] ;
         }
 
+        //----------------------------------------------------------------------
+        // Reload A_cur from the container before any further GraphBLAS use
+        // (Phase 3 mxm or next-level unload).  Ownership of Ap/Aj/Ax returns
+        // to GraphBLAS; we null our pointers so LG_FREE_WORK won't double-free.
+        //----------------------------------------------------------------------
+
+        GRB_TRY (GxB_Vector_load (cont->p, (void **) &Ap, pty,
+            pn, psz, ph, NULL)) ;
+        Ap = NULL ;
+        GRB_TRY (GxB_Vector_load (cont->i, (void **) &Aj, ity,
+            in_, isz, ih, NULL)) ;
+        Aj = NULL ;
+        GRB_TRY (GxB_Vector_load (cont->x, (void **) &Ax, xty,
+            xn, xsz, xh, NULL)) ;
+        Ax = NULL ;
+        GRB_TRY (GxB_load_Matrix_from_Container (A_cur, cont, NULL)) ;
+
         //----------------------------------------------------------------------
         // PHASE 3: Aggregation — build coarsened graph if communities merged
         //----------------------------------------------------------------------
@@ -557,25 +590,17 @@ int LAGraph_Leiden
         {
             outer_changed = true ;
 
-            // S_mat: n_cur × K_ref indicator matrix; S[i, c_ref[i]] = 1.
-            // Build in one shot (single-pass) instead of n_cur setElement
-            // calls + an implicit GrB_Matrix_wait.  Reuse scratch buffers:
-            //   nbrs_j  -> row indices [0..n_cur)
-            //   dirty_list -> column indices (c_ref cast to GrB_Index)
-            //   nbrs_v  -> values (1.0)
-            // (All sized n >= n_cur; build copies them, no aliasing concern.)
-            for (GrB_Index i = 0 ; i < n_cur ; i++)
-            {
-                nbrs_j[i]     = i ;
-                dirty_list[i] = (GrB_Index) c_ref[i] ;
-                nbrs_v[i]     = 1.0 ;
-            }
-            // Each (i, c_ref[i]) row index is unique, so duplicates are
-            // impossible and the dup operator is irrelevant; pass a valid
-            // monoid for portability.
+            // S_mat: n_cur × K_ref indicator matrix; S[i, c_ref[i]] = 1 for
+            // every i.  All values are 1.0, so use GxB_Matrix_build_Scalar
+            // and a shared scalar instead of materializing a 1.0-array.
+            //   rows: iota (precomputed [0..n-1], reused)
+            //   cols: c_ref reinterpret-cast to GrB_Index*.  c_ref values
+            //         are non-negative community labels in [0, K_ref); on
+            //         all targeted platforms int64_t and uint64_t share the
+            //         same width and representation for non-negative values.
             GRB_TRY (GrB_Matrix_new (&S_mat, GrB_FP64, n_cur, K_ref)) ;
-            GRB_TRY (GrB_Matrix_build_FP64 (S_mat, nbrs_j, dirty_list,
-                nbrs_v, n_cur, GrB_PLUS_FP64)) ;
+            GRB_TRY (GxB_Matrix_build_Scalar (S_mat, iota,
+                (GrB_Index *) c_ref, one_scalar, n_cur)) ;
 
             // A_temp = A_cur * S  (n_cur × K_ref)
             GRB_TRY (GrB_Matrix_new (&A_temp, GrB_FP64, n_cur, K_ref)) ;
@@ -599,14 +624,18 @@ int LAGraph_Leiden
     }
 
     //--------------------------------------------------------------------------
-    // Build output GrB_Vector from o_comm in one call (vs n setElement calls).
-    // (o_comm values are already relabeled 0..K_final-1 from the last iteration)
+    // Build output GrB_Vector from o_comm with move semantics: hand the
+    // o_comm buffer directly to GraphBLAS (no copy) and null our pointer so
+    // LG_FREE_WORK doesn't double-free.  o_comm values are already relabeled
+    // 0..K_final-1 from the last iteration; the loaded vector is "full"
+    // (every index has a value), so set sparsity hint accordingly.
     //--------------------------------------------------------------------------
 
-    for (GrB_Index i = 0 ; i < n ; i++) c_arr[i] = (int64_t) o_comm[i] ;
     GRB_TRY (GrB_Vector_new (c_handle, GrB_INT64, n)) ;
-    GRB_TRY (GrB_Vector_build_INT64 (*c_handle, iota, c_arr, n,
-        GrB_FIRST_INT64)) ;
+    GRB_TRY (GrB_set (*c_handle, GxB_FULL, GxB_SPARSITY_CONTROL)) ;
+    GRB_TRY (GxB_Vector_load (*c_handle, (void **) &o_comm, GrB_INT64,
+        n, n * sizeof (int64_t), GrB_DEFAULT, NULL)) ;
+    o_comm = NULL ;     // ownership transferred to *c_handle
 
     LG_FREE_WORK ;
     return (GrB_SUCCESS) ;

From a76bbb3a8645eb1e9d519d13b0174c15cbfc4743 Mon Sep 17 00:00:00 2001
From: Guy Korland <gkorland@gmail.com>
Date: Tue, 5 May 2026 09:27:06 +0300
Subject: [PATCH 3/3] fix(leiden): guard Container API behind GxB v10 version
 check

The previous commit unconditionally used GxB_Container, GxB_Vector_load,
GxB_Vector_unload, and GxB_load_Matrix_from_Container, which are only
available in SuiteSparse:GraphBLAS v10.0.0 and newer.  CI tests against
both v9.0.0 and v10.2.0, so the v9 build failed with implicit-declaration
errors.

Wrap the Container-based fast path in
'#if LAGR_LEIDEN_USE_CONTAINER' (defined to 1 when
GxB_IMPLEMENTATION >= GxB_VERSION(10,0,0), else 0) and provide a v9
fallback that materializes CSR via GrB_Matrix_extractTuples_FP64 +
counting-sort scatter (the same approach as commit 9a1daf07).

Both code paths share: FP64 typecast of G->A into A_agg, GrB_Matrix
reduce for degrees, GxB_Matrix_build_Scalar for the indicator matrix
S_mat (build_Scalar exists in v9), and the rest of the algorithm.

The macro can be overridden with -DLAGR_LEIDEN_USE_CONTAINER=0/1 to
exercise either path on the same SuiteSparse install (used to verify
the fallback compiles on the v10 dev machine).

test_leiden on karate.mtx: Q = 0.418803 (unchanged from baseline).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 experimental/algorithm/LAGraph_Leiden.c | 143 +++++++++++++++++++++---
 1 file changed, 127 insertions(+), 16 deletions(-)

diff --git a/experimental/algorithm/LAGraph_Leiden.c b/experimental/algorithm/LAGraph_Leiden.c
index 1d4a9a32a4..a810fc86f4 100644
--- a/experimental/algorithm/LAGraph_Leiden.c
+++ b/experimental/algorithm/LAGraph_Leiden.c
@@ -11,6 +11,27 @@
 // funding and support from the U.S. Government (see Acknowledgments.txt file).
 // DM22-0790
 
+#include "LG_internal.h"
+#include <LAGraphX.h>
+#include <LAGraph.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define LEIDEN_MAX_ITER 100
+
+// The CSR fast path uses the SuiteSparse:GraphBLAS Container API
+// (GxB_Container, GxB_unload_Matrix_into_Container, GxB_Vector_load /
+// GxB_Vector_unload, GxB_load_Matrix_from_Container), introduced in
+// SuiteSparse:GraphBLAS v10.0.0.  On older versions we fall back to a
+// CSR materialization via GrB_Matrix_extractTuples + counting-sort scatter.
+#ifndef LAGR_LEIDEN_USE_CONTAINER
+#if defined(GxB_IMPLEMENTATION) && (GxB_IMPLEMENTATION >= GxB_VERSION(10,0,0))
+#define LAGR_LEIDEN_USE_CONTAINER 1
+#else
+#define LAGR_LEIDEN_USE_CONTAINER 0
+#endif
+#endif
+
 //------------------------------------------------------------------------------
 // The Leiden algorithm is a modularity-based community detection method that
 // guarantees well-connected communities by introducing a Refinement phase
@@ -71,7 +92,7 @@
     GrB_free (&S_mat) ;                                     \
     GrB_free (&A_temp) ;                                    \
     GrB_free (&one_scalar) ;                                \
-    GxB_Container_free (&cont) ;                            \
+    LAGR_LEIDEN_FREE_CONTAINER ;                            \
     LAGraph_Free ((void **) &k_arr,      NULL) ;            \
     LAGraph_Free ((void **) &c_arr,      NULL) ;            \
     LAGraph_Free ((void **) &k_comm,     NULL) ;            \
@@ -87,6 +108,10 @@
     LAGraph_Free ((void **) &Ap,         NULL) ;            \
     LAGraph_Free ((void **) &Aj,         NULL) ;            \
     LAGraph_Free ((void **) &Ax,         NULL) ;            \
+    LAGraph_Free ((void **) &I_tup,      NULL) ;            \
+    LAGraph_Free ((void **) &J_tup,      NULL) ;            \
+    LAGraph_Free ((void **) &X_tup,      NULL) ;            \
+    LAGraph_Free ((void **) &cursor,     NULL) ;            \
     LAGraph_Free ((void **) &iota,       NULL) ;            \
 }
 
@@ -97,13 +122,11 @@
     if (c_handle != NULL) GrB_free (c_handle) ;             \
 }
 
-#include "LG_internal.h"
-#include <LAGraphX.h>
-#include <LAGraph.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define LEIDEN_MAX_ITER 100
+#if LAGR_LEIDEN_USE_CONTAINER
+#define LAGR_LEIDEN_FREE_CONTAINER GxB_Container_free (&cont)
+#else
+#define LAGR_LEIDEN_FREE_CONTAINER ((void) 0)
+#endif
 
 int LAGraph_Leiden
 (
@@ -127,7 +150,9 @@ int LAGraph_Leiden
     GrB_Matrix    S_mat      = NULL ;   // temporary membership matrix (Phase 3)
     GrB_Matrix    A_temp     = NULL ;   // temporary for mxm (Phase 3)
     GrB_Scalar    one_scalar = NULL ;   // FP64 scalar with value 1.0 for build_Scalar
+#if LAGR_LEIDEN_USE_CONTAINER
     GxB_Container cont       = NULL ;   // for unloading A_cur into raw CSR arrays
+#endif
     double       *k_arr      = NULL ;   // k_arr[i]      = degree of node i (current level)
     int64_t      *c_arr      = NULL ;   // c_arr[i]      = Phase-1 community label
     double       *k_comm     = NULL ;   // k_comm[l]     = total degree of community l
@@ -141,16 +166,21 @@ int LAGraph_Leiden
     int64_t      *o_comm     = NULL ;   // o_comm[i] = community of original node i
     GrB_Index    *init_comm  = NULL ;   // init_comm[r] = initial c_arr for aggregate node r
 
-    // Raw CSR pointers obtained by unloading A_cur once per outer aggregation
-    // level via the SuiteSparse Container API.  Walking Ap/Aj/Ax directly
-    // avoids one GrB_Col_extract+extractTuples per node per inner-loop
-    // iteration (which dominated runtime).  These are owned by us only while
-    // unloaded; ownership returns to GraphBLAS on reload (when we then null
-    // them so LG_FREE_WORK doesn't double-free).
+    // Raw CSR pointers for inner-loop walks.  On v10+ they are obtained by
+    // unloading A_cur into the SuiteSparse Container (zero-copy); ownership
+    // returns to GraphBLAS on reload (we then null them so LG_FREE_WORK
+    // doesn't double-free).  On older versions they are allocated by us
+    // and (re)filled per level via GrB_Matrix_extractTuples + counting-sort.
     GrB_Index  *Ap         = NULL ;   // row pointers, size n_cur+1
     GrB_Index  *Aj         = NULL ;   // column indices, size Anz
-    double     *Ax         = NULL ;   // values, size Anz (only if !iso)
+    double     *Ax         = NULL ;   // values, size Anz (only if !iso on v10)
+    GrB_Index  *I_tup      = NULL ;   // raw row indices from extractTuples (v9 fallback)
+    GrB_Index  *J_tup      = NULL ;   // raw col indices from extractTuples (v9 fallback)
+    double     *X_tup      = NULL ;   // raw values from extractTuples (v9 fallback)
+    GrB_Index  *cursor     = NULL ;   // scatter cursor for CSR build (v9 fallback)
     GrB_Index  *iota       = NULL ;   // [0,1,...,n-1] for vector/matrix build
+    GrB_Index   Ap_cap     = 0 ;      // current allocated capacity of Ap (v9 fallback)
+    GrB_Index   Anz_cap    = 0 ;      // current allocated capacity of Aj/Ax/tuples (v9 fallback)
 
     //--------------------------------------------------------------------------
     // check inputs
@@ -202,8 +232,11 @@ int LAGraph_Leiden
     GRB_TRY (GrB_Scalar_new (&one_scalar, GrB_FP64)) ;
     GRB_TRY (GrB_Scalar_setElement_FP64 (one_scalar, 1.0)) ;
 
-    // Reusable container for unloading A_cur into raw CSR arrays per level.
+    // Reusable container for unloading A_cur into raw CSR arrays per level
+    // (only available with the SuiteSparse v10+ Container API).
+#if LAGR_LEIDEN_USE_CONTAINER
     GRB_TRY (GxB_Container_new (&cont)) ;
+#endif
 
     //--------------------------------------------------------------------------
     // compute m = total edge weight / 2 from G->A (invariant under aggregation)
@@ -288,6 +321,7 @@ int LAGraph_Leiden
         GRB_TRY (GrB_Matrix_reduce_Monoid (k_vec, NULL, GrB_PLUS_FP64,
             GrB_PLUS_MONOID_FP64, A_cur, NULL)) ;
 
+#if LAGR_LEIDEN_USE_CONTAINER
         // Unload k_vec into k_arr (dense FP64 array of length n_cur).
         // The previous k_arr workspace allocation is replaced by a pointer
         // owned by GraphBLAS until LAGraph_Free reclaims it.
@@ -351,6 +385,75 @@ int LAGraph_Leiden
         Ap = (GrB_Index *) pv ;
         Aj = (GrB_Index *) iv ;
         Ax = (double    *) xv ;
+#else
+        // Fallback for SuiteSparse:GraphBLAS < v10.0.0 (no Container API):
+        // copy degrees out of k_vec, then materialize CSR via extractTuples
+        // + counting-sort scatter.  k_vec contains entries only for non-zero
+        // rows, so zero-fill k_arr first then scatter.
+        for (GrB_Index i = 0 ; i < n_cur ; i++) k_arr[i] = 0.0 ;
+        {
+            GrB_Index nvk = n_cur ;
+            GrB_Index *Ik = NULL ;
+            double    *Xk = NULL ;
+            LG_TRY (LAGraph_Malloc ((void **) &Ik, n_cur, sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &Xk, n_cur, sizeof (double),    msg)) ;
+            GRB_TRY (GrB_Vector_extractTuples_FP64 (Ik, Xk, &nvk, k_vec)) ;
+            for (GrB_Index t = 0 ; t < nvk ; t++) k_arr[Ik[t]] = Xk[t] ;
+            LAGraph_Free ((void **) &Ik, NULL) ;
+            LAGraph_Free ((void **) &Xk, NULL) ;
+        }
+
+        GrB_Index Anz ;
+        GRB_TRY (GrB_Matrix_nvals (&Anz, A_cur)) ;
+        if (Ap_cap < n_cur + 1)
+        {
+            LAGraph_Free ((void **) &Ap,     NULL) ;
+            LAGraph_Free ((void **) &cursor, NULL) ;
+            LG_TRY (LAGraph_Malloc ((void **) &Ap,     n_cur + 1,
+                sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &cursor, n_cur,
+                sizeof (GrB_Index), msg)) ;
+            Ap_cap = n_cur + 1 ;
+        }
+        if (Anz_cap < Anz)
+        {
+            GrB_Index newcap = (Anz < 16) ? 16 : Anz ;
+            LAGraph_Free ((void **) &Aj,    NULL) ;
+            LAGraph_Free ((void **) &Ax,    NULL) ;
+            LAGraph_Free ((void **) &I_tup, NULL) ;
+            LAGraph_Free ((void **) &J_tup, NULL) ;
+            LAGraph_Free ((void **) &X_tup, NULL) ;
+            LG_TRY (LAGraph_Malloc ((void **) &Aj,    newcap,
+                sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &Ax,    newcap,
+                sizeof (double),    msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &I_tup, newcap,
+                sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &J_tup, newcap,
+                sizeof (GrB_Index), msg)) ;
+            LG_TRY (LAGraph_Malloc ((void **) &X_tup, newcap,
+                sizeof (double),    msg)) ;
+            Anz_cap = newcap ;
+        }
+
+        memset (Ap, 0, (n_cur + 1) * sizeof (GrB_Index)) ;
+        if (Anz > 0)
+        {
+            GrB_Index nout = Anz ;
+            GRB_TRY (GrB_Matrix_extractTuples_FP64 (I_tup, J_tup, X_tup,
+                &nout, A_cur)) ;
+            for (GrB_Index t = 0 ; t < Anz ; t++) Ap[I_tup[t] + 1]++ ;
+            for (GrB_Index r = 0 ; r < n_cur ; r++) Ap[r + 1] += Ap[r] ;
+            memcpy (cursor, Ap, n_cur * sizeof (GrB_Index)) ;
+            for (GrB_Index t = 0 ; t < Anz ; t++)
+            {
+                GrB_Index r = I_tup[t] ;
+                GrB_Index dst = cursor[r]++ ;
+                Aj[dst] = J_tup[t] ;
+                Ax[dst] = X_tup[t] ;
+            }
+        }
+#endif
 
         //----------------------------------------------------------------------
         // PHASE 1: Local Move Phase
@@ -569,8 +672,10 @@ int LAGraph_Leiden
         // Reload A_cur from the container before any further GraphBLAS use
         // (Phase 3 mxm or next-level unload).  Ownership of Ap/Aj/Ax returns
         // to GraphBLAS; we null our pointers so LG_FREE_WORK won't double-free.
+        // No-op on the v9 fallback (A_cur was never unloaded).
         //----------------------------------------------------------------------
 
+#if LAGR_LEIDEN_USE_CONTAINER
         GRB_TRY (GxB_Vector_load (cont->p, (void **) &Ap, pty,
             pn, psz, ph, NULL)) ;
         Ap = NULL ;
@@ -581,6 +686,7 @@ int LAGraph_Leiden
             xn, xsz, xh, NULL)) ;
         Ax = NULL ;
         GRB_TRY (GxB_load_Matrix_from_Container (A_cur, cont, NULL)) ;
+#endif
 
         //----------------------------------------------------------------------
         // PHASE 3: Aggregation — build coarsened graph if communities merged
@@ -632,10 +738,15 @@ int LAGraph_Leiden
     //--------------------------------------------------------------------------
 
     GRB_TRY (GrB_Vector_new (c_handle, GrB_INT64, n)) ;
+#if LAGR_LEIDEN_USE_CONTAINER
     GRB_TRY (GrB_set (*c_handle, GxB_FULL, GxB_SPARSITY_CONTROL)) ;
     GRB_TRY (GxB_Vector_load (*c_handle, (void **) &o_comm, GrB_INT64,
         n, n * sizeof (int64_t), GrB_DEFAULT, NULL)) ;
     o_comm = NULL ;     // ownership transferred to *c_handle
+#else
+    GRB_TRY (GrB_Vector_build_INT64 (*c_handle, iota, o_comm, n,
+        GrB_FIRST_INT64)) ;
+#endif
 
     LG_FREE_WORK ;
     return (GrB_SUCCESS) ;