|
| 1 | +//------------------------------------------------------------------------------ |
| 2 | +// LAGraph_Jaccard - parallel jaccard similarity |
| 3 | +//------------------------------------------------------------------------------ |
| 4 | + |
| 5 | +// LAGraph, (c) 2019-2025 by The LAGraph Contributors, All Rights Reserved. |
| 6 | +// SPDX-License-Identifier: BSD-2-Clause |
| 7 | +// |
| 8 | +// For additional details (including references to third party source code and |
| 9 | +// other files) see the LICENSE file or contact permission@sei.cmu.edu. See |
| 10 | +// Contributors.txt for a full list of contributors. Created, in part, with |
| 11 | +// funding and support from the U.S. Government (see Acknowledgments.txt file). |
| 12 | +// DM22-0790 |
| 13 | + |
| 14 | +// Contributed by Elaheh Hassani and Tim Davis, Texas A&M University |
| 15 | + |
| 16 | +//------------------------------------------------------------------------------ |
| 17 | +// LAGr_Jaccard: compute Jaccard similarity (weight) coefficients for an undirected graph. |
| 18 | +// |
| 19 | +// Inputs: |
| 20 | +// G - a valid LAGraph_Graph with: |
| 21 | +// * G->A structurally symmetric (undirected) |
| 22 | +// * no self-edges |
| 23 | +// * G->out_degree cached (no explicit zeros) |
| 24 | +// all_pairs - if true, compute Jaccard for all pairs of nodes in the graph; |
| 25 | +// if false, compute only for neighboring nodes (current edges). |
| 26 | +// |
| 27 | +// Output: |
| 28 | +// JC - an n-by-n matrix of Jaccard coefficients. JC is always |
| 29 | +// returned as an upper-triangular matrix. When all_pairs is |
| 30 | +// true, the diagonal entries are equal to 1. When all_pairs is |
| 31 | +// false, the diagonal is structurally empty. |
| 32 | +// |
| 33 | +// Jaccard similarity measures the overlap of neighbor sets. |
| 34 | +// For a pair of nodes i and j in the graph G: |
| 35 | +// JC(i,j) = |N(i) ∩ N(j)| / |N(i) ∪ N(j)| |
| 36 | +// where N(i) is the set of neighbors of i in G. |
| 37 | +// |
| 38 | +// The computation proceeds in two stages. First, an “intersection matrix” B |
| 39 | +// is formed where B(i,j) = |N(i) ∩ N(j)|. |
| 40 | +// Second, a degree-based denominator matrix R is formed where |
| 41 | +// R(i,j) = deg(i) + deg(j) − B(i,j). |
| 42 | +// Lastly, the output matrix JC is computed as the element-wise ratio |
| 43 | +// JC = B ./ R. |
| 44 | +// |
| 45 | + |
| 46 | +//------------------------------------------------------------------------------ |
| 47 | +// References: |
| 48 | +// (1) "Parallel Algorithms for Computing Jaccard Weights on Graphs using Linear Algebra," |
| 49 | +// in Proc. IEEE High Performance Extreme Computing Conference (HPEC), 2023. |
| 50 | +// https://doi.org/10.1109/HPEC58863.2023.10363558 |
| 51 | +// |
| 52 | +// (2) https://en.wikipedia.org/wiki/Jaccard_index |
| 53 | + |
| 54 | + |
| 55 | +#define LG_FREE_WORK \ |
| 56 | +{ \ |
| 57 | + GrB_free(&R); \ |
| 58 | + GrB_free(&B); \ |
| 59 | + GrB_free(&D); \ |
| 60 | + GrB_free (&M) ; \ |
| 61 | +} |
| 62 | + |
| 63 | +#define LG_FREE_ALL \ |
| 64 | +{ \ |
| 65 | + LG_FREE_WORK ; \ |
| 66 | +} |
| 67 | + |
| 68 | +#include "LG_internal.h" |
| 69 | + |
| 70 | +int LAGr_Jaccard |
| 71 | +( |
| 72 | + // output |
| 73 | + GrB_Matrix *JC, |
| 74 | + // input: |
| 75 | + LAGraph_Graph G, |
| 76 | + bool all_pairs, |
| 77 | + char *msg |
| 78 | + ) |
| 79 | +{ |
| 80 | + GrB_Matrix B = NULL, R = NULL, D = NULL, M = NULL ; |
| 81 | + GrB_Index n; |
| 82 | + |
| 83 | + //-------------------------------------------------------------------------- |
| 84 | + // check inputs |
| 85 | + //-------------------------------------------------------------------------- |
| 86 | + LG_CLEAR_MSG ; |
| 87 | + // error if G is directed (OK if G is directed but G->A is symmetric in structure |
| 88 | + // error if G has self edges, or unknown |
| 89 | + LG_ASSERT (JC != NULL, GrB_NULL_POINTER) ; |
| 90 | + (*JC) = NULL ; |
| 91 | + LG_TRY (LAGraph_CheckGraph (G, msg)) ; |
| 92 | + LG_ASSERT_MSG (G->nself_edges == 0, LAGRAPH_NO_SELF_EDGES_ALLOWED, "G->nself_edges must be zero") ; |
| 93 | + LG_ASSERT_MSG ((G->kind == LAGraph_ADJACENCY_UNDIRECTED || (G->kind == LAGraph_ADJACENCY_DIRECTED && |
| 94 | + G->is_symmetric_structure == LAGraph_TRUE)), |
| 95 | + LAGRAPH_SYMMETRIC_STRUCTURE_REQUIRED, |
| 96 | + "G->A must be known to be symmetric") ; |
| 97 | + //-------------------------------------------------------------------------- |
| 98 | + // degree vector deg |
| 99 | + //-------------------------------------------------------------------------- |
| 100 | + GrB_Vector deg = G->out_degree ; |
| 101 | + LG_ASSERT_MSG (deg != NULL, LAGRAPH_NOT_CACHED, "G->out_degree is required") ; |
| 102 | + GrB_Matrix A = G->A ; |
| 103 | + GRB_TRY( GrB_Matrix_nrows (&n, A) ); |
| 104 | + GrB_Type int_type = (n > INT32_MAX) ? GrB_INT64 : GrB_INT32 ; |
| 105 | + |
| 106 | + //-------------------------------------------------------------------------- |
| 107 | + // B is intersection matrix |
| 108 | + //-------------------------------------------------------------------------- |
| 109 | + |
| 110 | + // B(i,j) is the size of the intersection of the pattern of A(i,:) and |
| 111 | + // A(:,j). If all_pairs is true, B is computed for all entries in A^2. |
| 112 | + // Otherwise, it is computed just for entries in triu(A). |
| 113 | + |
| 114 | + // The final output matrix (B) is always upper triangular, in both cases. |
| 115 | + |
| 116 | + GRB_TRY(GrB_Matrix_new(&B, GrB_FP64, n, n)); |
| 117 | + |
| 118 | + if (all_pairs) |
| 119 | + { |
| 120 | + // B = triu (A*A) |
| 121 | + GRB_TRY(GrB_mxm(B, NULL, NULL, LAGraph_plus_one_fp64, A, A, NULL)); |
| 122 | + GRB_TRY( GrB_select(B, NULL, NULL, GrB_TRIU, B, (int64_t)0, NULL)); |
| 123 | + } |
| 124 | + else |
| 125 | + { |
| 126 | + // B<triu(A)> = A*A' |
| 127 | + GRB_TRY(GrB_Matrix_new(&M, GrB_BOOL, n, n)); |
| 128 | + GRB_TRY( GrB_select(M, NULL, NULL, GrB_TRIU, A, (int64_t)0, NULL)); |
| 129 | + GRB_TRY(GrB_mxm(B, M, NULL, LAGraph_plus_one_fp64, A, A, GrB_DESC_ST1)); |
| 130 | + GrB_free (&M) ; |
| 131 | + } |
| 132 | + |
| 133 | + //-------------------------------------------------------------------------- |
| 134 | + // R has summation of degree of corresponding nodes |
| 135 | + // B is jaccard index B <- B / (R-B) |
| 136 | + //-------------------------------------------------------------------------- |
| 137 | + |
| 138 | + // If deg vectors is sparse, make it dense |
| 139 | + GrB_Index deg_nnz = 0; |
| 140 | + GrB_Vector d = NULL; |
| 141 | + GRB_TRY(GrB_Vector_nvals(°_nnz, deg)); |
| 142 | + if (deg_nnz < n ) { |
| 143 | + GrB_Vector t = NULL; |
| 144 | + GRB_TRY (GrB_Vector_dup(&t, deg)); |
| 145 | + GRB_TRY (GrB_assign (t, t, NULL, 0, GrB_ALL, n, GrB_DESC_SC)) ; |
| 146 | + GRB_TRY (GrB_Vector_dup(&d, t)); |
| 147 | + } |
| 148 | + else { |
| 149 | + GRB_TRY (GrB_Vector_dup(&d, deg)); |
| 150 | + } |
| 151 | + |
| 152 | + // D is degree matrix |
| 153 | + GRB_TRY(GrB_Matrix_diag(&D, d, 0)); // use d here |
| 154 | + GrB_Matrix_new(&R, int_type, n, n); |
| 155 | + |
| 156 | + // R = B*D -> R_ij = deg_j - b_ij |
| 157 | + GRB_TRY(GrB_mxm(R, NULL, NULL, (int_type == GrB_INT64) ? GxB_PLUS_RMINUS_INT64 : GxB_PLUS_RMINUS_INT32, B, D, GrB_DESC_S)); |
| 158 | + // R = D*R -> R_ij = deg_i + r_ij |
| 159 | + GRB_TRY(GrB_mxm(R, NULL, NULL, (int_type == GrB_INT64) ? GxB_PLUS_PLUS_INT64 : GxB_PLUS_PLUS_INT32, D, R, GrB_DESC_S)); |
| 160 | + |
| 161 | + GRB_TRY( GrB_eWiseMult(B, NULL, NULL, GrB_DIV_FP64, B, R, NULL) ); |
| 162 | + (*JC) = B; |
| 163 | + B = NULL; |
| 164 | + LG_FREE_WORK; |
| 165 | + |
| 166 | + return (GrB_SUCCESS) ; |
| 167 | +} |
0 commit comments