griptomo.core.gpu_extract_features
1from __future__ import annotations 2 3import time 4from typing import TYPE_CHECKING 5 6import igraph as ig 7 8if TYPE_CHECKING: # pragma: no cover - hints only evaluated by type-checkers. 9 import cudf # type: ignore 10 import cugraph # type: ignore 11 12try: # pragma: no cover - GPU-only dependencies may be absent in CPU environments. 13 import cugraph as cg # type: ignore 14 import cudf # type: ignore 15 import cupy as cp # type: ignore 16except ImportError as exc: # pragma: no cover - exercised when RAPIDS is unavailable. 17 cg = None # type: ignore 18 cudf = None # type: ignore 19 cp = None # type: ignore 20 _GPU_IMPORT_ERROR: Exception | None = exc 21else: 22 _GPU_IMPORT_ERROR = None 23 24 25def cugraph_calc_graph_features( 26 G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False 27): 28 """ 29 Calculate various graph features for a cugraph.Graph object. 30 31 Parameters 32 ---------- 33 G_ig : igraph.Graph 34 The input graph for which features are to be calculated. 35 G_cg : cugraph.Graph 36 The input graph for which features are to be calculated. 37 skip_clique_num : bool, optional 38 If True, the calculation of the largest clique number will be skipped. Default is False. 39 40 Returns 41 ------- 42 dict 43 A dictionary containing the calculated graph features: 44 - "total n nodes": Total number of nodes in the graph. 45 - "total n edges": Total number of edges in the graph. 46 - "n nodes": Number of nodes in the largest connected component. 47 - "n edges": Number of edges in the largest connected component. 48 - "density": Density of the graph. 49 - "diameter": Diameter of the graph. 50 - "avg clustering": Average clustering coefficient of the graph. 51 - "max closeness centrality": Maximum closeness centrality in the graph. 52 - "max eigenvector centrality": Maximum eigenvector centrality in the graph. 53 - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]). 54 - "max clique number": Size of the largest clique in the graph. 55 - "n communities": Number of communities detected using the louvian algorithm. 56 - "avg path length": Average path length in the graph. 57 - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs. 58 59 Raises 60 ------ 61 ValueError 62 If the input is not a cugraph.Graph object. 63 64 Notes 65 ----- 66 - The graph is assumed to be undirected and unweighted. 67 - If the graph is not connected, the largest connected component is used for feature calculation. 68 - The diameter is calculated using a 2-step BFS. 69 - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU. 70 - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph. 71 """ 72 if cg is None or cudf is None or cp is None: 73 raise RuntimeError( 74 "cugraph, cudf, and cupy must be installed to use GPU feature extractors" 75 ) from _GPU_IMPORT_ERROR 76 77 if not isinstance(G_cg, cg.Graph): 78 raise ValueError("Input must be a cugraph.Graph object") 79 80 if not isinstance(G_ig, ig.Graph): 81 raise ValueError("Input must be an igraph.Graph object") 82 83 G_feat = {} 84 G_feat["total n nodes"] = G_cg.number_of_nodes() 85 print("total n nodes", G_feat["total n nodes"]) 86 G_feat["total n edges"] = G_cg.number_of_edges() 87 print("total n edges", G_feat["total n edges"]) 88 89 connected_components_df = cg.weakly_connected_components(G_cg) 90 if connected_components_df["labels"].nunique() > 1: 91 largest_cc = connected_components_df["labels"].value_counts().idxmax() 92 G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc) 93 G_ig = G_ig.clusters().giant() 94 95 G_feat["n nodes"] = float(G_cg.number_of_nodes()) 96 print("n nodes", G_feat["n nodes"]) 97 G_feat["n edges"] = float(G_cg.number_of_edges()) 98 print("n edges", G_feat["n edges"]) 99 G_feat["density"] = ( 100 2 101 * G_cg.number_of_edges() 102 / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)) 103 ) 104 print("density", G_feat["density"]) 105 106 # Perform 2-step BFS to calculate diameter 107 # Assumed graph is unweighted and undirected 108 first_vertex_id = G_cg.nodes().iloc[0] 109 print("first_vertex_id", first_vertex_id) 110 bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False) 111 furthest_node = bfs_result["vertex"].iloc[ 112 cp.argmax(cp.array(bfs_result["distance"])) 113 ] 114 bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False) 115 G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max()) 116 117 # Avg clustering coefficient 118 edges_cudf = G_cg.edges() 119 src_indices = cp.array(edges_cudf["source"]) 120 dst_indices = cp.array(edges_cudf["destination"]) 121 # Don't count selfloops 122 not_selfloops = src_indices != dst_indices 123 src_indices = src_indices[not_selfloops] 124 dst_indices = dst_indices[not_selfloops] 125 if src_indices.size == 0: 126 G_feat["avg clustering"] = 0 127 else: 128 degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes()) 129 degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes()) 130 triangles = cg.triangle_count(G_cg) 131 degrees = degrees[triangles["vertex"]] 132 counts = cp.array(triangles["counts"]) 133 denominators = degrees * (degrees - 1) 134 clustering = 2 * counts / denominators 135 clustering = cp.where(denominators, clustering, 0) 136 assert clustering.ndim == 1 137 G_feat["avg clustering"] = float(clustering.mean()) 138 139 # Max closeness centrality 140 max_centrality = 0 141 avg_shortest_path = 0 142 for vertex_id in G_cg.nodes().values_host: 143 shortest_paths = cg.bfs(G_cg, vertex_id)["distance"] 144 shortest_path_sum = cp.sum(shortest_paths) 145 avg_shortest_path += shortest_path_sum / ( 146 G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1) 147 ) 148 closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum 149 if closeness > max_centrality: 150 max_centrality = closeness 151 G_feat["max closeness centrality"] = max_centrality 152 G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[ 153 "eigenvector_centrality" 154 ].max() 155 156 # Degree assortativity 157 edges_cudf = G_cg.edges() 158 # print(edges_cudf) 159 degrees = G_cg.degree() 160 degrees_cudf = cudf.merge( 161 edges_cudf, degrees, left_on="source", right_on="vertex", how="left" 162 ) 163 degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True) 164 degrees_cudf = cudf.merge( 165 degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left" 166 ) 167 degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True) 168 print(edges_cudf) 169 print(degrees_cudf) 170 degree_assortativity = cp.corrcoef( 171 degrees_cudf["degree_source"], degrees_cudf["degree_destination"] 172 )[0, 1] 173 G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2) 174 175 if not skip_clique_num: 176 raise NotImplementedError( 177 "Calculating the largest clique number is not supported on GPU" 178 ) 179 # G_feat["max clique number"] = G_ig.clique_number() 180 181 G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique() 182 183 G_feat["avg path length"] = avg_shortest_path 184 G_feat["max betweenness centrality"] = round( 185 cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15 186 ) 187 188 return G_feat 189 190 191def cugraph_calc_graph_features_timed( 192 G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False 193): 194 """ 195 Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object. 196 197 Parameters 198 ---------- 199 G_ig : igraph.Graph 200 The input graph for which features are to be calculated. 201 G_cg : cugraph.Graph 202 The input graph for which features are to be calculated. 203 skip_clique_num : bool, optional 204 If True, the calculation of the largest clique number will be skipped. Default is False. 205 206 Returns 207 ------- 208 tuple 209 A tuple containing two dictionaries: 210 - G_feat (dict): A dictionary with keys as feature names and values as the computed feature values. 211 Features include: 212 - "total n nodes": Total number of nodes in the graph. 213 - "total n edges": Total number of edges in the graph. 214 - "n nodes": Number of nodes in the largest connected component. 215 - "n edges": Number of edges in the largest connected component. 216 - "density": Density of the graph. 217 - "diameter": Diameter of the graph. 218 - "avg clustering": Average clustering coefficient of the graph. 219 - "max closeness centrality": Maximum closeness centrality in the graph. 220 - "max eigenvector centrality": Maximum eigenvector centrality in the graph. 221 - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]). 222 - "max clique number": Size of the largest clique in the graph. 223 - "n communities": Number of communities detected using the louvian algorithm. 224 - "avg path length": Average path length in the graph. 225 - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs. 226 227 - G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature. 228 229 Raises 230 ------ 231 ValueError 232 If the input is not a cugraph.Graph object. 233 234 Notes 235 ----- 236 - The graph is assumed to be undirected and unweighted. 237 - If the graph is not connected, the largest connected component is used for feature calculation. 238 - The diameter is calculated using a 2-step BFS. 239 - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU. 240 - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph. 241 """ 242 if cg is None or cudf is None or cp is None: 243 raise RuntimeError( 244 "cugraph, cudf, and cupy must be installed to use GPU feature extractors" 245 ) from _GPU_IMPORT_ERROR 246 247 if not isinstance(G_cg, cg.Graph): 248 raise ValueError("Input must be a cugraph.Graph object") 249 250 if not isinstance(G_ig, ig.Graph): 251 raise ValueError("Input must be an igraph.Graph object") 252 253 G_feat = {} 254 G_time = {} 255 start_time = time.perf_counter() 256 G_feat["total n nodes"] = G_cg.number_of_nodes() 257 G_time["total n nodes"] = time.perf_counter() - start_time 258 259 start_time = time.perf_counter() 260 G_feat["total n edges"] = G_cg.number_of_edges() 261 G_time["total n edges"] = time.perf_counter() - start_time 262 263 connected_components_df = cg.weakly_connected_components(G_cg) 264 if connected_components_df["labels"].nunique() > 1: 265 start_time = time.perf_counter() 266 largest_cc = connected_components_df["labels"].value_counts().idxmax() 267 G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc) 268 G_time["connected component"] = time.perf_counter() - start_time 269 G_ig = G_ig.clusters().giant() 270 else: 271 G_time["connected component"] = 0 272 273 start_time = time.perf_counter() 274 G_feat["n nodes"] = float(G_cg.number_of_nodes()) 275 G_time["n nodes"] = time.perf_counter() - start_time 276 277 start_time = time.perf_counter() 278 G_feat["n edges"] = float(G_cg.number_of_edges()) 279 G_time["n edges"] = time.perf_counter() - start_time 280 281 start_time = time.perf_counter() 282 G_feat["density"] = ( 283 2 284 * G_cg.number_of_edges() 285 / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)) 286 ) 287 G_time["density"] = time.perf_counter() - start_time 288 289 start_time = time.perf_counter() 290 first_vertex_id = G_cg.nodes().iloc[0] 291 bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False) 292 furthest_node = bfs_result["vertex"].iloc[ 293 cp.argmax(cp.array(bfs_result["distance"])) 294 ] 295 bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False) 296 G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max()) 297 G_time["diameter"] = time.perf_counter() - start_time 298 299 start_time = time.perf_counter() 300 edges_cudf = G_cg.edges() 301 src_indices = cp.array(edges_cudf["source"]) 302 dst_indices = cp.array(edges_cudf["destination"]) 303 # Don't count selfloops 304 not_selfloops = src_indices != dst_indices 305 src_indices = src_indices[not_selfloops] 306 dst_indices = dst_indices[not_selfloops] 307 if src_indices.size == 0: 308 G_feat["avg clustering"] = 0 309 else: 310 degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes()) 311 degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes()) 312 triangles = cg.triangle_count(G_cg) 313 degrees = degrees[triangles["vertex"]] 314 counts = cp.array(triangles["counts"]) 315 denominators = degrees * (degrees - 1) 316 clustering = 2 * counts / denominators 317 clustering = cp.where(denominators, clustering, 0) 318 assert clustering.ndim == 1 319 G_feat["avg clustering"] = float(clustering.mean()) 320 G_time["avg clustering"] = time.perf_counter() - start_time 321 322 start_time = time.perf_counter() 323 max_centrality = 0 324 avg_shortest_path = 0 325 for vertex_id in G_cg.nodes().values_host: 326 shortest_paths = cg.bfs(G_cg, vertex_id)["distance"] 327 shortest_path_sum = cp.sum(shortest_paths) 328 avg_shortest_path += shortest_path_sum / ( 329 G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1) 330 ) 331 closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum 332 if closeness > max_centrality: 333 max_centrality = closeness 334 G_feat["max closeness centrality"] = max_centrality 335 G_time["max closeness centrality"] = time.perf_counter() - start_time 336 337 start_time = time.perf_counter() 338 G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[ 339 "eigenvector_centrality" 340 ].max() 341 G_time["max eigenvector centrality"] = time.perf_counter() - start_time 342 343 start_time = time.perf_counter() 344 edges_cudf = G_cg.edges() 345 degrees = G_cg.degree() 346 degrees_cudf = cudf.merge( 347 edges_cudf, degrees, left_on="source", right_on="vertex", how="left" 348 ) 349 degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True) 350 degrees_cudf = cudf.merge( 351 degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left" 352 ) 353 degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True) 354 degree_assortativity = cp.corrcoef( 355 degrees_cudf["degree_source"], degrees_cudf["degree_destination"] 356 )[0, 1] 357 G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2) 358 G_time["degree assortativity"] = time.perf_counter() - start_time 359 360 if not skip_clique_num: 361 raise NotImplementedError( 362 "Calculating the largest clique number is not supported on GPU" 363 ) 364 # G_feat["max clique number"] = G_ig.clique_number() 365 # G_time["max clique number"] = time.perf_counter() - start_time 366 367 start_time = time.perf_counter() 368 G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique() 369 G_time["n communities"] = time.perf_counter() - start_time 370 371 start_time = time.perf_counter() 372 G_feat["avg path length"] = avg_shortest_path 373 G_time["avg path length"] = time.perf_counter() - start_time 374 375 start_time = time.perf_counter() 376 G_feat["max betweenness centrality"] = round( 377 cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15 378 ) 379 G_time["max betweenness centrality"] = time.perf_counter() - start_time 380 381 return G_feat, G_time
def
cugraph_calc_graph_features( G_ig: igraph.Graph, G_cg: "'cugraph.Graph'", skip_clique_num: bool = False):
26def cugraph_calc_graph_features( 27 G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False 28): 29 """ 30 Calculate various graph features for a cugraph.Graph object. 31 32 Parameters 33 ---------- 34 G_ig : igraph.Graph 35 The input graph for which features are to be calculated. 36 G_cg : cugraph.Graph 37 The input graph for which features are to be calculated. 38 skip_clique_num : bool, optional 39 If True, the calculation of the largest clique number will be skipped. Default is False. 40 41 Returns 42 ------- 43 dict 44 A dictionary containing the calculated graph features: 45 - "total n nodes": Total number of nodes in the graph. 46 - "total n edges": Total number of edges in the graph. 47 - "n nodes": Number of nodes in the largest connected component. 48 - "n edges": Number of edges in the largest connected component. 49 - "density": Density of the graph. 50 - "diameter": Diameter of the graph. 51 - "avg clustering": Average clustering coefficient of the graph. 52 - "max closeness centrality": Maximum closeness centrality in the graph. 53 - "max eigenvector centrality": Maximum eigenvector centrality in the graph. 54 - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]). 55 - "max clique number": Size of the largest clique in the graph. 56 - "n communities": Number of communities detected using the louvian algorithm. 57 - "avg path length": Average path length in the graph. 58 - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs. 59 60 Raises 61 ------ 62 ValueError 63 If the input is not a cugraph.Graph object. 64 65 Notes 66 ----- 67 - The graph is assumed to be undirected and unweighted. 68 - If the graph is not connected, the largest connected component is used for feature calculation. 69 - The diameter is calculated using a 2-step BFS. 70 - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU. 71 - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph. 72 """ 73 if cg is None or cudf is None or cp is None: 74 raise RuntimeError( 75 "cugraph, cudf, and cupy must be installed to use GPU feature extractors" 76 ) from _GPU_IMPORT_ERROR 77 78 if not isinstance(G_cg, cg.Graph): 79 raise ValueError("Input must be a cugraph.Graph object") 80 81 if not isinstance(G_ig, ig.Graph): 82 raise ValueError("Input must be an igraph.Graph object") 83 84 G_feat = {} 85 G_feat["total n nodes"] = G_cg.number_of_nodes() 86 print("total n nodes", G_feat["total n nodes"]) 87 G_feat["total n edges"] = G_cg.number_of_edges() 88 print("total n edges", G_feat["total n edges"]) 89 90 connected_components_df = cg.weakly_connected_components(G_cg) 91 if connected_components_df["labels"].nunique() > 1: 92 largest_cc = connected_components_df["labels"].value_counts().idxmax() 93 G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc) 94 G_ig = G_ig.clusters().giant() 95 96 G_feat["n nodes"] = float(G_cg.number_of_nodes()) 97 print("n nodes", G_feat["n nodes"]) 98 G_feat["n edges"] = float(G_cg.number_of_edges()) 99 print("n edges", G_feat["n edges"]) 100 G_feat["density"] = ( 101 2 102 * G_cg.number_of_edges() 103 / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)) 104 ) 105 print("density", G_feat["density"]) 106 107 # Perform 2-step BFS to calculate diameter 108 # Assumed graph is unweighted and undirected 109 first_vertex_id = G_cg.nodes().iloc[0] 110 print("first_vertex_id", first_vertex_id) 111 bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False) 112 furthest_node = bfs_result["vertex"].iloc[ 113 cp.argmax(cp.array(bfs_result["distance"])) 114 ] 115 bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False) 116 G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max()) 117 118 # Avg clustering coefficient 119 edges_cudf = G_cg.edges() 120 src_indices = cp.array(edges_cudf["source"]) 121 dst_indices = cp.array(edges_cudf["destination"]) 122 # Don't count selfloops 123 not_selfloops = src_indices != dst_indices 124 src_indices = src_indices[not_selfloops] 125 dst_indices = dst_indices[not_selfloops] 126 if src_indices.size == 0: 127 G_feat["avg clustering"] = 0 128 else: 129 degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes()) 130 degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes()) 131 triangles = cg.triangle_count(G_cg) 132 degrees = degrees[triangles["vertex"]] 133 counts = cp.array(triangles["counts"]) 134 denominators = degrees * (degrees - 1) 135 clustering = 2 * counts / denominators 136 clustering = cp.where(denominators, clustering, 0) 137 assert clustering.ndim == 1 138 G_feat["avg clustering"] = float(clustering.mean()) 139 140 # Max closeness centrality 141 max_centrality = 0 142 avg_shortest_path = 0 143 for vertex_id in G_cg.nodes().values_host: 144 shortest_paths = cg.bfs(G_cg, vertex_id)["distance"] 145 shortest_path_sum = cp.sum(shortest_paths) 146 avg_shortest_path += shortest_path_sum / ( 147 G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1) 148 ) 149 closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum 150 if closeness > max_centrality: 151 max_centrality = closeness 152 G_feat["max closeness centrality"] = max_centrality 153 G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[ 154 "eigenvector_centrality" 155 ].max() 156 157 # Degree assortativity 158 edges_cudf = G_cg.edges() 159 # print(edges_cudf) 160 degrees = G_cg.degree() 161 degrees_cudf = cudf.merge( 162 edges_cudf, degrees, left_on="source", right_on="vertex", how="left" 163 ) 164 degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True) 165 degrees_cudf = cudf.merge( 166 degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left" 167 ) 168 degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True) 169 print(edges_cudf) 170 print(degrees_cudf) 171 degree_assortativity = cp.corrcoef( 172 degrees_cudf["degree_source"], degrees_cudf["degree_destination"] 173 )[0, 1] 174 G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2) 175 176 if not skip_clique_num: 177 raise NotImplementedError( 178 "Calculating the largest clique number is not supported on GPU" 179 ) 180 # G_feat["max clique number"] = G_ig.clique_number() 181 182 G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique() 183 184 G_feat["avg path length"] = avg_shortest_path 185 G_feat["max betweenness centrality"] = round( 186 cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15 187 ) 188 189 return G_feat
Calculate various graph features for a cugraph.Graph object.
Parameters
- G_ig (igraph.Graph): The input graph for which features are to be calculated.
- G_cg (cugraph.Graph): The input graph for which features are to be calculated.
- skip_clique_num (bool, optional): If True, the calculation of the largest clique number will be skipped. Default is False.
Returns
- dict: A dictionary containing the calculated graph features:
- "total n nodes": Total number of nodes in the graph.
- "total n edges": Total number of edges in the graph.
- "n nodes": Number of nodes in the largest connected component.
- "n edges": Number of edges in the largest connected component.
- "density": Density of the graph.
- "diameter": Diameter of the graph.
- "avg clustering": Average clustering coefficient of the graph.
- "max closeness centrality": Maximum closeness centrality in the graph.
- "max eigenvector centrality": Maximum eigenvector centrality in the graph.
- "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
- "max clique number": Size of the largest clique in the graph.
- "n communities": Number of communities detected using the louvian algorithm.
- "avg path length": Average path length in the graph.
- "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
Raises
- ValueError: If the input is not a cugraph.Graph object.
Notes
- The graph is assumed to be undirected and unweighted.
- If the graph is not connected, the largest connected component is used for feature calculation.
- The diameter is calculated using a 2-step BFS.
- GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
- n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
def
cugraph_calc_graph_features_timed( G_ig: igraph.Graph, G_cg: "'cugraph.Graph'", skip_clique_num: bool = False):
192def cugraph_calc_graph_features_timed( 193 G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False 194): 195 """ 196 Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object. 197 198 Parameters 199 ---------- 200 G_ig : igraph.Graph 201 The input graph for which features are to be calculated. 202 G_cg : cugraph.Graph 203 The input graph for which features are to be calculated. 204 skip_clique_num : bool, optional 205 If True, the calculation of the largest clique number will be skipped. Default is False. 206 207 Returns 208 ------- 209 tuple 210 A tuple containing two dictionaries: 211 - G_feat (dict): A dictionary with keys as feature names and values as the computed feature values. 212 Features include: 213 - "total n nodes": Total number of nodes in the graph. 214 - "total n edges": Total number of edges in the graph. 215 - "n nodes": Number of nodes in the largest connected component. 216 - "n edges": Number of edges in the largest connected component. 217 - "density": Density of the graph. 218 - "diameter": Diameter of the graph. 219 - "avg clustering": Average clustering coefficient of the graph. 220 - "max closeness centrality": Maximum closeness centrality in the graph. 221 - "max eigenvector centrality": Maximum eigenvector centrality in the graph. 222 - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]). 223 - "max clique number": Size of the largest clique in the graph. 224 - "n communities": Number of communities detected using the louvian algorithm. 225 - "avg path length": Average path length in the graph. 226 - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs. 227 228 - G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature. 229 230 Raises 231 ------ 232 ValueError 233 If the input is not a cugraph.Graph object. 234 235 Notes 236 ----- 237 - The graph is assumed to be undirected and unweighted. 238 - If the graph is not connected, the largest connected component is used for feature calculation. 239 - The diameter is calculated using a 2-step BFS. 240 - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU. 241 - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph. 242 """ 243 if cg is None or cudf is None or cp is None: 244 raise RuntimeError( 245 "cugraph, cudf, and cupy must be installed to use GPU feature extractors" 246 ) from _GPU_IMPORT_ERROR 247 248 if not isinstance(G_cg, cg.Graph): 249 raise ValueError("Input must be a cugraph.Graph object") 250 251 if not isinstance(G_ig, ig.Graph): 252 raise ValueError("Input must be an igraph.Graph object") 253 254 G_feat = {} 255 G_time = {} 256 start_time = time.perf_counter() 257 G_feat["total n nodes"] = G_cg.number_of_nodes() 258 G_time["total n nodes"] = time.perf_counter() - start_time 259 260 start_time = time.perf_counter() 261 G_feat["total n edges"] = G_cg.number_of_edges() 262 G_time["total n edges"] = time.perf_counter() - start_time 263 264 connected_components_df = cg.weakly_connected_components(G_cg) 265 if connected_components_df["labels"].nunique() > 1: 266 start_time = time.perf_counter() 267 largest_cc = connected_components_df["labels"].value_counts().idxmax() 268 G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc) 269 G_time["connected component"] = time.perf_counter() - start_time 270 G_ig = G_ig.clusters().giant() 271 else: 272 G_time["connected component"] = 0 273 274 start_time = time.perf_counter() 275 G_feat["n nodes"] = float(G_cg.number_of_nodes()) 276 G_time["n nodes"] = time.perf_counter() - start_time 277 278 start_time = time.perf_counter() 279 G_feat["n edges"] = float(G_cg.number_of_edges()) 280 G_time["n edges"] = time.perf_counter() - start_time 281 282 start_time = time.perf_counter() 283 G_feat["density"] = ( 284 2 285 * G_cg.number_of_edges() 286 / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)) 287 ) 288 G_time["density"] = time.perf_counter() - start_time 289 290 start_time = time.perf_counter() 291 first_vertex_id = G_cg.nodes().iloc[0] 292 bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False) 293 furthest_node = bfs_result["vertex"].iloc[ 294 cp.argmax(cp.array(bfs_result["distance"])) 295 ] 296 bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False) 297 G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max()) 298 G_time["diameter"] = time.perf_counter() - start_time 299 300 start_time = time.perf_counter() 301 edges_cudf = G_cg.edges() 302 src_indices = cp.array(edges_cudf["source"]) 303 dst_indices = cp.array(edges_cudf["destination"]) 304 # Don't count selfloops 305 not_selfloops = src_indices != dst_indices 306 src_indices = src_indices[not_selfloops] 307 dst_indices = dst_indices[not_selfloops] 308 if src_indices.size == 0: 309 G_feat["avg clustering"] = 0 310 else: 311 degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes()) 312 degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes()) 313 triangles = cg.triangle_count(G_cg) 314 degrees = degrees[triangles["vertex"]] 315 counts = cp.array(triangles["counts"]) 316 denominators = degrees * (degrees - 1) 317 clustering = 2 * counts / denominators 318 clustering = cp.where(denominators, clustering, 0) 319 assert clustering.ndim == 1 320 G_feat["avg clustering"] = float(clustering.mean()) 321 G_time["avg clustering"] = time.perf_counter() - start_time 322 323 start_time = time.perf_counter() 324 max_centrality = 0 325 avg_shortest_path = 0 326 for vertex_id in G_cg.nodes().values_host: 327 shortest_paths = cg.bfs(G_cg, vertex_id)["distance"] 328 shortest_path_sum = cp.sum(shortest_paths) 329 avg_shortest_path += shortest_path_sum / ( 330 G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1) 331 ) 332 closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum 333 if closeness > max_centrality: 334 max_centrality = closeness 335 G_feat["max closeness centrality"] = max_centrality 336 G_time["max closeness centrality"] = time.perf_counter() - start_time 337 338 start_time = time.perf_counter() 339 G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[ 340 "eigenvector_centrality" 341 ].max() 342 G_time["max eigenvector centrality"] = time.perf_counter() - start_time 343 344 start_time = time.perf_counter() 345 edges_cudf = G_cg.edges() 346 degrees = G_cg.degree() 347 degrees_cudf = cudf.merge( 348 edges_cudf, degrees, left_on="source", right_on="vertex", how="left" 349 ) 350 degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True) 351 degrees_cudf = cudf.merge( 352 degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left" 353 ) 354 degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True) 355 degree_assortativity = cp.corrcoef( 356 degrees_cudf["degree_source"], degrees_cudf["degree_destination"] 357 )[0, 1] 358 G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2) 359 G_time["degree assortativity"] = time.perf_counter() - start_time 360 361 if not skip_clique_num: 362 raise NotImplementedError( 363 "Calculating the largest clique number is not supported on GPU" 364 ) 365 # G_feat["max clique number"] = G_ig.clique_number() 366 # G_time["max clique number"] = time.perf_counter() - start_time 367 368 start_time = time.perf_counter() 369 G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique() 370 G_time["n communities"] = time.perf_counter() - start_time 371 372 start_time = time.perf_counter() 373 G_feat["avg path length"] = avg_shortest_path 374 G_time["avg path length"] = time.perf_counter() - start_time 375 376 start_time = time.perf_counter() 377 G_feat["max betweenness centrality"] = round( 378 cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15 379 ) 380 G_time["max betweenness centrality"] = time.perf_counter() - start_time 381 382 return G_feat, G_time
Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object.
Parameters
- G_ig (igraph.Graph): The input graph for which features are to be calculated.
- G_cg (cugraph.Graph): The input graph for which features are to be calculated.
- skip_clique_num (bool, optional): If True, the calculation of the largest clique number will be skipped. Default is False.
Returns
tuple: A tuple containing two dictionaries:
- G_feat (dict): A dictionary with keys as feature names and values as the computed feature values. Features include:
- "total n nodes": Total number of nodes in the graph.
- "total n edges": Total number of edges in the graph.
- "n nodes": Number of nodes in the largest connected component.
- "n edges": Number of edges in the largest connected component.
- "density": Density of the graph.
- "diameter": Diameter of the graph.
- "avg clustering": Average clustering coefficient of the graph.
- "max closeness centrality": Maximum closeness centrality in the graph.
- "max eigenvector centrality": Maximum eigenvector centrality in the graph.
- "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
- "max clique number": Size of the largest clique in the graph.
- "n communities": Number of communities detected using the louvian algorithm.
- "avg path length": Average path length in the graph.
"max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature.
Raises
- ValueError: If the input is not a cugraph.Graph object.
Notes
- The graph is assumed to be undirected and unweighted.
- If the graph is not connected, the largest connected component is used for feature calculation.
- The diameter is calculated using a 2-step BFS.
- GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
- n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.