griptomo.core.gpu_extract_features

  1from __future__ import annotations
  2
  3import time
  4from typing import TYPE_CHECKING
  5
  6import igraph as ig
  7
  8if TYPE_CHECKING:  # pragma: no cover - hints only evaluated by type-checkers.
  9    import cudf  # type: ignore
 10    import cugraph  # type: ignore
 11
 12try:  # pragma: no cover - GPU-only dependencies may be absent in CPU environments.
 13    import cugraph as cg  # type: ignore
 14    import cudf  # type: ignore
 15    import cupy as cp  # type: ignore
 16except ImportError as exc:  # pragma: no cover - exercised when RAPIDS is unavailable.
 17    cg = None  # type: ignore
 18    cudf = None  # type: ignore
 19    cp = None  # type: ignore
 20    _GPU_IMPORT_ERROR: Exception | None = exc
 21else:
 22    _GPU_IMPORT_ERROR = None
 23
 24
 25def cugraph_calc_graph_features(
 26    G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False
 27):
 28    """
 29    Calculate various graph features for a cugraph.Graph object.
 30
 31    Parameters
 32    ----------
 33    G_ig : igraph.Graph
 34        The input graph for which features are to be calculated.
 35    G_cg : cugraph.Graph
 36        The input graph for which features are to be calculated.
 37    skip_clique_num : bool, optional
 38        If True, the calculation of the largest clique number will be skipped. Default is False.
 39
 40    Returns
 41    -------
 42    dict
 43        A dictionary containing the calculated graph features:
 44        - "total n nodes": Total number of nodes in the graph.
 45        - "total n edges": Total number of edges in the graph.
 46        - "n nodes": Number of nodes in the largest connected component.
 47        - "n edges": Number of edges in the largest connected component.
 48        - "density": Density of the graph.
 49        - "diameter": Diameter of the graph.
 50        - "avg clustering": Average clustering coefficient of the graph.
 51        - "max closeness centrality": Maximum closeness centrality in the graph.
 52        - "max eigenvector centrality": Maximum eigenvector centrality in the graph.
 53        - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
 54        - "max clique number": Size of the largest clique in the graph.
 55        - "n communities": Number of communities detected using the louvian algorithm.
 56        - "avg path length": Average path length in the graph.
 57        - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
 58
 59    Raises
 60    ------
 61    ValueError
 62        If the input is not a cugraph.Graph object.
 63
 64    Notes
 65    -----
 66    - The graph is assumed to be undirected and unweighted.
 67    - If the graph is not connected, the largest connected component is used for feature calculation.
 68    - The diameter is calculated using a 2-step BFS.
 69    - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
 70    - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
 71    """
 72    if cg is None or cudf is None or cp is None:
 73        raise RuntimeError(
 74            "cugraph, cudf, and cupy must be installed to use GPU feature extractors"
 75        ) from _GPU_IMPORT_ERROR
 76
 77    if not isinstance(G_cg, cg.Graph):
 78        raise ValueError("Input must be a cugraph.Graph object")
 79
 80    if not isinstance(G_ig, ig.Graph):
 81        raise ValueError("Input must be an igraph.Graph object")
 82
 83    G_feat = {}
 84    G_feat["total n nodes"] = G_cg.number_of_nodes()
 85    print("total n nodes", G_feat["total n nodes"])
 86    G_feat["total n edges"] = G_cg.number_of_edges()
 87    print("total n edges", G_feat["total n edges"])
 88
 89    connected_components_df = cg.weakly_connected_components(G_cg)
 90    if connected_components_df["labels"].nunique() > 1:
 91        largest_cc = connected_components_df["labels"].value_counts().idxmax()
 92        G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc)
 93        G_ig = G_ig.clusters().giant()
 94
 95    G_feat["n nodes"] = float(G_cg.number_of_nodes())
 96    print("n nodes", G_feat["n nodes"])
 97    G_feat["n edges"] = float(G_cg.number_of_edges())
 98    print("n edges", G_feat["n edges"])
 99    G_feat["density"] = (
100        2
101        * G_cg.number_of_edges()
102        / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1))
103    )
104    print("density", G_feat["density"])
105
106    # Perform 2-step BFS to calculate diameter
107    # Assumed graph is unweighted and undirected
108    first_vertex_id = G_cg.nodes().iloc[0]
109    print("first_vertex_id", first_vertex_id)
110    bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False)
111    furthest_node = bfs_result["vertex"].iloc[
112        cp.argmax(cp.array(bfs_result["distance"]))
113    ]
114    bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False)
115    G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max())
116
117    # Avg clustering coefficient
118    edges_cudf = G_cg.edges()
119    src_indices = cp.array(edges_cudf["source"])
120    dst_indices = cp.array(edges_cudf["destination"])
121    # Don't count selfloops
122    not_selfloops = src_indices != dst_indices
123    src_indices = src_indices[not_selfloops]
124    dst_indices = dst_indices[not_selfloops]
125    if src_indices.size == 0:
126        G_feat["avg clustering"] = 0
127    else:
128        degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes())
129        degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes())
130        triangles = cg.triangle_count(G_cg)
131        degrees = degrees[triangles["vertex"]]
132        counts = cp.array(triangles["counts"])
133        denominators = degrees * (degrees - 1)
134        clustering = 2 * counts / denominators
135        clustering = cp.where(denominators, clustering, 0)
136        assert clustering.ndim == 1
137        G_feat["avg clustering"] = float(clustering.mean())
138
139    # Max closeness centrality
140    max_centrality = 0
141    avg_shortest_path = 0
142    for vertex_id in G_cg.nodes().values_host:
143        shortest_paths = cg.bfs(G_cg, vertex_id)["distance"]
144        shortest_path_sum = cp.sum(shortest_paths)
145        avg_shortest_path += shortest_path_sum / (
146            G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)
147        )
148        closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum
149        if closeness > max_centrality:
150            max_centrality = closeness
151    G_feat["max closeness centrality"] = max_centrality
152    G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[
153        "eigenvector_centrality"
154    ].max()
155
156    # Degree assortativity
157    edges_cudf = G_cg.edges()
158    # print(edges_cudf)
159    degrees = G_cg.degree()
160    degrees_cudf = cudf.merge(
161        edges_cudf, degrees, left_on="source", right_on="vertex", how="left"
162    )
163    degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True)
164    degrees_cudf = cudf.merge(
165        degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left"
166    )
167    degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True)
168    print(edges_cudf)
169    print(degrees_cudf)
170    degree_assortativity = cp.corrcoef(
171        degrees_cudf["degree_source"], degrees_cudf["degree_destination"]
172    )[0, 1]
173    G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2)
174
175    if not skip_clique_num:
176        raise NotImplementedError(
177            "Calculating the largest clique number is not supported on GPU"
178        )
179        # G_feat["max clique number"] = G_ig.clique_number()
180
181    G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique()
182
183    G_feat["avg path length"] = avg_shortest_path
184    G_feat["max betweenness centrality"] = round(
185        cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15
186    )
187
188    return G_feat
189
190
191def cugraph_calc_graph_features_timed(
192    G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False
193):
194    """
195    Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object.
196
197    Parameters
198    ----------
199    G_ig : igraph.Graph
200        The input graph for which features are to be calculated.
201    G_cg : cugraph.Graph
202        The input graph for which features are to be calculated.
203    skip_clique_num : bool, optional
204        If True, the calculation of the largest clique number will be skipped. Default is False.
205
206    Returns
207    -------
208    tuple
209        A tuple containing two dictionaries:
210        - G_feat (dict): A dictionary with keys as feature names and values as the computed feature values.
211            Features include:
212            - "total n nodes": Total number of nodes in the graph.
213            - "total n edges": Total number of edges in the graph.
214            - "n nodes": Number of nodes in the largest connected component.
215            - "n edges": Number of edges in the largest connected component.
216            - "density": Density of the graph.
217            - "diameter": Diameter of the graph.
218            - "avg clustering": Average clustering coefficient of the graph.
219            - "max closeness centrality": Maximum closeness centrality in the graph.
220            - "max eigenvector centrality": Maximum eigenvector centrality in the graph.
221            - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
222            - "max clique number": Size of the largest clique in the graph.
223            - "n communities": Number of communities detected using the louvian algorithm.
224            - "avg path length": Average path length in the graph.
225            - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
226
227        - G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature.
228
229    Raises
230    ------
231    ValueError
232        If the input is not a cugraph.Graph object.
233
234    Notes
235    -----
236    - The graph is assumed to be undirected and unweighted.
237    - If the graph is not connected, the largest connected component is used for feature calculation.
238    - The diameter is calculated using a 2-step BFS.
239    - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
240    - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
241    """
242    if cg is None or cudf is None or cp is None:
243        raise RuntimeError(
244            "cugraph, cudf, and cupy must be installed to use GPU feature extractors"
245        ) from _GPU_IMPORT_ERROR
246
247    if not isinstance(G_cg, cg.Graph):
248        raise ValueError("Input must be a cugraph.Graph object")
249
250    if not isinstance(G_ig, ig.Graph):
251        raise ValueError("Input must be an igraph.Graph object")
252
253    G_feat = {}
254    G_time = {}
255    start_time = time.perf_counter()
256    G_feat["total n nodes"] = G_cg.number_of_nodes()
257    G_time["total n nodes"] = time.perf_counter() - start_time
258
259    start_time = time.perf_counter()
260    G_feat["total n edges"] = G_cg.number_of_edges()
261    G_time["total n edges"] = time.perf_counter() - start_time
262
263    connected_components_df = cg.weakly_connected_components(G_cg)
264    if connected_components_df["labels"].nunique() > 1:
265        start_time = time.perf_counter()
266        largest_cc = connected_components_df["labels"].value_counts().idxmax()
267        G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc)
268        G_time["connected component"] = time.perf_counter() - start_time
269        G_ig = G_ig.clusters().giant()
270    else:
271        G_time["connected component"] = 0
272
273    start_time = time.perf_counter()
274    G_feat["n nodes"] = float(G_cg.number_of_nodes())
275    G_time["n nodes"] = time.perf_counter() - start_time
276
277    start_time = time.perf_counter()
278    G_feat["n edges"] = float(G_cg.number_of_edges())
279    G_time["n edges"] = time.perf_counter() - start_time
280
281    start_time = time.perf_counter()
282    G_feat["density"] = (
283        2
284        * G_cg.number_of_edges()
285        / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1))
286    )
287    G_time["density"] = time.perf_counter() - start_time
288
289    start_time = time.perf_counter()
290    first_vertex_id = G_cg.nodes().iloc[0]
291    bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False)
292    furthest_node = bfs_result["vertex"].iloc[
293        cp.argmax(cp.array(bfs_result["distance"]))
294    ]
295    bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False)
296    G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max())
297    G_time["diameter"] = time.perf_counter() - start_time
298
299    start_time = time.perf_counter()
300    edges_cudf = G_cg.edges()
301    src_indices = cp.array(edges_cudf["source"])
302    dst_indices = cp.array(edges_cudf["destination"])
303    # Don't count selfloops
304    not_selfloops = src_indices != dst_indices
305    src_indices = src_indices[not_selfloops]
306    dst_indices = dst_indices[not_selfloops]
307    if src_indices.size == 0:
308        G_feat["avg clustering"] = 0
309    else:
310        degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes())
311        degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes())
312        triangles = cg.triangle_count(G_cg)
313        degrees = degrees[triangles["vertex"]]
314        counts = cp.array(triangles["counts"])
315        denominators = degrees * (degrees - 1)
316        clustering = 2 * counts / denominators
317        clustering = cp.where(denominators, clustering, 0)
318        assert clustering.ndim == 1
319        G_feat["avg clustering"] = float(clustering.mean())
320    G_time["avg clustering"] = time.perf_counter() - start_time
321
322    start_time = time.perf_counter()
323    max_centrality = 0
324    avg_shortest_path = 0
325    for vertex_id in G_cg.nodes().values_host:
326        shortest_paths = cg.bfs(G_cg, vertex_id)["distance"]
327        shortest_path_sum = cp.sum(shortest_paths)
328        avg_shortest_path += shortest_path_sum / (
329            G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)
330        )
331        closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum
332        if closeness > max_centrality:
333            max_centrality = closeness
334    G_feat["max closeness centrality"] = max_centrality
335    G_time["max closeness centrality"] = time.perf_counter() - start_time
336
337    start_time = time.perf_counter()
338    G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[
339        "eigenvector_centrality"
340    ].max()
341    G_time["max eigenvector centrality"] = time.perf_counter() - start_time
342
343    start_time = time.perf_counter()
344    edges_cudf = G_cg.edges()
345    degrees = G_cg.degree()
346    degrees_cudf = cudf.merge(
347        edges_cudf, degrees, left_on="source", right_on="vertex", how="left"
348    )
349    degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True)
350    degrees_cudf = cudf.merge(
351        degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left"
352    )
353    degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True)
354    degree_assortativity = cp.corrcoef(
355        degrees_cudf["degree_source"], degrees_cudf["degree_destination"]
356    )[0, 1]
357    G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2)
358    G_time["degree assortativity"] = time.perf_counter() - start_time
359
360    if not skip_clique_num:
361        raise NotImplementedError(
362            "Calculating the largest clique number is not supported on GPU"
363        )
364        # G_feat["max clique number"] = G_ig.clique_number()
365        # G_time["max clique number"] = time.perf_counter() - start_time
366
367    start_time = time.perf_counter()
368    G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique()
369    G_time["n communities"] = time.perf_counter() - start_time
370
371    start_time = time.perf_counter()
372    G_feat["avg path length"] = avg_shortest_path
373    G_time["avg path length"] = time.perf_counter() - start_time
374
375    start_time = time.perf_counter()
376    G_feat["max betweenness centrality"] = round(
377        cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15
378    )
379    G_time["max betweenness centrality"] = time.perf_counter() - start_time
380
381    return G_feat, G_time
def cugraph_calc_graph_features( G_ig: igraph.Graph, G_cg: "'cugraph.Graph'", skip_clique_num: bool = False):
 26def cugraph_calc_graph_features(
 27    G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False
 28):
 29    """
 30    Calculate various graph features for a cugraph.Graph object.
 31
 32    Parameters
 33    ----------
 34    G_ig : igraph.Graph
 35        The input graph for which features are to be calculated.
 36    G_cg : cugraph.Graph
 37        The input graph for which features are to be calculated.
 38    skip_clique_num : bool, optional
 39        If True, the calculation of the largest clique number will be skipped. Default is False.
 40
 41    Returns
 42    -------
 43    dict
 44        A dictionary containing the calculated graph features:
 45        - "total n nodes": Total number of nodes in the graph.
 46        - "total n edges": Total number of edges in the graph.
 47        - "n nodes": Number of nodes in the largest connected component.
 48        - "n edges": Number of edges in the largest connected component.
 49        - "density": Density of the graph.
 50        - "diameter": Diameter of the graph.
 51        - "avg clustering": Average clustering coefficient of the graph.
 52        - "max closeness centrality": Maximum closeness centrality in the graph.
 53        - "max eigenvector centrality": Maximum eigenvector centrality in the graph.
 54        - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
 55        - "max clique number": Size of the largest clique in the graph.
 56        - "n communities": Number of communities detected using the louvian algorithm.
 57        - "avg path length": Average path length in the graph.
 58        - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
 59
 60    Raises
 61    ------
 62    ValueError
 63        If the input is not a cugraph.Graph object.
 64
 65    Notes
 66    -----
 67    - The graph is assumed to be undirected and unweighted.
 68    - If the graph is not connected, the largest connected component is used for feature calculation.
 69    - The diameter is calculated using a 2-step BFS.
 70    - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
 71    - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
 72    """
 73    if cg is None or cudf is None or cp is None:
 74        raise RuntimeError(
 75            "cugraph, cudf, and cupy must be installed to use GPU feature extractors"
 76        ) from _GPU_IMPORT_ERROR
 77
 78    if not isinstance(G_cg, cg.Graph):
 79        raise ValueError("Input must be a cugraph.Graph object")
 80
 81    if not isinstance(G_ig, ig.Graph):
 82        raise ValueError("Input must be an igraph.Graph object")
 83
 84    G_feat = {}
 85    G_feat["total n nodes"] = G_cg.number_of_nodes()
 86    print("total n nodes", G_feat["total n nodes"])
 87    G_feat["total n edges"] = G_cg.number_of_edges()
 88    print("total n edges", G_feat["total n edges"])
 89
 90    connected_components_df = cg.weakly_connected_components(G_cg)
 91    if connected_components_df["labels"].nunique() > 1:
 92        largest_cc = connected_components_df["labels"].value_counts().idxmax()
 93        G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc)
 94        G_ig = G_ig.clusters().giant()
 95
 96    G_feat["n nodes"] = float(G_cg.number_of_nodes())
 97    print("n nodes", G_feat["n nodes"])
 98    G_feat["n edges"] = float(G_cg.number_of_edges())
 99    print("n edges", G_feat["n edges"])
100    G_feat["density"] = (
101        2
102        * G_cg.number_of_edges()
103        / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1))
104    )
105    print("density", G_feat["density"])
106
107    # Perform 2-step BFS to calculate diameter
108    # Assumed graph is unweighted and undirected
109    first_vertex_id = G_cg.nodes().iloc[0]
110    print("first_vertex_id", first_vertex_id)
111    bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False)
112    furthest_node = bfs_result["vertex"].iloc[
113        cp.argmax(cp.array(bfs_result["distance"]))
114    ]
115    bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False)
116    G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max())
117
118    # Avg clustering coefficient
119    edges_cudf = G_cg.edges()
120    src_indices = cp.array(edges_cudf["source"])
121    dst_indices = cp.array(edges_cudf["destination"])
122    # Don't count selfloops
123    not_selfloops = src_indices != dst_indices
124    src_indices = src_indices[not_selfloops]
125    dst_indices = dst_indices[not_selfloops]
126    if src_indices.size == 0:
127        G_feat["avg clustering"] = 0
128    else:
129        degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes())
130        degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes())
131        triangles = cg.triangle_count(G_cg)
132        degrees = degrees[triangles["vertex"]]
133        counts = cp.array(triangles["counts"])
134        denominators = degrees * (degrees - 1)
135        clustering = 2 * counts / denominators
136        clustering = cp.where(denominators, clustering, 0)
137        assert clustering.ndim == 1
138        G_feat["avg clustering"] = float(clustering.mean())
139
140    # Max closeness centrality
141    max_centrality = 0
142    avg_shortest_path = 0
143    for vertex_id in G_cg.nodes().values_host:
144        shortest_paths = cg.bfs(G_cg, vertex_id)["distance"]
145        shortest_path_sum = cp.sum(shortest_paths)
146        avg_shortest_path += shortest_path_sum / (
147            G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)
148        )
149        closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum
150        if closeness > max_centrality:
151            max_centrality = closeness
152    G_feat["max closeness centrality"] = max_centrality
153    G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[
154        "eigenvector_centrality"
155    ].max()
156
157    # Degree assortativity
158    edges_cudf = G_cg.edges()
159    # print(edges_cudf)
160    degrees = G_cg.degree()
161    degrees_cudf = cudf.merge(
162        edges_cudf, degrees, left_on="source", right_on="vertex", how="left"
163    )
164    degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True)
165    degrees_cudf = cudf.merge(
166        degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left"
167    )
168    degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True)
169    print(edges_cudf)
170    print(degrees_cudf)
171    degree_assortativity = cp.corrcoef(
172        degrees_cudf["degree_source"], degrees_cudf["degree_destination"]
173    )[0, 1]
174    G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2)
175
176    if not skip_clique_num:
177        raise NotImplementedError(
178            "Calculating the largest clique number is not supported on GPU"
179        )
180        # G_feat["max clique number"] = G_ig.clique_number()
181
182    G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique()
183
184    G_feat["avg path length"] = avg_shortest_path
185    G_feat["max betweenness centrality"] = round(
186        cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15
187    )
188
189    return G_feat

Calculate various graph features for a cugraph.Graph object.

Parameters
  • G_ig (igraph.Graph): The input graph for which features are to be calculated.
  • G_cg (cugraph.Graph): The input graph for which features are to be calculated.
  • skip_clique_num (bool, optional): If True, the calculation of the largest clique number will be skipped. Default is False.
Returns
  • dict: A dictionary containing the calculated graph features:
    • "total n nodes": Total number of nodes in the graph.
    • "total n edges": Total number of edges in the graph.
    • "n nodes": Number of nodes in the largest connected component.
    • "n edges": Number of edges in the largest connected component.
    • "density": Density of the graph.
    • "diameter": Diameter of the graph.
    • "avg clustering": Average clustering coefficient of the graph.
    • "max closeness centrality": Maximum closeness centrality in the graph.
    • "max eigenvector centrality": Maximum eigenvector centrality in the graph.
    • "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
    • "max clique number": Size of the largest clique in the graph.
    • "n communities": Number of communities detected using the louvian algorithm.
    • "avg path length": Average path length in the graph.
    • "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
Raises
  • ValueError: If the input is not a cugraph.Graph object.
Notes
  • The graph is assumed to be undirected and unweighted.
  • If the graph is not connected, the largest connected component is used for feature calculation.
  • The diameter is calculated using a 2-step BFS.
  • GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
  • n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
def cugraph_calc_graph_features_timed( G_ig: igraph.Graph, G_cg: "'cugraph.Graph'", skip_clique_num: bool = False):
192def cugraph_calc_graph_features_timed(
193    G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False
194):
195    """
196    Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object.
197
198    Parameters
199    ----------
200    G_ig : igraph.Graph
201        The input graph for which features are to be calculated.
202    G_cg : cugraph.Graph
203        The input graph for which features are to be calculated.
204    skip_clique_num : bool, optional
205        If True, the calculation of the largest clique number will be skipped. Default is False.
206
207    Returns
208    -------
209    tuple
210        A tuple containing two dictionaries:
211        - G_feat (dict): A dictionary with keys as feature names and values as the computed feature values.
212            Features include:
213            - "total n nodes": Total number of nodes in the graph.
214            - "total n edges": Total number of edges in the graph.
215            - "n nodes": Number of nodes in the largest connected component.
216            - "n edges": Number of edges in the largest connected component.
217            - "density": Density of the graph.
218            - "diameter": Diameter of the graph.
219            - "avg clustering": Average clustering coefficient of the graph.
220            - "max closeness centrality": Maximum closeness centrality in the graph.
221            - "max eigenvector centrality": Maximum eigenvector centrality in the graph.
222            - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
223            - "max clique number": Size of the largest clique in the graph.
224            - "n communities": Number of communities detected using the louvian algorithm.
225            - "avg path length": Average path length in the graph.
226            - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
227
228        - G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature.
229
230    Raises
231    ------
232    ValueError
233        If the input is not a cugraph.Graph object.
234
235    Notes
236    -----
237    - The graph is assumed to be undirected and unweighted.
238    - If the graph is not connected, the largest connected component is used for feature calculation.
239    - The diameter is calculated using a 2-step BFS.
240    - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
241    - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
242    """
243    if cg is None or cudf is None or cp is None:
244        raise RuntimeError(
245            "cugraph, cudf, and cupy must be installed to use GPU feature extractors"
246        ) from _GPU_IMPORT_ERROR
247
248    if not isinstance(G_cg, cg.Graph):
249        raise ValueError("Input must be a cugraph.Graph object")
250
251    if not isinstance(G_ig, ig.Graph):
252        raise ValueError("Input must be an igraph.Graph object")
253
254    G_feat = {}
255    G_time = {}
256    start_time = time.perf_counter()
257    G_feat["total n nodes"] = G_cg.number_of_nodes()
258    G_time["total n nodes"] = time.perf_counter() - start_time
259
260    start_time = time.perf_counter()
261    G_feat["total n edges"] = G_cg.number_of_edges()
262    G_time["total n edges"] = time.perf_counter() - start_time
263
264    connected_components_df = cg.weakly_connected_components(G_cg)
265    if connected_components_df["labels"].nunique() > 1:
266        start_time = time.perf_counter()
267        largest_cc = connected_components_df["labels"].value_counts().idxmax()
268        G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc)
269        G_time["connected component"] = time.perf_counter() - start_time
270        G_ig = G_ig.clusters().giant()
271    else:
272        G_time["connected component"] = 0
273
274    start_time = time.perf_counter()
275    G_feat["n nodes"] = float(G_cg.number_of_nodes())
276    G_time["n nodes"] = time.perf_counter() - start_time
277
278    start_time = time.perf_counter()
279    G_feat["n edges"] = float(G_cg.number_of_edges())
280    G_time["n edges"] = time.perf_counter() - start_time
281
282    start_time = time.perf_counter()
283    G_feat["density"] = (
284        2
285        * G_cg.number_of_edges()
286        / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1))
287    )
288    G_time["density"] = time.perf_counter() - start_time
289
290    start_time = time.perf_counter()
291    first_vertex_id = G_cg.nodes().iloc[0]
292    bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False)
293    furthest_node = bfs_result["vertex"].iloc[
294        cp.argmax(cp.array(bfs_result["distance"]))
295    ]
296    bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False)
297    G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max())
298    G_time["diameter"] = time.perf_counter() - start_time
299
300    start_time = time.perf_counter()
301    edges_cudf = G_cg.edges()
302    src_indices = cp.array(edges_cudf["source"])
303    dst_indices = cp.array(edges_cudf["destination"])
304    # Don't count selfloops
305    not_selfloops = src_indices != dst_indices
306    src_indices = src_indices[not_selfloops]
307    dst_indices = dst_indices[not_selfloops]
308    if src_indices.size == 0:
309        G_feat["avg clustering"] = 0
310    else:
311        degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes())
312        degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes())
313        triangles = cg.triangle_count(G_cg)
314        degrees = degrees[triangles["vertex"]]
315        counts = cp.array(triangles["counts"])
316        denominators = degrees * (degrees - 1)
317        clustering = 2 * counts / denominators
318        clustering = cp.where(denominators, clustering, 0)
319        assert clustering.ndim == 1
320        G_feat["avg clustering"] = float(clustering.mean())
321    G_time["avg clustering"] = time.perf_counter() - start_time
322
323    start_time = time.perf_counter()
324    max_centrality = 0
325    avg_shortest_path = 0
326    for vertex_id in G_cg.nodes().values_host:
327        shortest_paths = cg.bfs(G_cg, vertex_id)["distance"]
328        shortest_path_sum = cp.sum(shortest_paths)
329        avg_shortest_path += shortest_path_sum / (
330            G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)
331        )
332        closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum
333        if closeness > max_centrality:
334            max_centrality = closeness
335    G_feat["max closeness centrality"] = max_centrality
336    G_time["max closeness centrality"] = time.perf_counter() - start_time
337
338    start_time = time.perf_counter()
339    G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[
340        "eigenvector_centrality"
341    ].max()
342    G_time["max eigenvector centrality"] = time.perf_counter() - start_time
343
344    start_time = time.perf_counter()
345    edges_cudf = G_cg.edges()
346    degrees = G_cg.degree()
347    degrees_cudf = cudf.merge(
348        edges_cudf, degrees, left_on="source", right_on="vertex", how="left"
349    )
350    degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True)
351    degrees_cudf = cudf.merge(
352        degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left"
353    )
354    degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True)
355    degree_assortativity = cp.corrcoef(
356        degrees_cudf["degree_source"], degrees_cudf["degree_destination"]
357    )[0, 1]
358    G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2)
359    G_time["degree assortativity"] = time.perf_counter() - start_time
360
361    if not skip_clique_num:
362        raise NotImplementedError(
363            "Calculating the largest clique number is not supported on GPU"
364        )
365        # G_feat["max clique number"] = G_ig.clique_number()
366        # G_time["max clique number"] = time.perf_counter() - start_time
367
368    start_time = time.perf_counter()
369    G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique()
370    G_time["n communities"] = time.perf_counter() - start_time
371
372    start_time = time.perf_counter()
373    G_feat["avg path length"] = avg_shortest_path
374    G_time["avg path length"] = time.perf_counter() - start_time
375
376    start_time = time.perf_counter()
377    G_feat["max betweenness centrality"] = round(
378        cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15
379    )
380    G_time["max betweenness centrality"] = time.perf_counter() - start_time
381
382    return G_feat, G_time

Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object.

Parameters
  • G_ig (igraph.Graph): The input graph for which features are to be calculated.
  • G_cg (cugraph.Graph): The input graph for which features are to be calculated.
  • skip_clique_num (bool, optional): If True, the calculation of the largest clique number will be skipped. Default is False.
Returns
  • tuple: A tuple containing two dictionaries:

    • G_feat (dict): A dictionary with keys as feature names and values as the computed feature values. Features include:
    • "total n nodes": Total number of nodes in the graph.
    • "total n edges": Total number of edges in the graph.
    • "n nodes": Number of nodes in the largest connected component.
    • "n edges": Number of edges in the largest connected component.
    • "density": Density of the graph.
    • "diameter": Diameter of the graph.
    • "avg clustering": Average clustering coefficient of the graph.
    • "max closeness centrality": Maximum closeness centrality in the graph.
    • "max eigenvector centrality": Maximum eigenvector centrality in the graph.
    • "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
    • "max clique number": Size of the largest clique in the graph.
    • "n communities": Number of communities detected using the louvian algorithm.
    • "avg path length": Average path length in the graph.
    • "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.

    • G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature.

Raises
  • ValueError: If the input is not a cugraph.Graph object.
Notes
  • The graph is assumed to be undirected and unweighted.
  • If the graph is not connected, the largest connected component is used for feature calculation.
  • The diameter is calculated using a 2-step BFS.
  • GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
  • n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.