griptomo.core.gpu_extract_features

View Source

  1from __future__ import annotations
  2
  3import time
  4from typing import TYPE_CHECKING
  5
  6import igraph as ig
  7
  8if TYPE_CHECKING:  # pragma: no cover - hints only evaluated by type-checkers.
  9    import cudf  # type: ignore
 10    import cugraph  # type: ignore
 11
 12try:  # pragma: no cover - GPU-only dependencies may be absent in CPU environments.
 13    import cugraph as cg  # type: ignore
 14    import cudf  # type: ignore
 15    import cupy as cp  # type: ignore
 16except ImportError as exc:  # pragma: no cover - exercised when RAPIDS is unavailable.
 17    cg = None  # type: ignore
 18    cudf = None  # type: ignore
 19    cp = None  # type: ignore
 20    _GPU_IMPORT_ERROR: Exception | None = exc
 21else:
 22    _GPU_IMPORT_ERROR = None
 23
 24
 25def cugraph_calc_graph_features(
 26    G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False
 27):
 28    """
 29    Calculate various graph features for a cugraph.Graph object.
 30
 31    Parameters
 32    ----------
 33    G_ig : igraph.Graph
 34        The input graph for which features are to be calculated.
 35    G_cg : cugraph.Graph
 36        The input graph for which features are to be calculated.
 37    skip_clique_num : bool, optional
 38        If True, the calculation of the largest clique number will be skipped. Default is False.
 39
 40    Returns
 41    -------
 42    dict
 43        A dictionary containing the calculated graph features:
 44        - "total n nodes": Total number of nodes in the graph.
 45        - "total n edges": Total number of edges in the graph.
 46        - "n nodes": Number of nodes in the largest connected component.
 47        - "n edges": Number of edges in the largest connected component.
 48        - "density": Density of the graph.
 49        - "diameter": Diameter of the graph.
 50        - "avg clustering": Average clustering coefficient of the graph.
 51        - "max closeness centrality": Maximum closeness centrality in the graph.
 52        - "max eigenvector centrality": Maximum eigenvector centrality in the graph.
 53        - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
 54        - "max clique number": Size of the largest clique in the graph.
 55        - "n communities": Number of communities detected using the louvian algorithm.
 56        - "avg path length": Average path length in the graph.
 57        - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
 58
 59    Raises
 60    ------
 61    ValueError
 62        If the input is not a cugraph.Graph object.
 63
 64    Notes
 65    -----
 66    - The graph is assumed to be undirected and unweighted.
 67    - If the graph is not connected, the largest connected component is used for feature calculation.
 68    - The diameter is calculated using a 2-step BFS.
 69    - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
 70    - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
 71    """
 72    if cg is None or cudf is None or cp is None:
 73        raise RuntimeError(
 74            "cugraph, cudf, and cupy must be installed to use GPU feature extractors"
 75        ) from _GPU_IMPORT_ERROR
 76
 77    if not isinstance(G_cg, cg.Graph):
 78        raise ValueError("Input must be a cugraph.Graph object")
 79
 80    if not isinstance(G_ig, ig.Graph):
 81        raise ValueError("Input must be an igraph.Graph object")
 82
 83    G_feat = {}
 84    G_feat["total n nodes"] = G_cg.number_of_nodes()
 85    print("total n nodes", G_feat["total n nodes"])
 86    G_feat["total n edges"] = G_cg.number_of_edges()
 87    print("total n edges", G_feat["total n edges"])
 88
 89    connected_components_df = cg.weakly_connected_components(G_cg)
 90    if connected_components_df["labels"].nunique() > 1:
 91        largest_cc = connected_components_df["labels"].value_counts().idxmax()
 92        G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc)
 93        G_ig = G_ig.clusters().giant()
 94
 95    G_feat["n nodes"] = float(G_cg.number_of_nodes())
 96    print("n nodes", G_feat["n nodes"])
 97    G_feat["n edges"] = float(G_cg.number_of_edges())
 98    print("n edges", G_feat["n edges"])
 99    G_feat["density"] = (
100        2
101        * G_cg.number_of_edges()
102        / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1))
103    )
104    print("density", G_feat["density"])
105
106    # Perform 2-step BFS to calculate diameter
107    # Assumed graph is unweighted and undirected
108    first_vertex_id = G_cg.nodes().iloc[0]
109    print("first_vertex_id", first_vertex_id)
110    bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False)
111    furthest_node = bfs_result["vertex"].iloc[
112        cp.argmax(cp.array(bfs_result["distance"]))
113    ]
114    bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False)
115    G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max())
116
117    # Avg clustering coefficient
118    edges_cudf = G_cg.edges()
119    src_indices = cp.array(edges_cudf["source"])
120    dst_indices = cp.array(edges_cudf["destination"])
121    # Don't count selfloops
122    not_selfloops = src_indices != dst_indices
123    src_indices = src_indices[not_selfloops]
124    dst_indices = dst_indices[not_selfloops]
125    if src_indices.size == 0:
126        G_feat["avg clustering"] = 0
127    else:
128        degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes())
129        degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes())
130        triangles = cg.triangle_count(G_cg)
131        degrees = degrees[triangles["vertex"]]
132        counts = cp.array(triangles["counts"])
133        denominators = degrees * (degrees - 1)
134        clustering = 2 * counts / denominators
135        clustering = cp.where(denominators, clustering, 0)
136        assert clustering.ndim == 1
137        G_feat["avg clustering"] = float(clustering.mean())
138
139    # Max closeness centrality
140    max_centrality = 0
141    avg_shortest_path = 0
142    for vertex_id in G_cg.nodes().values_host:
143        shortest_paths = cg.bfs(G_cg, vertex_id)["distance"]
144        shortest_path_sum = cp.sum(shortest_paths)
145        avg_shortest_path += shortest_path_sum / (
146            G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)
147        )
148        closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum
149        if closeness > max_centrality:
150            max_centrality = closeness
151    G_feat["max closeness centrality"] = max_centrality
152    G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[
153        "eigenvector_centrality"
154    ].max()
155
156    # Degree assortativity
157    edges_cudf = G_cg.edges()
158    # print(edges_cudf)
159    degrees = G_cg.degree()
160    degrees_cudf = cudf.merge(
161        edges_cudf, degrees, left_on="source", right_on="vertex", how="left"
162    )
163    degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True)
164    degrees_cudf = cudf.merge(
165        degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left"
166    )
167    degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True)
168    print(edges_cudf)
169    print(degrees_cudf)
170    degree_assortativity = cp.corrcoef(
171        degrees_cudf["degree_source"], degrees_cudf["degree_destination"]
172    )[0, 1]
173    G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2)
174
175    if not skip_clique_num:
176        raise NotImplementedError(
177            "Calculating the largest clique number is not supported on GPU"
178        )
179        # G_feat["max clique number"] = G_ig.clique_number()
180
181    G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique()
182
183    G_feat["avg path length"] = avg_shortest_path
184    G_feat["max betweenness centrality"] = round(
185        cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15
186    )
187
188    return G_feat
189
190
191def cugraph_calc_graph_features_timed(
192    G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False
193):
194    """
195    Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object.
196
197    Parameters
198    ----------
199    G_ig : igraph.Graph
200        The input graph for which features are to be calculated.
201    G_cg : cugraph.Graph
202        The input graph for which features are to be calculated.
203    skip_clique_num : bool, optional
204        If True, the calculation of the largest clique number will be skipped. Default is False.
205
206    Returns
207    -------
208    tuple
209        A tuple containing two dictionaries:
210        - G_feat (dict): A dictionary with keys as feature names and values as the computed feature values.
211            Features include:
212            - "total n nodes": Total number of nodes in the graph.
213            - "total n edges": Total number of edges in the graph.
214            - "n nodes": Number of nodes in the largest connected component.
215            - "n edges": Number of edges in the largest connected component.
216            - "density": Density of the graph.
217            - "diameter": Diameter of the graph.
218            - "avg clustering": Average clustering coefficient of the graph.
219            - "max closeness centrality": Maximum closeness centrality in the graph.
220            - "max eigenvector centrality": Maximum eigenvector centrality in the graph.
221            - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
222            - "max clique number": Size of the largest clique in the graph.
223            - "n communities": Number of communities detected using the louvian algorithm.
224            - "avg path length": Average path length in the graph.
225            - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
226
227        - G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature.
228
229    Raises
230    ------
231    ValueError
232        If the input is not a cugraph.Graph object.
233
234    Notes
235    -----
236    - The graph is assumed to be undirected and unweighted.
237    - If the graph is not connected, the largest connected component is used for feature calculation.
238    - The diameter is calculated using a 2-step BFS.
239    - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
240    - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
241    """
242    if cg is None or cudf is None or cp is None:
243        raise RuntimeError(
244            "cugraph, cudf, and cupy must be installed to use GPU feature extractors"
245        ) from _GPU_IMPORT_ERROR
246
247    if not isinstance(G_cg, cg.Graph):
248        raise ValueError("Input must be a cugraph.Graph object")
249
250    if not isinstance(G_ig, ig.Graph):
251        raise ValueError("Input must be an igraph.Graph object")
252
253    G_feat = {}
254    G_time = {}
255    start_time = time.perf_counter()
256    G_feat["total n nodes"] = G_cg.number_of_nodes()
257    G_time["total n nodes"] = time.perf_counter() - start_time
258
259    start_time = time.perf_counter()
260    G_feat["total n edges"] = G_cg.number_of_edges()
261    G_time["total n edges"] = time.perf_counter() - start_time
262
263    connected_components_df = cg.weakly_connected_components(G_cg)
264    if connected_components_df["labels"].nunique() > 1:
265        start_time = time.perf_counter()
266        largest_cc = connected_components_df["labels"].value_counts().idxmax()
267        G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc)
268        G_time["connected component"] = time.perf_counter() - start_time
269        G_ig = G_ig.clusters().giant()
270    else:
271        G_time["connected component"] = 0
272
273    start_time = time.perf_counter()
274    G_feat["n nodes"] = float(G_cg.number_of_nodes())
275    G_time["n nodes"] = time.perf_counter() - start_time
276
277    start_time = time.perf_counter()
278    G_feat["n edges"] = float(G_cg.number_of_edges())
279    G_time["n edges"] = time.perf_counter() - start_time
280
281    start_time = time.perf_counter()
282    G_feat["density"] = (
283        2
284        * G_cg.number_of_edges()
285        / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1))
286    )
287    G_time["density"] = time.perf_counter() - start_time
288
289    start_time = time.perf_counter()
290    first_vertex_id = G_cg.nodes().iloc[0]
291    bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False)
292    furthest_node = bfs_result["vertex"].iloc[
293        cp.argmax(cp.array(bfs_result["distance"]))
294    ]
295    bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False)
296    G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max())
297    G_time["diameter"] = time.perf_counter() - start_time
298
299    start_time = time.perf_counter()
300    edges_cudf = G_cg.edges()
301    src_indices = cp.array(edges_cudf["source"])
302    dst_indices = cp.array(edges_cudf["destination"])
303    # Don't count selfloops
304    not_selfloops = src_indices != dst_indices
305    src_indices = src_indices[not_selfloops]
306    dst_indices = dst_indices[not_selfloops]
307    if src_indices.size == 0:
308        G_feat["avg clustering"] = 0
309    else:
310        degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes())
311        degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes())
312        triangles = cg.triangle_count(G_cg)
313        degrees = degrees[triangles["vertex"]]
314        counts = cp.array(triangles["counts"])
315        denominators = degrees * (degrees - 1)
316        clustering = 2 * counts / denominators
317        clustering = cp.where(denominators, clustering, 0)
318        assert clustering.ndim == 1
319        G_feat["avg clustering"] = float(clustering.mean())
320    G_time["avg clustering"] = time.perf_counter() - start_time
321
322    start_time = time.perf_counter()
323    max_centrality = 0
324    avg_shortest_path = 0
325    for vertex_id in G_cg.nodes().values_host:
326        shortest_paths = cg.bfs(G_cg, vertex_id)["distance"]
327        shortest_path_sum = cp.sum(shortest_paths)
328        avg_shortest_path += shortest_path_sum / (
329            G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)
330        )
331        closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum
332        if closeness > max_centrality:
333            max_centrality = closeness
334    G_feat["max closeness centrality"] = max_centrality
335    G_time["max closeness centrality"] = time.perf_counter() - start_time
336
337    start_time = time.perf_counter()
338    G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[
339        "eigenvector_centrality"
340    ].max()
341    G_time["max eigenvector centrality"] = time.perf_counter() - start_time
342
343    start_time = time.perf_counter()
344    edges_cudf = G_cg.edges()
345    degrees = G_cg.degree()
346    degrees_cudf = cudf.merge(
347        edges_cudf, degrees, left_on="source", right_on="vertex", how="left"
348    )
349    degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True)
350    degrees_cudf = cudf.merge(
351        degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left"
352    )
353    degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True)
354    degree_assortativity = cp.corrcoef(
355        degrees_cudf["degree_source"], degrees_cudf["degree_destination"]
356    )[0, 1]
357    G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2)
358    G_time["degree assortativity"] = time.perf_counter() - start_time
359
360    if not skip_clique_num:
361        raise NotImplementedError(
362            "Calculating the largest clique number is not supported on GPU"
363        )
364        # G_feat["max clique number"] = G_ig.clique_number()
365        # G_time["max clique number"] = time.perf_counter() - start_time
366
367    start_time = time.perf_counter()
368    G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique()
369    G_time["n communities"] = time.perf_counter() - start_time
370
371    start_time = time.perf_counter()
372    G_feat["avg path length"] = avg_shortest_path
373    G_time["avg path length"] = time.perf_counter() - start_time
374
375    start_time = time.perf_counter()
376    G_feat["max betweenness centrality"] = round(
377        cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15
378    )
379    G_time["max betweenness centrality"] = time.perf_counter() - start_time
380
381    return G_feat, G_time

def cugraph_calc_graph_features( G_ig: igraph.Graph, G_cg: "'cugraph.Graph'", skip_clique_num: bool = False): View Source

 26def cugraph_calc_graph_features(
 27    G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False
 28):
 29    """
 30    Calculate various graph features for a cugraph.Graph object.
 31
 32    Parameters
 33    ----------
 34    G_ig : igraph.Graph
 35        The input graph for which features are to be calculated.
 36    G_cg : cugraph.Graph
 37        The input graph for which features are to be calculated.
 38    skip_clique_num : bool, optional
 39        If True, the calculation of the largest clique number will be skipped. Default is False.
 40
 41    Returns
 42    -------
 43    dict
 44        A dictionary containing the calculated graph features:
 45        - "total n nodes": Total number of nodes in the graph.
 46        - "total n edges": Total number of edges in the graph.
 47        - "n nodes": Number of nodes in the largest connected component.
 48        - "n edges": Number of edges in the largest connected component.
 49        - "density": Density of the graph.
 50        - "diameter": Diameter of the graph.
 51        - "avg clustering": Average clustering coefficient of the graph.
 52        - "max closeness centrality": Maximum closeness centrality in the graph.
 53        - "max eigenvector centrality": Maximum eigenvector centrality in the graph.
 54        - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
 55        - "max clique number": Size of the largest clique in the graph.
 56        - "n communities": Number of communities detected using the louvian algorithm.
 57        - "avg path length": Average path length in the graph.
 58        - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
 59
 60    Raises
 61    ------
 62    ValueError
 63        If the input is not a cugraph.Graph object.
 64
 65    Notes
 66    -----
 67    - The graph is assumed to be undirected and unweighted.
 68    - If the graph is not connected, the largest connected component is used for feature calculation.
 69    - The diameter is calculated using a 2-step BFS.
 70    - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
 71    - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
 72    """
 73    if cg is None or cudf is None or cp is None:
 74        raise RuntimeError(
 75            "cugraph, cudf, and cupy must be installed to use GPU feature extractors"
 76        ) from _GPU_IMPORT_ERROR
 77
 78    if not isinstance(G_cg, cg.Graph):
 79        raise ValueError("Input must be a cugraph.Graph object")
 80
 81    if not isinstance(G_ig, ig.Graph):
 82        raise ValueError("Input must be an igraph.Graph object")
 83
 84    G_feat = {}
 85    G_feat["total n nodes"] = G_cg.number_of_nodes()
 86    print("total n nodes", G_feat["total n nodes"])
 87    G_feat["total n edges"] = G_cg.number_of_edges()
 88    print("total n edges", G_feat["total n edges"])
 89
 90    connected_components_df = cg.weakly_connected_components(G_cg)
 91    if connected_components_df["labels"].nunique() > 1:
 92        largest_cc = connected_components_df["labels"].value_counts().idxmax()
 93        G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc)
 94        G_ig = G_ig.clusters().giant()
 95
 96    G_feat["n nodes"] = float(G_cg.number_of_nodes())
 97    print("n nodes", G_feat["n nodes"])
 98    G_feat["n edges"] = float(G_cg.number_of_edges())
 99    print("n edges", G_feat["n edges"])
100    G_feat["density"] = (
101        2
102        * G_cg.number_of_edges()
103        / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1))
104    )
105    print("density", G_feat["density"])
106
107    # Perform 2-step BFS to calculate diameter
108    # Assumed graph is unweighted and undirected
109    first_vertex_id = G_cg.nodes().iloc[0]
110    print("first_vertex_id", first_vertex_id)
111    bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False)
112    furthest_node = bfs_result["vertex"].iloc[
113        cp.argmax(cp.array(bfs_result["distance"]))
114    ]
115    bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False)
116    G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max())
117
118    # Avg clustering coefficient
119    edges_cudf = G_cg.edges()
120    src_indices = cp.array(edges_cudf["source"])
121    dst_indices = cp.array(edges_cudf["destination"])
122    # Don't count selfloops
123    not_selfloops = src_indices != dst_indices
124    src_indices = src_indices[not_selfloops]
125    dst_indices = dst_indices[not_selfloops]
126    if src_indices.size == 0:
127        G_feat["avg clustering"] = 0
128    else:
129        degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes())
130        degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes())
131        triangles = cg.triangle_count(G_cg)
132        degrees = degrees[triangles["vertex"]]
133        counts = cp.array(triangles["counts"])
134        denominators = degrees * (degrees - 1)
135        clustering = 2 * counts / denominators
136        clustering = cp.where(denominators, clustering, 0)
137        assert clustering.ndim == 1
138        G_feat["avg clustering"] = float(clustering.mean())
139
140    # Max closeness centrality
141    max_centrality = 0
142    avg_shortest_path = 0
143    for vertex_id in G_cg.nodes().values_host:
144        shortest_paths = cg.bfs(G_cg, vertex_id)["distance"]
145        shortest_path_sum = cp.sum(shortest_paths)
146        avg_shortest_path += shortest_path_sum / (
147            G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)
148        )
149        closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum
150        if closeness > max_centrality:
151            max_centrality = closeness
152    G_feat["max closeness centrality"] = max_centrality
153    G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[
154        "eigenvector_centrality"
155    ].max()
156
157    # Degree assortativity
158    edges_cudf = G_cg.edges()
159    # print(edges_cudf)
160    degrees = G_cg.degree()
161    degrees_cudf = cudf.merge(
162        edges_cudf, degrees, left_on="source", right_on="vertex", how="left"
163    )
164    degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True)
165    degrees_cudf = cudf.merge(
166        degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left"
167    )
168    degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True)
169    print(edges_cudf)
170    print(degrees_cudf)
171    degree_assortativity = cp.corrcoef(
172        degrees_cudf["degree_source"], degrees_cudf["degree_destination"]
173    )[0, 1]
174    G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2)
175
176    if not skip_clique_num:
177        raise NotImplementedError(
178            "Calculating the largest clique number is not supported on GPU"
179        )
180        # G_feat["max clique number"] = G_ig.clique_number()
181
182    G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique()
183
184    G_feat["avg path length"] = avg_shortest_path
185    G_feat["max betweenness centrality"] = round(
186        cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15
187    )
188
189    return G_feat

Calculate various graph features for a cugraph.Graph object.

Parameters

G_ig (igraph.Graph): The input graph for which features are to be calculated.
G_cg (cugraph.Graph): The input graph for which features are to be calculated.
skip_clique_num (bool, optional): If True, the calculation of the largest clique number will be skipped. Default is False.

Returns

dict: A dictionary containing the calculated graph features:
- "total n nodes": Total number of nodes in the graph.
- "total n edges": Total number of edges in the graph.
- "n nodes": Number of nodes in the largest connected component.
- "n edges": Number of edges in the largest connected component.
- "density": Density of the graph.
- "diameter": Diameter of the graph.
- "avg clustering": Average clustering coefficient of the graph.
- "max closeness centrality": Maximum closeness centrality in the graph.
- "max eigenvector centrality": Maximum eigenvector centrality in the graph.
- "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
- "max clique number": Size of the largest clique in the graph.
- "n communities": Number of communities detected using the louvian algorithm.
- "avg path length": Average path length in the graph.
- "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.

Raises

ValueError: If the input is not a cugraph.Graph object.

Notes

The graph is assumed to be undirected and unweighted.
If the graph is not connected, the largest connected component is used for feature calculation.
The diameter is calculated using a 2-step BFS.
GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.

def cugraph_calc_graph_features_timed( G_ig: igraph.Graph, G_cg: "'cugraph.Graph'", skip_clique_num: bool = False): View Source

192def cugraph_calc_graph_features_timed(
193    G_ig: ig.Graph, G_cg: "cugraph.Graph", skip_clique_num: bool = False
194):
195    """
196    Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object.
197
198    Parameters
199    ----------
200    G_ig : igraph.Graph
201        The input graph for which features are to be calculated.
202    G_cg : cugraph.Graph
203        The input graph for which features are to be calculated.
204    skip_clique_num : bool, optional
205        If True, the calculation of the largest clique number will be skipped. Default is False.
206
207    Returns
208    -------
209    tuple
210        A tuple containing two dictionaries:
211        - G_feat (dict): A dictionary with keys as feature names and values as the computed feature values.
212            Features include:
213            - "total n nodes": Total number of nodes in the graph.
214            - "total n edges": Total number of edges in the graph.
215            - "n nodes": Number of nodes in the largest connected component.
216            - "n edges": Number of edges in the largest connected component.
217            - "density": Density of the graph.
218            - "diameter": Diameter of the graph.
219            - "avg clustering": Average clustering coefficient of the graph.
220            - "max closeness centrality": Maximum closeness centrality in the graph.
221            - "max eigenvector centrality": Maximum eigenvector centrality in the graph.
222            - "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
223            - "max clique number": Size of the largest clique in the graph.
224            - "n communities": Number of communities detected using the louvian algorithm.
225            - "avg path length": Average path length in the graph.
226            - "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
227
228        - G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature.
229
230    Raises
231    ------
232    ValueError
233        If the input is not a cugraph.Graph object.
234
235    Notes
236    -----
237    - The graph is assumed to be undirected and unweighted.
238    - If the graph is not connected, the largest connected component is used for feature calculation.
239    - The diameter is calculated using a 2-step BFS.
240    - GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
241    - n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.
242    """
243    if cg is None or cudf is None or cp is None:
244        raise RuntimeError(
245            "cugraph, cudf, and cupy must be installed to use GPU feature extractors"
246        ) from _GPU_IMPORT_ERROR
247
248    if not isinstance(G_cg, cg.Graph):
249        raise ValueError("Input must be a cugraph.Graph object")
250
251    if not isinstance(G_ig, ig.Graph):
252        raise ValueError("Input must be an igraph.Graph object")
253
254    G_feat = {}
255    G_time = {}
256    start_time = time.perf_counter()
257    G_feat["total n nodes"] = G_cg.number_of_nodes()
258    G_time["total n nodes"] = time.perf_counter() - start_time
259
260    start_time = time.perf_counter()
261    G_feat["total n edges"] = G_cg.number_of_edges()
262    G_time["total n edges"] = time.perf_counter() - start_time
263
264    connected_components_df = cg.weakly_connected_components(G_cg)
265    if connected_components_df["labels"].nunique() > 1:
266        start_time = time.perf_counter()
267        largest_cc = connected_components_df["labels"].value_counts().idxmax()
268        G_cg = cg.subgraph(G_cg, connected_components_df["labels"] == largest_cc)
269        G_time["connected component"] = time.perf_counter() - start_time
270        G_ig = G_ig.clusters().giant()
271    else:
272        G_time["connected component"] = 0
273
274    start_time = time.perf_counter()
275    G_feat["n nodes"] = float(G_cg.number_of_nodes())
276    G_time["n nodes"] = time.perf_counter() - start_time
277
278    start_time = time.perf_counter()
279    G_feat["n edges"] = float(G_cg.number_of_edges())
280    G_time["n edges"] = time.perf_counter() - start_time
281
282    start_time = time.perf_counter()
283    G_feat["density"] = (
284        2
285        * G_cg.number_of_edges()
286        / (G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1))
287    )
288    G_time["density"] = time.perf_counter() - start_time
289
290    start_time = time.perf_counter()
291    first_vertex_id = G_cg.nodes().iloc[0]
292    bfs_result = cg.bfs(G_cg, first_vertex_id, return_predecessors=False)
293    furthest_node = bfs_result["vertex"].iloc[
294        cp.argmax(cp.array(bfs_result["distance"]))
295    ]
296    bfs_result_from_furthest = cg.bfs(G_cg, furthest_node, return_predecessors=False)
297    G_feat["diameter"] = float(bfs_result_from_furthest["distance"].max())
298    G_time["diameter"] = time.perf_counter() - start_time
299
300    start_time = time.perf_counter()
301    edges_cudf = G_cg.edges()
302    src_indices = cp.array(edges_cudf["source"])
303    dst_indices = cp.array(edges_cudf["destination"])
304    # Don't count selfloops
305    not_selfloops = src_indices != dst_indices
306    src_indices = src_indices[not_selfloops]
307    dst_indices = dst_indices[not_selfloops]
308    if src_indices.size == 0:
309        G_feat["avg clustering"] = 0
310    else:
311        degrees = cp.bincount(src_indices, minlength=G_cg.number_of_nodes())
312        degrees += cp.bincount(dst_indices, minlength=G_cg.number_of_nodes())
313        triangles = cg.triangle_count(G_cg)
314        degrees = degrees[triangles["vertex"]]
315        counts = cp.array(triangles["counts"])
316        denominators = degrees * (degrees - 1)
317        clustering = 2 * counts / denominators
318        clustering = cp.where(denominators, clustering, 0)
319        assert clustering.ndim == 1
320        G_feat["avg clustering"] = float(clustering.mean())
321    G_time["avg clustering"] = time.perf_counter() - start_time
322
323    start_time = time.perf_counter()
324    max_centrality = 0
325    avg_shortest_path = 0
326    for vertex_id in G_cg.nodes().values_host:
327        shortest_paths = cg.bfs(G_cg, vertex_id)["distance"]
328        shortest_path_sum = cp.sum(shortest_paths)
329        avg_shortest_path += shortest_path_sum / (
330            G_cg.number_of_nodes() * (G_cg.number_of_nodes() - 1)
331        )
332        closeness = (G_cg.number_of_nodes() - 1) / shortest_path_sum
333        if closeness > max_centrality:
334            max_centrality = closeness
335    G_feat["max closeness centrality"] = max_centrality
336    G_time["max closeness centrality"] = time.perf_counter() - start_time
337
338    start_time = time.perf_counter()
339    G_feat["max eigenvector centrality"] = cg.eigenvector_centrality(G_cg)[
340        "eigenvector_centrality"
341    ].max()
342    G_time["max eigenvector centrality"] = time.perf_counter() - start_time
343
344    start_time = time.perf_counter()
345    edges_cudf = G_cg.edges()
346    degrees = G_cg.degree()
347    degrees_cudf = cudf.merge(
348        edges_cudf, degrees, left_on="source", right_on="vertex", how="left"
349    )
350    degrees_cudf.rename(columns={"degree": "degree_source"}, inplace=True)
351    degrees_cudf = cudf.merge(
352        degrees_cudf, degrees, left_on="destination", right_on="vertex", how="left"
353    )
354    degrees_cudf.rename(columns={"degree": "degree_destination"}, inplace=True)
355    degree_assortativity = cp.corrcoef(
356        degrees_cudf["degree_source"], degrees_cudf["degree_destination"]
357    )[0, 1]
358    G_feat["degree assortativity"] = float((degree_assortativity + 1) / 2)
359    G_time["degree assortativity"] = time.perf_counter() - start_time
360
361    if not skip_clique_num:
362        raise NotImplementedError(
363            "Calculating the largest clique number is not supported on GPU"
364        )
365        # G_feat["max clique number"] = G_ig.clique_number()
366        # G_time["max clique number"] = time.perf_counter() - start_time
367
368    start_time = time.perf_counter()
369    G_feat["n communities"] = cg.louvain(G_cg)[0]["partition"].nunique()
370    G_time["n communities"] = time.perf_counter() - start_time
371
372    start_time = time.perf_counter()
373    G_feat["avg path length"] = avg_shortest_path
374    G_time["avg path length"] = time.perf_counter() - start_time
375
376    start_time = time.perf_counter()
377    G_feat["max betweenness centrality"] = round(
378        cg.betweenness_centrality(G_cg)["betweenness_centrality"].max(), 15
379    )
380    G_time["max betweenness centrality"] = time.perf_counter() - start_time
381
382    return G_feat, G_time

Calculate various graph features and the time taken to compute each feature for a given cugraph.Graph object.

Parameters

G_ig (igraph.Graph): The input graph for which features are to be calculated.
G_cg (cugraph.Graph): The input graph for which features are to be calculated.
skip_clique_num (bool, optional): If True, the calculation of the largest clique number will be skipped. Default is False.

Returns

tuple: A tuple containing two dictionaries:
- G_feat (dict): A dictionary with keys as feature names and values as the computed feature values. Features include:
- "total n nodes": Total number of nodes in the graph.
- "total n edges": Total number of edges in the graph.
- "n nodes": Number of nodes in the largest connected component.
- "n edges": Number of edges in the largest connected component.
- "density": Density of the graph.
- "diameter": Diameter of the graph.
- "avg clustering": Average clustering coefficient of the graph.
- "max closeness centrality": Maximum closeness centrality in the graph.
- "max eigenvector centrality": Maximum eigenvector centrality in the graph.
- "degree assortativity": Degree assortativity of the graph (normalized to [0, 1]).
- "max clique number": Size of the largest clique in the graph.
- "n communities": Number of communities detected using the louvian algorithm.
- "avg path length": Average path length in the graph.
- "max betweenness centrality": Maximum betweenness centrality in the graph, normalized by the number of possible node pairs.
- G_time (dict): A dictionary with keys as feature names and values as the time, in seconds, taken to compute each feature.

Raises

ValueError: If the input is not a cugraph.Graph object.

Notes

The graph is assumed to be undirected and unweighted.
If the graph is not connected, the largest connected component is used for feature calculation.
The diameter is calculated using a 2-step BFS.
GPU implementations are used for all calculations except for clique_num, which is not supported on GPU.
n_communities is calculated using the Louvain algorithm instead of the fast greedy algorithm in igraph.