griptomo.core.graph2class

  1import time
  2import networkx as nx
  3import numpy as np
  4import pandas as pd
  5from pathlib import Path
  6import multiprocessing
  7import igraph as ig
  8
  9
 10import warnings
 11import time
 12
 13warnings.filterwarnings("ignore")
 14
 15
 16def calc_bc(G, return_dict):
 17    """
 18    Parallel subprocess function to calculate the betweenness centrality.
 19
 20    Parameters
 21    ----------
 22    G : networkx.Graph
 23        The input graph.
 24    return_dict : dict
 25        Dictionary to store the result.
 26
 27    Returns
 28    -------
 29    dict
 30        Betweenness centrality dictionary from multiple processes.
 31    """
 32    return_dict[1] = np.max(list(nx.betweenness_centrality(G).values()))
 33
 34
 35def calc_shortest_pthlen(G, return_dict):
 36    """
 37    Parallel subprocess function to calculate the average shortest path length.
 38
 39    Parameters
 40    ----------
 41    G : networkx.Graph
 42        The input graph.
 43    return_dict : dict
 44        Dictionary to store the result.
 45
 46    Returns
 47    -------
 48    dict
 49        Average shortest path length dictionary from multiple processes.
 50    """
 51    return_dict[2] = nx.average_shortest_path_length(G)
 52
 53
 54def benchmark_igraph_graph_features(G):
 55    """
 56    Benchmarks several graph network features using igraph. If not connected, largest subgraph is used.
 57
 58    Parameters
 59    ----------
 60    G : networkx.Graph
 61        The input graph.
 62
 63    Returns
 64    -------
 65    dict
 66        Dictionary containing the benchmarked features and their computation times.
 67    """
 68
 69    # G = ig.Graph.Load(G, format='gexf')
 70    if type(G) != nx.classes.graph.Graph:
 71        G = nx.read_gexf(G)
 72    try:
 73        assert nx.is_connected(G)
 74    except:
 75        # graph isn't connect --> use largest connected component
 76        # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph
 77        G_cc = sorted(nx.connected_components(G), key=len, reverse=True)
 78        G = G.subgraph(G_cc[0])
 79    """ 
 80    Maybe rewrite the gexf files to a file format just in this case 
 81    so igraph can read it? To remove the step of having to load from networkx?
 82    """
 83    G_time = {}
 84    start_time = time.process_time()
 85    G = ig.Graph.from_networkx(G)
 86    G_time["conversion"] = time.process_time() - start_time
 87    G_time["backend"] = "igraph"
 88    G_feat = {}
 89    start_time = time.process_time()
 90
 91    G_feat["n nodes"] = float(ig.Graph.vcount(G))  # number of nodes
 92    G_time["n nodes"] = time.process_time() - start_time
 93    G_time["n nodes backend"] = "igraph"
 94
 95    start_time = time.process_time()
 96    G_feat["n edges"] = float(ig.Graph.ecount(G))  # number of edges
 97    G_time["n edges"] = time.process_time() - start_time
 98    G_time["n edges backend"] = "igraph"
 99
100    start_time = time.process_time()
101    G_feat["density"] = ig.Graph.density(
102        G
103    )  # how close the network is to a 'complete graph' where each node is connected
104    G_time["density"] = time.process_time() - start_time
105    G_time["density backend"] = "igraph"
106
107    start_time = time.process_time()
108    G_feat["diameter"] = ig.Graph.diameter(
109        G
110    )  # the farthest distance (e.g. number of edges) between two nodes in the graph
111    G_time["diameter"] = time.process_time() - start_time
112    G_time["diameter backend"] = "igraph"
113
114    start_time = time.process_time()
115    clustering = ig.Graph.transitivity_local_undirected(G, mode="zero")
116    if sum(clustering) == 0 or len(clustering) == 0:
117        G_feat["avg clustering"] = sum(clustering)
118    else:
119        G_feat["avg clustering"] = sum(clustering) / len(clustering)
120    G_time["avg clustering"] = time.process_time() - start_time
121    G_time["avg clustering backend"] = "igraph"
122
123    start_time = time.process_time()
124    G_feat["max closeness centrality"] = np.max(
125        list(ig.Graph.closeness(G))
126    )  # max closeness centraility. high closeness --> short distance to all other nodes
127    G_time["max closeness centrality"] = time.process_time() - start_time
128    G_time["max closeness centrality backend"] = "igraph"
129
130    start_time = time.process_time()
131    G_feat["max eigenvector centrality"] = np.max(
132        ig.Graph.eigenvector_centrality(G, scale=False)
133    )  # eigenvector centraility. how 'important'/'influential' a node is
134    G_time["max eigenvector centrality"] = time.process_time() - start_time
135    G_time["max eigenvector centrality backend"] = "igraph"
136
137    start_time = time.process_time()
138    G_feat["degree assortativity"] = (
139        ig.Graph.assortativity_degree(G) + 1
140    ) / 2  # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1.
141    G_time["degree assortativity"] = time.process_time() - start_time
142    G_time["degree assortativity backend"] = "igraph"
143
144    start_time = time.process_time()
145    G_feat["max clique number"] = len(
146        max(ig.Graph.largest_cliques(G), key=len)
147    )  # largest clique (i.e. an induced subgraph that is complete) size
148    G_time["max clique number"] = time.process_time() - start_time
149    G_time["max clique number backend"] = "igraph"
150
151    start_time = time.process_time()
152    G_feat["n communities"] = ig.Graph.community_fastgreedy(
153        G
154    ).optimal_count  # number of communities
155    G_time["n communities"] = time.process_time() - start_time
156    G_time["n communities backend"] = "igraph"
157
158    start_time = time.process_time()
159    G_feat["avg path length"] = float(ig.Graph.average_path_length(G))
160    G_time["avg path length"] = time.process_time() - start_time
161    G_time["avg path length backend"] = "igraph"
162
163    start_time = time.process_time()
164    G_feat["max betweenness centrality"] = round(
165        np.max(ig.Graph.betweenness(G))
166        / (((G_feat["n nodes"] - 1) * (G_feat["n nodes"] - 2)) / 2),
167        15,
168    )
169    G_time["max betweenness centrality"] = time.process_time() - start_time
170    G_time["max betweenness centrality backend"] = "igraph"
171
172    return G_time
173
174
175def deduce_backend(backends_list, backend=None):
176    """
177    Deduce the backend used for the graph features calculation.
178
179    Parameters
180    ----------
181    backends_list : list
182        List of available backends.
183    backend : str, optional
184        Preferred backend to use. Default is None.
185
186    Returns
187    -------
188    str
189        Backend to use.
190    """
191    if backend in backends_list:
192        return backend
193    else:
194        return "serial"
195
196
197# import nx_cugraph as nxcg
198# import nx_parallel as nxp
199
200
201def benchmark_nx_graph_features(G, backend="serial"):
202    """
203    Benchmarks several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism.
204
205    Parameters
206    ----------
207    G : networkx.Graph
208        The input graph.
209    backend : str, optional
210        Backend to use for computation. Default is 'serial'.
211
212    Returns
213    -------
214    dict
215        Dictionary containing the benchmarked features and their computation times.
216    """
217
218    try:
219        assert nx.is_connected(G)
220    except:
221        # graph isn't connect --> use largest connected component
222        # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph
223        G_cc = sorted(nx.connected_components(G), key=len, reverse=True)
224        G = G.subgraph(G_cc[0])
225
226    G_time = {}  # graph features time dictionary
227
228    if backend == "cugraph":
229        start_time = time.process_time()
230        G_conv = nxcg.from_networkx(G)
231        G_time["conversion"] = time.process_time() - start_time
232    elif backend == "parallel":
233        start_time = time.process_time()
234        G_conv = nxp.ParallelGraph(G)
235        G_time["conversion"] = time.process_time() - start_time
236    elif backend == "serial":
237        G_conv = G
238        G_time["conversion"] = 0
239    else:
240        raise ValueError("backend must be 'cugraph', 'parallel', or 'serial'")
241
242    G_time["backend"] = backend
243
244    G_feat = {}  # graph features dictionary
245    start_time = time.process_time()
246
247    G_feat["n nodes"] = float(G.number_of_nodes())  # number of nodes
248    G_time["n nodes"] = time.process_time() - start_time
249    print(
250        f"Execution time for {backend} : n nodes {G_time['n nodes']} : {G_feat['n nodes']}"
251    )
252    G_time["n nodes backend"] = "serial"
253
254    start_time = time.process_time()
255    G_feat["n edges"] = float(G.number_of_edges())  # number of edges
256    G_time["n edges"] = time.process_time() - start_time
257    print(
258        f"Execution time for {backend} : n edges {G_time['n edges']} : {G_feat['n edges']}"
259    )
260    G_time["n edges backend"] = "serial"
261
262    start_time = time.process_time()
263    G_feat["density"] = nx.density(
264        G
265    )  # how close the network is to a 'complete graph' where each node is connected
266    G_time["density"] = time.process_time() - start_time
267    print(
268        f"Execution time for {backend} : density {G_time['density']} : {G_feat['density']}"
269    )
270    G_time["density backend"] = "serial"
271
272    start_time = time.process_time()
273    if deduce_backend(nx.diameter.backends, backend) == backend:
274        G_feat["diameter"] = nx.diameter(
275            G_conv
276        )  # the farthest distance (e.g. number of edges) between two nodes in the graph
277        G_time["diameter"] = time.process_time() - start_time
278        print(
279            f"Execution time for {backend} : diameter {G_time['diameter']} : {G_feat['diameter']}"
280        )
281        G_time["diameter backend"] = backend
282    elif backend == "cugraph":
283        # Calculate shortest path length, find max
284        shortest_paths = nxcg.shortest_path(G_conv)
285        G_feat["diameter"] = np.max(shortest_paths)
286        G_time["diameter"] = time.process_time() - start_time
287    else:
288        G_feat["diameter"] = 0
289        # G_feat['diameter'] = nx.diameter(G)  # the farthest distance (e.g. number of edges) between two nodes in the graph
290        G_time["diameter"] = 0
291        G_time["diameter backend"] = "N/A"
292
293    start_time = time.process_time()
294    if deduce_backend(nx.average_clustering.backends, backend) == backend:
295        G_feat["avg clustering"] = nx.average_clustering(
296            G_conv
297        )  # the (averaged) fraction of possible triangles through a node.
298        G_time["avg clustering"] = time.process_time() - start_time
299        print(
300            f"Execution time for {backend} : avg clustering {G_time['avg clustering']} : {G_feat['avg clustering']}"
301        )
302        G_time["avg clustering backend"] = backend
303    else:
304        G_feat["avg clustering"] = 0
305        # G_feat['avg clustering'] = nx.average_clustering(G)  # the (averaged) fraction of possible triangles through a node.
306        G_time["avg clustering"] = 0
307        G_time["avg clustering backend"] = "N/A"
308
309    start_time = time.process_time()
310    if deduce_backend(nx.closeness_centrality.backends, backend) == backend:
311        G_feat["max closeness centrality"] = np.max(
312            list(nx.closeness_centrality(G_conv).values())
313        )  # max closeness centraility. high closeness --> short distance to all other nodes
314        G_time["max closeness centrality"] = time.process_time() - start_time
315        print(
316            f"Execution time for {backend} : max closeness centrality {G_time['max closeness centrality']} : {G_feat['max closeness centrality']}"
317        )
318        G_time["max closeness centrality backend"] = backend
319    else:
320        G_feat["max closeness centrality"] = 0
321        # G_feat['max closeness centrality'] = np.max(list(nx.closeness_centrality(G).values()))  # max closeness centraility. high closeness --> short distance to all other nodes
322        G_time["max closeness centrality"] = 0
323        G_time["max closeness centrality backend"] = "N/A"
324
325    start_time = time.process_time()
326    if deduce_backend(nx.eigenvector_centrality.backends, backend) == backend:
327        G_feat["max eigenvector centrality"] = np.max(
328            list(nx.eigenvector_centrality(G_conv, max_iter=10000).values())
329        )  # eigenvector centraility. how 'important'/'influential' a node is
330        G_time["max eigenvector centrality"] = time.process_time() - start_time
331        print(
332            f"Execution time for {backend} : max eigenvector centrality {G_time['max eigenvector centrality']} : {G_feat['max eigenvector centrality']}"
333        )
334        G_time["max eigenvector centrality backend"] = backend
335    else:
336        G_feat["max eigenvector centrality"] = 0
337        # G_feat['max eigenvector centrality'] = np.max(list(nx.eigenvector_centrality(G, max_iter=10000).values()))  # eigenvector centraility. how 'important'/'influential' a node is
338        G_time["max eigenvector centrality"] = 0
339        G_time["max eigenvector centrality backend"] = "N/A"
340
341    start_time = time.process_time()
342    if (
343        deduce_backend(nx.degree_pearson_correlation_coefficient.backends, backend)
344        == backend
345    ):
346        G_feat["degree assortativity"] = (
347            nx.degree_pearson_correlation_coefficient(G_conv) + 1
348        ) / 2  # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1.
349        G_time["degree assortativity"] = time.process_time() - start_time
350        print(
351            f"Execution time for {backend} : degree assortativity {G_time['degree assortativity']} : {G_feat['degree assortativity']}"
352        )
353        G_time["degree assortativity backend"] = backend
354    else:
355        G_feat["degree assortativity"] = 0
356        # G_feat['degree assortativity'] = (nx.degree_pearson_correlation_coefficient(G) + 1)/2  # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1.
357        G_time["degree assortativity"] = 0
358        G_time["degree assortativity backend"] = "N/A"
359
360    start_time = time.process_time()
361    if deduce_backend(nx.find_cliques.backends, backend) == backend:
362        G_feat["max clique number"] = len(
363            max(nx.find_cliques(G_conv), key=len)
364        )  # largest clique (i.e. an induced subgraph that is complete) size
365        G_time["max clique number"] = time.process_time() - start_time
366        print(
367            f"Execution time for {backend} : max clique number {G_time['max clique number']} : {G_feat['max clique number']}"
368        )
369        G_time["max clique number backend"] = backend
370    else:
371        G_feat["max clique number"] = 0
372        # G_feat['max clique number'] = len(max(nx.find_cliques(G),key=len))  # largest clique (i.e. an induced subgraph that is complete) size
373        G_time["max clique number"] = 0
374        G_time["max clique number backend"] = "N/A"
375
376    start_time = time.process_time()
377    if (
378        deduce_backend(
379            nx.algorithms.community.modularity_max.greedy_modularity_communities.backends,
380            backend,
381        )
382        == backend
383    ):
384        G_feat["n communities"] = len(
385            nx.algorithms.community.modularity_max.greedy_modularity_communities(G_conv)
386        )  # number of communities
387        G_time["n communities"] = time.process_time() - start_time
388        print(
389            f"Execution time for {backend} : n communities {G_time['n communities']} : {G_feat['n communities']}"
390        )
391        G_time["n communities backend"] = backend
392    else:
393        G_feat["n communities"] = 0
394        # G_feat['n communities'] = len(nx.algorithms.community.modularity_max.greedy_modularity_communities(G))  # number of communities
395        G_time["n communities"] = 0
396        G_time["n communities backend"] = "N/A"
397
398    start_time = time.process_time()
399    if deduce_backend(nx.average_shortest_path_length.backends, backend) == backend:
400        G_feat["avg path length"] = nx.average_shortest_path_length(G_conv)
401        G_time["avg path length"] = time.process_time() - start_time
402        print(
403            f"Execution time for {backend} : avg path length {G_time['avg path length']} : {G_feat['avg path length']}"
404        )
405        G_time["avg path length backend"] = backend
406    else:
407        G_feat["avg path length"] = 0
408        # G_feat['avg path length'] = nx.average_shortest_path_length(G)
409        G_time["avg path length"] = 0
410        G_time["avg path length backend"] = "N/A"
411
412    start_time = time.process_time()
413    if deduce_backend(nx.betweenness_centrality.backends, backend) == backend:
414        G_feat["max betweenness centrality"] = np.max(
415            list(nx.betweenness_centrality(G_conv).values())
416        )
417        G_time["max betweenness centrality"] = time.process_time() - start_time
418        print(
419            f"Execution time for {backend} : max betweenness centrality {G_time['max betweenness centrality']} : {G_feat['max betweenness centrality']}"
420        )
421        G_time["max betweenness centrality backend"] = backend
422    else:
423        G_feat["max betweenness centrality"] = 0
424        # G_feat['max betweenness centrality'] = (np.max(list(nx.betweenness_centrality(G).values())))
425        G_time["max betweenness centrality"] = 0
426        G_time["max betweenness centrality backend"] = "N/A"
427    return G_time
428
429
430def calc_graph_features(G, backend=None):
431    """
432    Calculates several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism.
433
434    Parameters
435    ----------
436    G : networkx.Graph
437        The input graph.
438    backend : str, optional
439        Backend to use for computation. Default is None.
440
441    Returns
442    -------
443    dict
444        Dictionary containing the calculated features.
445    """
446    try:
447        assert nx.is_connected(G)
448    except:
449        # graph isn't connect --> use largest connected component
450        # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph
451        G_cc = sorted(nx.connected_components(G), key=len, reverse=True)
452        G = G.subgraph(G_cc[0])
453
454    G_feat = {}  # graph features dictionary
455    start_time = time.process_time()
456
457    G_feat["n nodes"] = float(G.number_of_nodes())  # number of nodes
458    print("Execution time for 'n nodes':", time.process_time() - start_time)
459
460    start_time = time.process_time()
461    G_feat["n edges"] = float(G.number_of_edges())  # number of edges
462    print("Execution time for 'n edges':", time.process_time() - start_time)
463
464    start_time = time.process_time()
465    G_feat["density"] = nx.density(
466        G
467    )  # how close the network is to a 'complete graph' where each node is connected
468    print("Execution time for 'density':", time.process_time() - start_time)
469
470    start_time = time.process_time()
471    G_feat["diameter"] = nx.diameter(
472        G
473    )  # the farthest distance (e.g. number of edges) between two nodes in the graph
474    print("Execution time for 'diameter':", time.process_time() - start_time)
475
476    start_time = time.process_time()
477    G_feat["avg clustering"] = nx.average_clustering(
478        G
479    )  # the (averaged) fraction of possible triangles through a node.
480    print("Execution time for 'avg clustering':", time.process_time() - start_time)
481
482    start_time = time.process_time()
483    G_feat["max closeness centrality"] = np.max(
484        list(nx.closeness_centrality(G).values())
485    )  # max closeness centraility. high closeness --> short distance to all other nodes
486    print(
487        "Execution time for 'max closeness centrality':",
488        time.process_time() - start_time,
489    )
490
491    start_time = time.process_time()
492    G_feat["max eigenvector centrality"] = np.max(
493        list(nx.eigenvector_centrality(G, max_iter=10000).values())
494    )  # eigenvector centraility. how 'important'/'influential' a node is
495    print(
496        "Execution time for 'max eigenvector centrality':",
497        time.process_time() - start_time,
498    )
499
500    start_time = time.process_time()
501    G_feat["degree assortativity"] = (
502        nx.degree_pearson_correlation_coefficient(G) + 1
503    ) / 2  # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1.
504    print(
505        "Execution time for 'degree assortativity':", time.process_time() - start_time
506    )
507
508    start_time = time.process_time()
509    G_feat["max clique number"] = len(
510        max(nx.find_cliques(G), key=len)
511    )  # largest clique (i.e. an induced subgraph that is complete) size
512    print("Execution time for 'max clique number':", time.process_time() - start_time)
513
514    start_time = time.process_time()
515    G_feat["n communities"] = len(
516        nx.algorithms.community.modularity_max.greedy_modularity_communities(G)
517    )  # number of communities
518    print("Execution time for 'n communities':", time.process_time() - start_time)
519
520    start_time = time.process_time()
521    G_feat["avg path length"] = nx.average_shortest_path_length(G)
522    print("Execution time for 'avg path length':", time.process_time() - start_time)
523
524    start_time = time.process_time()
525    G_feat["max betweenness centrality"] = np.max(
526        list(nx.betweenness_centrality(G).values())
527    )
528    print(
529        "Execution time for 'max betweenness centrality':",
530        time.process_time() - start_time,
531    )
532    return G_feat
533
534
535def igraph_calc_graph_features(G):
536    """
537    Calculates several graph network features using igraph. If not connected, largest subgraph is used.
538
539    Parameters
540    ----------
541    G : networkx.Graph
542        The input graph.
543
544    Returns
545    -------
546    dict
547        Dictionary containing the calculated features.
548    """
549    # G = ig.Graph.Load(G, format='gexf')
550    if type(G) != nx.classes.graph.Graph:
551        G = nx.read_gexf(G)
552    try:
553        assert nx.is_connected(G)
554    except:
555        # graph isn't connect --> use largest connected component
556        # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph
557        G_cc = sorted(nx.connected_components(G), key=len, reverse=True)
558        G = G.subgraph(G_cc[0])
559    """ 
560    Maybe rewrite the gexf files to a file format just in this case 
561    so igraph can read it? To remove the step of having to load from networkx?
562    """
563    G = ig.Graph.from_networkx(G)
564    G_feat = {}
565    G_feat["n nodes"] = float(ig.Graph.vcount(G))
566    G_feat["n edges"] = float(ig.Graph.ecount(G))
567    G_feat["density"] = ig.Graph.density(G)
568    G_feat["diameter"] = ig.Graph.diameter(G)
569    clustering = ig.Graph.transitivity_local_undirected(G, mode="zero")
570    if sum(clustering) == 0 or len(clustering) == 0:
571        G_feat["avg clustering"] = sum(clustering)
572    else:
573        G_feat["avg clustering"] = sum(clustering) / len(clustering)
574    G_feat["max closeness centrality"] = np.max(list(ig.Graph.closeness(G)))
575    G_feat["max eigenvector centrality"] = np.max(
576        ig.Graph.eigenvector_centrality(G, scale=False)
577    )
578    G_feat["degree assortativity"] = (
579        ig.Graph.assortativity_degree(G) + 1
580    ) / 2  # normalize to 0-1
581    G_feat["max clique number"] = len(max(ig.Graph.largest_cliques(G), key=len))
582    G_feat["n communities"] = ig.Graph.community_fastgreedy(G).optimal_count
583    G_feat["avg path length"] = float(ig.Graph.average_path_length(G))
584    G_feat["max betweenness centrality"] = round(
585        np.max(ig.Graph.betweenness(G))
586        / (((G_feat["n nodes"] - 1) * (G_feat["n nodes"] - 2)) / 2),
587        15,
588    )
589
590    return G_feat
591
592
593def similarity_measure(x1, x2):
594    """
595    Calculates the similarity between two feature values.
596    Similarity is defined as 1 minus the relative distance between features (x1 and x2).
597
598    Parameters
599    ----------
600    x1 : float
601        Feature value from graph 1 (must range between 0 and 1).
602    x2 : float
603        Feature value from graph 2 (must range between 0 and 1).
604
605    Returns
606    -------
607    float
608        Relative similarity between the two features.
609    """
610
611    return 1 - (np.abs(x1 - x2) / max(x1, x2))
612
613
614def calc_similarity_score(G1_dict, G2_dict, feature_list):
615    """
616    Calculates the similarity score of two graphs.
617
618    Parameters
619    ----------
620    G1_dict : dict or pandas.DataFrame
621        Graph 1 features dictionary or dataframe. Must be able to use a key to access values.
622    G2_dict : dict or pandas.DataFrame
623        Graph 2 features dictionary or dataframe. Must be able to use a key to access values.
624    feature_list : list
625        List of graph features to compare. Must be keys in the graph features dictionary.
626
627    Returns
628    -------
629    float
630        Similarity score (0 to 1) where 1 indicates identical graphs.
631    """
632    s_list = []
633    for feat in feature_list:
634        f1 = G1_dict[feat]
635        f2 = G2_dict[feat]
636        s_tmp = similarity_measure(f1, f2)
637        s_list.append(s_tmp)
638    s = np.sum(s_list) / len(s_list)
639    return s
640
641
642def process_graphs(graph_fnames, package=2):
643    """
644    Takes a list of graph files, calculates their features, and returns them as a dataframe.
645
646    Parameters
647    ----------
648    graph_fnames : list
649        List of graph filenames to process.
650    package : int, optional
651        Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph).
652
653    Returns
654    -------
655    pandas.DataFrame
656        Dataframe containing graph features for each graph in the filename list.
657    """
658    pool = multiprocessing.Pool()
659    tmp_feat_list = []
660    tmp_df = ""
661    if package == 1:  # networkx
662        for fname in graph_fnames:
663            G_tmp = nx.read_gexf(fname)
664            tmp_feat_list.append(calc_graph_features(G_tmp))
665        tmp_df = pd.DataFrame(tmp_feat_list)
666    elif package == 2:  # igraph
667        result = pool.map_async(igraph_calc_graph_features, graph_fnames)
668        pool.close()
669        pool.join()
670        # for fname in graph_fnames:
671        # tmp_feat_list.append(igraph_calc_graph_features(fname))
672        # print (f"type(result.get()):{type(result.get())}")
673        # <class 'list'>
674        tmp_df = pd.DataFrame(result.get())
675    tmp_df["name"] = [Path(i).stem for i in graph_fnames]
676    assert not tmp_df.empty
677    return tmp_df
678
679
680def classify_graphs(class_file_list, sample_file_list, feature_list, package=2):
681    """
682    Classifies a similarity score from a list of Class and Sample graphs.
683
684    Parameters
685    ----------
686    class_file_list : list
687        List of control/reference graph files (classes).
688    sample_file_list : list
689        List of non-control/non-reference graph files (samples).
690    feature_list : list
691        List of features to use for similarity score. Must be valid keys in the graph features dictionary.
692    package : int, optional
693        Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph).
694
695    Returns
696    -------
697    pandas.DataFrame
698        Dataframe where each column is a class and each row is the similarity score of the sampled graph.
699    """
700    if package == 1:
701        class_feat_df = process_graphs(class_file_list, 1)
702        sample_feat_df = process_graphs(sample_file_list, 1)
703    elif package == 2:
704        class_feat_df = process_graphs(class_file_list, 2)
705        sample_feat_df = process_graphs(sample_file_list, 2)
706    class_list = []
707    for index, row in sample_feat_df.iterrows():  # for each sample graph
708        tmp_graph_feat = row
709        class_dict = {}
710        class_dict["name"] = tmp_graph_feat["name"]
711        for index2, row2 in class_feat_df.iterrows():  # for each class graph
712            tmp_class_feat = row2
713            s_tmp = calc_similarity_score(tmp_graph_feat, tmp_class_feat, feature_list)
714            class_dict[tmp_class_feat["name"]] = s_tmp
715        class_list.append(class_dict)
716    class_similarity_df = pd.DataFrame(class_list)
717    return class_similarity_df
718
719
720def process_similarity_df(class_similarity_df):
721    """
722    Generates y_true and y_pred based on the similarity score dataframe.
723
724    y_true is a list where each index is a class and each value is the class value.
725    y_pred is a list where each index is a sample and each value is the maximum similarity score for that sample.
726
727    Note
728    ----
729    This assumes the correct classification is along the diagonal of the similarity matrix/dataframe.
730
731    Parameters
732    ----------
733    class_similarity_df : pandas.DataFrame
734        Dataframe where each column is a class graph and each row is a sample graph. A_ij is the similarity score between graphs i and j. The exception is one column 'name' which contains the names of the sampled graphs for each row.
735
736    Returns
737    -------
738    tuple of lists
739        y_true, y_pred
740    """
741    num_df = class_similarity_df.drop(columns="name")
742    y_true = [i for i in range(len(class_similarity_df.columns) - 1)]
743    y_pred = list(num_df.idxmax(axis=0))
744    return y_true, y_pred
def calc_bc(G, return_dict):
17def calc_bc(G, return_dict):
18    """
19    Parallel subprocess function to calculate the betweenness centrality.
20
21    Parameters
22    ----------
23    G : networkx.Graph
24        The input graph.
25    return_dict : dict
26        Dictionary to store the result.
27
28    Returns
29    -------
30    dict
31        Betweenness centrality dictionary from multiple processes.
32    """
33    return_dict[1] = np.max(list(nx.betweenness_centrality(G).values()))

Parallel subprocess function to calculate the betweenness centrality.

Parameters
  • G (networkx.Graph): The input graph.
  • return_dict (dict): Dictionary to store the result.
Returns
  • dict: Betweenness centrality dictionary from multiple processes.
def calc_shortest_pthlen(G, return_dict):
36def calc_shortest_pthlen(G, return_dict):
37    """
38    Parallel subprocess function to calculate the average shortest path length.
39
40    Parameters
41    ----------
42    G : networkx.Graph
43        The input graph.
44    return_dict : dict
45        Dictionary to store the result.
46
47    Returns
48    -------
49    dict
50        Average shortest path length dictionary from multiple processes.
51    """
52    return_dict[2] = nx.average_shortest_path_length(G)

Parallel subprocess function to calculate the average shortest path length.

Parameters
  • G (networkx.Graph): The input graph.
  • return_dict (dict): Dictionary to store the result.
Returns
  • dict: Average shortest path length dictionary from multiple processes.
def benchmark_igraph_graph_features(G):
 55def benchmark_igraph_graph_features(G):
 56    """
 57    Benchmarks several graph network features using igraph. If not connected, largest subgraph is used.
 58
 59    Parameters
 60    ----------
 61    G : networkx.Graph
 62        The input graph.
 63
 64    Returns
 65    -------
 66    dict
 67        Dictionary containing the benchmarked features and their computation times.
 68    """
 69
 70    # G = ig.Graph.Load(G, format='gexf')
 71    if type(G) != nx.classes.graph.Graph:
 72        G = nx.read_gexf(G)
 73    try:
 74        assert nx.is_connected(G)
 75    except:
 76        # graph isn't connect --> use largest connected component
 77        # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph
 78        G_cc = sorted(nx.connected_components(G), key=len, reverse=True)
 79        G = G.subgraph(G_cc[0])
 80    """ 
 81    Maybe rewrite the gexf files to a file format just in this case 
 82    so igraph can read it? To remove the step of having to load from networkx?
 83    """
 84    G_time = {}
 85    start_time = time.process_time()
 86    G = ig.Graph.from_networkx(G)
 87    G_time["conversion"] = time.process_time() - start_time
 88    G_time["backend"] = "igraph"
 89    G_feat = {}
 90    start_time = time.process_time()
 91
 92    G_feat["n nodes"] = float(ig.Graph.vcount(G))  # number of nodes
 93    G_time["n nodes"] = time.process_time() - start_time
 94    G_time["n nodes backend"] = "igraph"
 95
 96    start_time = time.process_time()
 97    G_feat["n edges"] = float(ig.Graph.ecount(G))  # number of edges
 98    G_time["n edges"] = time.process_time() - start_time
 99    G_time["n edges backend"] = "igraph"
100
101    start_time = time.process_time()
102    G_feat["density"] = ig.Graph.density(
103        G
104    )  # how close the network is to a 'complete graph' where each node is connected
105    G_time["density"] = time.process_time() - start_time
106    G_time["density backend"] = "igraph"
107
108    start_time = time.process_time()
109    G_feat["diameter"] = ig.Graph.diameter(
110        G
111    )  # the farthest distance (e.g. number of edges) between two nodes in the graph
112    G_time["diameter"] = time.process_time() - start_time
113    G_time["diameter backend"] = "igraph"
114
115    start_time = time.process_time()
116    clustering = ig.Graph.transitivity_local_undirected(G, mode="zero")
117    if sum(clustering) == 0 or len(clustering) == 0:
118        G_feat["avg clustering"] = sum(clustering)
119    else:
120        G_feat["avg clustering"] = sum(clustering) / len(clustering)
121    G_time["avg clustering"] = time.process_time() - start_time
122    G_time["avg clustering backend"] = "igraph"
123
124    start_time = time.process_time()
125    G_feat["max closeness centrality"] = np.max(
126        list(ig.Graph.closeness(G))
127    )  # max closeness centraility. high closeness --> short distance to all other nodes
128    G_time["max closeness centrality"] = time.process_time() - start_time
129    G_time["max closeness centrality backend"] = "igraph"
130
131    start_time = time.process_time()
132    G_feat["max eigenvector centrality"] = np.max(
133        ig.Graph.eigenvector_centrality(G, scale=False)
134    )  # eigenvector centraility. how 'important'/'influential' a node is
135    G_time["max eigenvector centrality"] = time.process_time() - start_time
136    G_time["max eigenvector centrality backend"] = "igraph"
137
138    start_time = time.process_time()
139    G_feat["degree assortativity"] = (
140        ig.Graph.assortativity_degree(G) + 1
141    ) / 2  # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1.
142    G_time["degree assortativity"] = time.process_time() - start_time
143    G_time["degree assortativity backend"] = "igraph"
144
145    start_time = time.process_time()
146    G_feat["max clique number"] = len(
147        max(ig.Graph.largest_cliques(G), key=len)
148    )  # largest clique (i.e. an induced subgraph that is complete) size
149    G_time["max clique number"] = time.process_time() - start_time
150    G_time["max clique number backend"] = "igraph"
151
152    start_time = time.process_time()
153    G_feat["n communities"] = ig.Graph.community_fastgreedy(
154        G
155    ).optimal_count  # number of communities
156    G_time["n communities"] = time.process_time() - start_time
157    G_time["n communities backend"] = "igraph"
158
159    start_time = time.process_time()
160    G_feat["avg path length"] = float(ig.Graph.average_path_length(G))
161    G_time["avg path length"] = time.process_time() - start_time
162    G_time["avg path length backend"] = "igraph"
163
164    start_time = time.process_time()
165    G_feat["max betweenness centrality"] = round(
166        np.max(ig.Graph.betweenness(G))
167        / (((G_feat["n nodes"] - 1) * (G_feat["n nodes"] - 2)) / 2),
168        15,
169    )
170    G_time["max betweenness centrality"] = time.process_time() - start_time
171    G_time["max betweenness centrality backend"] = "igraph"
172
173    return G_time

Benchmarks several graph network features using igraph. If not connected, largest subgraph is used.

Parameters
  • G (networkx.Graph): The input graph.
Returns
  • dict: Dictionary containing the benchmarked features and their computation times.
def deduce_backend(backends_list, backend=None):
176def deduce_backend(backends_list, backend=None):
177    """
178    Deduce the backend used for the graph features calculation.
179
180    Parameters
181    ----------
182    backends_list : list
183        List of available backends.
184    backend : str, optional
185        Preferred backend to use. Default is None.
186
187    Returns
188    -------
189    str
190        Backend to use.
191    """
192    if backend in backends_list:
193        return backend
194    else:
195        return "serial"

Deduce the backend used for the graph features calculation.

Parameters
  • backends_list (list): List of available backends.
  • backend (str, optional): Preferred backend to use. Default is None.
Returns
  • str: Backend to use.
def benchmark_nx_graph_features(G, backend='serial'):
202def benchmark_nx_graph_features(G, backend="serial"):
203    """
204    Benchmarks several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism.
205
206    Parameters
207    ----------
208    G : networkx.Graph
209        The input graph.
210    backend : str, optional
211        Backend to use for computation. Default is 'serial'.
212
213    Returns
214    -------
215    dict
216        Dictionary containing the benchmarked features and their computation times.
217    """
218
219    try:
220        assert nx.is_connected(G)
221    except:
222        # graph isn't connect --> use largest connected component
223        # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph
224        G_cc = sorted(nx.connected_components(G), key=len, reverse=True)
225        G = G.subgraph(G_cc[0])
226
227    G_time = {}  # graph features time dictionary
228
229    if backend == "cugraph":
230        start_time = time.process_time()
231        G_conv = nxcg.from_networkx(G)
232        G_time["conversion"] = time.process_time() - start_time
233    elif backend == "parallel":
234        start_time = time.process_time()
235        G_conv = nxp.ParallelGraph(G)
236        G_time["conversion"] = time.process_time() - start_time
237    elif backend == "serial":
238        G_conv = G
239        G_time["conversion"] = 0
240    else:
241        raise ValueError("backend must be 'cugraph', 'parallel', or 'serial'")
242
243    G_time["backend"] = backend
244
245    G_feat = {}  # graph features dictionary
246    start_time = time.process_time()
247
248    G_feat["n nodes"] = float(G.number_of_nodes())  # number of nodes
249    G_time["n nodes"] = time.process_time() - start_time
250    print(
251        f"Execution time for {backend} : n nodes {G_time['n nodes']} : {G_feat['n nodes']}"
252    )
253    G_time["n nodes backend"] = "serial"
254
255    start_time = time.process_time()
256    G_feat["n edges"] = float(G.number_of_edges())  # number of edges
257    G_time["n edges"] = time.process_time() - start_time
258    print(
259        f"Execution time for {backend} : n edges {G_time['n edges']} : {G_feat['n edges']}"
260    )
261    G_time["n edges backend"] = "serial"
262
263    start_time = time.process_time()
264    G_feat["density"] = nx.density(
265        G
266    )  # how close the network is to a 'complete graph' where each node is connected
267    G_time["density"] = time.process_time() - start_time
268    print(
269        f"Execution time for {backend} : density {G_time['density']} : {G_feat['density']}"
270    )
271    G_time["density backend"] = "serial"
272
273    start_time = time.process_time()
274    if deduce_backend(nx.diameter.backends, backend) == backend:
275        G_feat["diameter"] = nx.diameter(
276            G_conv
277        )  # the farthest distance (e.g. number of edges) between two nodes in the graph
278        G_time["diameter"] = time.process_time() - start_time
279        print(
280            f"Execution time for {backend} : diameter {G_time['diameter']} : {G_feat['diameter']}"
281        )
282        G_time["diameter backend"] = backend
283    elif backend == "cugraph":
284        # Calculate shortest path length, find max
285        shortest_paths = nxcg.shortest_path(G_conv)
286        G_feat["diameter"] = np.max(shortest_paths)
287        G_time["diameter"] = time.process_time() - start_time
288    else:
289        G_feat["diameter"] = 0
290        # G_feat['diameter'] = nx.diameter(G)  # the farthest distance (e.g. number of edges) between two nodes in the graph
291        G_time["diameter"] = 0
292        G_time["diameter backend"] = "N/A"
293
294    start_time = time.process_time()
295    if deduce_backend(nx.average_clustering.backends, backend) == backend:
296        G_feat["avg clustering"] = nx.average_clustering(
297            G_conv
298        )  # the (averaged) fraction of possible triangles through a node.
299        G_time["avg clustering"] = time.process_time() - start_time
300        print(
301            f"Execution time for {backend} : avg clustering {G_time['avg clustering']} : {G_feat['avg clustering']}"
302        )
303        G_time["avg clustering backend"] = backend
304    else:
305        G_feat["avg clustering"] = 0
306        # G_feat['avg clustering'] = nx.average_clustering(G)  # the (averaged) fraction of possible triangles through a node.
307        G_time["avg clustering"] = 0
308        G_time["avg clustering backend"] = "N/A"
309
310    start_time = time.process_time()
311    if deduce_backend(nx.closeness_centrality.backends, backend) == backend:
312        G_feat["max closeness centrality"] = np.max(
313            list(nx.closeness_centrality(G_conv).values())
314        )  # max closeness centraility. high closeness --> short distance to all other nodes
315        G_time["max closeness centrality"] = time.process_time() - start_time
316        print(
317            f"Execution time for {backend} : max closeness centrality {G_time['max closeness centrality']} : {G_feat['max closeness centrality']}"
318        )
319        G_time["max closeness centrality backend"] = backend
320    else:
321        G_feat["max closeness centrality"] = 0
322        # G_feat['max closeness centrality'] = np.max(list(nx.closeness_centrality(G).values()))  # max closeness centraility. high closeness --> short distance to all other nodes
323        G_time["max closeness centrality"] = 0
324        G_time["max closeness centrality backend"] = "N/A"
325
326    start_time = time.process_time()
327    if deduce_backend(nx.eigenvector_centrality.backends, backend) == backend:
328        G_feat["max eigenvector centrality"] = np.max(
329            list(nx.eigenvector_centrality(G_conv, max_iter=10000).values())
330        )  # eigenvector centraility. how 'important'/'influential' a node is
331        G_time["max eigenvector centrality"] = time.process_time() - start_time
332        print(
333            f"Execution time for {backend} : max eigenvector centrality {G_time['max eigenvector centrality']} : {G_feat['max eigenvector centrality']}"
334        )
335        G_time["max eigenvector centrality backend"] = backend
336    else:
337        G_feat["max eigenvector centrality"] = 0
338        # G_feat['max eigenvector centrality'] = np.max(list(nx.eigenvector_centrality(G, max_iter=10000).values()))  # eigenvector centraility. how 'important'/'influential' a node is
339        G_time["max eigenvector centrality"] = 0
340        G_time["max eigenvector centrality backend"] = "N/A"
341
342    start_time = time.process_time()
343    if (
344        deduce_backend(nx.degree_pearson_correlation_coefficient.backends, backend)
345        == backend
346    ):
347        G_feat["degree assortativity"] = (
348            nx.degree_pearson_correlation_coefficient(G_conv) + 1
349        ) / 2  # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1.
350        G_time["degree assortativity"] = time.process_time() - start_time
351        print(
352            f"Execution time for {backend} : degree assortativity {G_time['degree assortativity']} : {G_feat['degree assortativity']}"
353        )
354        G_time["degree assortativity backend"] = backend
355    else:
356        G_feat["degree assortativity"] = 0
357        # G_feat['degree assortativity'] = (nx.degree_pearson_correlation_coefficient(G) + 1)/2  # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1.
358        G_time["degree assortativity"] = 0
359        G_time["degree assortativity backend"] = "N/A"
360
361    start_time = time.process_time()
362    if deduce_backend(nx.find_cliques.backends, backend) == backend:
363        G_feat["max clique number"] = len(
364            max(nx.find_cliques(G_conv), key=len)
365        )  # largest clique (i.e. an induced subgraph that is complete) size
366        G_time["max clique number"] = time.process_time() - start_time
367        print(
368            f"Execution time for {backend} : max clique number {G_time['max clique number']} : {G_feat['max clique number']}"
369        )
370        G_time["max clique number backend"] = backend
371    else:
372        G_feat["max clique number"] = 0
373        # G_feat['max clique number'] = len(max(nx.find_cliques(G),key=len))  # largest clique (i.e. an induced subgraph that is complete) size
374        G_time["max clique number"] = 0
375        G_time["max clique number backend"] = "N/A"
376
377    start_time = time.process_time()
378    if (
379        deduce_backend(
380            nx.algorithms.community.modularity_max.greedy_modularity_communities.backends,
381            backend,
382        )
383        == backend
384    ):
385        G_feat["n communities"] = len(
386            nx.algorithms.community.modularity_max.greedy_modularity_communities(G_conv)
387        )  # number of communities
388        G_time["n communities"] = time.process_time() - start_time
389        print(
390            f"Execution time for {backend} : n communities {G_time['n communities']} : {G_feat['n communities']}"
391        )
392        G_time["n communities backend"] = backend
393    else:
394        G_feat["n communities"] = 0
395        # G_feat['n communities'] = len(nx.algorithms.community.modularity_max.greedy_modularity_communities(G))  # number of communities
396        G_time["n communities"] = 0
397        G_time["n communities backend"] = "N/A"
398
399    start_time = time.process_time()
400    if deduce_backend(nx.average_shortest_path_length.backends, backend) == backend:
401        G_feat["avg path length"] = nx.average_shortest_path_length(G_conv)
402        G_time["avg path length"] = time.process_time() - start_time
403        print(
404            f"Execution time for {backend} : avg path length {G_time['avg path length']} : {G_feat['avg path length']}"
405        )
406        G_time["avg path length backend"] = backend
407    else:
408        G_feat["avg path length"] = 0
409        # G_feat['avg path length'] = nx.average_shortest_path_length(G)
410        G_time["avg path length"] = 0
411        G_time["avg path length backend"] = "N/A"
412
413    start_time = time.process_time()
414    if deduce_backend(nx.betweenness_centrality.backends, backend) == backend:
415        G_feat["max betweenness centrality"] = np.max(
416            list(nx.betweenness_centrality(G_conv).values())
417        )
418        G_time["max betweenness centrality"] = time.process_time() - start_time
419        print(
420            f"Execution time for {backend} : max betweenness centrality {G_time['max betweenness centrality']} : {G_feat['max betweenness centrality']}"
421        )
422        G_time["max betweenness centrality backend"] = backend
423    else:
424        G_feat["max betweenness centrality"] = 0
425        # G_feat['max betweenness centrality'] = (np.max(list(nx.betweenness_centrality(G).values())))
426        G_time["max betweenness centrality"] = 0
427        G_time["max betweenness centrality backend"] = "N/A"
428    return G_time

Benchmarks several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism.

Parameters
  • G (networkx.Graph): The input graph.
  • backend (str, optional): Backend to use for computation. Default is 'serial'.
Returns
  • dict: Dictionary containing the benchmarked features and their computation times.
def calc_graph_features(G, backend=None):
431def calc_graph_features(G, backend=None):
432    """
433    Calculates several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism.
434
435    Parameters
436    ----------
437    G : networkx.Graph
438        The input graph.
439    backend : str, optional
440        Backend to use for computation. Default is None.
441
442    Returns
443    -------
444    dict
445        Dictionary containing the calculated features.
446    """
447    try:
448        assert nx.is_connected(G)
449    except:
450        # graph isn't connect --> use largest connected component
451        # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph
452        G_cc = sorted(nx.connected_components(G), key=len, reverse=True)
453        G = G.subgraph(G_cc[0])
454
455    G_feat = {}  # graph features dictionary
456    start_time = time.process_time()
457
458    G_feat["n nodes"] = float(G.number_of_nodes())  # number of nodes
459    print("Execution time for 'n nodes':", time.process_time() - start_time)
460
461    start_time = time.process_time()
462    G_feat["n edges"] = float(G.number_of_edges())  # number of edges
463    print("Execution time for 'n edges':", time.process_time() - start_time)
464
465    start_time = time.process_time()
466    G_feat["density"] = nx.density(
467        G
468    )  # how close the network is to a 'complete graph' where each node is connected
469    print("Execution time for 'density':", time.process_time() - start_time)
470
471    start_time = time.process_time()
472    G_feat["diameter"] = nx.diameter(
473        G
474    )  # the farthest distance (e.g. number of edges) between two nodes in the graph
475    print("Execution time for 'diameter':", time.process_time() - start_time)
476
477    start_time = time.process_time()
478    G_feat["avg clustering"] = nx.average_clustering(
479        G
480    )  # the (averaged) fraction of possible triangles through a node.
481    print("Execution time for 'avg clustering':", time.process_time() - start_time)
482
483    start_time = time.process_time()
484    G_feat["max closeness centrality"] = np.max(
485        list(nx.closeness_centrality(G).values())
486    )  # max closeness centraility. high closeness --> short distance to all other nodes
487    print(
488        "Execution time for 'max closeness centrality':",
489        time.process_time() - start_time,
490    )
491
492    start_time = time.process_time()
493    G_feat["max eigenvector centrality"] = np.max(
494        list(nx.eigenvector_centrality(G, max_iter=10000).values())
495    )  # eigenvector centraility. how 'important'/'influential' a node is
496    print(
497        "Execution time for 'max eigenvector centrality':",
498        time.process_time() - start_time,
499    )
500
501    start_time = time.process_time()
502    G_feat["degree assortativity"] = (
503        nx.degree_pearson_correlation_coefficient(G) + 1
504    ) / 2  # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1.
505    print(
506        "Execution time for 'degree assortativity':", time.process_time() - start_time
507    )
508
509    start_time = time.process_time()
510    G_feat["max clique number"] = len(
511        max(nx.find_cliques(G), key=len)
512    )  # largest clique (i.e. an induced subgraph that is complete) size
513    print("Execution time for 'max clique number':", time.process_time() - start_time)
514
515    start_time = time.process_time()
516    G_feat["n communities"] = len(
517        nx.algorithms.community.modularity_max.greedy_modularity_communities(G)
518    )  # number of communities
519    print("Execution time for 'n communities':", time.process_time() - start_time)
520
521    start_time = time.process_time()
522    G_feat["avg path length"] = nx.average_shortest_path_length(G)
523    print("Execution time for 'avg path length':", time.process_time() - start_time)
524
525    start_time = time.process_time()
526    G_feat["max betweenness centrality"] = np.max(
527        list(nx.betweenness_centrality(G).values())
528    )
529    print(
530        "Execution time for 'max betweenness centrality':",
531        time.process_time() - start_time,
532    )
533    return G_feat

Calculates several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism.

Parameters
  • G (networkx.Graph): The input graph.
  • backend (str, optional): Backend to use for computation. Default is None.
Returns
  • dict: Dictionary containing the calculated features.
def igraph_calc_graph_features(G):
536def igraph_calc_graph_features(G):
537    """
538    Calculates several graph network features using igraph. If not connected, largest subgraph is used.
539
540    Parameters
541    ----------
542    G : networkx.Graph
543        The input graph.
544
545    Returns
546    -------
547    dict
548        Dictionary containing the calculated features.
549    """
550    # G = ig.Graph.Load(G, format='gexf')
551    if type(G) != nx.classes.graph.Graph:
552        G = nx.read_gexf(G)
553    try:
554        assert nx.is_connected(G)
555    except:
556        # graph isn't connect --> use largest connected component
557        # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph
558        G_cc = sorted(nx.connected_components(G), key=len, reverse=True)
559        G = G.subgraph(G_cc[0])
560    """ 
561    Maybe rewrite the gexf files to a file format just in this case 
562    so igraph can read it? To remove the step of having to load from networkx?
563    """
564    G = ig.Graph.from_networkx(G)
565    G_feat = {}
566    G_feat["n nodes"] = float(ig.Graph.vcount(G))
567    G_feat["n edges"] = float(ig.Graph.ecount(G))
568    G_feat["density"] = ig.Graph.density(G)
569    G_feat["diameter"] = ig.Graph.diameter(G)
570    clustering = ig.Graph.transitivity_local_undirected(G, mode="zero")
571    if sum(clustering) == 0 or len(clustering) == 0:
572        G_feat["avg clustering"] = sum(clustering)
573    else:
574        G_feat["avg clustering"] = sum(clustering) / len(clustering)
575    G_feat["max closeness centrality"] = np.max(list(ig.Graph.closeness(G)))
576    G_feat["max eigenvector centrality"] = np.max(
577        ig.Graph.eigenvector_centrality(G, scale=False)
578    )
579    G_feat["degree assortativity"] = (
580        ig.Graph.assortativity_degree(G) + 1
581    ) / 2  # normalize to 0-1
582    G_feat["max clique number"] = len(max(ig.Graph.largest_cliques(G), key=len))
583    G_feat["n communities"] = ig.Graph.community_fastgreedy(G).optimal_count
584    G_feat["avg path length"] = float(ig.Graph.average_path_length(G))
585    G_feat["max betweenness centrality"] = round(
586        np.max(ig.Graph.betweenness(G))
587        / (((G_feat["n nodes"] - 1) * (G_feat["n nodes"] - 2)) / 2),
588        15,
589    )
590
591    return G_feat

Calculates several graph network features using igraph. If not connected, largest subgraph is used.

Parameters
  • G (networkx.Graph): The input graph.
Returns
  • dict: Dictionary containing the calculated features.
def similarity_measure(x1, x2):
594def similarity_measure(x1, x2):
595    """
596    Calculates the similarity between two feature values.
597    Similarity is defined as 1 minus the relative distance between features (x1 and x2).
598
599    Parameters
600    ----------
601    x1 : float
602        Feature value from graph 1 (must range between 0 and 1).
603    x2 : float
604        Feature value from graph 2 (must range between 0 and 1).
605
606    Returns
607    -------
608    float
609        Relative similarity between the two features.
610    """
611
612    return 1 - (np.abs(x1 - x2) / max(x1, x2))

Calculates the similarity between two feature values. Similarity is defined as 1 minus the relative distance between features (x1 and x2).

Parameters
  • x1 (float): Feature value from graph 1 (must range between 0 and 1).
  • x2 (float): Feature value from graph 2 (must range between 0 and 1).
Returns
  • float: Relative similarity between the two features.
def calc_similarity_score(G1_dict, G2_dict, feature_list):
615def calc_similarity_score(G1_dict, G2_dict, feature_list):
616    """
617    Calculates the similarity score of two graphs.
618
619    Parameters
620    ----------
621    G1_dict : dict or pandas.DataFrame
622        Graph 1 features dictionary or dataframe. Must be able to use a key to access values.
623    G2_dict : dict or pandas.DataFrame
624        Graph 2 features dictionary or dataframe. Must be able to use a key to access values.
625    feature_list : list
626        List of graph features to compare. Must be keys in the graph features dictionary.
627
628    Returns
629    -------
630    float
631        Similarity score (0 to 1) where 1 indicates identical graphs.
632    """
633    s_list = []
634    for feat in feature_list:
635        f1 = G1_dict[feat]
636        f2 = G2_dict[feat]
637        s_tmp = similarity_measure(f1, f2)
638        s_list.append(s_tmp)
639    s = np.sum(s_list) / len(s_list)
640    return s

Calculates the similarity score of two graphs.

Parameters
  • G1_dict (dict or pandas.DataFrame): Graph 1 features dictionary or dataframe. Must be able to use a key to access values.
  • G2_dict (dict or pandas.DataFrame): Graph 2 features dictionary or dataframe. Must be able to use a key to access values.
  • feature_list (list): List of graph features to compare. Must be keys in the graph features dictionary.
Returns
  • float: Similarity score (0 to 1) where 1 indicates identical graphs.
def process_graphs(graph_fnames, package=2):
643def process_graphs(graph_fnames, package=2):
644    """
645    Takes a list of graph files, calculates their features, and returns them as a dataframe.
646
647    Parameters
648    ----------
649    graph_fnames : list
650        List of graph filenames to process.
651    package : int, optional
652        Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph).
653
654    Returns
655    -------
656    pandas.DataFrame
657        Dataframe containing graph features for each graph in the filename list.
658    """
659    pool = multiprocessing.Pool()
660    tmp_feat_list = []
661    tmp_df = ""
662    if package == 1:  # networkx
663        for fname in graph_fnames:
664            G_tmp = nx.read_gexf(fname)
665            tmp_feat_list.append(calc_graph_features(G_tmp))
666        tmp_df = pd.DataFrame(tmp_feat_list)
667    elif package == 2:  # igraph
668        result = pool.map_async(igraph_calc_graph_features, graph_fnames)
669        pool.close()
670        pool.join()
671        # for fname in graph_fnames:
672        # tmp_feat_list.append(igraph_calc_graph_features(fname))
673        # print (f"type(result.get()):{type(result.get())}")
674        # <class 'list'>
675        tmp_df = pd.DataFrame(result.get())
676    tmp_df["name"] = [Path(i).stem for i in graph_fnames]
677    assert not tmp_df.empty
678    return tmp_df

Takes a list of graph files, calculates their features, and returns them as a dataframe.

Parameters
  • graph_fnames (list): List of graph filenames to process.
  • package (int, optional): Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph).
Returns
  • pandas.DataFrame: Dataframe containing graph features for each graph in the filename list.
def classify_graphs(class_file_list, sample_file_list, feature_list, package=2):
681def classify_graphs(class_file_list, sample_file_list, feature_list, package=2):
682    """
683    Classifies a similarity score from a list of Class and Sample graphs.
684
685    Parameters
686    ----------
687    class_file_list : list
688        List of control/reference graph files (classes).
689    sample_file_list : list
690        List of non-control/non-reference graph files (samples).
691    feature_list : list
692        List of features to use for similarity score. Must be valid keys in the graph features dictionary.
693    package : int, optional
694        Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph).
695
696    Returns
697    -------
698    pandas.DataFrame
699        Dataframe where each column is a class and each row is the similarity score of the sampled graph.
700    """
701    if package == 1:
702        class_feat_df = process_graphs(class_file_list, 1)
703        sample_feat_df = process_graphs(sample_file_list, 1)
704    elif package == 2:
705        class_feat_df = process_graphs(class_file_list, 2)
706        sample_feat_df = process_graphs(sample_file_list, 2)
707    class_list = []
708    for index, row in sample_feat_df.iterrows():  # for each sample graph
709        tmp_graph_feat = row
710        class_dict = {}
711        class_dict["name"] = tmp_graph_feat["name"]
712        for index2, row2 in class_feat_df.iterrows():  # for each class graph
713            tmp_class_feat = row2
714            s_tmp = calc_similarity_score(tmp_graph_feat, tmp_class_feat, feature_list)
715            class_dict[tmp_class_feat["name"]] = s_tmp
716        class_list.append(class_dict)
717    class_similarity_df = pd.DataFrame(class_list)
718    return class_similarity_df

Classifies a similarity score from a list of Class and Sample graphs.

Parameters
  • class_file_list (list): List of control/reference graph files (classes).
  • sample_file_list (list): List of non-control/non-reference graph files (samples).
  • feature_list (list): List of features to use for similarity score. Must be valid keys in the graph features dictionary.
  • package (int, optional): Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph).
Returns
  • pandas.DataFrame: Dataframe where each column is a class and each row is the similarity score of the sampled graph.
def process_similarity_df(class_similarity_df):
721def process_similarity_df(class_similarity_df):
722    """
723    Generates y_true and y_pred based on the similarity score dataframe.
724
725    y_true is a list where each index is a class and each value is the class value.
726    y_pred is a list where each index is a sample and each value is the maximum similarity score for that sample.
727
728    Note
729    ----
730    This assumes the correct classification is along the diagonal of the similarity matrix/dataframe.
731
732    Parameters
733    ----------
734    class_similarity_df : pandas.DataFrame
735        Dataframe where each column is a class graph and each row is a sample graph. A_ij is the similarity score between graphs i and j. The exception is one column 'name' which contains the names of the sampled graphs for each row.
736
737    Returns
738    -------
739    tuple of lists
740        y_true, y_pred
741    """
742    num_df = class_similarity_df.drop(columns="name")
743    y_true = [i for i in range(len(class_similarity_df.columns) - 1)]
744    y_pred = list(num_df.idxmax(axis=0))
745    return y_true, y_pred

Generates y_true and y_pred based on the similarity score dataframe.

y_true is a list where each index is a class and each value is the class value. y_pred is a list where each index is a sample and each value is the maximum similarity score for that sample.

Note

This assumes the correct classification is along the diagonal of the similarity matrix/dataframe.

Parameters
  • class_similarity_df (pandas.DataFrame): Dataframe where each column is a class graph and each row is a sample graph. A_ij is the similarity score between graphs i and j. The exception is one column 'name' which contains the names of the sampled graphs for each row.
Returns
  • tuple of lists: y_true, y_pred