griptomo.core.graph2class
1import time 2import networkx as nx 3import numpy as np 4import pandas as pd 5from pathlib import Path 6import multiprocessing 7import igraph as ig 8 9 10import warnings 11import time 12 13warnings.filterwarnings("ignore") 14 15 16def calc_bc(G, return_dict): 17 """ 18 Parallel subprocess function to calculate the betweenness centrality. 19 20 Parameters 21 ---------- 22 G : networkx.Graph 23 The input graph. 24 return_dict : dict 25 Dictionary to store the result. 26 27 Returns 28 ------- 29 dict 30 Betweenness centrality dictionary from multiple processes. 31 """ 32 return_dict[1] = np.max(list(nx.betweenness_centrality(G).values())) 33 34 35def calc_shortest_pthlen(G, return_dict): 36 """ 37 Parallel subprocess function to calculate the average shortest path length. 38 39 Parameters 40 ---------- 41 G : networkx.Graph 42 The input graph. 43 return_dict : dict 44 Dictionary to store the result. 45 46 Returns 47 ------- 48 dict 49 Average shortest path length dictionary from multiple processes. 50 """ 51 return_dict[2] = nx.average_shortest_path_length(G) 52 53 54def benchmark_igraph_graph_features(G): 55 """ 56 Benchmarks several graph network features using igraph. If not connected, largest subgraph is used. 57 58 Parameters 59 ---------- 60 G : networkx.Graph 61 The input graph. 62 63 Returns 64 ------- 65 dict 66 Dictionary containing the benchmarked features and their computation times. 67 """ 68 69 # G = ig.Graph.Load(G, format='gexf') 70 if type(G) != nx.classes.graph.Graph: 71 G = nx.read_gexf(G) 72 try: 73 assert nx.is_connected(G) 74 except: 75 # graph isn't connect --> use largest connected component 76 # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph 77 G_cc = sorted(nx.connected_components(G), key=len, reverse=True) 78 G = G.subgraph(G_cc[0]) 79 """ 80 Maybe rewrite the gexf files to a file format just in this case 81 so igraph can read it? To remove the step of having to load from networkx? 82 """ 83 G_time = {} 84 start_time = time.process_time() 85 G = ig.Graph.from_networkx(G) 86 G_time["conversion"] = time.process_time() - start_time 87 G_time["backend"] = "igraph" 88 G_feat = {} 89 start_time = time.process_time() 90 91 G_feat["n nodes"] = float(ig.Graph.vcount(G)) # number of nodes 92 G_time["n nodes"] = time.process_time() - start_time 93 G_time["n nodes backend"] = "igraph" 94 95 start_time = time.process_time() 96 G_feat["n edges"] = float(ig.Graph.ecount(G)) # number of edges 97 G_time["n edges"] = time.process_time() - start_time 98 G_time["n edges backend"] = "igraph" 99 100 start_time = time.process_time() 101 G_feat["density"] = ig.Graph.density( 102 G 103 ) # how close the network is to a 'complete graph' where each node is connected 104 G_time["density"] = time.process_time() - start_time 105 G_time["density backend"] = "igraph" 106 107 start_time = time.process_time() 108 G_feat["diameter"] = ig.Graph.diameter( 109 G 110 ) # the farthest distance (e.g. number of edges) between two nodes in the graph 111 G_time["diameter"] = time.process_time() - start_time 112 G_time["diameter backend"] = "igraph" 113 114 start_time = time.process_time() 115 clustering = ig.Graph.transitivity_local_undirected(G, mode="zero") 116 if sum(clustering) == 0 or len(clustering) == 0: 117 G_feat["avg clustering"] = sum(clustering) 118 else: 119 G_feat["avg clustering"] = sum(clustering) / len(clustering) 120 G_time["avg clustering"] = time.process_time() - start_time 121 G_time["avg clustering backend"] = "igraph" 122 123 start_time = time.process_time() 124 G_feat["max closeness centrality"] = np.max( 125 list(ig.Graph.closeness(G)) 126 ) # max closeness centraility. high closeness --> short distance to all other nodes 127 G_time["max closeness centrality"] = time.process_time() - start_time 128 G_time["max closeness centrality backend"] = "igraph" 129 130 start_time = time.process_time() 131 G_feat["max eigenvector centrality"] = np.max( 132 ig.Graph.eigenvector_centrality(G, scale=False) 133 ) # eigenvector centraility. how 'important'/'influential' a node is 134 G_time["max eigenvector centrality"] = time.process_time() - start_time 135 G_time["max eigenvector centrality backend"] = "igraph" 136 137 start_time = time.process_time() 138 G_feat["degree assortativity"] = ( 139 ig.Graph.assortativity_degree(G) + 1 140 ) / 2 # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1. 141 G_time["degree assortativity"] = time.process_time() - start_time 142 G_time["degree assortativity backend"] = "igraph" 143 144 start_time = time.process_time() 145 G_feat["max clique number"] = len( 146 max(ig.Graph.largest_cliques(G), key=len) 147 ) # largest clique (i.e. an induced subgraph that is complete) size 148 G_time["max clique number"] = time.process_time() - start_time 149 G_time["max clique number backend"] = "igraph" 150 151 start_time = time.process_time() 152 G_feat["n communities"] = ig.Graph.community_fastgreedy( 153 G 154 ).optimal_count # number of communities 155 G_time["n communities"] = time.process_time() - start_time 156 G_time["n communities backend"] = "igraph" 157 158 start_time = time.process_time() 159 G_feat["avg path length"] = float(ig.Graph.average_path_length(G)) 160 G_time["avg path length"] = time.process_time() - start_time 161 G_time["avg path length backend"] = "igraph" 162 163 start_time = time.process_time() 164 G_feat["max betweenness centrality"] = round( 165 np.max(ig.Graph.betweenness(G)) 166 / (((G_feat["n nodes"] - 1) * (G_feat["n nodes"] - 2)) / 2), 167 15, 168 ) 169 G_time["max betweenness centrality"] = time.process_time() - start_time 170 G_time["max betweenness centrality backend"] = "igraph" 171 172 return G_time 173 174 175def deduce_backend(backends_list, backend=None): 176 """ 177 Deduce the backend used for the graph features calculation. 178 179 Parameters 180 ---------- 181 backends_list : list 182 List of available backends. 183 backend : str, optional 184 Preferred backend to use. Default is None. 185 186 Returns 187 ------- 188 str 189 Backend to use. 190 """ 191 if backend in backends_list: 192 return backend 193 else: 194 return "serial" 195 196 197# import nx_cugraph as nxcg 198# import nx_parallel as nxp 199 200 201def benchmark_nx_graph_features(G, backend="serial"): 202 """ 203 Benchmarks several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism. 204 205 Parameters 206 ---------- 207 G : networkx.Graph 208 The input graph. 209 backend : str, optional 210 Backend to use for computation. Default is 'serial'. 211 212 Returns 213 ------- 214 dict 215 Dictionary containing the benchmarked features and their computation times. 216 """ 217 218 try: 219 assert nx.is_connected(G) 220 except: 221 # graph isn't connect --> use largest connected component 222 # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph 223 G_cc = sorted(nx.connected_components(G), key=len, reverse=True) 224 G = G.subgraph(G_cc[0]) 225 226 G_time = {} # graph features time dictionary 227 228 if backend == "cugraph": 229 start_time = time.process_time() 230 G_conv = nxcg.from_networkx(G) 231 G_time["conversion"] = time.process_time() - start_time 232 elif backend == "parallel": 233 start_time = time.process_time() 234 G_conv = nxp.ParallelGraph(G) 235 G_time["conversion"] = time.process_time() - start_time 236 elif backend == "serial": 237 G_conv = G 238 G_time["conversion"] = 0 239 else: 240 raise ValueError("backend must be 'cugraph', 'parallel', or 'serial'") 241 242 G_time["backend"] = backend 243 244 G_feat = {} # graph features dictionary 245 start_time = time.process_time() 246 247 G_feat["n nodes"] = float(G.number_of_nodes()) # number of nodes 248 G_time["n nodes"] = time.process_time() - start_time 249 print( 250 f"Execution time for {backend} : n nodes {G_time['n nodes']} : {G_feat['n nodes']}" 251 ) 252 G_time["n nodes backend"] = "serial" 253 254 start_time = time.process_time() 255 G_feat["n edges"] = float(G.number_of_edges()) # number of edges 256 G_time["n edges"] = time.process_time() - start_time 257 print( 258 f"Execution time for {backend} : n edges {G_time['n edges']} : {G_feat['n edges']}" 259 ) 260 G_time["n edges backend"] = "serial" 261 262 start_time = time.process_time() 263 G_feat["density"] = nx.density( 264 G 265 ) # how close the network is to a 'complete graph' where each node is connected 266 G_time["density"] = time.process_time() - start_time 267 print( 268 f"Execution time for {backend} : density {G_time['density']} : {G_feat['density']}" 269 ) 270 G_time["density backend"] = "serial" 271 272 start_time = time.process_time() 273 if deduce_backend(nx.diameter.backends, backend) == backend: 274 G_feat["diameter"] = nx.diameter( 275 G_conv 276 ) # the farthest distance (e.g. number of edges) between two nodes in the graph 277 G_time["diameter"] = time.process_time() - start_time 278 print( 279 f"Execution time for {backend} : diameter {G_time['diameter']} : {G_feat['diameter']}" 280 ) 281 G_time["diameter backend"] = backend 282 elif backend == "cugraph": 283 # Calculate shortest path length, find max 284 shortest_paths = nxcg.shortest_path(G_conv) 285 G_feat["diameter"] = np.max(shortest_paths) 286 G_time["diameter"] = time.process_time() - start_time 287 else: 288 G_feat["diameter"] = 0 289 # G_feat['diameter'] = nx.diameter(G) # the farthest distance (e.g. number of edges) between two nodes in the graph 290 G_time["diameter"] = 0 291 G_time["diameter backend"] = "N/A" 292 293 start_time = time.process_time() 294 if deduce_backend(nx.average_clustering.backends, backend) == backend: 295 G_feat["avg clustering"] = nx.average_clustering( 296 G_conv 297 ) # the (averaged) fraction of possible triangles through a node. 298 G_time["avg clustering"] = time.process_time() - start_time 299 print( 300 f"Execution time for {backend} : avg clustering {G_time['avg clustering']} : {G_feat['avg clustering']}" 301 ) 302 G_time["avg clustering backend"] = backend 303 else: 304 G_feat["avg clustering"] = 0 305 # G_feat['avg clustering'] = nx.average_clustering(G) # the (averaged) fraction of possible triangles through a node. 306 G_time["avg clustering"] = 0 307 G_time["avg clustering backend"] = "N/A" 308 309 start_time = time.process_time() 310 if deduce_backend(nx.closeness_centrality.backends, backend) == backend: 311 G_feat["max closeness centrality"] = np.max( 312 list(nx.closeness_centrality(G_conv).values()) 313 ) # max closeness centraility. high closeness --> short distance to all other nodes 314 G_time["max closeness centrality"] = time.process_time() - start_time 315 print( 316 f"Execution time for {backend} : max closeness centrality {G_time['max closeness centrality']} : {G_feat['max closeness centrality']}" 317 ) 318 G_time["max closeness centrality backend"] = backend 319 else: 320 G_feat["max closeness centrality"] = 0 321 # G_feat['max closeness centrality'] = np.max(list(nx.closeness_centrality(G).values())) # max closeness centraility. high closeness --> short distance to all other nodes 322 G_time["max closeness centrality"] = 0 323 G_time["max closeness centrality backend"] = "N/A" 324 325 start_time = time.process_time() 326 if deduce_backend(nx.eigenvector_centrality.backends, backend) == backend: 327 G_feat["max eigenvector centrality"] = np.max( 328 list(nx.eigenvector_centrality(G_conv, max_iter=10000).values()) 329 ) # eigenvector centraility. how 'important'/'influential' a node is 330 G_time["max eigenvector centrality"] = time.process_time() - start_time 331 print( 332 f"Execution time for {backend} : max eigenvector centrality {G_time['max eigenvector centrality']} : {G_feat['max eigenvector centrality']}" 333 ) 334 G_time["max eigenvector centrality backend"] = backend 335 else: 336 G_feat["max eigenvector centrality"] = 0 337 # G_feat['max eigenvector centrality'] = np.max(list(nx.eigenvector_centrality(G, max_iter=10000).values())) # eigenvector centraility. how 'important'/'influential' a node is 338 G_time["max eigenvector centrality"] = 0 339 G_time["max eigenvector centrality backend"] = "N/A" 340 341 start_time = time.process_time() 342 if ( 343 deduce_backend(nx.degree_pearson_correlation_coefficient.backends, backend) 344 == backend 345 ): 346 G_feat["degree assortativity"] = ( 347 nx.degree_pearson_correlation_coefficient(G_conv) + 1 348 ) / 2 # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1. 349 G_time["degree assortativity"] = time.process_time() - start_time 350 print( 351 f"Execution time for {backend} : degree assortativity {G_time['degree assortativity']} : {G_feat['degree assortativity']}" 352 ) 353 G_time["degree assortativity backend"] = backend 354 else: 355 G_feat["degree assortativity"] = 0 356 # G_feat['degree assortativity'] = (nx.degree_pearson_correlation_coefficient(G) + 1)/2 # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1. 357 G_time["degree assortativity"] = 0 358 G_time["degree assortativity backend"] = "N/A" 359 360 start_time = time.process_time() 361 if deduce_backend(nx.find_cliques.backends, backend) == backend: 362 G_feat["max clique number"] = len( 363 max(nx.find_cliques(G_conv), key=len) 364 ) # largest clique (i.e. an induced subgraph that is complete) size 365 G_time["max clique number"] = time.process_time() - start_time 366 print( 367 f"Execution time for {backend} : max clique number {G_time['max clique number']} : {G_feat['max clique number']}" 368 ) 369 G_time["max clique number backend"] = backend 370 else: 371 G_feat["max clique number"] = 0 372 # G_feat['max clique number'] = len(max(nx.find_cliques(G),key=len)) # largest clique (i.e. an induced subgraph that is complete) size 373 G_time["max clique number"] = 0 374 G_time["max clique number backend"] = "N/A" 375 376 start_time = time.process_time() 377 if ( 378 deduce_backend( 379 nx.algorithms.community.modularity_max.greedy_modularity_communities.backends, 380 backend, 381 ) 382 == backend 383 ): 384 G_feat["n communities"] = len( 385 nx.algorithms.community.modularity_max.greedy_modularity_communities(G_conv) 386 ) # number of communities 387 G_time["n communities"] = time.process_time() - start_time 388 print( 389 f"Execution time for {backend} : n communities {G_time['n communities']} : {G_feat['n communities']}" 390 ) 391 G_time["n communities backend"] = backend 392 else: 393 G_feat["n communities"] = 0 394 # G_feat['n communities'] = len(nx.algorithms.community.modularity_max.greedy_modularity_communities(G)) # number of communities 395 G_time["n communities"] = 0 396 G_time["n communities backend"] = "N/A" 397 398 start_time = time.process_time() 399 if deduce_backend(nx.average_shortest_path_length.backends, backend) == backend: 400 G_feat["avg path length"] = nx.average_shortest_path_length(G_conv) 401 G_time["avg path length"] = time.process_time() - start_time 402 print( 403 f"Execution time for {backend} : avg path length {G_time['avg path length']} : {G_feat['avg path length']}" 404 ) 405 G_time["avg path length backend"] = backend 406 else: 407 G_feat["avg path length"] = 0 408 # G_feat['avg path length'] = nx.average_shortest_path_length(G) 409 G_time["avg path length"] = 0 410 G_time["avg path length backend"] = "N/A" 411 412 start_time = time.process_time() 413 if deduce_backend(nx.betweenness_centrality.backends, backend) == backend: 414 G_feat["max betweenness centrality"] = np.max( 415 list(nx.betweenness_centrality(G_conv).values()) 416 ) 417 G_time["max betweenness centrality"] = time.process_time() - start_time 418 print( 419 f"Execution time for {backend} : max betweenness centrality {G_time['max betweenness centrality']} : {G_feat['max betweenness centrality']}" 420 ) 421 G_time["max betweenness centrality backend"] = backend 422 else: 423 G_feat["max betweenness centrality"] = 0 424 # G_feat['max betweenness centrality'] = (np.max(list(nx.betweenness_centrality(G).values()))) 425 G_time["max betweenness centrality"] = 0 426 G_time["max betweenness centrality backend"] = "N/A" 427 return G_time 428 429 430def calc_graph_features(G, backend=None): 431 """ 432 Calculates several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism. 433 434 Parameters 435 ---------- 436 G : networkx.Graph 437 The input graph. 438 backend : str, optional 439 Backend to use for computation. Default is None. 440 441 Returns 442 ------- 443 dict 444 Dictionary containing the calculated features. 445 """ 446 try: 447 assert nx.is_connected(G) 448 except: 449 # graph isn't connect --> use largest connected component 450 # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph 451 G_cc = sorted(nx.connected_components(G), key=len, reverse=True) 452 G = G.subgraph(G_cc[0]) 453 454 G_feat = {} # graph features dictionary 455 start_time = time.process_time() 456 457 G_feat["n nodes"] = float(G.number_of_nodes()) # number of nodes 458 print("Execution time for 'n nodes':", time.process_time() - start_time) 459 460 start_time = time.process_time() 461 G_feat["n edges"] = float(G.number_of_edges()) # number of edges 462 print("Execution time for 'n edges':", time.process_time() - start_time) 463 464 start_time = time.process_time() 465 G_feat["density"] = nx.density( 466 G 467 ) # how close the network is to a 'complete graph' where each node is connected 468 print("Execution time for 'density':", time.process_time() - start_time) 469 470 start_time = time.process_time() 471 G_feat["diameter"] = nx.diameter( 472 G 473 ) # the farthest distance (e.g. number of edges) between two nodes in the graph 474 print("Execution time for 'diameter':", time.process_time() - start_time) 475 476 start_time = time.process_time() 477 G_feat["avg clustering"] = nx.average_clustering( 478 G 479 ) # the (averaged) fraction of possible triangles through a node. 480 print("Execution time for 'avg clustering':", time.process_time() - start_time) 481 482 start_time = time.process_time() 483 G_feat["max closeness centrality"] = np.max( 484 list(nx.closeness_centrality(G).values()) 485 ) # max closeness centraility. high closeness --> short distance to all other nodes 486 print( 487 "Execution time for 'max closeness centrality':", 488 time.process_time() - start_time, 489 ) 490 491 start_time = time.process_time() 492 G_feat["max eigenvector centrality"] = np.max( 493 list(nx.eigenvector_centrality(G, max_iter=10000).values()) 494 ) # eigenvector centraility. how 'important'/'influential' a node is 495 print( 496 "Execution time for 'max eigenvector centrality':", 497 time.process_time() - start_time, 498 ) 499 500 start_time = time.process_time() 501 G_feat["degree assortativity"] = ( 502 nx.degree_pearson_correlation_coefficient(G) + 1 503 ) / 2 # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1. 504 print( 505 "Execution time for 'degree assortativity':", time.process_time() - start_time 506 ) 507 508 start_time = time.process_time() 509 G_feat["max clique number"] = len( 510 max(nx.find_cliques(G), key=len) 511 ) # largest clique (i.e. an induced subgraph that is complete) size 512 print("Execution time for 'max clique number':", time.process_time() - start_time) 513 514 start_time = time.process_time() 515 G_feat["n communities"] = len( 516 nx.algorithms.community.modularity_max.greedy_modularity_communities(G) 517 ) # number of communities 518 print("Execution time for 'n communities':", time.process_time() - start_time) 519 520 start_time = time.process_time() 521 G_feat["avg path length"] = nx.average_shortest_path_length(G) 522 print("Execution time for 'avg path length':", time.process_time() - start_time) 523 524 start_time = time.process_time() 525 G_feat["max betweenness centrality"] = np.max( 526 list(nx.betweenness_centrality(G).values()) 527 ) 528 print( 529 "Execution time for 'max betweenness centrality':", 530 time.process_time() - start_time, 531 ) 532 return G_feat 533 534 535def igraph_calc_graph_features(G): 536 """ 537 Calculates several graph network features using igraph. If not connected, largest subgraph is used. 538 539 Parameters 540 ---------- 541 G : networkx.Graph 542 The input graph. 543 544 Returns 545 ------- 546 dict 547 Dictionary containing the calculated features. 548 """ 549 # G = ig.Graph.Load(G, format='gexf') 550 if type(G) != nx.classes.graph.Graph: 551 G = nx.read_gexf(G) 552 try: 553 assert nx.is_connected(G) 554 except: 555 # graph isn't connect --> use largest connected component 556 # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph 557 G_cc = sorted(nx.connected_components(G), key=len, reverse=True) 558 G = G.subgraph(G_cc[0]) 559 """ 560 Maybe rewrite the gexf files to a file format just in this case 561 so igraph can read it? To remove the step of having to load from networkx? 562 """ 563 G = ig.Graph.from_networkx(G) 564 G_feat = {} 565 G_feat["n nodes"] = float(ig.Graph.vcount(G)) 566 G_feat["n edges"] = float(ig.Graph.ecount(G)) 567 G_feat["density"] = ig.Graph.density(G) 568 G_feat["diameter"] = ig.Graph.diameter(G) 569 clustering = ig.Graph.transitivity_local_undirected(G, mode="zero") 570 if sum(clustering) == 0 or len(clustering) == 0: 571 G_feat["avg clustering"] = sum(clustering) 572 else: 573 G_feat["avg clustering"] = sum(clustering) / len(clustering) 574 G_feat["max closeness centrality"] = np.max(list(ig.Graph.closeness(G))) 575 G_feat["max eigenvector centrality"] = np.max( 576 ig.Graph.eigenvector_centrality(G, scale=False) 577 ) 578 G_feat["degree assortativity"] = ( 579 ig.Graph.assortativity_degree(G) + 1 580 ) / 2 # normalize to 0-1 581 G_feat["max clique number"] = len(max(ig.Graph.largest_cliques(G), key=len)) 582 G_feat["n communities"] = ig.Graph.community_fastgreedy(G).optimal_count 583 G_feat["avg path length"] = float(ig.Graph.average_path_length(G)) 584 G_feat["max betweenness centrality"] = round( 585 np.max(ig.Graph.betweenness(G)) 586 / (((G_feat["n nodes"] - 1) * (G_feat["n nodes"] - 2)) / 2), 587 15, 588 ) 589 590 return G_feat 591 592 593def similarity_measure(x1, x2): 594 """ 595 Calculates the similarity between two feature values. 596 Similarity is defined as 1 minus the relative distance between features (x1 and x2). 597 598 Parameters 599 ---------- 600 x1 : float 601 Feature value from graph 1 (must range between 0 and 1). 602 x2 : float 603 Feature value from graph 2 (must range between 0 and 1). 604 605 Returns 606 ------- 607 float 608 Relative similarity between the two features. 609 """ 610 611 return 1 - (np.abs(x1 - x2) / max(x1, x2)) 612 613 614def calc_similarity_score(G1_dict, G2_dict, feature_list): 615 """ 616 Calculates the similarity score of two graphs. 617 618 Parameters 619 ---------- 620 G1_dict : dict or pandas.DataFrame 621 Graph 1 features dictionary or dataframe. Must be able to use a key to access values. 622 G2_dict : dict or pandas.DataFrame 623 Graph 2 features dictionary or dataframe. Must be able to use a key to access values. 624 feature_list : list 625 List of graph features to compare. Must be keys in the graph features dictionary. 626 627 Returns 628 ------- 629 float 630 Similarity score (0 to 1) where 1 indicates identical graphs. 631 """ 632 s_list = [] 633 for feat in feature_list: 634 f1 = G1_dict[feat] 635 f2 = G2_dict[feat] 636 s_tmp = similarity_measure(f1, f2) 637 s_list.append(s_tmp) 638 s = np.sum(s_list) / len(s_list) 639 return s 640 641 642def process_graphs(graph_fnames, package=2): 643 """ 644 Takes a list of graph files, calculates their features, and returns them as a dataframe. 645 646 Parameters 647 ---------- 648 graph_fnames : list 649 List of graph filenames to process. 650 package : int, optional 651 Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph). 652 653 Returns 654 ------- 655 pandas.DataFrame 656 Dataframe containing graph features for each graph in the filename list. 657 """ 658 pool = multiprocessing.Pool() 659 tmp_feat_list = [] 660 tmp_df = "" 661 if package == 1: # networkx 662 for fname in graph_fnames: 663 G_tmp = nx.read_gexf(fname) 664 tmp_feat_list.append(calc_graph_features(G_tmp)) 665 tmp_df = pd.DataFrame(tmp_feat_list) 666 elif package == 2: # igraph 667 result = pool.map_async(igraph_calc_graph_features, graph_fnames) 668 pool.close() 669 pool.join() 670 # for fname in graph_fnames: 671 # tmp_feat_list.append(igraph_calc_graph_features(fname)) 672 # print (f"type(result.get()):{type(result.get())}") 673 # <class 'list'> 674 tmp_df = pd.DataFrame(result.get()) 675 tmp_df["name"] = [Path(i).stem for i in graph_fnames] 676 assert not tmp_df.empty 677 return tmp_df 678 679 680def classify_graphs(class_file_list, sample_file_list, feature_list, package=2): 681 """ 682 Classifies a similarity score from a list of Class and Sample graphs. 683 684 Parameters 685 ---------- 686 class_file_list : list 687 List of control/reference graph files (classes). 688 sample_file_list : list 689 List of non-control/non-reference graph files (samples). 690 feature_list : list 691 List of features to use for similarity score. Must be valid keys in the graph features dictionary. 692 package : int, optional 693 Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph). 694 695 Returns 696 ------- 697 pandas.DataFrame 698 Dataframe where each column is a class and each row is the similarity score of the sampled graph. 699 """ 700 if package == 1: 701 class_feat_df = process_graphs(class_file_list, 1) 702 sample_feat_df = process_graphs(sample_file_list, 1) 703 elif package == 2: 704 class_feat_df = process_graphs(class_file_list, 2) 705 sample_feat_df = process_graphs(sample_file_list, 2) 706 class_list = [] 707 for index, row in sample_feat_df.iterrows(): # for each sample graph 708 tmp_graph_feat = row 709 class_dict = {} 710 class_dict["name"] = tmp_graph_feat["name"] 711 for index2, row2 in class_feat_df.iterrows(): # for each class graph 712 tmp_class_feat = row2 713 s_tmp = calc_similarity_score(tmp_graph_feat, tmp_class_feat, feature_list) 714 class_dict[tmp_class_feat["name"]] = s_tmp 715 class_list.append(class_dict) 716 class_similarity_df = pd.DataFrame(class_list) 717 return class_similarity_df 718 719 720def process_similarity_df(class_similarity_df): 721 """ 722 Generates y_true and y_pred based on the similarity score dataframe. 723 724 y_true is a list where each index is a class and each value is the class value. 725 y_pred is a list where each index is a sample and each value is the maximum similarity score for that sample. 726 727 Note 728 ---- 729 This assumes the correct classification is along the diagonal of the similarity matrix/dataframe. 730 731 Parameters 732 ---------- 733 class_similarity_df : pandas.DataFrame 734 Dataframe where each column is a class graph and each row is a sample graph. A_ij is the similarity score between graphs i and j. The exception is one column 'name' which contains the names of the sampled graphs for each row. 735 736 Returns 737 ------- 738 tuple of lists 739 y_true, y_pred 740 """ 741 num_df = class_similarity_df.drop(columns="name") 742 y_true = [i for i in range(len(class_similarity_df.columns) - 1)] 743 y_pred = list(num_df.idxmax(axis=0)) 744 return y_true, y_pred
17def calc_bc(G, return_dict): 18 """ 19 Parallel subprocess function to calculate the betweenness centrality. 20 21 Parameters 22 ---------- 23 G : networkx.Graph 24 The input graph. 25 return_dict : dict 26 Dictionary to store the result. 27 28 Returns 29 ------- 30 dict 31 Betweenness centrality dictionary from multiple processes. 32 """ 33 return_dict[1] = np.max(list(nx.betweenness_centrality(G).values()))
Parallel subprocess function to calculate the betweenness centrality.
Parameters
- G (networkx.Graph): The input graph.
- return_dict (dict): Dictionary to store the result.
Returns
- dict: Betweenness centrality dictionary from multiple processes.
36def calc_shortest_pthlen(G, return_dict): 37 """ 38 Parallel subprocess function to calculate the average shortest path length. 39 40 Parameters 41 ---------- 42 G : networkx.Graph 43 The input graph. 44 return_dict : dict 45 Dictionary to store the result. 46 47 Returns 48 ------- 49 dict 50 Average shortest path length dictionary from multiple processes. 51 """ 52 return_dict[2] = nx.average_shortest_path_length(G)
Parallel subprocess function to calculate the average shortest path length.
Parameters
- G (networkx.Graph): The input graph.
- return_dict (dict): Dictionary to store the result.
Returns
- dict: Average shortest path length dictionary from multiple processes.
55def benchmark_igraph_graph_features(G): 56 """ 57 Benchmarks several graph network features using igraph. If not connected, largest subgraph is used. 58 59 Parameters 60 ---------- 61 G : networkx.Graph 62 The input graph. 63 64 Returns 65 ------- 66 dict 67 Dictionary containing the benchmarked features and their computation times. 68 """ 69 70 # G = ig.Graph.Load(G, format='gexf') 71 if type(G) != nx.classes.graph.Graph: 72 G = nx.read_gexf(G) 73 try: 74 assert nx.is_connected(G) 75 except: 76 # graph isn't connect --> use largest connected component 77 # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph 78 G_cc = sorted(nx.connected_components(G), key=len, reverse=True) 79 G = G.subgraph(G_cc[0]) 80 """ 81 Maybe rewrite the gexf files to a file format just in this case 82 so igraph can read it? To remove the step of having to load from networkx? 83 """ 84 G_time = {} 85 start_time = time.process_time() 86 G = ig.Graph.from_networkx(G) 87 G_time["conversion"] = time.process_time() - start_time 88 G_time["backend"] = "igraph" 89 G_feat = {} 90 start_time = time.process_time() 91 92 G_feat["n nodes"] = float(ig.Graph.vcount(G)) # number of nodes 93 G_time["n nodes"] = time.process_time() - start_time 94 G_time["n nodes backend"] = "igraph" 95 96 start_time = time.process_time() 97 G_feat["n edges"] = float(ig.Graph.ecount(G)) # number of edges 98 G_time["n edges"] = time.process_time() - start_time 99 G_time["n edges backend"] = "igraph" 100 101 start_time = time.process_time() 102 G_feat["density"] = ig.Graph.density( 103 G 104 ) # how close the network is to a 'complete graph' where each node is connected 105 G_time["density"] = time.process_time() - start_time 106 G_time["density backend"] = "igraph" 107 108 start_time = time.process_time() 109 G_feat["diameter"] = ig.Graph.diameter( 110 G 111 ) # the farthest distance (e.g. number of edges) between two nodes in the graph 112 G_time["diameter"] = time.process_time() - start_time 113 G_time["diameter backend"] = "igraph" 114 115 start_time = time.process_time() 116 clustering = ig.Graph.transitivity_local_undirected(G, mode="zero") 117 if sum(clustering) == 0 or len(clustering) == 0: 118 G_feat["avg clustering"] = sum(clustering) 119 else: 120 G_feat["avg clustering"] = sum(clustering) / len(clustering) 121 G_time["avg clustering"] = time.process_time() - start_time 122 G_time["avg clustering backend"] = "igraph" 123 124 start_time = time.process_time() 125 G_feat["max closeness centrality"] = np.max( 126 list(ig.Graph.closeness(G)) 127 ) # max closeness centraility. high closeness --> short distance to all other nodes 128 G_time["max closeness centrality"] = time.process_time() - start_time 129 G_time["max closeness centrality backend"] = "igraph" 130 131 start_time = time.process_time() 132 G_feat["max eigenvector centrality"] = np.max( 133 ig.Graph.eigenvector_centrality(G, scale=False) 134 ) # eigenvector centraility. how 'important'/'influential' a node is 135 G_time["max eigenvector centrality"] = time.process_time() - start_time 136 G_time["max eigenvector centrality backend"] = "igraph" 137 138 start_time = time.process_time() 139 G_feat["degree assortativity"] = ( 140 ig.Graph.assortativity_degree(G) + 1 141 ) / 2 # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1. 142 G_time["degree assortativity"] = time.process_time() - start_time 143 G_time["degree assortativity backend"] = "igraph" 144 145 start_time = time.process_time() 146 G_feat["max clique number"] = len( 147 max(ig.Graph.largest_cliques(G), key=len) 148 ) # largest clique (i.e. an induced subgraph that is complete) size 149 G_time["max clique number"] = time.process_time() - start_time 150 G_time["max clique number backend"] = "igraph" 151 152 start_time = time.process_time() 153 G_feat["n communities"] = ig.Graph.community_fastgreedy( 154 G 155 ).optimal_count # number of communities 156 G_time["n communities"] = time.process_time() - start_time 157 G_time["n communities backend"] = "igraph" 158 159 start_time = time.process_time() 160 G_feat["avg path length"] = float(ig.Graph.average_path_length(G)) 161 G_time["avg path length"] = time.process_time() - start_time 162 G_time["avg path length backend"] = "igraph" 163 164 start_time = time.process_time() 165 G_feat["max betweenness centrality"] = round( 166 np.max(ig.Graph.betweenness(G)) 167 / (((G_feat["n nodes"] - 1) * (G_feat["n nodes"] - 2)) / 2), 168 15, 169 ) 170 G_time["max betweenness centrality"] = time.process_time() - start_time 171 G_time["max betweenness centrality backend"] = "igraph" 172 173 return G_time
Benchmarks several graph network features using igraph. If not connected, largest subgraph is used.
Parameters
- G (networkx.Graph): The input graph.
Returns
- dict: Dictionary containing the benchmarked features and their computation times.
176def deduce_backend(backends_list, backend=None): 177 """ 178 Deduce the backend used for the graph features calculation. 179 180 Parameters 181 ---------- 182 backends_list : list 183 List of available backends. 184 backend : str, optional 185 Preferred backend to use. Default is None. 186 187 Returns 188 ------- 189 str 190 Backend to use. 191 """ 192 if backend in backends_list: 193 return backend 194 else: 195 return "serial"
Deduce the backend used for the graph features calculation.
Parameters
- backends_list (list): List of available backends.
- backend (str, optional): Preferred backend to use. Default is None.
Returns
- str: Backend to use.
202def benchmark_nx_graph_features(G, backend="serial"): 203 """ 204 Benchmarks several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism. 205 206 Parameters 207 ---------- 208 G : networkx.Graph 209 The input graph. 210 backend : str, optional 211 Backend to use for computation. Default is 'serial'. 212 213 Returns 214 ------- 215 dict 216 Dictionary containing the benchmarked features and their computation times. 217 """ 218 219 try: 220 assert nx.is_connected(G) 221 except: 222 # graph isn't connect --> use largest connected component 223 # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph 224 G_cc = sorted(nx.connected_components(G), key=len, reverse=True) 225 G = G.subgraph(G_cc[0]) 226 227 G_time = {} # graph features time dictionary 228 229 if backend == "cugraph": 230 start_time = time.process_time() 231 G_conv = nxcg.from_networkx(G) 232 G_time["conversion"] = time.process_time() - start_time 233 elif backend == "parallel": 234 start_time = time.process_time() 235 G_conv = nxp.ParallelGraph(G) 236 G_time["conversion"] = time.process_time() - start_time 237 elif backend == "serial": 238 G_conv = G 239 G_time["conversion"] = 0 240 else: 241 raise ValueError("backend must be 'cugraph', 'parallel', or 'serial'") 242 243 G_time["backend"] = backend 244 245 G_feat = {} # graph features dictionary 246 start_time = time.process_time() 247 248 G_feat["n nodes"] = float(G.number_of_nodes()) # number of nodes 249 G_time["n nodes"] = time.process_time() - start_time 250 print( 251 f"Execution time for {backend} : n nodes {G_time['n nodes']} : {G_feat['n nodes']}" 252 ) 253 G_time["n nodes backend"] = "serial" 254 255 start_time = time.process_time() 256 G_feat["n edges"] = float(G.number_of_edges()) # number of edges 257 G_time["n edges"] = time.process_time() - start_time 258 print( 259 f"Execution time for {backend} : n edges {G_time['n edges']} : {G_feat['n edges']}" 260 ) 261 G_time["n edges backend"] = "serial" 262 263 start_time = time.process_time() 264 G_feat["density"] = nx.density( 265 G 266 ) # how close the network is to a 'complete graph' where each node is connected 267 G_time["density"] = time.process_time() - start_time 268 print( 269 f"Execution time for {backend} : density {G_time['density']} : {G_feat['density']}" 270 ) 271 G_time["density backend"] = "serial" 272 273 start_time = time.process_time() 274 if deduce_backend(nx.diameter.backends, backend) == backend: 275 G_feat["diameter"] = nx.diameter( 276 G_conv 277 ) # the farthest distance (e.g. number of edges) between two nodes in the graph 278 G_time["diameter"] = time.process_time() - start_time 279 print( 280 f"Execution time for {backend} : diameter {G_time['diameter']} : {G_feat['diameter']}" 281 ) 282 G_time["diameter backend"] = backend 283 elif backend == "cugraph": 284 # Calculate shortest path length, find max 285 shortest_paths = nxcg.shortest_path(G_conv) 286 G_feat["diameter"] = np.max(shortest_paths) 287 G_time["diameter"] = time.process_time() - start_time 288 else: 289 G_feat["diameter"] = 0 290 # G_feat['diameter'] = nx.diameter(G) # the farthest distance (e.g. number of edges) between two nodes in the graph 291 G_time["diameter"] = 0 292 G_time["diameter backend"] = "N/A" 293 294 start_time = time.process_time() 295 if deduce_backend(nx.average_clustering.backends, backend) == backend: 296 G_feat["avg clustering"] = nx.average_clustering( 297 G_conv 298 ) # the (averaged) fraction of possible triangles through a node. 299 G_time["avg clustering"] = time.process_time() - start_time 300 print( 301 f"Execution time for {backend} : avg clustering {G_time['avg clustering']} : {G_feat['avg clustering']}" 302 ) 303 G_time["avg clustering backend"] = backend 304 else: 305 G_feat["avg clustering"] = 0 306 # G_feat['avg clustering'] = nx.average_clustering(G) # the (averaged) fraction of possible triangles through a node. 307 G_time["avg clustering"] = 0 308 G_time["avg clustering backend"] = "N/A" 309 310 start_time = time.process_time() 311 if deduce_backend(nx.closeness_centrality.backends, backend) == backend: 312 G_feat["max closeness centrality"] = np.max( 313 list(nx.closeness_centrality(G_conv).values()) 314 ) # max closeness centraility. high closeness --> short distance to all other nodes 315 G_time["max closeness centrality"] = time.process_time() - start_time 316 print( 317 f"Execution time for {backend} : max closeness centrality {G_time['max closeness centrality']} : {G_feat['max closeness centrality']}" 318 ) 319 G_time["max closeness centrality backend"] = backend 320 else: 321 G_feat["max closeness centrality"] = 0 322 # G_feat['max closeness centrality'] = np.max(list(nx.closeness_centrality(G).values())) # max closeness centraility. high closeness --> short distance to all other nodes 323 G_time["max closeness centrality"] = 0 324 G_time["max closeness centrality backend"] = "N/A" 325 326 start_time = time.process_time() 327 if deduce_backend(nx.eigenvector_centrality.backends, backend) == backend: 328 G_feat["max eigenvector centrality"] = np.max( 329 list(nx.eigenvector_centrality(G_conv, max_iter=10000).values()) 330 ) # eigenvector centraility. how 'important'/'influential' a node is 331 G_time["max eigenvector centrality"] = time.process_time() - start_time 332 print( 333 f"Execution time for {backend} : max eigenvector centrality {G_time['max eigenvector centrality']} : {G_feat['max eigenvector centrality']}" 334 ) 335 G_time["max eigenvector centrality backend"] = backend 336 else: 337 G_feat["max eigenvector centrality"] = 0 338 # G_feat['max eigenvector centrality'] = np.max(list(nx.eigenvector_centrality(G, max_iter=10000).values())) # eigenvector centraility. how 'important'/'influential' a node is 339 G_time["max eigenvector centrality"] = 0 340 G_time["max eigenvector centrality backend"] = "N/A" 341 342 start_time = time.process_time() 343 if ( 344 deduce_backend(nx.degree_pearson_correlation_coefficient.backends, backend) 345 == backend 346 ): 347 G_feat["degree assortativity"] = ( 348 nx.degree_pearson_correlation_coefficient(G_conv) + 1 349 ) / 2 # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1. 350 G_time["degree assortativity"] = time.process_time() - start_time 351 print( 352 f"Execution time for {backend} : degree assortativity {G_time['degree assortativity']} : {G_feat['degree assortativity']}" 353 ) 354 G_time["degree assortativity backend"] = backend 355 else: 356 G_feat["degree assortativity"] = 0 357 # G_feat['degree assortativity'] = (nx.degree_pearson_correlation_coefficient(G) + 1)/2 # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1. 358 G_time["degree assortativity"] = 0 359 G_time["degree assortativity backend"] = "N/A" 360 361 start_time = time.process_time() 362 if deduce_backend(nx.find_cliques.backends, backend) == backend: 363 G_feat["max clique number"] = len( 364 max(nx.find_cliques(G_conv), key=len) 365 ) # largest clique (i.e. an induced subgraph that is complete) size 366 G_time["max clique number"] = time.process_time() - start_time 367 print( 368 f"Execution time for {backend} : max clique number {G_time['max clique number']} : {G_feat['max clique number']}" 369 ) 370 G_time["max clique number backend"] = backend 371 else: 372 G_feat["max clique number"] = 0 373 # G_feat['max clique number'] = len(max(nx.find_cliques(G),key=len)) # largest clique (i.e. an induced subgraph that is complete) size 374 G_time["max clique number"] = 0 375 G_time["max clique number backend"] = "N/A" 376 377 start_time = time.process_time() 378 if ( 379 deduce_backend( 380 nx.algorithms.community.modularity_max.greedy_modularity_communities.backends, 381 backend, 382 ) 383 == backend 384 ): 385 G_feat["n communities"] = len( 386 nx.algorithms.community.modularity_max.greedy_modularity_communities(G_conv) 387 ) # number of communities 388 G_time["n communities"] = time.process_time() - start_time 389 print( 390 f"Execution time for {backend} : n communities {G_time['n communities']} : {G_feat['n communities']}" 391 ) 392 G_time["n communities backend"] = backend 393 else: 394 G_feat["n communities"] = 0 395 # G_feat['n communities'] = len(nx.algorithms.community.modularity_max.greedy_modularity_communities(G)) # number of communities 396 G_time["n communities"] = 0 397 G_time["n communities backend"] = "N/A" 398 399 start_time = time.process_time() 400 if deduce_backend(nx.average_shortest_path_length.backends, backend) == backend: 401 G_feat["avg path length"] = nx.average_shortest_path_length(G_conv) 402 G_time["avg path length"] = time.process_time() - start_time 403 print( 404 f"Execution time for {backend} : avg path length {G_time['avg path length']} : {G_feat['avg path length']}" 405 ) 406 G_time["avg path length backend"] = backend 407 else: 408 G_feat["avg path length"] = 0 409 # G_feat['avg path length'] = nx.average_shortest_path_length(G) 410 G_time["avg path length"] = 0 411 G_time["avg path length backend"] = "N/A" 412 413 start_time = time.process_time() 414 if deduce_backend(nx.betweenness_centrality.backends, backend) == backend: 415 G_feat["max betweenness centrality"] = np.max( 416 list(nx.betweenness_centrality(G_conv).values()) 417 ) 418 G_time["max betweenness centrality"] = time.process_time() - start_time 419 print( 420 f"Execution time for {backend} : max betweenness centrality {G_time['max betweenness centrality']} : {G_feat['max betweenness centrality']}" 421 ) 422 G_time["max betweenness centrality backend"] = backend 423 else: 424 G_feat["max betweenness centrality"] = 0 425 # G_feat['max betweenness centrality'] = (np.max(list(nx.betweenness_centrality(G).values()))) 426 G_time["max betweenness centrality"] = 0 427 G_time["max betweenness centrality backend"] = "N/A" 428 return G_time
Benchmarks several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism.
Parameters
- G (networkx.Graph): The input graph.
- backend (str, optional): Backend to use for computation. Default is 'serial'.
Returns
- dict: Dictionary containing the benchmarked features and their computation times.
431def calc_graph_features(G, backend=None): 432 """ 433 Calculates several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism. 434 435 Parameters 436 ---------- 437 G : networkx.Graph 438 The input graph. 439 backend : str, optional 440 Backend to use for computation. Default is None. 441 442 Returns 443 ------- 444 dict 445 Dictionary containing the calculated features. 446 """ 447 try: 448 assert nx.is_connected(G) 449 except: 450 # graph isn't connect --> use largest connected component 451 # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph 452 G_cc = sorted(nx.connected_components(G), key=len, reverse=True) 453 G = G.subgraph(G_cc[0]) 454 455 G_feat = {} # graph features dictionary 456 start_time = time.process_time() 457 458 G_feat["n nodes"] = float(G.number_of_nodes()) # number of nodes 459 print("Execution time for 'n nodes':", time.process_time() - start_time) 460 461 start_time = time.process_time() 462 G_feat["n edges"] = float(G.number_of_edges()) # number of edges 463 print("Execution time for 'n edges':", time.process_time() - start_time) 464 465 start_time = time.process_time() 466 G_feat["density"] = nx.density( 467 G 468 ) # how close the network is to a 'complete graph' where each node is connected 469 print("Execution time for 'density':", time.process_time() - start_time) 470 471 start_time = time.process_time() 472 G_feat["diameter"] = nx.diameter( 473 G 474 ) # the farthest distance (e.g. number of edges) between two nodes in the graph 475 print("Execution time for 'diameter':", time.process_time() - start_time) 476 477 start_time = time.process_time() 478 G_feat["avg clustering"] = nx.average_clustering( 479 G 480 ) # the (averaged) fraction of possible triangles through a node. 481 print("Execution time for 'avg clustering':", time.process_time() - start_time) 482 483 start_time = time.process_time() 484 G_feat["max closeness centrality"] = np.max( 485 list(nx.closeness_centrality(G).values()) 486 ) # max closeness centraility. high closeness --> short distance to all other nodes 487 print( 488 "Execution time for 'max closeness centrality':", 489 time.process_time() - start_time, 490 ) 491 492 start_time = time.process_time() 493 G_feat["max eigenvector centrality"] = np.max( 494 list(nx.eigenvector_centrality(G, max_iter=10000).values()) 495 ) # eigenvector centraility. how 'important'/'influential' a node is 496 print( 497 "Execution time for 'max eigenvector centrality':", 498 time.process_time() - start_time, 499 ) 500 501 start_time = time.process_time() 502 G_feat["degree assortativity"] = ( 503 nx.degree_pearson_correlation_coefficient(G) + 1 504 ) / 2 # tendency of a node to be connected with other nodes of the same degree, normalized to 0 and 1. 505 print( 506 "Execution time for 'degree assortativity':", time.process_time() - start_time 507 ) 508 509 start_time = time.process_time() 510 G_feat["max clique number"] = len( 511 max(nx.find_cliques(G), key=len) 512 ) # largest clique (i.e. an induced subgraph that is complete) size 513 print("Execution time for 'max clique number':", time.process_time() - start_time) 514 515 start_time = time.process_time() 516 G_feat["n communities"] = len( 517 nx.algorithms.community.modularity_max.greedy_modularity_communities(G) 518 ) # number of communities 519 print("Execution time for 'n communities':", time.process_time() - start_time) 520 521 start_time = time.process_time() 522 G_feat["avg path length"] = nx.average_shortest_path_length(G) 523 print("Execution time for 'avg path length':", time.process_time() - start_time) 524 525 start_time = time.process_time() 526 G_feat["max betweenness centrality"] = np.max( 527 list(nx.betweenness_centrality(G).values()) 528 ) 529 print( 530 "Execution time for 'max betweenness centrality':", 531 time.process_time() - start_time, 532 ) 533 return G_feat
Calculates several graph network features using networkx. If not connected, largest subgraph is used. Uses multiprocessing for parallelism.
Parameters
- G (networkx.Graph): The input graph.
- backend (str, optional): Backend to use for computation. Default is None.
Returns
- dict: Dictionary containing the calculated features.
536def igraph_calc_graph_features(G): 537 """ 538 Calculates several graph network features using igraph. If not connected, largest subgraph is used. 539 540 Parameters 541 ---------- 542 G : networkx.Graph 543 The input graph. 544 545 Returns 546 ------- 547 dict 548 Dictionary containing the calculated features. 549 """ 550 # G = ig.Graph.Load(G, format='gexf') 551 if type(G) != nx.classes.graph.Graph: 552 G = nx.read_gexf(G) 553 try: 554 assert nx.is_connected(G) 555 except: 556 # graph isn't connect --> use largest connected component 557 # see ref: https://stackoverflow.com/questions/26105764/how-do-i-get-the-giant-component-of-a-networkx-graph 558 G_cc = sorted(nx.connected_components(G), key=len, reverse=True) 559 G = G.subgraph(G_cc[0]) 560 """ 561 Maybe rewrite the gexf files to a file format just in this case 562 so igraph can read it? To remove the step of having to load from networkx? 563 """ 564 G = ig.Graph.from_networkx(G) 565 G_feat = {} 566 G_feat["n nodes"] = float(ig.Graph.vcount(G)) 567 G_feat["n edges"] = float(ig.Graph.ecount(G)) 568 G_feat["density"] = ig.Graph.density(G) 569 G_feat["diameter"] = ig.Graph.diameter(G) 570 clustering = ig.Graph.transitivity_local_undirected(G, mode="zero") 571 if sum(clustering) == 0 or len(clustering) == 0: 572 G_feat["avg clustering"] = sum(clustering) 573 else: 574 G_feat["avg clustering"] = sum(clustering) / len(clustering) 575 G_feat["max closeness centrality"] = np.max(list(ig.Graph.closeness(G))) 576 G_feat["max eigenvector centrality"] = np.max( 577 ig.Graph.eigenvector_centrality(G, scale=False) 578 ) 579 G_feat["degree assortativity"] = ( 580 ig.Graph.assortativity_degree(G) + 1 581 ) / 2 # normalize to 0-1 582 G_feat["max clique number"] = len(max(ig.Graph.largest_cliques(G), key=len)) 583 G_feat["n communities"] = ig.Graph.community_fastgreedy(G).optimal_count 584 G_feat["avg path length"] = float(ig.Graph.average_path_length(G)) 585 G_feat["max betweenness centrality"] = round( 586 np.max(ig.Graph.betweenness(G)) 587 / (((G_feat["n nodes"] - 1) * (G_feat["n nodes"] - 2)) / 2), 588 15, 589 ) 590 591 return G_feat
Calculates several graph network features using igraph. If not connected, largest subgraph is used.
Parameters
- G (networkx.Graph): The input graph.
Returns
- dict: Dictionary containing the calculated features.
594def similarity_measure(x1, x2): 595 """ 596 Calculates the similarity between two feature values. 597 Similarity is defined as 1 minus the relative distance between features (x1 and x2). 598 599 Parameters 600 ---------- 601 x1 : float 602 Feature value from graph 1 (must range between 0 and 1). 603 x2 : float 604 Feature value from graph 2 (must range between 0 and 1). 605 606 Returns 607 ------- 608 float 609 Relative similarity between the two features. 610 """ 611 612 return 1 - (np.abs(x1 - x2) / max(x1, x2))
Calculates the similarity between two feature values. Similarity is defined as 1 minus the relative distance between features (x1 and x2).
Parameters
- x1 (float): Feature value from graph 1 (must range between 0 and 1).
- x2 (float): Feature value from graph 2 (must range between 0 and 1).
Returns
- float: Relative similarity between the two features.
615def calc_similarity_score(G1_dict, G2_dict, feature_list): 616 """ 617 Calculates the similarity score of two graphs. 618 619 Parameters 620 ---------- 621 G1_dict : dict or pandas.DataFrame 622 Graph 1 features dictionary or dataframe. Must be able to use a key to access values. 623 G2_dict : dict or pandas.DataFrame 624 Graph 2 features dictionary or dataframe. Must be able to use a key to access values. 625 feature_list : list 626 List of graph features to compare. Must be keys in the graph features dictionary. 627 628 Returns 629 ------- 630 float 631 Similarity score (0 to 1) where 1 indicates identical graphs. 632 """ 633 s_list = [] 634 for feat in feature_list: 635 f1 = G1_dict[feat] 636 f2 = G2_dict[feat] 637 s_tmp = similarity_measure(f1, f2) 638 s_list.append(s_tmp) 639 s = np.sum(s_list) / len(s_list) 640 return s
Calculates the similarity score of two graphs.
Parameters
- G1_dict (dict or pandas.DataFrame): Graph 1 features dictionary or dataframe. Must be able to use a key to access values.
- G2_dict (dict or pandas.DataFrame): Graph 2 features dictionary or dataframe. Must be able to use a key to access values.
- feature_list (list): List of graph features to compare. Must be keys in the graph features dictionary.
Returns
- float: Similarity score (0 to 1) where 1 indicates identical graphs.
643def process_graphs(graph_fnames, package=2): 644 """ 645 Takes a list of graph files, calculates their features, and returns them as a dataframe. 646 647 Parameters 648 ---------- 649 graph_fnames : list 650 List of graph filenames to process. 651 package : int, optional 652 Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph). 653 654 Returns 655 ------- 656 pandas.DataFrame 657 Dataframe containing graph features for each graph in the filename list. 658 """ 659 pool = multiprocessing.Pool() 660 tmp_feat_list = [] 661 tmp_df = "" 662 if package == 1: # networkx 663 for fname in graph_fnames: 664 G_tmp = nx.read_gexf(fname) 665 tmp_feat_list.append(calc_graph_features(G_tmp)) 666 tmp_df = pd.DataFrame(tmp_feat_list) 667 elif package == 2: # igraph 668 result = pool.map_async(igraph_calc_graph_features, graph_fnames) 669 pool.close() 670 pool.join() 671 # for fname in graph_fnames: 672 # tmp_feat_list.append(igraph_calc_graph_features(fname)) 673 # print (f"type(result.get()):{type(result.get())}") 674 # <class 'list'> 675 tmp_df = pd.DataFrame(result.get()) 676 tmp_df["name"] = [Path(i).stem for i in graph_fnames] 677 assert not tmp_df.empty 678 return tmp_df
Takes a list of graph files, calculates their features, and returns them as a dataframe.
Parameters
- graph_fnames (list): List of graph filenames to process.
- package (int, optional): Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph).
Returns
- pandas.DataFrame: Dataframe containing graph features for each graph in the filename list.
681def classify_graphs(class_file_list, sample_file_list, feature_list, package=2): 682 """ 683 Classifies a similarity score from a list of Class and Sample graphs. 684 685 Parameters 686 ---------- 687 class_file_list : list 688 List of control/reference graph files (classes). 689 sample_file_list : list 690 List of non-control/non-reference graph files (samples). 691 feature_list : list 692 List of features to use for similarity score. Must be valid keys in the graph features dictionary. 693 package : int, optional 694 Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph). 695 696 Returns 697 ------- 698 pandas.DataFrame 699 Dataframe where each column is a class and each row is the similarity score of the sampled graph. 700 """ 701 if package == 1: 702 class_feat_df = process_graphs(class_file_list, 1) 703 sample_feat_df = process_graphs(sample_file_list, 1) 704 elif package == 2: 705 class_feat_df = process_graphs(class_file_list, 2) 706 sample_feat_df = process_graphs(sample_file_list, 2) 707 class_list = [] 708 for index, row in sample_feat_df.iterrows(): # for each sample graph 709 tmp_graph_feat = row 710 class_dict = {} 711 class_dict["name"] = tmp_graph_feat["name"] 712 for index2, row2 in class_feat_df.iterrows(): # for each class graph 713 tmp_class_feat = row2 714 s_tmp = calc_similarity_score(tmp_graph_feat, tmp_class_feat, feature_list) 715 class_dict[tmp_class_feat["name"]] = s_tmp 716 class_list.append(class_dict) 717 class_similarity_df = pd.DataFrame(class_list) 718 return class_similarity_df
Classifies a similarity score from a list of Class and Sample graphs.
Parameters
- class_file_list (list): List of control/reference graph files (classes).
- sample_file_list (list): List of non-control/non-reference graph files (samples).
- feature_list (list): List of features to use for similarity score. Must be valid keys in the graph features dictionary.
- package (int, optional): Integer value to choose between networkx (1) and igraph (2). Default is 2 (igraph).
Returns
- pandas.DataFrame: Dataframe where each column is a class and each row is the similarity score of the sampled graph.
721def process_similarity_df(class_similarity_df): 722 """ 723 Generates y_true and y_pred based on the similarity score dataframe. 724 725 y_true is a list where each index is a class and each value is the class value. 726 y_pred is a list where each index is a sample and each value is the maximum similarity score for that sample. 727 728 Note 729 ---- 730 This assumes the correct classification is along the diagonal of the similarity matrix/dataframe. 731 732 Parameters 733 ---------- 734 class_similarity_df : pandas.DataFrame 735 Dataframe where each column is a class graph and each row is a sample graph. A_ij is the similarity score between graphs i and j. The exception is one column 'name' which contains the names of the sampled graphs for each row. 736 737 Returns 738 ------- 739 tuple of lists 740 y_true, y_pred 741 """ 742 num_df = class_similarity_df.drop(columns="name") 743 y_true = [i for i in range(len(class_similarity_df.columns) - 1)] 744 y_pred = list(num_df.idxmax(axis=0)) 745 return y_true, y_pred
Generates y_true and y_pred based on the similarity score dataframe.
y_true is a list where each index is a class and each value is the class value. y_pred is a list where each index is a sample and each value is the maximum similarity score for that sample.
Note
This assumes the correct classification is along the diagonal of the similarity matrix/dataframe.
Parameters
- class_similarity_df (pandas.DataFrame): Dataframe where each column is a class graph and each row is a sample graph. A_ij is the similarity score between graphs i and j. The exception is one column 'name' which contains the names of the sampled graphs for each row.
Returns
- tuple of lists: y_true, y_pred