griptomo.parsl.apps
1from parsl import python_app 2 3 4@python_app 5def convert_density_file_to_centroids( 6 protein_name, 7 mrc_file_path, 8 DBSCAN_epsilon, 9 DBSCAN_min_samples, 10 density_threshold, 11 output_dir, 12): 13 """ 14 Convert a density file to centroids using DBSCAN clustering. 15 16 Parameters 17 ---------- 18 protein_name : str 19 Name of the protein. 20 mrc_file_path : str 21 Path to the MRC density file. 22 DBSCAN_epsilon : float 23 The maximum distance between two samples for one to be considered as in the neighborhood of the other. 24 DBSCAN_min_samples : int 25 The number of samples in a neighborhood for a point to be considered as a core point. 26 density_threshold : float 27 The threshold value for density. 28 output_dir : str 29 Directory where the output will be saved. 30 31 Returns 32 ------- 33 tuple 34 A tuple containing: 35 - centroids (numpy.ndarray): The centroids of the clusters. 36 - centroid_dir (str): The directory where the centroids are saved. 37 """ 38 import os 39 import numpy as np 40 import csv 41 42 from griptomo.core import density2graph as d2g 43 44 fname = os.path.basename(mrc_file_path) 45 density_name = fname[:-4] 46 47 centroid_dir = os.path.join( 48 output_dir, 49 protein_name, 50 density_name, 51 f"density_{density_threshold}", 52 f"min_samples_{DBSCAN_min_samples}", 53 f"epsilon_{DBSCAN_epsilon}", 54 ) 55 if os.path.exists(centroid_dir): 56 print(f"{centroid_dir} already exists.") 57 else: 58 os.makedirs(centroid_dir) 59 print(f"{centroid_dir} does not exist. Creating it.") 60 61 centroid_filename = os.path.join( 62 centroid_dir, 63 "centroids.npy", 64 ) 65 66 if os.path.exists(centroid_filename): 67 print(f"{centroid_filename} already exists.") 68 centroids = np.load(centroid_filename) 69 else: 70 print(f"{centroid_filename} does not exist. Generating coarse model.") 71 ### load density file, normalize the data and threshold, and extract the x,y,z coordinates of the remaining pixels 72 mrc = d2g.load_density_file(mrc_file_path) # load density file 73 D_norm = d2g.normalize_mrc_data(mrc.data) 74 total_voxel_count = mrc.data.size 75 D_thresholded = d2g.threshold_mrc_data(D_norm, density_threshold) 76 xyz_data = d2g.generate_point_cloud_from_mrc_data( 77 D_thresholded, density_threshold 78 ) 79 point_cloud_node_count = len(xyz_data) 80 81 ### perform clustering and get cluster centers 82 model = d2g.cluster_data( 83 xyz_data, DBSCAN_epsilon, DBSCAN_min_samples 84 ) # cluster thresholded data using DBSCAN 85 noisy_node_count = np.sum(model.labels_ == -1) 86 non_noise_cluster_count = len(np.unique(model.labels_)) 87 if noisy_node_count > 0: 88 non_noise_cluster_count -= 1 89 centroids = d2g.get_cluster_centroids( 90 xyz_data, model 91 ) # coarse grain model by getting cluster centroids 92 # Save centroids in output folder. 93 np.save(centroid_filename, centroids) 94 # Save centroids_run_info in output folder. 95 run_info_filename = centroid_filename.replace(".npy", "_run_info.csv") 96 with open(run_info_filename, "w", newline="") as csvfile: 97 fieldnames = [ 98 "Total Voxel Count", 99 "Point Cloud Node Count", 100 "Non Noise Cluster Count", 101 "Noisy Node Count", 102 ] 103 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 104 105 writer.writeheader() 106 writer.writerow( 107 { 108 "Total Voxel Count": total_voxel_count, 109 "Point Cloud Node Count": point_cloud_node_count, 110 "Non Noise Cluster Count": non_noise_cluster_count, 111 "Noisy Node Count": noisy_node_count, 112 } 113 ) 114 return (centroids, centroid_dir) 115 116 117@python_app 118def extract_graph_features_from_centroids( 119 centroids, 120 cutoff, 121 output_dir, 122 force_overwrite=False, 123): 124 """ 125 Extracts graph features from centroid coordinates and saves them to a CSV file. 126 127 Parameters 128 ---------- 129 centroids : str or numpy.ndarray 130 Path to a file containing centroid coordinates or a numpy array of centroid coordinates. 131 cutoff : float 132 Distance threshold for creating edges in the graph. 133 output_dir : str 134 Directory where the output CSV file will be saved. 135 force_overwrite : bool, optional 136 If True, overwrite the existing output file. Defaults to False. 137 138 Returns 139 ------- 140 str 141 Path to the output CSV file containing the graph features. 142 """ 143 import os 144 145 feature_output_dir = os.path.join(output_dir, f"cutoff_{cutoff}") 146 147 if os.path.exists(feature_output_dir): 148 print(f"{feature_output_dir} already exists.") 149 else: 150 os.makedirs(feature_output_dir) 151 print(f"{feature_output_dir} does not exist. Creating it.") 152 153 feature_output_filename = os.path.join(feature_output_dir, "graph_features.csv") 154 155 if os.path.exists(feature_output_filename) and not force_overwrite: 156 print(f"{feature_output_filename} already exists and force_overwrite is False.") 157 else: 158 import numpy as np 159 import csv 160 161 from griptomo.core import ig_extract_features as g2c 162 from griptomo.core.density2graph import create_igraph_from_point_cloud 163 164 # If the centroids are not already loaded, load them. 165 if isinstance(centroids, str): 166 centroids = np.load(centroids) 167 168 graph = create_igraph_from_point_cloud(centroids, cutoff) 169 graph_features = g2c.igraph_calc_graph_features(graph) 170 171 if os.path.exists(feature_output_filename): 172 print(f"{feature_output_filename} already exists but will be overwritten.") 173 else: 174 print(f"{feature_output_filename} does not exist. Writing graph features.") 175 with open(feature_output_filename, "w", newline="") as csvfile: 176 fieldnames = list(graph_features.keys()) 177 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 178 writer.writeheader() 179 writer.writerow(graph_features) 180 181 return feature_output_filename 182 183 184@python_app 185def convert_density_file_to_centroids_timed( 186 protein_name, 187 mrc_file_path, 188 DBSCAN_epsilon, 189 DBSCAN_min_samples, 190 density_threshold, 191 output_dir, 192): 193 """ 194 Convert a density file to centroids using DBSCAN clustering. 195 Additionally, save the timings of each step to a CSV file. 196 197 Parameters 198 ---------- 199 protein_name : str 200 Name of the protein. 201 mrc_file_path : str 202 Path to the MRC density file. 203 DBSCAN_epsilon : float 204 The maximum distance between two samples for one to be considered as in the neighborhood of the other. 205 DBSCAN_min_samples : int 206 The number of samples in a neighborhood for a point to be considered as a core point. 207 density_threshold : float 208 The threshold value for density. 209 output_dir : str 210 Directory where the output will be saved. 211 212 Returns 213 ------- 214 tuple 215 A tuple containing: 216 - centroids (numpy.ndarray): The centroids of the clusters. 217 - centroid_dir (str): The directory where the centroids are saved. 218 """ 219 import time 220 221 times = {} 222 223 start_time = time.perf_counter() 224 import os 225 import numpy as np 226 import csv 227 228 from griptomo.core import density2graph as d2g 229 230 times["module import"] = time.perf_counter() - start_time 231 232 fname = os.path.basename(mrc_file_path) 233 density_name = fname[:-4] 234 235 centroid_dir = os.path.join( 236 output_dir, 237 protein_name, 238 density_name, 239 f"density_{density_threshold}", 240 f"min_samples_{DBSCAN_min_samples}", 241 f"epsilon_{DBSCAN_epsilon}", 242 ) 243 if os.path.exists(centroid_dir): 244 print(f"{centroid_dir} already exists.") 245 else: 246 os.makedirs(centroid_dir) 247 print(f"{centroid_dir} does not exist. Creating it.") 248 249 centroid_filename = os.path.join( 250 centroid_dir, 251 "centroids.npy", 252 ) 253 254 if os.path.exists(centroid_filename): 255 print(f"{centroid_filename} already exists.") 256 start_time = time.perf_counter() 257 centroids = np.load(centroid_filename) 258 times["load centroids"] = time.perf_counter() - start_time 259 else: 260 print(f"{centroid_filename} does not exist. Generating coarse model.") 261 ### load density file, normalize the data and threshold, and extract the x,y,z coordinates of the remaining pixels 262 start_time = time.perf_counter() 263 mrc = d2g.load_density_file(mrc_file_path) # load density file 264 times["load density file"] = time.perf_counter() - start_time 265 266 start_time = time.perf_counter() 267 D_norm = d2g.normalize_mrc_data(mrc.data) 268 times["normalize data"] = time.perf_counter() - start_time 269 total_voxel_count = mrc.data.size 270 271 start_time = time.perf_counter() 272 D_thresholded = d2g.threshold_mrc_data(D_norm, density_threshold) 273 times["threshold data"] = time.perf_counter() - start_time 274 275 start_time = time.perf_counter() 276 xyz_data = d2g.generate_point_cloud_from_mrc_data( 277 D_thresholded, density_threshold 278 ) 279 times["generate point cloud"] = time.perf_counter() - start_time 280 281 point_cloud_node_count = len(xyz_data) 282 283 start_time = time.perf_counter() 284 ### perform clustering and get cluster centers 285 model = d2g.cluster_data( 286 xyz_data, DBSCAN_epsilon, DBSCAN_min_samples 287 ) # cluster thresholded data using DBSCAN 288 times["cluster data"] = time.perf_counter() - start_time 289 290 noisy_node_count = np.sum(model.labels_ == -1) 291 non_noise_cluster_count = len(np.unique(model.labels_)) 292 if noisy_node_count > 0: 293 non_noise_cluster_count -= 1 294 295 start_time = time.perf_counter() 296 centroids = d2g.get_cluster_centroids( 297 xyz_data, model 298 ) # coarse grain model by getting cluster centroids 299 times["get cluster centroids"] = time.perf_counter() - start_time 300 301 start_time = time.perf_counter() 302 np.save(centroid_filename, centroids) 303 times["save centroids"] = time.perf_counter() - start_time 304 305 run_info_filename = centroid_filename.replace(".npy", "_run_info.csv") 306 with open(run_info_filename, "w", newline="") as csvfile: 307 fieldnames = [ 308 "Total Voxel Count", 309 "Point Cloud Node Count", 310 "Non Noise Cluster Count", 311 "Noisy Node Count", 312 ] 313 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 314 315 writer.writeheader() 316 writer.writerow( 317 { 318 "Total Voxel Count": total_voxel_count, 319 "Point Cloud Node Count": point_cloud_node_count, 320 "Non Noise Cluster Count": non_noise_cluster_count, 321 "Noisy Node Count": noisy_node_count, 322 } 323 ) 324 # Print timing information to a file 325 times_filename = centroid_filename.replace("centroids.npy", "times.csv") 326 with open(times_filename, "w", newline="") as csvfile: 327 fieldnames = list(times.keys()) 328 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 329 writer.writeheader() 330 writer.writerow(times) 331 return (centroids, centroid_dir) 332 333 334@python_app 335def extract_graph_features_from_centroids_timed( 336 centroids, 337 cutoff, 338 output_dir, 339 force_overwrite=False, 340 skip_clique_num=False, 341): 342 """ 343 Extracts graph features from centroid coordinates and saves them to a CSV file. 344 Additionally, save the times of each step to a separate CSV file. 345 346 Parameters 347 ---------- 348 centroids : str or numpy.ndarray 349 Path to a file containing centroid coordinates or a numpy array of centroid coordinates. 350 cutoff : float 351 Distance threshold for creating edges in the graph. 352 output_dir : str 353 Directory where the output CSV file will be saved. 354 force_overwrite : bool, optional 355 If True, overwrite the existing output file. Defaults to False. 356 skip_clique_num : bool, optional 357 If True, skip the calculation of the number of maximal cliques. Defaults to False. 358 359 Returns 360 ------- 361 str 362 Path to the output CSV file containing the graph features. 363 """ 364 import os 365 366 feature_output_dir = os.path.join(output_dir, f"cutoff_{cutoff}") 367 368 if os.path.exists(feature_output_dir): 369 print(f"{feature_output_dir} already exists.") 370 else: 371 os.makedirs(feature_output_dir) 372 print(f"{feature_output_dir} does not exist. Creating it.") 373 374 feature_output_filename = os.path.join(feature_output_dir, "graph_features.csv") 375 376 if os.path.exists(feature_output_filename) and not force_overwrite: 377 print(f"{feature_output_filename} already exists and force_overwrite is False.") 378 else: 379 import time 380 381 times = {} 382 383 start_time = time.perf_counter() 384 import numpy as np 385 import csv 386 387 from griptomo.core import ig_extract_features as g2c 388 from griptomo.core.density2graph import create_igraph_from_point_cloud 389 390 times["module import"] = time.perf_counter() - start_time 391 392 # If the centroids are not already loaded, load them. 393 if isinstance(centroids, str): 394 centroids = np.load(centroids) 395 396 graph = create_igraph_from_point_cloud(centroids, cutoff) 397 times["convert to igraph"] = time.perf_counter() - start_time 398 399 start_time = time.perf_counter() 400 graph_features, graph_times = g2c.igraph_calc_graph_features_timed( 401 graph, skip_clique_num=skip_clique_num 402 ) 403 times["extract features"] = time.perf_counter() - start_time 404 405 if os.path.exists(feature_output_filename): 406 print(f"{feature_output_filename} already exists but will be overwritten.") 407 else: 408 print(f"{feature_output_filename} does not exist. Writing graph features.") 409 with open(feature_output_filename, "w", newline="") as csvfile: 410 fieldnames = list(graph_features.keys()) 411 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 412 writer.writeheader() 413 writer.writerow(graph_features) 414 graph_times_filename = feature_output_filename.replace( 415 "graph_features.csv", "graph_times.csv" 416 ) 417 with open(graph_times_filename, "w", newline="") as csvfile: 418 fieldnames = list(graph_times.keys()) 419 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 420 writer.writeheader() 421 writer.writerow(graph_times) 422 times_filename = feature_output_filename.replace( 423 "graph_features.csv", "times.csv" 424 ) 425 with open(times_filename, "w", newline="") as csvfile: 426 fieldnames = list(times.keys()) 427 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 428 writer.writeheader() 429 writer.writerow(times) 430 431 return feature_output_filename 432 433 434@python_app 435def convert_density_file_to_point_cloud_hdbscan_timed( 436 protein_name, 437 mrc_file_path, 438 HDBSCAN_min_cluster_size, 439 HDBSCAN_min_samples, 440 coarsening_dim, 441 density_threshold, 442 output_dir, 443 averaged=False, 444 density_coarsening=False, 445 precoarsened=False, 446 standardize=False, 447 threshold_ratio=False, 448): 449 """ 450 Convert a density file to centroids using HDBSCAN clustering. 451 Additionally, save the timings of each step to a CSV file. 452 453 Parameters 454 ---------- 455 protein_name : str 456 Name of the protein. 457 mrc_file_path : str 458 Path to the MRC density file. 459 HDBSCAN_min_cluster_size : float 460 The minimum number of points in a cluster. 461 HDBSCAN_min_samples : int 462 The number of samples in a neighborhood for a point to be considered as a core point. 463 coarsening_dim : int 464 The dimensionality passed in to griptomo.core.density2graph.voxel_coarsening's voxel_dim argument to be used for performing coarsening on the thresholded point cloud. 465 density_threshold : float 466 The threshold value for density. 467 output_dir : str 468 Directory where the output will be saved. 469 averaged : bool, optional 470 If True, average the points in the coarsening box instead of returning the center of the box. Defaults to False. 471 density_coarsening : bool, optional 472 If True, use the density values of each voxel in the box to provide weight when averaging during coarsening. Defaults to False. 473 precoarsened : bool, optional 474 If True, pre-coarsen the data before extracting the cluster centroids. Defaults to False. 475 standardize : bool, optional 476 If True, standardize the data before thresholding. Defaults to False. 477 threshold_ratio : bool, optional 478 If True, use the threshold ratio to identify the threshold. Defaults to False. 479 480 Returns 481 ------- 482 tuple 483 A tuple containing: 484 - centroids (numpy.ndarray): The centroids of the clusters. 485 - centroid_dir (str): The directory where the centroids are saved. 486 """ 487 import time 488 489 times = {} 490 491 start_time = time.perf_counter() 492 import os 493 import numpy as np 494 import csv 495 496 from griptomo.core import density2graph as d2g 497 498 times["module import"] = time.perf_counter() - start_time 499 500 fname = os.path.basename(mrc_file_path) 501 density_name = fname[:-4] 502 503 centroid_dir = os.path.join( 504 output_dir, 505 protein_name, 506 density_name, 507 f"density_{density_threshold}", 508 f"min_samples_{HDBSCAN_min_samples}", 509 f"min_cluster_size_{HDBSCAN_min_cluster_size}", 510 f"coarsening_dim_{coarsening_dim}", 511 ) 512 if os.path.exists(centroid_dir): 513 print(f"{centroid_dir} already exists.") 514 else: 515 print(f"{centroid_dir} does not exist. Creating it.") 516 os.makedirs(centroid_dir) 517 518 centroid_filename = os.path.join( 519 centroid_dir, 520 "centroids.npy", 521 ) 522 523 if os.path.exists(centroid_filename): 524 print(f"{centroid_filename} already exists.") 525 start_time = time.perf_counter() 526 centroids = np.load(centroid_filename) 527 times["load centroids"] = time.perf_counter() - start_time 528 else: 529 print(f"{centroid_filename} does not exist. Generating coarse model.") 530 ### load density file, normalize the data and threshold, and extract the x,y,z coordinates of the remaining pixels 531 start_time = time.perf_counter() 532 mrc = d2g.load_density_file(mrc_file_path) # load density file 533 times["load density file"] = time.perf_counter() - start_time 534 535 start_time = time.perf_counter() 536 if standardize: 537 D_standardized = d2g.standardize_mrc_data(mrc.data) 538 times["standardize data"] = time.perf_counter() - start_time 539 start_time = time.perf_counter() 540 D_norm = d2g.normalize_mrc_data(D_standardized) 541 times["normalize data"] = time.perf_counter() - start_time 542 else: 543 D_norm = d2g.normalize_mrc_data(mrc.data) 544 times["normalize data"] = time.perf_counter() - start_time 545 total_voxel_count = mrc.data.size 546 547 start_time = time.perf_counter() 548 if threshold_ratio: 549 threshold = d2g.identify_threshold_ratio(D_norm, density_threshold) 550 else: 551 threshold = density_threshold 552 start_time = time.perf_counter() 553 xyz_data = d2g.generate_point_cloud_from_mrc_data(D_norm, threshold) 554 times["generate point cloud"] = time.perf_counter() - start_time 555 556 if density_coarsening: 557 xyz_densities = D_norm[xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2]] + 1 558 xyz_densities = xyz_densities / 2 559 fine_mean_density = np.mean(xyz_densities) 560 fine_median_density = np.median(xyz_densities) 561 fine_max_density = np.max(xyz_densities) 562 fine_min_density = np.min(xyz_densities) 563 fine_std_density = np.std(xyz_densities) 564 fine_sum_density = np.sum(xyz_densities) 565 xyz_densities_filename = os.path.join( 566 centroid_dir, 567 "xyz_densities.npy", 568 ) 569 np.save(xyz_densities_filename, xyz_densities) 570 else: 571 xyz_densities = None 572 573 point_cloud_node_count = len(xyz_data) 574 575 if precoarsened: 576 start_time = time.perf_counter() 577 coarsened = d2g.voxel_coarsening( 578 coarsening_dim, 579 xyz_data, 580 np.zeros(xyz_data.shape[0], dtype=np.int64), 581 density=xyz_densities, 582 averaged=averaged, 583 ) 584 if xyz_densities is not None: 585 xyz_densities = coarsened[1] 586 xyz_data = coarsened[0] 587 else: 588 xyz_data = coarsened 589 times["precoarsening"] = time.perf_counter() - start_time 590 591 start_time = time.perf_counter() 592 ### perform clustering and get cluster centers 593 model = d2g.cluster_data_hdbscan( 594 xyz_data, HDBSCAN_min_cluster_size, HDBSCAN_min_samples 595 ) # cluster thresholded data using DBSCAN 596 times["cluster data"] = time.perf_counter() - start_time 597 598 noisy_node_count = np.sum(model.labels_ == -1) 599 non_noise_cluster_count = len(np.unique(model.labels_)) 600 if noisy_node_count > 0: 601 non_noise_cluster_count -= 1 602 if non_noise_cluster_count == 0: 603 print("No clusters found for {centroid_dir}") 604 run_info_filename = centroid_filename.replace(".npy", "_run_info.csv") 605 with open(run_info_filename, "w", newline="") as csvfile: 606 fieldnames = ["Total Voxel Count", "Point Cloud Node Count"] 607 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 608 writer.writeheader() 609 writer.writerow( 610 { 611 "Total Voxel Count": total_voxel_count, 612 "Point Cloud Node Count": point_cloud_node_count, 613 } 614 ) 615 # Print timing information to a file 616 times_filename = centroid_filename.replace("centroids.npy", "times.csv") 617 with open(times_filename, "w", newline="") as csvfile: 618 fieldnames = list(times.keys()) 619 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 620 writer.writeheader() 621 writer.writerow(times) 622 raise ValueError("No clusters found.") 623 largest_cluster = np.argmax(np.bincount(model.labels_[model.labels_ != -1])) 624 largest_cluster_size = np.sum(model.labels_ == largest_cluster) 625 626 xyz_filename = os.path.join( 627 centroid_dir, 628 "xyz_data.npy", 629 ) 630 labels_filename = os.path.join( 631 centroid_dir, 632 "labels.npy", 633 ) 634 np.save(xyz_filename, xyz_data) 635 np.save(labels_filename, model.labels_) 636 637 if not precoarsened: 638 start_time = time.perf_counter() 639 coarsened = d2g.voxel_coarsening( 640 coarsening_dim, 641 xyz_data, 642 model.labels_, 643 density=xyz_densities, 644 averaged=averaged, 645 ) # coarse grain model by getting cluster centroids 646 if density_coarsening: 647 centroids, centroid_densities = coarsened 648 else: 649 centroids = coarsened 650 coarsened_size = centroids.shape[0] 651 times["voxel coarsening"] = time.perf_counter() - start_time 652 else: 653 centroids = xyz_data[model.labels_ == largest_cluster] 654 centroid_densities = ( 655 xyz_densities[model.labels_ == largest_cluster] 656 if xyz_densities is not None 657 else None 658 ) 659 coarsened_size = centroids.shape[0] 660 if density_coarsening: 661 coarse_mean_density = np.mean(centroid_densities) 662 coarse_median_density = np.median(centroid_densities) 663 coarse_max_density = np.max(centroid_densities) 664 coarse_min_density = np.min(centroid_densities) 665 coarse_std_density = np.std(centroid_densities) 666 coarse_sum_density = np.sum(centroid_densities) 667 else: 668 centroids = coarsened 669 670 coarsened_size = centroids.shape[0] 671 times["voxel coarsening"] = time.perf_counter() - start_time 672 673 start_time = time.perf_counter() 674 if density_coarsening: 675 centroid_densities_filename = os.path.join( 676 centroid_dir, 677 "centroid_densities.npy", 678 ) 679 np.save(centroid_densities_filename, centroid_densities) 680 np.save(centroid_filename, centroids) 681 times["save centroids"] = time.perf_counter() - start_time 682 683 run_info_filename = centroid_filename.replace(".npy", "_run_info.csv") 684 with open(run_info_filename, "w", newline="") as csvfile: 685 fieldnames = [ 686 "Total Voxel Count", 687 "Point Cloud Node Count", 688 "Non Noise Cluster Count", 689 "Noisy Node Count", 690 "Largest Cluster Size", 691 "Coarsened Size", 692 ] 693 if density_coarsening: 694 fieldnames += [ 695 "Fine Mean Density", 696 "Fine Median Density", 697 "Fine Max Density", 698 "Fine Min Density", 699 "Fine Std Density", 700 "Fine Sum Density", 701 "Coarse Mean Density", 702 "Coarse Median Density", 703 "Coarse Max Density", 704 "Coarse Min Density", 705 "Coarse Std Density", 706 "Coarse Sum Density", 707 ] 708 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 709 710 writer.writeheader() 711 if density_coarsening: 712 writer.writerow( 713 { 714 "Total Voxel Count": total_voxel_count, 715 "Point Cloud Node Count": point_cloud_node_count, 716 "Non Noise Cluster Count": non_noise_cluster_count, 717 "Noisy Node Count": noisy_node_count, 718 "Largest Cluster Size": largest_cluster_size, 719 "Coarsened Size": coarsened_size, 720 "Fine Mean Density": fine_mean_density, 721 "Fine Median Density": fine_median_density, 722 "Fine Max Density": fine_max_density, 723 "Fine Min Density": fine_min_density, 724 "Fine Std Density": fine_std_density, 725 "Fine Sum Density": fine_sum_density, 726 "Coarse Mean Density": coarse_mean_density, 727 "Coarse Median Density": coarse_median_density, 728 "Coarse Max Density": coarse_max_density, 729 "Coarse Min Density": coarse_min_density, 730 "Coarse Std Density": coarse_std_density, 731 "Coarse Sum Density": coarse_sum_density, 732 } 733 ) 734 else: 735 writer.writerow( 736 { 737 "Total Voxel Count": total_voxel_count, 738 "Point Cloud Node Count": point_cloud_node_count, 739 "Non Noise Cluster Count": non_noise_cluster_count, 740 "Noisy Node Count": noisy_node_count, 741 "Largest Cluster Size": largest_cluster_size, 742 "Coarsened Size": coarsened_size, 743 } 744 ) 745 # Print timing information to a file 746 times_filename = centroid_filename.replace("centroids.npy", "times.csv") 747 with open(times_filename, "w", newline="") as csvfile: 748 fieldnames = list(times.keys()) 749 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 750 writer.writeheader() 751 writer.writerow(times) 752 return (centroids, centroid_dir) 753 754 755@python_app 756def convert_density_file_to_point_cloud_optics_timed( 757 protein_name, 758 mrc_file_path, 759 OPTICS_min_samples, 760 coarsening_dim, 761 density_threshold, 762 output_dir, 763 OPTICS_min_cluster_size=None, 764 OPTICS_max_epsilon=None, 765 averaged=False, 766 density_coarsening=False, 767): 768 """ 769 Convert a density file to centroids using OPTICS clustering. 770 Additionally, save the timings of each step to a CSV file. 771 772 Parameters 773 ---------- 774 protein_name : str 775 Name of the protein. 776 mrc_file_path : str 777 Path to the MRC density file. 778 OPTICS_min_samples : int 779 The number of samples in a neighborhood for a point to be considered as a core point. 780 coarsening_dim : int 781 The dimensionality passed in to griptomo.core.density2graph.voxel_coarsening's voxel_dim argument to be used for performing coarsening on the thresholded point cloud. 782 density_threshold : float 783 The threshold value for density. 784 output_dir : str 785 Directory where the output will be saved. 786 OPTICS_min_cluster_size : float, optional 787 The minimum number of points in a cluster. Defaults to None. 788 OPTICS_max_epsilon : float, optional 789 The maximum distance between points considered by OPTICS. Defaults to None, which uses np.inf. A lower value may reduce computation time. 790 averaged : bool, optional 791 If True, average the points in the coarsening box instead of returning the center of the box. Defaults to False. 792 density_coarsening : bool, optional 793 If True, use the density values of each voxel in the box to provide weight when averaging during coarsening. Defaults to False. 794 795 Returns 796 ------- 797 tuple 798 A tuple containing: 799 - centroids (numpy.ndarray): The centroids of the clusters. 800 - centroid_dir (str): The directory where the centroids are saved. 801 """ 802 import time 803 804 times = {} 805 806 start_time = time.perf_counter() 807 import os 808 import numpy as np 809 import csv 810 811 from griptomo.core import density2graph as d2g 812 813 times["module import"] = time.perf_counter() - start_time 814 815 OPTICS_max_epsilon = ( 816 OPTICS_max_epsilon if OPTICS_max_epsilon is not None else np.inf 817 ) 818 819 fname = os.path.basename(mrc_file_path) 820 density_name = fname[:-4] 821 822 centroid_dir = os.path.join( 823 output_dir, 824 protein_name, 825 density_name, 826 f"density_{density_threshold}", 827 f"min_samples_{OPTICS_min_samples}", 828 f"max_eps_{OPTICS_max_epsilon}", 829 f"coarsening_dim_{coarsening_dim}", 830 ) 831 if os.path.exists(centroid_dir): 832 print(f"{centroid_dir} already exists.") 833 else: 834 print(f"{centroid_dir} does not exist. Creating it.") 835 os.makedirs(centroid_dir) 836 837 centroid_filename = os.path.join( 838 centroid_dir, 839 "centroids.npy", 840 ) 841 842 if os.path.exists(centroid_filename): 843 print(f"{centroid_filename} already exists.") 844 start_time = time.perf_counter() 845 centroids = np.load(centroid_filename) 846 times["load centroids"] = time.perf_counter() - start_time 847 else: 848 print(f"{centroid_filename} does not exist. Generating coarse model.") 849 ### load density file, normalize the data and threshold, and extract the x,y,z coordinates of the remaining pixels 850 start_time = time.perf_counter() 851 mrc = d2g.load_density_file(mrc_file_path) # load density file 852 times["load density file"] = time.perf_counter() - start_time 853 854 start_time = time.perf_counter() 855 D_norm = d2g.normalize_mrc_data(mrc.data) 856 times["normalize data"] = time.perf_counter() - start_time 857 total_voxel_count = mrc.data.size 858 859 start_time = time.perf_counter() 860 D_thresholded = d2g.threshold_mrc_data(D_norm, density_threshold) 861 times["threshold data"] = time.perf_counter() - start_time 862 863 start_time = time.perf_counter() 864 xyz_data = d2g.generate_point_cloud_from_mrc_data( 865 D_thresholded, density_threshold 866 ) 867 times["generate point cloud"] = time.perf_counter() - start_time 868 869 point_cloud_node_count = len(xyz_data) 870 871 start_time = time.perf_counter() 872 ### perform clustering and get cluster centers 873 model = d2g.cluster_data_optics( 874 xyz_data, OPTICS_min_samples, OPTICS_min_cluster_size, OPTICS_max_epsilon 875 ) # cluster thresholded data using DBSCAN 876 times["cluster data"] = time.perf_counter() - start_time 877 878 noisy_node_count = np.sum(model.labels_ == -1) 879 non_noise_cluster_count = len(np.unique(model.labels_)) 880 if noisy_node_count > 0: 881 non_noise_cluster_count -= 1 882 if non_noise_cluster_count == 0: 883 print("No clusters found for {centroid_dir}") 884 run_info_filename = centroid_filename.replace(".npy", "_run_info.csv") 885 with open(run_info_filename, "w", newline="") as csvfile: 886 fieldnames = ["Total Voxel Count", "Point Cloud Node Count"] 887 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 888 writer.writeheader() 889 writer.writerow( 890 { 891 "Total Voxel Count": total_voxel_count, 892 "Point Cloud Node Count": point_cloud_node_count, 893 } 894 ) 895 # Print timing information to a file 896 times_filename = centroid_filename.replace("centroids.npy", "times.csv") 897 with open(times_filename, "w", newline="") as csvfile: 898 fieldnames = list(times.keys()) 899 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 900 writer.writeheader() 901 writer.writerow(times) 902 raise ValueError("No clusters found.") 903 largest_cluster = np.argmax(np.bincount(model.labels_[model.labels_ != -1])) 904 largest_cluster_size = np.sum(model.labels_ == largest_cluster) 905 906 if density_coarsening: 907 xyz_densities = D_norm[xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2]] + 1 908 xyz_densities = xyz_densities / 2 909 fine_mean_density = np.mean(xyz_densities) 910 fine_median_density = np.median(xyz_densities) 911 fine_max_density = np.max(xyz_densities) 912 fine_min_density = np.min(xyz_densities) 913 fine_std_density = np.std(xyz_densities) 914 fine_sum_density = np.sum(xyz_densities) 915 xyz_densities_filename = os.path.join( 916 centroid_dir, 917 "xyz_densities.npy", 918 ) 919 np.save(xyz_densities_filename, xyz_densities) 920 else: 921 xyz_densities = None 922 923 xyz_filename = os.path.join( 924 centroid_dir, 925 "xyz_data.npy", 926 ) 927 labels_filename = os.path.join( 928 centroid_dir, 929 "labels.npy", 930 ) 931 np.save(xyz_filename, xyz_data) 932 np.save(labels_filename, model.labels_) 933 934 start_time = time.perf_counter() 935 coarsened = d2g.voxel_coarsening( 936 coarsening_dim, 937 xyz_data, 938 model.labels_, 939 density=xyz_densities, 940 averaged=averaged, 941 ) # coarse grain model by getting cluster centroids 942 if density_coarsening: 943 centroids, centroid_densities = coarsened 944 coarse_mean_density = np.mean(centroid_densities) 945 coarse_median_density = np.median(centroid_densities) 946 coarse_max_density = np.max(centroid_densities) 947 coarse_min_density = np.min(centroid_densities) 948 coarse_std_density = np.std(centroid_densities) 949 coarse_sum_density = np.sum(centroid_densities) 950 else: 951 centroids = coarsened 952 953 coarsened_size = centroids.shape[0] 954 times["voxel coarsening"] = time.perf_counter() - start_time 955 956 start_time = time.perf_counter() 957 if density_coarsening: 958 centroid_densities_filename = os.path.join( 959 centroid_dir, 960 "centroid_densities.npy", 961 ) 962 np.save(centroid_densities_filename, centroid_densities) 963 np.save(centroid_filename, centroids) 964 times["save centroids"] = time.perf_counter() - start_time 965 966 run_info_filename = centroid_filename.replace(".npy", "_run_info.csv") 967 with open(run_info_filename, "w", newline="") as csvfile: 968 fieldnames = [ 969 "Total Voxel Count", 970 "Point Cloud Node Count", 971 "Non Noise Cluster Count", 972 "Noisy Node Count", 973 "Largest Cluster Size", 974 "Coarsened Size", 975 ] 976 if density_coarsening: 977 fieldnames += [ 978 "Fine Mean Density", 979 "Fine Median Density", 980 "Fine Max Density", 981 "Fine Min Density", 982 "Fine Std Density", 983 "Fine Sum Density", 984 "Coarse Mean Density", 985 "Coarse Median Density", 986 "Coarse Max Density", 987 "Coarse Min Density", 988 "Coarse Std Density", 989 "Coarse Sum Density", 990 ] 991 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 992 993 writer.writeheader() 994 if density_coarsening: 995 writer.writerow( 996 { 997 "Total Voxel Count": total_voxel_count, 998 "Point Cloud Node Count": point_cloud_node_count, 999 "Non Noise Cluster Count": non_noise_cluster_count, 1000 "Noisy Node Count": noisy_node_count, 1001 "Largest Cluster Size": largest_cluster_size, 1002 "Coarsened Size": coarsened_size, 1003 "Fine Mean Density": fine_mean_density, 1004 "Fine Median Density": fine_median_density, 1005 "Fine Max Density": fine_max_density, 1006 "Fine Min Density": fine_min_density, 1007 "Fine Std Density": fine_std_density, 1008 "Fine Sum Density": fine_sum_density, 1009 "Coarse Mean Density": coarse_mean_density, 1010 "Coarse Median Density": coarse_median_density, 1011 "Coarse Max Density": coarse_max_density, 1012 "Coarse Min Density": coarse_min_density, 1013 "Coarse Std Density": coarse_std_density, 1014 "Coarse Sum Density": coarse_sum_density, 1015 } 1016 ) 1017 else: 1018 writer.writerow( 1019 { 1020 "Total Voxel Count": total_voxel_count, 1021 "Point Cloud Node Count": point_cloud_node_count, 1022 "Non Noise Cluster Count": non_noise_cluster_count, 1023 "Noisy Node Count": noisy_node_count, 1024 "Largest Cluster Size": largest_cluster_size, 1025 "Coarsened Size": coarsened_size, 1026 } 1027 ) 1028 # Print timing information to a file 1029 times_filename = centroid_filename.replace("centroids.npy", "times.csv") 1030 with open(times_filename, "w", newline="") as csvfile: 1031 fieldnames = list(times.keys()) 1032 writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 1033 writer.writeheader() 1034 writer.writerow(times) 1035 return (centroids, centroid_dir)
Convert a density file to centroids using DBSCAN clustering.
Parameters
- protein_name (str): Name of the protein.
- mrc_file_path (str): Path to the MRC density file.
- DBSCAN_epsilon (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
- DBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
- density_threshold (float): The threshold value for density.
- output_dir (str): Directory where the output will be saved.
Returns
- tuple: A tuple containing:
- centroids (numpy.ndarray): The centroids of the clusters.
- centroid_dir (str): The directory where the centroids are saved.
Extracts graph features from centroid coordinates and saves them to a CSV file.
Parameters
- centroids (str or numpy.ndarray): Path to a file containing centroid coordinates or a numpy array of centroid coordinates.
- cutoff (float): Distance threshold for creating edges in the graph.
- output_dir (str): Directory where the output CSV file will be saved.
- force_overwrite (bool, optional): If True, overwrite the existing output file. Defaults to False.
Returns
- str: Path to the output CSV file containing the graph features.
Convert a density file to centroids using DBSCAN clustering. Additionally, save the timings of each step to a CSV file.
Parameters
- protein_name (str): Name of the protein.
- mrc_file_path (str): Path to the MRC density file.
- DBSCAN_epsilon (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
- DBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
- density_threshold (float): The threshold value for density.
- output_dir (str): Directory where the output will be saved.
Returns
- tuple: A tuple containing:
- centroids (numpy.ndarray): The centroids of the clusters.
- centroid_dir (str): The directory where the centroids are saved.
Extracts graph features from centroid coordinates and saves them to a CSV file. Additionally, save the times of each step to a separate CSV file.
Parameters
- centroids (str or numpy.ndarray): Path to a file containing centroid coordinates or a numpy array of centroid coordinates.
- cutoff (float): Distance threshold for creating edges in the graph.
- output_dir (str): Directory where the output CSV file will be saved.
- force_overwrite (bool, optional): If True, overwrite the existing output file. Defaults to False.
- skip_clique_num (bool, optional): If True, skip the calculation of the number of maximal cliques. Defaults to False.
Returns
- str: Path to the output CSV file containing the graph features.
Convert a density file to centroids using HDBSCAN clustering. Additionally, save the timings of each step to a CSV file.
Parameters
- protein_name (str): Name of the protein.
- mrc_file_path (str): Path to the MRC density file.
- HDBSCAN_min_cluster_size (float): The minimum number of points in a cluster.
- HDBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
coarsening_dim (int):
The dimensionality passed in to griptomo.core.density2graph.voxel_coarsening's voxel_dim argument to be used for performing coarsening on the thresholded point cloud.
- density_threshold (float): The threshold value for density.
- output_dir (str): Directory where the output will be saved.
- averaged (bool, optional): If True, average the points in the coarsening box instead of returning the center of the box. Defaults to False.
- density_coarsening (bool, optional): If True, use the density values of each voxel in the box to provide weight when averaging during coarsening. Defaults to False.
- precoarsened (bool, optional): If True, pre-coarsen the data before extracting the cluster centroids. Defaults to False.
- standardize (bool, optional): If True, standardize the data before thresholding. Defaults to False.
- threshold_ratio (bool, optional): If True, use the threshold ratio to identify the threshold. Defaults to False.
Returns
- tuple: A tuple containing:
- centroids (numpy.ndarray): The centroids of the clusters.
- centroid_dir (str): The directory where the centroids are saved.
Convert a density file to centroids using OPTICS clustering. Additionally, save the timings of each step to a CSV file.
Parameters
- protein_name (str): Name of the protein.
- mrc_file_path (str): Path to the MRC density file.
- OPTICS_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
coarsening_dim (int):
The dimensionality passed in to griptomo.core.density2graph.voxel_coarsening's voxel_dim argument to be used for performing coarsening on the thresholded point cloud.
- density_threshold (float): The threshold value for density.
- output_dir (str): Directory where the output will be saved.
- OPTICS_min_cluster_size (float, optional): The minimum number of points in a cluster. Defaults to None.
- OPTICS_max_epsilon (float, optional): The maximum distance between points considered by OPTICS. Defaults to None, which uses np.inf. A lower value may reduce computation time.
- averaged (bool, optional): If True, average the points in the coarsening box instead of returning the center of the box. Defaults to False.
- density_coarsening (bool, optional): If True, use the density values of each voxel in the box to provide weight when averaging during coarsening. Defaults to False.
Returns
- tuple: A tuple containing:
- centroids (numpy.ndarray): The centroids of the clusters.
- centroid_dir (str): The directory where the centroids are saved.