griptomo.core.density2graph
1# August George, 2022, PNNL 2 3import argparse 4import mrcfile 5import matplotlib.pyplot as plt 6 7try: 8 import cupy as cp 9 from cuml.cluster import DBSCAN as DBSCAN_GPU, HDBSCAN as HDBSCAN_GPU 10 from cuml.preprocessing import StandardScaler as StandardScaler_GPU 11 import cugraph as cg 12 import cudf 13 14 if cp.cuda.is_available(): 15 IS_GPU_AVAILABLE = True 16 print("GPU support for GRIP-Tomo clustering is available.") 17 else: 18 raise Exception("No GPU detected.") 19except Exception as e: 20 print( 21 f"GPU support for GRIP-Tomo clustering is not available because of {e}. Using CPU." 22 ) 23 IS_GPU_AVAILABLE = False 24import numpy as np 25from hdbscan import HDBSCAN 26from sklearn.cluster import DBSCAN, OPTICS 27from sklearn.preprocessing import StandardScaler 28import networkx as nx 29from pathlib import Path 30from warnings import warn 31from scipy.spatial import KDTree 32import igraph as ig 33 34 35def _ensure_array(data, device=None): 36 """Ensures that the input data is a numpy array.""" 37 if isinstance(data, np.ndarray): 38 if device == "gpu" and IS_GPU_AVAILABLE: 39 return cp.asarray(data) 40 return data 41 elif isinstance(data, mrcfile.mrcfile.MrcFile): 42 array_data = np.asarray(data.data) 43 if device == "gpu" and IS_GPU_AVAILABLE: 44 return cp.asarray(array_data) 45 return array_data 46 if IS_GPU_AVAILABLE: 47 if isinstance(data, cp.ndarray): 48 return data 49 else: 50 raise ValueError(f"Unsupported data type: {type(data)}") 51 52 53def load_density_file(fname, print_warning=True): 54 """ 55 Load a .mrc file using the mrcfile package. 56 57 Parameters 58 ---------- 59 fname : str 60 Filename or filepath of the .mrc file. 61 print_warning : bool, optional 62 If True, prints a warning message about data inversion. Default is True. 63 64 Returns 65 ------- 66 mrcfile.mrcfile.MrcFile 67 MRC file object containing the header and data properties. 68 """ 69 # load .mrc tomogram file as a MRC object which has header and data properties. 70 # see: https://mrcfile.readthedocs.io/en/latest/usage_guide.html 71 mrc = mrcfile.mmap(fname, mode="r") # memory mapped mode for large files 72 # warn('WARNING: check the inversion of the data. White voxels should correspond to high density.') 73 if print_warning: 74 print( 75 "WARNING: check the inversion of the data. White voxels should correspond to high density." 76 ) 77 return mrc 78 79 80def normalize_and_threshold_data(mrc, mrc_t, noise_stdev=0.0, norm_T=True): 81 """ 82 Normalizes threshold value and densities then applies a cutoff threshold. 83 84 Parameters 85 ---------- 86 mrc : mrcfile.mrcfile.MrcFile 87 MRC data. 88 mrc_t : float 89 Raw (unnormalized) pixel intensity cutoff threshold. 90 noise_stdev : float, optional 91 Standard deviation of Gaussian noise (mean=0) to add. Default is 0 (no noise added). 92 norm_T : bool, optional 93 If True, threshold value is normalized. Default is True. 94 95 Returns 96 ------- 97 numpy.ndarray or cupy.ndarray 98 Array of x, y, z coordinates which are above the cutoff threshold. A[0] = [x0, y0, z0]. 99 """ 100 warn("This method is deprecated.", DeprecationWarning, stacklevel=2) 101 # load and normalize data, normalize threshold value 102 if noise_stdev == 0: 103 D = mrc.data 104 else: 105 assert noise_stdev >= 0 106 print("add_Gaussian_noise") 107 D = add_Gaussian_noise(mrc, scale=noise_stdev) 108 109 # get x,y,z coordinates above threshold 110 if norm_T == False: 111 print("Do not normalize") 112 x, y, z = np.where(D > mrc_t) 113 else: 114 D_min = np.min(D) 115 D_max = np.max(D) 116 D_norm = (D - D_min) / ( 117 D_max - D_min 118 ) # normalize to 0,1: (x_i-x_min) / (x_max - x_min) 119 120 t_norm = (mrc_t - D_min) / (D_max - D_min) 121 # Doo Nam confirmed that normalization of density ran well. 122 # Doo Nam confirmed that as long as he puts reasonable mrc_t, t_norm is 0~1 123 # However, since both density and t_norm are normalized, there was no practice difference for xyz_data (regardless of norm_T) 124 125 x, y, z = np.where(D_norm > t_norm) 126 xyz_data = np.transpose(np.stack([x, y, z])) 127 return xyz_data 128 129 130### end of def normalize_and_threshold_data(mrc, mrc_t, noise_stdev=0.0, norm_T=True): 131 132 133def normalize_mrc_data( 134 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, 135 lower_bound: float = -1, 136 upper_bound: float = 1, 137 device: str = "cpu", 138) -> np.ndarray: 139 """Normalizes the data of an MRC file. 140 141 Normalizes the data from `mrc` to the range of 142 [`lower_bound`, `upper_bound`]. 143 144 Parameters 145 ---------- 146 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 147 MRC file or MRC data (numpy array or cupy array). 148 lower_bound : float, optional 149 Lower bound of the normalized data. Default is -1. 150 upper_bound : float, optional 151 Upper bound of the normalized data. Default is 1. 152 device : str, optional 153 Device to use for computation. Default is cpu. 154 155 Returns 156 ------- 157 numpy.ndarray or cupy.ndarray 158 Array of normalized MRC data. 159 160 Notes 161 ----- 162 x_norm = lower_bound + (x - min(x))(upper_bound-lower_bound)/(max(x)-min(x)) 163 """ 164 D = _ensure_array(mrc_data, device) 165 assert lower_bound < upper_bound 166 D_min = np.min(D) 167 D_max = np.max(D) 168 D_norm = lower_bound + (upper_bound - lower_bound) * (D - D_min) / (D_max - D_min) 169 return D_norm 170 171 172def standardize_mrc_data( 173 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, device: str = "cpu" 174) -> np.ndarray: 175 """Standardizes the data using scikit-learn's StandardScaler. 176 177 Parameters 178 ---------- 179 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 180 MRC file or MRC data (numpy array or cupy array). 181 device : str, optional 182 Device to use for computation. Default is cpu. 183 184 Returns 185 ------- 186 numpy.ndarray or cupy.ndarray 187 Array of standardized MRC data. 188 189 Notes 190 ----- 191 Standardization: z = (x - u) / s 192 u: mean of the data 193 s: standard deviation of the data 194 195 """ 196 D = _ensure_array(mrc_data, device) 197 198 if device == "gpu" and IS_GPU_AVAILABLE: 199 scaler = StandardScaler_GPU() 200 else: 201 scaler = StandardScaler() 202 D_standardized = scaler.fit_transform(D.reshape(-1, 1)).reshape(D.shape) 203 204 return D_standardized 205 206 207def threshold_mrc_data( 208 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, 209 threshold: float, 210 device: str = "cpu", 211) -> np.ndarray: 212 """ 213 Keeps only the MRC data above the threshold value, setting the other data to the min value. 214 215 Parameters 216 ---------- 217 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 218 MRC file or MRC data (numpy array or cupy array). 219 threshold : float 220 Threshold value. 221 device : str, optional 222 Device to use for computation. Default is cpu. 223 224 Returns 225 ------- 226 numpy.ndarray or cupy.ndarray 227 Thresholded MRC data. 228 229 Notes 230 ------ 231 This function is agnostic towards data normalization. 232 """ 233 D = _ensure_array(mrc_data, device) 234 thresholded_D = np.where(D > threshold, D, np.min(D)) 235 return thresholded_D 236 237 238def identify_threshold_ratio( 239 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, ratio: float, device: str = "cpu" 240) -> np.ndarray: 241 """ 242 Given the MRC file, finds the density value that corresponds to the requested threshold ratio from 0 to 1. 243 244 Parameters 245 ---------- 246 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 247 MRC file or MRC data (numpy array or cupy array). 248 ratio : float 249 Threshold ratio from 0 to 1. 1 = max density (remove all points), 0 = min density (keep all points). 250 device : str, optional 251 Device to use for computation. Default is cpu. 252 253 Returns 254 ------- 255 numpy.ndarray or cupy.ndarray 256 Density value corresponding to the requested threshold ratio. 257 """ 258 assert 0 <= ratio <= 1 259 D = _ensure_array(mrc_data, device) 260 sorted_densities = np.sort(D.flatten()) 261 threshold_index = int(round(ratio * len(sorted_densities))) 262 if threshold_index == len(sorted_densities): 263 raise ValueError( 264 f"Threshold index is at the end of the sorted densities. Please choose a lower ratio." 265 ) 266 return sorted_densities[threshold_index] 267 268 269def generate_point_cloud_from_mrc_data( 270 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, 271 threshold: float, 272 device: str = "cpu", 273) -> np.ndarray: 274 """ 275 Generates an array of X, Y, and Z coordinates of the data that is above a threshold. 276 277 Parameters 278 ---------- 279 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 280 MRC file or MRC data (numpy array or cupy array). 281 threshold : float 282 Threshold value. 283 device : str, optional 284 Device to use for computation. Default is cpu. 285 286 Returns 287 ------- 288 numpy.ndarray or cupy.ndarray 289 Array of X, Y, and Z coordinates of the data that is above the threshold. 290 291 Notes 292 ----- 293 This function is agnostic towards data normalization. 294 """ 295 D = _ensure_array(mrc_data, device) 296 x, y, z = np.where(D > threshold) 297 xyz_data = np.transpose(np.stack([x, y, z])) 298 return xyz_data 299 300 301def augment_mrc_data( 302 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, 303 offset_percent: float = 10, 304 n: int = 1, 305) -> list[np.ndarray] | np.ndarray: 306 """ 307 Generates N augmented MRC datasets that are injected with random noise from a uniform distribution bounded by the offset percentage. 308 309 Parameters 310 ---------- 311 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray 312 MRC file or MRC data (numpy array). 313 offset_percent : float, optional 314 Offset percentage for random noise. Default is 10. 315 n : int, optional 316 Number of augmented datasets to generate. Default is 1. 317 318 Returns 319 ------- 320 list[numpy.ndarray] or numpy.ndarray 321 List of augmented MRC datasets or a single augmented dataset. 322 """ 323 D = _ensure_array(mrc_data) 324 assert 0.0 <= offset_percent <= 100 325 assert n >= 1 326 327 random_uniform = np.random.uniform 328 329 if n == 1: 330 random_offsets = ( 331 random_uniform(low=-offset_percent, high=offset_percent, size=D.shape) / 100 332 ) 333 adjusted_data = D * (1 + random_offsets) 334 return adjusted_data 335 else: 336 adjusted_data_list = [] 337 for _ in range(n): 338 random_offsets = ( 339 random_uniform(low=-offset_percent, high=offset_percent, size=D.shape) 340 / 100 341 ) 342 adjusted_data = D * (1 + random_offsets) 343 adjusted_data_list.append(adjusted_data) 344 return adjusted_data_list 345 346 347def calculate_point_cloud_mass( 348 point_cloud: np.ndarray, voxel_dim: float, density: float 349) -> float: 350 """ 351 Calculates the mass of an array of voxel xyz coordinates. 352 353 Parameters 354 ---------- 355 point_cloud : numpy.ndarray or cupy.ndarray 356 Array of voxel xyz coordinates. 357 voxel_dim : float 358 Voxel dimension. 359 density : float 360 Density value. 361 362 Returns 363 ------- 364 float 365 Estimated mass of the point cloud. 366 367 Notes 368 ----- 369 The estimated mass = n_voxels * voxel_dim * density. 370 For example, mass in kDa = 10 voxels * 1 nm3/ voxel * 0.82 kDa / nm3. 371 The units for `voxel_dim` and `density` must be consistent. 372 Use 0.82 kDa / nm3 for large globular proteins. 373 """ 374 n_voxels = point_cloud.shape[0] 375 return n_voxels * voxel_dim * density 376 377 378def cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples): 379 """ 380 Clusters data using DBSCAN from sklearn or cuml. 381 382 Parameters 383 ---------- 384 xyz_data : numpy.ndarray or cupy.ndarray 385 Array of x, y, z coordinates. 386 DBSCAN_epsilon : float 387 The maximum distance between two samples for one to be considered as in the neighborhood of the other. 388 DBSCAN_min_samples : int 389 The number of samples in a neighborhood for a point to be considered as a core point. 390 391 Returns 392 ------- 393 sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN: clustering results stored in an object 394 395 See Also 396 ------- 397 sklearn.cluster.DBSCAN 398 cuml.cluster.DBSCAN 399 cluster_data_hdbscan 400 cluster_data_optics 401 """ 402 if IS_GPU_AVAILABLE and isinstance(xyz_data, cp.ndarray): 403 clusterer = DBSCAN_GPU 404 else: 405 clusterer = DBSCAN 406 try: 407 model = clusterer( 408 eps=DBSCAN_epsilon, min_samples=DBSCAN_min_samples 409 ) # apply coarse-graining (DBSCAN) 410 model.fit_predict(xyz_data) 411 return model 412 except: 413 print( 414 'Perhaps, "Fatal Python error: _PyErr_NormalizeException: Cannot recover from MemoryErrors while normalizing exceptions. Python runtime state: initialized" at tahoma for empiar inverted mrc' 415 ) 416 print(f"Failed xyz_data.shape:{xyz_data.shape}") 417 print(f"DBSCAN_epsilon:{DBSCAN_epsilon}") 418 print(f"DBSCAN_min_samples:{DBSCAN_min_samples}") 419 return False 420 421 422def cluster_data_hdbscan(xyz_data, HDBSCAN_min_cluster_size, HDBSCAN_min_samples): 423 """ 424 Clusters data using HDBSCAN from hdbscan or cuml. 425 426 Parameters 427 ---------- 428 xyz_data : numpy.ndarray or cupy.ndarray 429 Array of x, y, z coordinates. 430 HDBSCAN_min_cluster_size : int 431 The minimum number of points in a cluster. 432 HDBSCAN_min_samples : int 433 The number of samples in a neighborhood for a point to be considered as a core point. 434 435 Returns 436 ------- 437 hdbscan.HDBSCAN or cuml.cluster.HDBSCAN: clustering results stored in an object 438 439 See Also 440 ------- 441 hdbscan.HDBSCAN 442 cuml.cluster.HDBSCAN 443 cluster_data 444 cluster_data_optics 445 """ 446 if IS_GPU_AVAILABLE and isinstance(xyz_data, cp.ndarray): 447 clusterer = HDBSCAN_GPU 448 else: 449 clusterer = HDBSCAN 450 try: 451 model = clusterer( 452 min_cluster_size=HDBSCAN_min_cluster_size, 453 min_samples=HDBSCAN_min_samples, 454 allow_single_cluster=False, 455 core_dist_n_jobs=-1, 456 approx_min_span_tree=False, 457 cluster_selection_method="eom", 458 ) # apply clustering (HDBSCAN) 459 model.fit(xyz_data) 460 return model 461 except Exception as e: 462 print(f"Error while performing HDBSCAN: {e}") 463 print(f"Failed xyz_data.shape:{xyz_data.shape}") 464 print(f"HDBSCAN_min_cluster_size:{HDBSCAN_min_cluster_size}") 465 print(f"HDBSCAN_min_samples:{HDBSCAN_min_samples}") 466 return False 467 468 469def cluster_data_optics( 470 xyz_data, 471 OPTICS_min_samples, 472 OPTICS_min_cluster_size=None, 473 OPTICS_max_epsilon=np.inf, 474): 475 """ 476 Clusters data using OPTICS from sklearn. 477 478 Parameters 479 ---------- 480 xyz_data : numpy.ndarray or cupy.ndarray 481 Array of x, y, z coordinates. 482 OPTICS_min_samples : int 483 The number of samples in a neighborhood for a point to be considered as a core point. 484 OPTICS_min_cluster_size : int, optional 485 The minimum number of points in a cluster. Default is None. 486 OPTICS_max_epsilon : float, optional 487 The maximum distance between points considered by OPTICS. Default is numpy.inf. 488 489 Returns 490 ------- 491 sklearn.cluster.OPTICS: clustering results stored in an object 492 493 See Also 494 ------- 495 sklearn.cluster.OPTICS 496 cluster_data_hdbscan 497 cluster_data 498 """ 499 try: 500 model = OPTICS( 501 min_cluster_size=OPTICS_min_cluster_size, 502 min_samples=OPTICS_min_samples, 503 max_eps=OPTICS_max_epsilon, 504 n_jobs=-1, 505 ) # apply clustering (OPTICS) 506 model.fit(xyz_data) 507 return model 508 except Exception as e: 509 print(f"Error while performing OPTICS: {e}") 510 print(f"Failed xyz_data.shape:{xyz_data.shape}") 511 print(f"OPTICS_min_cluster_size:{OPTICS_min_cluster_size}") 512 print(f"OPTICS_min_samples:{OPTICS_min_samples}") 513 return False 514 515 516def voxel_coarsening(voxel_dim, point_cloud, labels, density=None, averaged=False): 517 """ 518 Coarsens largest cluster into voxel_dim^3 chunks. 519 520 Parameters 521 ---------- 522 voxel_dim : int 523 Chunk dimension for coarsening. 524 point_cloud : numpy.ndarray or cupy.ndarray 525 Array of x, y, z coordinates. 526 labels : numpy.ndarray or cupy.ndarray 527 Array of cluster labels. 528 density : numpy.ndarray or cupy.ndarray, optional 529 Array of density values. Default is None. 530 averaged : bool, optional 531 If True, the coarsened point cloud will be the average of the points in each chunk, using the density as weights, if provided. Default is False. 532 533 Returns 534 ------- 535 numpy.ndarray or cupy.ndarray 536 Coordinates of the centroid of each chunk that contain more than half of the non-empty voxels. 537 """ 538 max_x_dim, max_y_dim, max_z_dim = np.max(point_cloud, axis=0) + 1 539 coarsened_dim_x = int(np.ceil(max_x_dim / voxel_dim)) 540 coarsened_dim_y = int(np.ceil(max_y_dim / voxel_dim)) 541 coarsened_dim_z = int(np.ceil(max_z_dim / voxel_dim)) 542 largest_cluster = np.argmax(np.bincount(labels[labels != -1])) 543 dense_bins_count = np.zeros((coarsened_dim_x, coarsened_dim_y, coarsened_dim_z)) 544 label_mask = labels == largest_cluster 545 isolated_points = point_cloud[label_mask] 546 coarsened_points = isolated_points // voxel_dim 547 coarsened_points = coarsened_points.astype(int) 548 if density is not None: 549 isolated_density = density[label_mask] 550 coarse_dense_count = np.zeros_like(dense_bins_count) 551 if not averaged: 552 for point in coarsened_points: 553 dense_bins_count[point[0], point[1], point[2]] += 1 554 x, y, z = np.where(dense_bins_count >= (voxel_dim**3) / 2) 555 coarsened_point_cloud = np.transpose(np.stack([x, y, z])) * 3 + 1 556 else: 557 dense_bins = np.zeros((coarsened_dim_x, coarsened_dim_y, coarsened_dim_z, 3)) 558 if density is not None: 559 np.add.at( 560 dense_bins, 561 ( 562 coarsened_points[:, 0], 563 coarsened_points[:, 1], 564 coarsened_points[:, 2], 565 ), 566 isolated_points * isolated_density[:, np.newaxis], 567 ) 568 np.add.at( 569 dense_bins_count, 570 ( 571 coarsened_points[:, 0], 572 coarsened_points[:, 1], 573 coarsened_points[:, 2], 574 ), 575 isolated_density, 576 ) 577 np.add.at( 578 coarse_dense_count, 579 ( 580 coarsened_points[:, 0], 581 coarsened_points[:, 1], 582 coarsened_points[:, 2], 583 ), 584 1, 585 ) 586 else: 587 for coarse_point, raw_point in zip(coarsened_points, isolated_points): 588 dense_bins[ 589 coarse_point[0], coarse_point[1], coarse_point[2] 590 ] += raw_point 591 dense_bins_count[coarse_point[0], coarse_point[1], coarse_point[2]] += 1 592 dense_bin_points = np.where(dense_bins_count > 0) 593 coarsened_point_cloud = ( 594 dense_bins[dense_bin_points] 595 / dense_bins_count[dense_bin_points][:, np.newaxis] 596 ) 597 if density is not None: 598 coarsened_density = ( 599 dense_bins_count[dense_bin_points] 600 / coarse_dense_count[dense_bin_points] 601 ) 602 return coarsened_point_cloud, coarsened_density 603 return coarsened_point_cloud 604 605 606def get_cluster_centroids(xyz_data, model): 607 """ 608 Coarse grain density model using cluster centroids. 609 610 Parameters 611 ---------- 612 xyz_data : numpy.ndarray or cupy.ndarray 613 Array of x, y, z coordinates. 614 model : sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN 615 DBSCAN clustering results stored in an object. 616 617 Returns 618 ------- 619 numpy.ndarray or cupy.ndarray 620 Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0]. 621 """ 622 samples_w_lbls = np.concatenate((xyz_data, model.labels_[:, np.newaxis]), axis=1) 623 if -1 in set(model.labels_): # if noise detected 624 coarse_model = np.zeros( 625 (len(set(model.labels_)) - 1, 3) 626 ) # remove last label which is noise 627 for i in range(len(set(model.labels_)) - 1): 628 # https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python 629 tmp_T = np.transpose( 630 samples_w_lbls[np.in1d(samples_w_lbls[:, -1], np.asarray([i]))] 631 ) 632 x_mean = np.mean(tmp_T[0]) 633 y_mean = np.mean(tmp_T[1]) 634 z_mean = np.mean(tmp_T[2]) 635 coarse_model[i] = [x_mean, y_mean, z_mean] 636 else: 637 coarse_model = np.zeros((len(set(model.labels_)), 3)) 638 for i in range(len(set(model.labels_))): 639 # https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python 640 tmp_T = np.transpose( 641 samples_w_lbls[np.in1d(samples_w_lbls[:, -1], np.asarray([i]))] 642 ) 643 x_mean = np.mean(tmp_T[0]) 644 y_mean = np.mean(tmp_T[1]) 645 z_mean = np.mean(tmp_T[2]) 646 coarse_model[i] = [x_mean, y_mean, z_mean] 647 return coarse_model 648 649 650def plot_clustering_results(xyz_data, coarse_model, figsize=3): 651 """ 652 Creates a 3D scatter plot containing both the xyz data and the cluster centroids. 653 654 Note: should rotate afterwards for better visualization. 655 656 Parameters 657 ---------- 658 xyz_data : numpy.ndarray or cupy.ndarray 659 Array of x, y, z coordinates. 660 coarse_model : numpy.ndarray or cupy.ndarray 661 Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0]. 662 figsize : int, optional 663 Size of the figure. Default is 3. 664 665 Returns 666 ------- 667 matplotlib.figure.Figure 668 3D scatter plot figure. 669 """ 670 fig = plt.figure(figsize=(figsize, figsize)) 671 plt.title("clustering results") 672 ax = fig.add_subplot(projection="3d") 673 ax.scatter( 674 xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2], c="purple", s=2, alpha=0.3 675 ) 676 ax.scatter( 677 coarse_model[:, 0], 678 coarse_model[:, 1], 679 coarse_model[:, 2], 680 c="k", 681 s=10, 682 alpha=0.9, 683 ) 684 return fig 685 686 687def create_cugraph_from_point_cloud(point_cloud, proximity_px): 688 """ 689 Creates a cuGraph object from a point cloud with edges assigned based on proximity. 690 691 Parameters 692 ---------- 693 point_cloud : numpy.ndarray or cupy.ndarray 694 Array of x, y, z coordinates. 695 proximity_px : float 696 Pairwise cutoff distance for assigning edges to nodes, in pixels. 697 savepath : str, optional 698 Path to save the cuGraph object as a GML file. If None, don't save. 699 Default is None. 700 701 Returns 702 ------- 703 cugraph.Graph 704 Unweighted, undirected cuGraph graph object. 705 706 Notes 707 ----- 708 The cuGraph graph is unweighted and undirected. 709 Uses the KDTree algorithm to find the neighbors under the cutoff distance. 710 """ 711 if IS_GPU_AVAILABLE: 712 if isinstance(point_cloud, cp.ndarray): 713 point_cloud = cp.asnumpy(point_cloud) 714 tree = KDTree(point_cloud) 715 edgelist = cp.array(tree.query_pairs(proximity_px, output_type="ndarray")) 716 edgelist_cudf = cudf.DataFrame(edgelist, columns=["source", "destination"]) 717 edgelist_cudf = cg.symmetrize_df(edgelist_cudf, "source", "destination") 718 G = cg.Graph() 719 G.from_cudf_edgelist(edgelist_cudf, source="source", destination="destination") 720 return G 721 else: 722 raise ValueError("GPU support is not available.") 723 724 725def create_igraph_from_point_cloud(point_cloud, proximity_px, savepath=None): 726 """ 727 Creates an igraph object from a point cloud with edges assigned based on proximity. 728 729 Parameters 730 ---------- 731 point_cloud : numpy.ndarray 732 Array of x, y, z coordinates. 733 proximity_px : float 734 Pairwise cutoff distance for assigning edges to nodes, in pixels. 735 savepath : str, optional 736 Path to save the igraph object as a GML file. If None, don't save. 737 Default is None. 738 739 Returns 740 ------- 741 igraph.Graph 742 Unweighted, undirected igraph graph object. 743 744 Notes 745 ----- 746 The igraph graph is unweighted and undirected. 747 Uses the KDTree algorithm to find the neighbors under the cutoff distance. 748 """ 749 tree = KDTree(point_cloud) 750 edgelist = tree.query_pairs(proximity_px, output_type="ndarray") 751 G = ig.Graph(edgelist) 752 if savepath: 753 G.write(savepath, format="gml") 754 return G 755 756 757def create_and_save_graph(coarse_model, proximity_px, out_fname, save=True): 758 """ 759 Creates a Networkx graph from the coarse grained model (cluster centroids) and saves it as a graph XML file (.gexf). 760 761 Parameters 762 ---------- 763 coarse_model : numpy.ndarray or cupy.ndarray 764 Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0]. 765 proximity_px : float 766 Pairwise cutoff distance for assigning edges to nodes, in pixels. 767 out_fname : str 768 Filename for output. 769 save : bool, optional 770 Flag to save file (True) or not (False). Default is True. 771 772 Returns 773 ------- 774 networkx.Graph 775 Networkx graph representation of coarse model (cluster centroids). 776 """ 777 tree = KDTree(coarse_model) 778 edgelist = tree.query_pairs(proximity_px, output_type="ndarray") 779 780 # Arsam Firoozfar "It appears that the current code using nx is slightly faster than using igraph to directly create graph from matrix." 3/31/2023 781 782 G = nx.from_edgelist(edgelist) 783 784 if save: 785 nx.write_gexf(G, f"{out_fname}.gexf") 786 return G 787 788 789def add_Gaussian_noise(mrc, loc=0.0, scale=1.0): 790 """ 791 Adds Gaussian white noise to the data in an mrc file. 792 793 Parameters 794 ---------- 795 mrc : mrcfile.mrcfile.MrcFile 796 MRC object to add noise to. 797 loc : float, optional 798 Mean of Gaussian distribution. Default is 0. 799 scale : float, optional 800 Standard deviation of Gaussian distribution. Default is 1. 801 802 Returns 803 ------- 804 numpy.ndarray or cupy.ndarray 805 Data with noise added. 806 """ 807 D = mrc.data 808 noise = np.random.normal(loc=loc, scale=scale, size=D.shape) 809 D_w_noise = D + noise 810 return D_w_noise 811 812 813# TODO: Add documentation for leave_2D_density 814def leave_2D_density(mrc_data, middle_z_index, xyz_data): 815 three_D_ndarray_left = np.zeros( 816 (mrc_data.shape[0], mrc_data.shape[1], mrc_data.shape[2]) 817 ) 818 for k in range(xyz_data.shape[0]): 819 # print (xyz_data[k]) # 0 0 64 820 # print (type(xyz_data[k])) #<class 'numpy.ndarray'> 821 x_to_keep = xyz_data[k][0] # 0 822 y_to_keep = xyz_data[k][1] # 0 823 z_to_keep = xyz_data[k][2] # 64 824 three_D_ndarray_left[x_to_keep, y_to_keep, z_to_keep] = mrc_data[ 825 x_to_keep, y_to_keep, z_to_keep 826 ] 827 return three_D_ndarray_left 828 829 830def main(args): 831 """ 832 Takes a 3D density (.mrc), applies threshold, coarse-grains data, and converts it into a graph network. 833 Outputs a .png file of the coarse grained model, and a .gexf graph xml file. 834 835 Parameters 836 ---------- 837 args : argparse.Namespace 838 Argument parser object containing the following attributes: 839 - fname : str 840 .mrc filename (white density w/ black background). 841 - mrc_t : float 842 Unnormalized pixel intensity threshold level. 843 - eps : float 844 DBSCAN epsilon (inter cluster distance). 845 - ms : int 846 DBSCAN min samples (minimum number of samples in cluster). 847 - d_cut : float 848 Pairwise distance cutoff for assigning edges to graph, in pixels. 849 """ 850 fname = Path(args.fname) 851 mrc_t = args.mrc_t 852 DBSCAN_epsilon = args.eps # try 1 853 DBSCAN_min_samples = args.ms # try 4 854 d_cut = args.d_cut # try 8 855 out_fname = fname.with_suffix("") 856 857 mrc = load_density_file(fname) 858 xyz_data = normalize_and_threshold_data(mrc, mrc_t) 859 860 # print (f"xyz_data:{xyz_data}") 861 862 model = cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples) 863 coarse_model = get_cluster_centroids(xyz_data, model) 864 G = create_and_save_graph(coarse_model, d_cut, out_fname) 865 fig = plot_clustering_results(xyz_data, coarse_model) 866 867 868if __name__ == "__main__": 869 870 # example: >> python density2graph.py fname.mrc 0.5 1 4 8 871 parser = argparse.ArgumentParser() 872 parser.add_argument( 873 "fname", 874 help="tomogram .mrc filename (white density w/ black background)", 875 type=str, 876 ) 877 parser.add_argument( 878 "t", help="pixel intensity threshold cutoff (unormalized)", type=float 879 ) 880 parser.add_argument( 881 "eps", help="DBSCAN epsilon (inter cluster distance) in pixels", type=float 882 ) 883 parser.add_argument( 884 "ms", help="DBSCAN min samples (minimum number of samples in cluster)", type=int 885 ) 886 parser.add_argument( 887 "d_cut", 888 help="pairwise distance cutoff for assigning edges in pixels", 889 type=float, 890 ) 891 args = parser.parse_args() 892 main(args)
54def load_density_file(fname, print_warning=True): 55 """ 56 Load a .mrc file using the mrcfile package. 57 58 Parameters 59 ---------- 60 fname : str 61 Filename or filepath of the .mrc file. 62 print_warning : bool, optional 63 If True, prints a warning message about data inversion. Default is True. 64 65 Returns 66 ------- 67 mrcfile.mrcfile.MrcFile 68 MRC file object containing the header and data properties. 69 """ 70 # load .mrc tomogram file as a MRC object which has header and data properties. 71 # see: https://mrcfile.readthedocs.io/en/latest/usage_guide.html 72 mrc = mrcfile.mmap(fname, mode="r") # memory mapped mode for large files 73 # warn('WARNING: check the inversion of the data. White voxels should correspond to high density.') 74 if print_warning: 75 print( 76 "WARNING: check the inversion of the data. White voxels should correspond to high density." 77 ) 78 return mrc
Load a .mrc file using the mrcfile package.
Parameters
- fname (str): Filename or filepath of the .mrc file.
- print_warning (bool, optional): If True, prints a warning message about data inversion. Default is True.
Returns
- mrcfile.mrcfile.MrcFile: MRC file object containing the header and data properties.
81def normalize_and_threshold_data(mrc, mrc_t, noise_stdev=0.0, norm_T=True): 82 """ 83 Normalizes threshold value and densities then applies a cutoff threshold. 84 85 Parameters 86 ---------- 87 mrc : mrcfile.mrcfile.MrcFile 88 MRC data. 89 mrc_t : float 90 Raw (unnormalized) pixel intensity cutoff threshold. 91 noise_stdev : float, optional 92 Standard deviation of Gaussian noise (mean=0) to add. Default is 0 (no noise added). 93 norm_T : bool, optional 94 If True, threshold value is normalized. Default is True. 95 96 Returns 97 ------- 98 numpy.ndarray or cupy.ndarray 99 Array of x, y, z coordinates which are above the cutoff threshold. A[0] = [x0, y0, z0]. 100 """ 101 warn("This method is deprecated.", DeprecationWarning, stacklevel=2) 102 # load and normalize data, normalize threshold value 103 if noise_stdev == 0: 104 D = mrc.data 105 else: 106 assert noise_stdev >= 0 107 print("add_Gaussian_noise") 108 D = add_Gaussian_noise(mrc, scale=noise_stdev) 109 110 # get x,y,z coordinates above threshold 111 if norm_T == False: 112 print("Do not normalize") 113 x, y, z = np.where(D > mrc_t) 114 else: 115 D_min = np.min(D) 116 D_max = np.max(D) 117 D_norm = (D - D_min) / ( 118 D_max - D_min 119 ) # normalize to 0,1: (x_i-x_min) / (x_max - x_min) 120 121 t_norm = (mrc_t - D_min) / (D_max - D_min) 122 # Doo Nam confirmed that normalization of density ran well. 123 # Doo Nam confirmed that as long as he puts reasonable mrc_t, t_norm is 0~1 124 # However, since both density and t_norm are normalized, there was no practice difference for xyz_data (regardless of norm_T) 125 126 x, y, z = np.where(D_norm > t_norm) 127 xyz_data = np.transpose(np.stack([x, y, z])) 128 return xyz_data
Normalizes threshold value and densities then applies a cutoff threshold.
Parameters
- mrc (mrcfile.mrcfile.MrcFile): MRC data.
- mrc_t (float): Raw (unnormalized) pixel intensity cutoff threshold.
- noise_stdev (float, optional): Standard deviation of Gaussian noise (mean=0) to add. Default is 0 (no noise added).
- norm_T (bool, optional): If True, threshold value is normalized. Default is True.
Returns
- numpy.ndarray or cupy.ndarray: Array of x, y, z coordinates which are above the cutoff threshold. A[0] = [x0, y0, z0].
134def normalize_mrc_data( 135 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, 136 lower_bound: float = -1, 137 upper_bound: float = 1, 138 device: str = "cpu", 139) -> np.ndarray: 140 """Normalizes the data of an MRC file. 141 142 Normalizes the data from `mrc` to the range of 143 [`lower_bound`, `upper_bound`]. 144 145 Parameters 146 ---------- 147 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 148 MRC file or MRC data (numpy array or cupy array). 149 lower_bound : float, optional 150 Lower bound of the normalized data. Default is -1. 151 upper_bound : float, optional 152 Upper bound of the normalized data. Default is 1. 153 device : str, optional 154 Device to use for computation. Default is cpu. 155 156 Returns 157 ------- 158 numpy.ndarray or cupy.ndarray 159 Array of normalized MRC data. 160 161 Notes 162 ----- 163 x_norm = lower_bound + (x - min(x))(upper_bound-lower_bound)/(max(x)-min(x)) 164 """ 165 D = _ensure_array(mrc_data, device) 166 assert lower_bound < upper_bound 167 D_min = np.min(D) 168 D_max = np.max(D) 169 D_norm = lower_bound + (upper_bound - lower_bound) * (D - D_min) / (D_max - D_min) 170 return D_norm
Normalizes the data of an MRC file.
Normalizes the data from mrc to the range of
[lower_bound, upper_bound].
Parameters
- mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
- lower_bound (float, optional): Lower bound of the normalized data. Default is -1.
- upper_bound (float, optional): Upper bound of the normalized data. Default is 1.
- device (str, optional): Device to use for computation. Default is cpu.
Returns
- numpy.ndarray or cupy.ndarray: Array of normalized MRC data.
Notes
x_norm = lower_bound + (x - min(x))(upper_bound-lower_bound)/(max(x)-min(x))
173def standardize_mrc_data( 174 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, device: str = "cpu" 175) -> np.ndarray: 176 """Standardizes the data using scikit-learn's StandardScaler. 177 178 Parameters 179 ---------- 180 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 181 MRC file or MRC data (numpy array or cupy array). 182 device : str, optional 183 Device to use for computation. Default is cpu. 184 185 Returns 186 ------- 187 numpy.ndarray or cupy.ndarray 188 Array of standardized MRC data. 189 190 Notes 191 ----- 192 Standardization: z = (x - u) / s 193 u: mean of the data 194 s: standard deviation of the data 195 196 """ 197 D = _ensure_array(mrc_data, device) 198 199 if device == "gpu" and IS_GPU_AVAILABLE: 200 scaler = StandardScaler_GPU() 201 else: 202 scaler = StandardScaler() 203 D_standardized = scaler.fit_transform(D.reshape(-1, 1)).reshape(D.shape) 204 205 return D_standardized
Standardizes the data using scikit-learn's StandardScaler.
Parameters
- mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
- device (str, optional): Device to use for computation. Default is cpu.
Returns
- numpy.ndarray or cupy.ndarray: Array of standardized MRC data.
Notes
Standardization: z = (x - u) / s u: mean of the data s: standard deviation of the data
208def threshold_mrc_data( 209 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, 210 threshold: float, 211 device: str = "cpu", 212) -> np.ndarray: 213 """ 214 Keeps only the MRC data above the threshold value, setting the other data to the min value. 215 216 Parameters 217 ---------- 218 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 219 MRC file or MRC data (numpy array or cupy array). 220 threshold : float 221 Threshold value. 222 device : str, optional 223 Device to use for computation. Default is cpu. 224 225 Returns 226 ------- 227 numpy.ndarray or cupy.ndarray 228 Thresholded MRC data. 229 230 Notes 231 ------ 232 This function is agnostic towards data normalization. 233 """ 234 D = _ensure_array(mrc_data, device) 235 thresholded_D = np.where(D > threshold, D, np.min(D)) 236 return thresholded_D
Keeps only the MRC data above the threshold value, setting the other data to the min value.
Parameters
- mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
- threshold (float): Threshold value.
- device (str, optional): Device to use for computation. Default is cpu.
Returns
- numpy.ndarray or cupy.ndarray: Thresholded MRC data.
Notes
This function is agnostic towards data normalization.
239def identify_threshold_ratio( 240 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, ratio: float, device: str = "cpu" 241) -> np.ndarray: 242 """ 243 Given the MRC file, finds the density value that corresponds to the requested threshold ratio from 0 to 1. 244 245 Parameters 246 ---------- 247 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 248 MRC file or MRC data (numpy array or cupy array). 249 ratio : float 250 Threshold ratio from 0 to 1. 1 = max density (remove all points), 0 = min density (keep all points). 251 device : str, optional 252 Device to use for computation. Default is cpu. 253 254 Returns 255 ------- 256 numpy.ndarray or cupy.ndarray 257 Density value corresponding to the requested threshold ratio. 258 """ 259 assert 0 <= ratio <= 1 260 D = _ensure_array(mrc_data, device) 261 sorted_densities = np.sort(D.flatten()) 262 threshold_index = int(round(ratio * len(sorted_densities))) 263 if threshold_index == len(sorted_densities): 264 raise ValueError( 265 f"Threshold index is at the end of the sorted densities. Please choose a lower ratio." 266 ) 267 return sorted_densities[threshold_index]
Given the MRC file, finds the density value that corresponds to the requested threshold ratio from 0 to 1.
Parameters
- mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
- ratio (float): Threshold ratio from 0 to 1. 1 = max density (remove all points), 0 = min density (keep all points).
- device (str, optional): Device to use for computation. Default is cpu.
Returns
- numpy.ndarray or cupy.ndarray: Density value corresponding to the requested threshold ratio.
270def generate_point_cloud_from_mrc_data( 271 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, 272 threshold: float, 273 device: str = "cpu", 274) -> np.ndarray: 275 """ 276 Generates an array of X, Y, and Z coordinates of the data that is above a threshold. 277 278 Parameters 279 ---------- 280 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray 281 MRC file or MRC data (numpy array or cupy array). 282 threshold : float 283 Threshold value. 284 device : str, optional 285 Device to use for computation. Default is cpu. 286 287 Returns 288 ------- 289 numpy.ndarray or cupy.ndarray 290 Array of X, Y, and Z coordinates of the data that is above the threshold. 291 292 Notes 293 ----- 294 This function is agnostic towards data normalization. 295 """ 296 D = _ensure_array(mrc_data, device) 297 x, y, z = np.where(D > threshold) 298 xyz_data = np.transpose(np.stack([x, y, z])) 299 return xyz_data
Generates an array of X, Y, and Z coordinates of the data that is above a threshold.
Parameters
- mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
- threshold (float): Threshold value.
- device (str, optional): Device to use for computation. Default is cpu.
Returns
- numpy.ndarray or cupy.ndarray: Array of X, Y, and Z coordinates of the data that is above the threshold.
Notes
This function is agnostic towards data normalization.
302def augment_mrc_data( 303 mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, 304 offset_percent: float = 10, 305 n: int = 1, 306) -> list[np.ndarray] | np.ndarray: 307 """ 308 Generates N augmented MRC datasets that are injected with random noise from a uniform distribution bounded by the offset percentage. 309 310 Parameters 311 ---------- 312 mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray 313 MRC file or MRC data (numpy array). 314 offset_percent : float, optional 315 Offset percentage for random noise. Default is 10. 316 n : int, optional 317 Number of augmented datasets to generate. Default is 1. 318 319 Returns 320 ------- 321 list[numpy.ndarray] or numpy.ndarray 322 List of augmented MRC datasets or a single augmented dataset. 323 """ 324 D = _ensure_array(mrc_data) 325 assert 0.0 <= offset_percent <= 100 326 assert n >= 1 327 328 random_uniform = np.random.uniform 329 330 if n == 1: 331 random_offsets = ( 332 random_uniform(low=-offset_percent, high=offset_percent, size=D.shape) / 100 333 ) 334 adjusted_data = D * (1 + random_offsets) 335 return adjusted_data 336 else: 337 adjusted_data_list = [] 338 for _ in range(n): 339 random_offsets = ( 340 random_uniform(low=-offset_percent, high=offset_percent, size=D.shape) 341 / 100 342 ) 343 adjusted_data = D * (1 + random_offsets) 344 adjusted_data_list.append(adjusted_data) 345 return adjusted_data_list
Generates N augmented MRC datasets that are injected with random noise from a uniform distribution bounded by the offset percentage.
Parameters
- mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray): MRC file or MRC data (numpy array).
- offset_percent (float, optional): Offset percentage for random noise. Default is 10.
- n (int, optional): Number of augmented datasets to generate. Default is 1.
Returns
- list[numpy.ndarray] or numpy.ndarray: List of augmented MRC datasets or a single augmented dataset.
348def calculate_point_cloud_mass( 349 point_cloud: np.ndarray, voxel_dim: float, density: float 350) -> float: 351 """ 352 Calculates the mass of an array of voxel xyz coordinates. 353 354 Parameters 355 ---------- 356 point_cloud : numpy.ndarray or cupy.ndarray 357 Array of voxel xyz coordinates. 358 voxel_dim : float 359 Voxel dimension. 360 density : float 361 Density value. 362 363 Returns 364 ------- 365 float 366 Estimated mass of the point cloud. 367 368 Notes 369 ----- 370 The estimated mass = n_voxels * voxel_dim * density. 371 For example, mass in kDa = 10 voxels * 1 nm3/ voxel * 0.82 kDa / nm3. 372 The units for `voxel_dim` and `density` must be consistent. 373 Use 0.82 kDa / nm3 for large globular proteins. 374 """ 375 n_voxels = point_cloud.shape[0] 376 return n_voxels * voxel_dim * density
Calculates the mass of an array of voxel xyz coordinates.
Parameters
- point_cloud (numpy.ndarray or cupy.ndarray): Array of voxel xyz coordinates.
- voxel_dim (float): Voxel dimension.
- density (float): Density value.
Returns
- float: Estimated mass of the point cloud.
Notes
The estimated mass = n_voxels * voxel_dim * density.
For example, mass in kDa = 10 voxels * 1 nm3/ voxel * 0.82 kDa / nm3.
The units for voxel_dim and density must be consistent.
Use 0.82 kDa / nm3 for large globular proteins.
379def cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples): 380 """ 381 Clusters data using DBSCAN from sklearn or cuml. 382 383 Parameters 384 ---------- 385 xyz_data : numpy.ndarray or cupy.ndarray 386 Array of x, y, z coordinates. 387 DBSCAN_epsilon : float 388 The maximum distance between two samples for one to be considered as in the neighborhood of the other. 389 DBSCAN_min_samples : int 390 The number of samples in a neighborhood for a point to be considered as a core point. 391 392 Returns 393 ------- 394 sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN: clustering results stored in an object 395 396 See Also 397 ------- 398 sklearn.cluster.DBSCAN 399 cuml.cluster.DBSCAN 400 cluster_data_hdbscan 401 cluster_data_optics 402 """ 403 if IS_GPU_AVAILABLE and isinstance(xyz_data, cp.ndarray): 404 clusterer = DBSCAN_GPU 405 else: 406 clusterer = DBSCAN 407 try: 408 model = clusterer( 409 eps=DBSCAN_epsilon, min_samples=DBSCAN_min_samples 410 ) # apply coarse-graining (DBSCAN) 411 model.fit_predict(xyz_data) 412 return model 413 except: 414 print( 415 'Perhaps, "Fatal Python error: _PyErr_NormalizeException: Cannot recover from MemoryErrors while normalizing exceptions. Python runtime state: initialized" at tahoma for empiar inverted mrc' 416 ) 417 print(f"Failed xyz_data.shape:{xyz_data.shape}") 418 print(f"DBSCAN_epsilon:{DBSCAN_epsilon}") 419 print(f"DBSCAN_min_samples:{DBSCAN_min_samples}") 420 return False
Clusters data using DBSCAN from sklearn or cuml.
Parameters
- xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
- DBSCAN_epsilon (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
- DBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
Returns
- sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN (clustering results stored in an object):
See Also
sklearn.cluster.DBSCAN
cuml.cluster.DBSCAN
cluster_data_hdbscan
cluster_data_optics
423def cluster_data_hdbscan(xyz_data, HDBSCAN_min_cluster_size, HDBSCAN_min_samples): 424 """ 425 Clusters data using HDBSCAN from hdbscan or cuml. 426 427 Parameters 428 ---------- 429 xyz_data : numpy.ndarray or cupy.ndarray 430 Array of x, y, z coordinates. 431 HDBSCAN_min_cluster_size : int 432 The minimum number of points in a cluster. 433 HDBSCAN_min_samples : int 434 The number of samples in a neighborhood for a point to be considered as a core point. 435 436 Returns 437 ------- 438 hdbscan.HDBSCAN or cuml.cluster.HDBSCAN: clustering results stored in an object 439 440 See Also 441 ------- 442 hdbscan.HDBSCAN 443 cuml.cluster.HDBSCAN 444 cluster_data 445 cluster_data_optics 446 """ 447 if IS_GPU_AVAILABLE and isinstance(xyz_data, cp.ndarray): 448 clusterer = HDBSCAN_GPU 449 else: 450 clusterer = HDBSCAN 451 try: 452 model = clusterer( 453 min_cluster_size=HDBSCAN_min_cluster_size, 454 min_samples=HDBSCAN_min_samples, 455 allow_single_cluster=False, 456 core_dist_n_jobs=-1, 457 approx_min_span_tree=False, 458 cluster_selection_method="eom", 459 ) # apply clustering (HDBSCAN) 460 model.fit(xyz_data) 461 return model 462 except Exception as e: 463 print(f"Error while performing HDBSCAN: {e}") 464 print(f"Failed xyz_data.shape:{xyz_data.shape}") 465 print(f"HDBSCAN_min_cluster_size:{HDBSCAN_min_cluster_size}") 466 print(f"HDBSCAN_min_samples:{HDBSCAN_min_samples}") 467 return False
Clusters data using HDBSCAN from hdbscan or cuml.
Parameters
- xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
- HDBSCAN_min_cluster_size (int): The minimum number of points in a cluster.
- HDBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
Returns
- hdbscan.HDBSCAN or cuml.cluster.HDBSCAN (clustering results stored in an object):
See Also
hdbscan.HDBSCAN
cuml.cluster.HDBSCAN
cluster_data
cluster_data_optics
470def cluster_data_optics( 471 xyz_data, 472 OPTICS_min_samples, 473 OPTICS_min_cluster_size=None, 474 OPTICS_max_epsilon=np.inf, 475): 476 """ 477 Clusters data using OPTICS from sklearn. 478 479 Parameters 480 ---------- 481 xyz_data : numpy.ndarray or cupy.ndarray 482 Array of x, y, z coordinates. 483 OPTICS_min_samples : int 484 The number of samples in a neighborhood for a point to be considered as a core point. 485 OPTICS_min_cluster_size : int, optional 486 The minimum number of points in a cluster. Default is None. 487 OPTICS_max_epsilon : float, optional 488 The maximum distance between points considered by OPTICS. Default is numpy.inf. 489 490 Returns 491 ------- 492 sklearn.cluster.OPTICS: clustering results stored in an object 493 494 See Also 495 ------- 496 sklearn.cluster.OPTICS 497 cluster_data_hdbscan 498 cluster_data 499 """ 500 try: 501 model = OPTICS( 502 min_cluster_size=OPTICS_min_cluster_size, 503 min_samples=OPTICS_min_samples, 504 max_eps=OPTICS_max_epsilon, 505 n_jobs=-1, 506 ) # apply clustering (OPTICS) 507 model.fit(xyz_data) 508 return model 509 except Exception as e: 510 print(f"Error while performing OPTICS: {e}") 511 print(f"Failed xyz_data.shape:{xyz_data.shape}") 512 print(f"OPTICS_min_cluster_size:{OPTICS_min_cluster_size}") 513 print(f"OPTICS_min_samples:{OPTICS_min_samples}") 514 return False
Clusters data using OPTICS from sklearn.
Parameters
- xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
- OPTICS_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
- OPTICS_min_cluster_size (int, optional): The minimum number of points in a cluster. Default is None.
- OPTICS_max_epsilon (float, optional): The maximum distance between points considered by OPTICS. Default is numpy.inf.
Returns
- sklearn.cluster.OPTICS (clustering results stored in an object):
See Also
sklearn.cluster.OPTICS
cluster_data_hdbscan
cluster_data
517def voxel_coarsening(voxel_dim, point_cloud, labels, density=None, averaged=False): 518 """ 519 Coarsens largest cluster into voxel_dim^3 chunks. 520 521 Parameters 522 ---------- 523 voxel_dim : int 524 Chunk dimension for coarsening. 525 point_cloud : numpy.ndarray or cupy.ndarray 526 Array of x, y, z coordinates. 527 labels : numpy.ndarray or cupy.ndarray 528 Array of cluster labels. 529 density : numpy.ndarray or cupy.ndarray, optional 530 Array of density values. Default is None. 531 averaged : bool, optional 532 If True, the coarsened point cloud will be the average of the points in each chunk, using the density as weights, if provided. Default is False. 533 534 Returns 535 ------- 536 numpy.ndarray or cupy.ndarray 537 Coordinates of the centroid of each chunk that contain more than half of the non-empty voxels. 538 """ 539 max_x_dim, max_y_dim, max_z_dim = np.max(point_cloud, axis=0) + 1 540 coarsened_dim_x = int(np.ceil(max_x_dim / voxel_dim)) 541 coarsened_dim_y = int(np.ceil(max_y_dim / voxel_dim)) 542 coarsened_dim_z = int(np.ceil(max_z_dim / voxel_dim)) 543 largest_cluster = np.argmax(np.bincount(labels[labels != -1])) 544 dense_bins_count = np.zeros((coarsened_dim_x, coarsened_dim_y, coarsened_dim_z)) 545 label_mask = labels == largest_cluster 546 isolated_points = point_cloud[label_mask] 547 coarsened_points = isolated_points // voxel_dim 548 coarsened_points = coarsened_points.astype(int) 549 if density is not None: 550 isolated_density = density[label_mask] 551 coarse_dense_count = np.zeros_like(dense_bins_count) 552 if not averaged: 553 for point in coarsened_points: 554 dense_bins_count[point[0], point[1], point[2]] += 1 555 x, y, z = np.where(dense_bins_count >= (voxel_dim**3) / 2) 556 coarsened_point_cloud = np.transpose(np.stack([x, y, z])) * 3 + 1 557 else: 558 dense_bins = np.zeros((coarsened_dim_x, coarsened_dim_y, coarsened_dim_z, 3)) 559 if density is not None: 560 np.add.at( 561 dense_bins, 562 ( 563 coarsened_points[:, 0], 564 coarsened_points[:, 1], 565 coarsened_points[:, 2], 566 ), 567 isolated_points * isolated_density[:, np.newaxis], 568 ) 569 np.add.at( 570 dense_bins_count, 571 ( 572 coarsened_points[:, 0], 573 coarsened_points[:, 1], 574 coarsened_points[:, 2], 575 ), 576 isolated_density, 577 ) 578 np.add.at( 579 coarse_dense_count, 580 ( 581 coarsened_points[:, 0], 582 coarsened_points[:, 1], 583 coarsened_points[:, 2], 584 ), 585 1, 586 ) 587 else: 588 for coarse_point, raw_point in zip(coarsened_points, isolated_points): 589 dense_bins[ 590 coarse_point[0], coarse_point[1], coarse_point[2] 591 ] += raw_point 592 dense_bins_count[coarse_point[0], coarse_point[1], coarse_point[2]] += 1 593 dense_bin_points = np.where(dense_bins_count > 0) 594 coarsened_point_cloud = ( 595 dense_bins[dense_bin_points] 596 / dense_bins_count[dense_bin_points][:, np.newaxis] 597 ) 598 if density is not None: 599 coarsened_density = ( 600 dense_bins_count[dense_bin_points] 601 / coarse_dense_count[dense_bin_points] 602 ) 603 return coarsened_point_cloud, coarsened_density 604 return coarsened_point_cloud
Coarsens largest cluster into voxel_dim^3 chunks.
Parameters
- voxel_dim (int): Chunk dimension for coarsening.
- point_cloud (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
- labels (numpy.ndarray or cupy.ndarray): Array of cluster labels.
- density (numpy.ndarray or cupy.ndarray, optional): Array of density values. Default is None.
- averaged (bool, optional): If True, the coarsened point cloud will be the average of the points in each chunk, using the density as weights, if provided. Default is False.
Returns
- numpy.ndarray or cupy.ndarray: Coordinates of the centroid of each chunk that contain more than half of the non-empty voxels.
607def get_cluster_centroids(xyz_data, model): 608 """ 609 Coarse grain density model using cluster centroids. 610 611 Parameters 612 ---------- 613 xyz_data : numpy.ndarray or cupy.ndarray 614 Array of x, y, z coordinates. 615 model : sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN 616 DBSCAN clustering results stored in an object. 617 618 Returns 619 ------- 620 numpy.ndarray or cupy.ndarray 621 Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0]. 622 """ 623 samples_w_lbls = np.concatenate((xyz_data, model.labels_[:, np.newaxis]), axis=1) 624 if -1 in set(model.labels_): # if noise detected 625 coarse_model = np.zeros( 626 (len(set(model.labels_)) - 1, 3) 627 ) # remove last label which is noise 628 for i in range(len(set(model.labels_)) - 1): 629 # https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python 630 tmp_T = np.transpose( 631 samples_w_lbls[np.in1d(samples_w_lbls[:, -1], np.asarray([i]))] 632 ) 633 x_mean = np.mean(tmp_T[0]) 634 y_mean = np.mean(tmp_T[1]) 635 z_mean = np.mean(tmp_T[2]) 636 coarse_model[i] = [x_mean, y_mean, z_mean] 637 else: 638 coarse_model = np.zeros((len(set(model.labels_)), 3)) 639 for i in range(len(set(model.labels_))): 640 # https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python 641 tmp_T = np.transpose( 642 samples_w_lbls[np.in1d(samples_w_lbls[:, -1], np.asarray([i]))] 643 ) 644 x_mean = np.mean(tmp_T[0]) 645 y_mean = np.mean(tmp_T[1]) 646 z_mean = np.mean(tmp_T[2]) 647 coarse_model[i] = [x_mean, y_mean, z_mean] 648 return coarse_model
Coarse grain density model using cluster centroids.
Parameters
- xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
- model (sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN): DBSCAN clustering results stored in an object.
Returns
- numpy.ndarray or cupy.ndarray: Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
651def plot_clustering_results(xyz_data, coarse_model, figsize=3): 652 """ 653 Creates a 3D scatter plot containing both the xyz data and the cluster centroids. 654 655 Note: should rotate afterwards for better visualization. 656 657 Parameters 658 ---------- 659 xyz_data : numpy.ndarray or cupy.ndarray 660 Array of x, y, z coordinates. 661 coarse_model : numpy.ndarray or cupy.ndarray 662 Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0]. 663 figsize : int, optional 664 Size of the figure. Default is 3. 665 666 Returns 667 ------- 668 matplotlib.figure.Figure 669 3D scatter plot figure. 670 """ 671 fig = plt.figure(figsize=(figsize, figsize)) 672 plt.title("clustering results") 673 ax = fig.add_subplot(projection="3d") 674 ax.scatter( 675 xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2], c="purple", s=2, alpha=0.3 676 ) 677 ax.scatter( 678 coarse_model[:, 0], 679 coarse_model[:, 1], 680 coarse_model[:, 2], 681 c="k", 682 s=10, 683 alpha=0.9, 684 ) 685 return fig
Creates a 3D scatter plot containing both the xyz data and the cluster centroids.
Note: should rotate afterwards for better visualization.
Parameters
- xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
- coarse_model (numpy.ndarray or cupy.ndarray): Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
- figsize (int, optional): Size of the figure. Default is 3.
Returns
- matplotlib.figure.Figure: 3D scatter plot figure.
688def create_cugraph_from_point_cloud(point_cloud, proximity_px): 689 """ 690 Creates a cuGraph object from a point cloud with edges assigned based on proximity. 691 692 Parameters 693 ---------- 694 point_cloud : numpy.ndarray or cupy.ndarray 695 Array of x, y, z coordinates. 696 proximity_px : float 697 Pairwise cutoff distance for assigning edges to nodes, in pixels. 698 savepath : str, optional 699 Path to save the cuGraph object as a GML file. If None, don't save. 700 Default is None. 701 702 Returns 703 ------- 704 cugraph.Graph 705 Unweighted, undirected cuGraph graph object. 706 707 Notes 708 ----- 709 The cuGraph graph is unweighted and undirected. 710 Uses the KDTree algorithm to find the neighbors under the cutoff distance. 711 """ 712 if IS_GPU_AVAILABLE: 713 if isinstance(point_cloud, cp.ndarray): 714 point_cloud = cp.asnumpy(point_cloud) 715 tree = KDTree(point_cloud) 716 edgelist = cp.array(tree.query_pairs(proximity_px, output_type="ndarray")) 717 edgelist_cudf = cudf.DataFrame(edgelist, columns=["source", "destination"]) 718 edgelist_cudf = cg.symmetrize_df(edgelist_cudf, "source", "destination") 719 G = cg.Graph() 720 G.from_cudf_edgelist(edgelist_cudf, source="source", destination="destination") 721 return G 722 else: 723 raise ValueError("GPU support is not available.")
Creates a cuGraph object from a point cloud with edges assigned based on proximity.
Parameters
- point_cloud (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
- proximity_px (float): Pairwise cutoff distance for assigning edges to nodes, in pixels.
- savepath (str, optional): Path to save the cuGraph object as a GML file. If None, don't save. Default is None.
Returns
- cugraph.Graph: Unweighted, undirected cuGraph graph object.
Notes
The cuGraph graph is unweighted and undirected. Uses the KDTree algorithm to find the neighbors under the cutoff distance.
726def create_igraph_from_point_cloud(point_cloud, proximity_px, savepath=None): 727 """ 728 Creates an igraph object from a point cloud with edges assigned based on proximity. 729 730 Parameters 731 ---------- 732 point_cloud : numpy.ndarray 733 Array of x, y, z coordinates. 734 proximity_px : float 735 Pairwise cutoff distance for assigning edges to nodes, in pixels. 736 savepath : str, optional 737 Path to save the igraph object as a GML file. If None, don't save. 738 Default is None. 739 740 Returns 741 ------- 742 igraph.Graph 743 Unweighted, undirected igraph graph object. 744 745 Notes 746 ----- 747 The igraph graph is unweighted and undirected. 748 Uses the KDTree algorithm to find the neighbors under the cutoff distance. 749 """ 750 tree = KDTree(point_cloud) 751 edgelist = tree.query_pairs(proximity_px, output_type="ndarray") 752 G = ig.Graph(edgelist) 753 if savepath: 754 G.write(savepath, format="gml") 755 return G
Creates an igraph object from a point cloud with edges assigned based on proximity.
Parameters
- point_cloud (numpy.ndarray): Array of x, y, z coordinates.
- proximity_px (float): Pairwise cutoff distance for assigning edges to nodes, in pixels.
- savepath (str, optional): Path to save the igraph object as a GML file. If None, don't save. Default is None.
Returns
- igraph.Graph: Unweighted, undirected igraph graph object.
Notes
The igraph graph is unweighted and undirected. Uses the KDTree algorithm to find the neighbors under the cutoff distance.
758def create_and_save_graph(coarse_model, proximity_px, out_fname, save=True): 759 """ 760 Creates a Networkx graph from the coarse grained model (cluster centroids) and saves it as a graph XML file (.gexf). 761 762 Parameters 763 ---------- 764 coarse_model : numpy.ndarray or cupy.ndarray 765 Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0]. 766 proximity_px : float 767 Pairwise cutoff distance for assigning edges to nodes, in pixels. 768 out_fname : str 769 Filename for output. 770 save : bool, optional 771 Flag to save file (True) or not (False). Default is True. 772 773 Returns 774 ------- 775 networkx.Graph 776 Networkx graph representation of coarse model (cluster centroids). 777 """ 778 tree = KDTree(coarse_model) 779 edgelist = tree.query_pairs(proximity_px, output_type="ndarray") 780 781 # Arsam Firoozfar "It appears that the current code using nx is slightly faster than using igraph to directly create graph from matrix." 3/31/2023 782 783 G = nx.from_edgelist(edgelist) 784 785 if save: 786 nx.write_gexf(G, f"{out_fname}.gexf") 787 return G
Creates a Networkx graph from the coarse grained model (cluster centroids) and saves it as a graph XML file (.gexf).
Parameters
- coarse_model (numpy.ndarray or cupy.ndarray): Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
- proximity_px (float): Pairwise cutoff distance for assigning edges to nodes, in pixels.
- out_fname (str): Filename for output.
- save (bool, optional): Flag to save file (True) or not (False). Default is True.
Returns
- networkx.Graph: Networkx graph representation of coarse model (cluster centroids).
790def add_Gaussian_noise(mrc, loc=0.0, scale=1.0): 791 """ 792 Adds Gaussian white noise to the data in an mrc file. 793 794 Parameters 795 ---------- 796 mrc : mrcfile.mrcfile.MrcFile 797 MRC object to add noise to. 798 loc : float, optional 799 Mean of Gaussian distribution. Default is 0. 800 scale : float, optional 801 Standard deviation of Gaussian distribution. Default is 1. 802 803 Returns 804 ------- 805 numpy.ndarray or cupy.ndarray 806 Data with noise added. 807 """ 808 D = mrc.data 809 noise = np.random.normal(loc=loc, scale=scale, size=D.shape) 810 D_w_noise = D + noise 811 return D_w_noise
Adds Gaussian white noise to the data in an mrc file.
Parameters
- mrc (mrcfile.mrcfile.MrcFile): MRC object to add noise to.
- loc (float, optional): Mean of Gaussian distribution. Default is 0.
- scale (float, optional): Standard deviation of Gaussian distribution. Default is 1.
Returns
- numpy.ndarray or cupy.ndarray: Data with noise added.
815def leave_2D_density(mrc_data, middle_z_index, xyz_data): 816 three_D_ndarray_left = np.zeros( 817 (mrc_data.shape[0], mrc_data.shape[1], mrc_data.shape[2]) 818 ) 819 for k in range(xyz_data.shape[0]): 820 # print (xyz_data[k]) # 0 0 64 821 # print (type(xyz_data[k])) #<class 'numpy.ndarray'> 822 x_to_keep = xyz_data[k][0] # 0 823 y_to_keep = xyz_data[k][1] # 0 824 z_to_keep = xyz_data[k][2] # 64 825 three_D_ndarray_left[x_to_keep, y_to_keep, z_to_keep] = mrc_data[ 826 x_to_keep, y_to_keep, z_to_keep 827 ] 828 return three_D_ndarray_left
831def main(args): 832 """ 833 Takes a 3D density (.mrc), applies threshold, coarse-grains data, and converts it into a graph network. 834 Outputs a .png file of the coarse grained model, and a .gexf graph xml file. 835 836 Parameters 837 ---------- 838 args : argparse.Namespace 839 Argument parser object containing the following attributes: 840 - fname : str 841 .mrc filename (white density w/ black background). 842 - mrc_t : float 843 Unnormalized pixel intensity threshold level. 844 - eps : float 845 DBSCAN epsilon (inter cluster distance). 846 - ms : int 847 DBSCAN min samples (minimum number of samples in cluster). 848 - d_cut : float 849 Pairwise distance cutoff for assigning edges to graph, in pixels. 850 """ 851 fname = Path(args.fname) 852 mrc_t = args.mrc_t 853 DBSCAN_epsilon = args.eps # try 1 854 DBSCAN_min_samples = args.ms # try 4 855 d_cut = args.d_cut # try 8 856 out_fname = fname.with_suffix("") 857 858 mrc = load_density_file(fname) 859 xyz_data = normalize_and_threshold_data(mrc, mrc_t) 860 861 # print (f"xyz_data:{xyz_data}") 862 863 model = cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples) 864 coarse_model = get_cluster_centroids(xyz_data, model) 865 G = create_and_save_graph(coarse_model, d_cut, out_fname) 866 fig = plot_clustering_results(xyz_data, coarse_model)
Takes a 3D density (.mrc), applies threshold, coarse-grains data, and converts it into a graph network. Outputs a .png file of the coarse grained model, and a .gexf graph xml file.
Parameters
- args (argparse.Namespace):
Argument parser object containing the following attributes:
- fname : str .mrc filename (white density w/ black background).
- mrc_t : float Unnormalized pixel intensity threshold level.
- eps : float DBSCAN epsilon (inter cluster distance).
- ms : int DBSCAN min samples (minimum number of samples in cluster).
- d_cut : float Pairwise distance cutoff for assigning edges to graph, in pixels.