griptomo.core.density2graph

View Source

  1# August George, 2022, PNNL
  2
  3import argparse
  4import mrcfile
  5import matplotlib.pyplot as plt
  6
  7try:
  8    import cupy as cp
  9    from cuml.cluster import DBSCAN as DBSCAN_GPU, HDBSCAN as HDBSCAN_GPU
 10    from cuml.preprocessing import StandardScaler as StandardScaler_GPU
 11    import cugraph as cg
 12    import cudf
 13
 14    if cp.cuda.is_available():
 15        IS_GPU_AVAILABLE = True
 16        print("GPU support for GRIP-Tomo clustering is available.")
 17    else:
 18        raise Exception("No GPU detected.")
 19except Exception as e:
 20    print(
 21        f"GPU support for GRIP-Tomo clustering is not available because of {e}. Using CPU."
 22    )
 23    IS_GPU_AVAILABLE = False
 24import numpy as np
 25from hdbscan import HDBSCAN
 26from sklearn.cluster import DBSCAN, OPTICS
 27from sklearn.preprocessing import StandardScaler
 28import networkx as nx
 29from pathlib import Path
 30from warnings import warn
 31from scipy.spatial import KDTree
 32import igraph as ig
 33
 34
 35def _ensure_array(data, device=None):
 36    """Ensures that the input data is a numpy array."""
 37    if isinstance(data, np.ndarray):
 38        if device == "gpu" and IS_GPU_AVAILABLE:
 39            return cp.asarray(data)
 40        return data
 41    elif isinstance(data, mrcfile.mrcfile.MrcFile):
 42        array_data = np.asarray(data.data)
 43        if device == "gpu" and IS_GPU_AVAILABLE:
 44            return cp.asarray(array_data)
 45        return array_data
 46    if IS_GPU_AVAILABLE:
 47        if isinstance(data, cp.ndarray):
 48            return data
 49    else:
 50        raise ValueError(f"Unsupported data type: {type(data)}")
 51
 52
 53def load_density_file(fname, print_warning=True):
 54    """
 55    Load a .mrc file using the mrcfile package.
 56
 57    Parameters
 58    ----------
 59    fname : str
 60        Filename or filepath of the .mrc file.
 61    print_warning : bool, optional
 62        If True, prints a warning message about data inversion. Default is True.
 63
 64    Returns
 65    -------
 66    mrcfile.mrcfile.MrcFile
 67        MRC file object containing the header and data properties.
 68    """
 69    # load .mrc tomogram file as a MRC object which has header and data properties.
 70    # see: https://mrcfile.readthedocs.io/en/latest/usage_guide.html
 71    mrc = mrcfile.mmap(fname, mode="r")  # memory mapped mode for large files
 72    # warn('WARNING: check the inversion of the data. White voxels should correspond to high density.')
 73    if print_warning:
 74        print(
 75            "WARNING: check the inversion of the data. White voxels should correspond to high density."
 76        )
 77    return mrc
 78
 79
 80def normalize_and_threshold_data(mrc, mrc_t, noise_stdev=0.0, norm_T=True):
 81    """
 82    Normalizes threshold value and densities then applies a cutoff threshold.
 83
 84    Parameters
 85    ----------
 86    mrc : mrcfile.mrcfile.MrcFile
 87        MRC data.
 88    mrc_t : float
 89        Raw (unnormalized) pixel intensity cutoff threshold.
 90    noise_stdev : float, optional
 91        Standard deviation of Gaussian noise (mean=0) to add. Default is 0 (no noise added).
 92    norm_T : bool, optional
 93        If True, threshold value is normalized. Default is True.
 94
 95    Returns
 96    -------
 97    numpy.ndarray or cupy.ndarray
 98        Array of x, y, z coordinates which are above the cutoff threshold. A[0] = [x0, y0, z0].
 99    """
100    warn("This method is deprecated.", DeprecationWarning, stacklevel=2)
101    # load and normalize data, normalize threshold value
102    if noise_stdev == 0:
103        D = mrc.data
104    else:
105        assert noise_stdev >= 0
106        print("add_Gaussian_noise")
107        D = add_Gaussian_noise(mrc, scale=noise_stdev)
108
109    # get x,y,z coordinates above threshold
110    if norm_T == False:
111        print("Do not normalize")
112        x, y, z = np.where(D > mrc_t)
113    else:
114        D_min = np.min(D)
115        D_max = np.max(D)
116        D_norm = (D - D_min) / (
117            D_max - D_min
118        )  # normalize to 0,1: (x_i-x_min) / (x_max - x_min)
119
120        t_norm = (mrc_t - D_min) / (D_max - D_min)
121        # Doo Nam confirmed that normalization of density ran well.
122        # Doo Nam confirmed that as long as he puts reasonable mrc_t, t_norm is 0~1
123        # However, since both density and t_norm are normalized, there was no practice difference for xyz_data (regardless of norm_T)
124
125        x, y, z = np.where(D_norm > t_norm)
126    xyz_data = np.transpose(np.stack([x, y, z]))
127    return xyz_data
128
129
130### end of def normalize_and_threshold_data(mrc, mrc_t, noise_stdev=0.0, norm_T=True):
131
132
133def normalize_mrc_data(
134    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray,
135    lower_bound: float = -1,
136    upper_bound: float = 1,
137    device: str = "cpu",
138) -> np.ndarray:
139    """Normalizes the data of an MRC file.
140
141    Normalizes the data from `mrc` to the range of
142    [`lower_bound`, `upper_bound`].
143
144    Parameters
145    ----------
146    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
147        MRC file or MRC data (numpy array or cupy array).
148    lower_bound : float, optional
149        Lower bound of the normalized data. Default is -1.
150    upper_bound : float, optional
151        Upper bound of the normalized data. Default is 1.
152    device : str, optional
153        Device to use for computation. Default is cpu.
154
155    Returns
156    -------
157    numpy.ndarray or cupy.ndarray
158        Array of normalized MRC data.
159
160    Notes
161    -----
162    x_norm = lower_bound + (x - min(x))(upper_bound-lower_bound)/(max(x)-min(x))
163    """
164    D = _ensure_array(mrc_data, device)
165    assert lower_bound < upper_bound
166    D_min = np.min(D)
167    D_max = np.max(D)
168    D_norm = lower_bound + (upper_bound - lower_bound) * (D - D_min) / (D_max - D_min)
169    return D_norm
170
171
172def standardize_mrc_data(
173    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, device: str = "cpu"
174) -> np.ndarray:
175    """Standardizes the data using scikit-learn's StandardScaler.
176
177    Parameters
178    ----------
179    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
180        MRC file or MRC data (numpy array or cupy array).
181    device : str, optional
182        Device to use for computation. Default is cpu.
183
184    Returns
185    -------
186    numpy.ndarray or cupy.ndarray
187        Array of standardized MRC data.
188
189    Notes
190    -----
191    Standardization: z = (x - u) / s
192    u: mean of the data
193    s: standard deviation of the data
194
195    """
196    D = _ensure_array(mrc_data, device)
197
198    if device == "gpu" and IS_GPU_AVAILABLE:
199        scaler = StandardScaler_GPU()
200    else:
201        scaler = StandardScaler()
202    D_standardized = scaler.fit_transform(D.reshape(-1, 1)).reshape(D.shape)
203
204    return D_standardized
205
206
207def threshold_mrc_data(
208    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray,
209    threshold: float,
210    device: str = "cpu",
211) -> np.ndarray:
212    """
213    Keeps only the MRC data above the threshold value, setting the other data to the min value.
214
215    Parameters
216    ----------
217    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
218        MRC file or MRC data (numpy array or cupy array).
219    threshold : float
220        Threshold value.
221    device : str, optional
222        Device to use for computation. Default is cpu.
223
224    Returns
225    -------
226    numpy.ndarray or cupy.ndarray
227        Thresholded MRC data.
228
229    Notes
230    ------
231    This function is agnostic towards data normalization.
232    """
233    D = _ensure_array(mrc_data, device)
234    thresholded_D = np.where(D > threshold, D, np.min(D))
235    return thresholded_D
236
237
238def identify_threshold_ratio(
239    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, ratio: float, device: str = "cpu"
240) -> np.ndarray:
241    """
242    Given the MRC file, finds the density value that corresponds to the requested threshold ratio from 0 to 1.
243
244    Parameters
245    ----------
246    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
247        MRC file or MRC data (numpy array or cupy array).
248    ratio : float
249        Threshold ratio from 0 to 1. 1 = max density (remove all points), 0 = min density (keep all points).
250    device : str, optional
251        Device to use for computation. Default is cpu.
252
253    Returns
254    -------
255    numpy.ndarray or cupy.ndarray
256        Density value corresponding to the requested threshold ratio.
257    """
258    assert 0 <= ratio <= 1
259    D = _ensure_array(mrc_data, device)
260    sorted_densities = np.sort(D.flatten())
261    threshold_index = int(round(ratio * len(sorted_densities)))
262    if threshold_index == len(sorted_densities):
263        raise ValueError(
264            f"Threshold index is at the end of the sorted densities. Please choose a lower ratio."
265        )
266    return sorted_densities[threshold_index]
267
268
269def generate_point_cloud_from_mrc_data(
270    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray,
271    threshold: float,
272    device: str = "cpu",
273) -> np.ndarray:
274    """
275    Generates an array of X, Y, and Z coordinates of the data that is above a threshold.
276
277    Parameters
278    ----------
279    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
280        MRC file or MRC data (numpy array or cupy array).
281    threshold : float
282        Threshold value.
283    device : str, optional
284        Device to use for computation. Default is cpu.
285
286    Returns
287    -------
288    numpy.ndarray or cupy.ndarray
289        Array of X, Y, and Z coordinates of the data that is above the threshold.
290
291    Notes
292    -----
293    This function is agnostic towards data normalization.
294    """
295    D = _ensure_array(mrc_data, device)
296    x, y, z = np.where(D > threshold)
297    xyz_data = np.transpose(np.stack([x, y, z]))
298    return xyz_data
299
300
301def augment_mrc_data(
302    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray,
303    offset_percent: float = 10,
304    n: int = 1,
305) -> list[np.ndarray] | np.ndarray:
306    """
307    Generates N augmented MRC datasets that are injected with random noise from a uniform distribution bounded by the offset percentage.
308
309    Parameters
310    ----------
311    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray
312        MRC file or MRC data (numpy array).
313    offset_percent : float, optional
314        Offset percentage for random noise. Default is 10.
315    n : int, optional
316        Number of augmented datasets to generate. Default is 1.
317
318    Returns
319    -------
320    list[numpy.ndarray] or numpy.ndarray
321        List of augmented MRC datasets or a single augmented dataset.
322    """
323    D = _ensure_array(mrc_data)
324    assert 0.0 <= offset_percent <= 100
325    assert n >= 1
326
327    random_uniform = np.random.uniform
328
329    if n == 1:
330        random_offsets = (
331            random_uniform(low=-offset_percent, high=offset_percent, size=D.shape) / 100
332        )
333        adjusted_data = D * (1 + random_offsets)
334        return adjusted_data
335    else:
336        adjusted_data_list = []
337        for _ in range(n):
338            random_offsets = (
339                random_uniform(low=-offset_percent, high=offset_percent, size=D.shape)
340                / 100
341            )
342            adjusted_data = D * (1 + random_offsets)
343            adjusted_data_list.append(adjusted_data)
344        return adjusted_data_list
345
346
347def calculate_point_cloud_mass(
348    point_cloud: np.ndarray, voxel_dim: float, density: float
349) -> float:
350    """
351    Calculates the mass of an array of voxel xyz coordinates.
352
353    Parameters
354    ----------
355    point_cloud : numpy.ndarray or cupy.ndarray
356        Array of voxel xyz coordinates.
357    voxel_dim : float
358        Voxel dimension.
359    density : float
360        Density value.
361
362    Returns
363    -------
364    float
365        Estimated mass of the point cloud.
366
367    Notes
368    -----
369    The estimated mass = n_voxels * voxel_dim * density.
370    For example, mass in kDa = 10 voxels * 1 nm3/ voxel * 0.82 kDa / nm3.
371    The units for `voxel_dim` and `density` must be consistent.
372    Use 0.82 kDa / nm3 for large globular proteins.
373    """
374    n_voxels = point_cloud.shape[0]
375    return n_voxels * voxel_dim * density
376
377
378def cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples):
379    """
380    Clusters data using DBSCAN from sklearn or cuml.
381
382    Parameters
383    ----------
384    xyz_data : numpy.ndarray or cupy.ndarray
385        Array of x, y, z coordinates.
386    DBSCAN_epsilon : float
387        The maximum distance between two samples for one to be considered as in the neighborhood of the other.
388    DBSCAN_min_samples : int
389        The number of samples in a neighborhood for a point to be considered as a core point.
390
391    Returns
392    -------
393    sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN: clustering results stored in an object
394
395    See Also
396    -------
397    sklearn.cluster.DBSCAN
398    cuml.cluster.DBSCAN
399    cluster_data_hdbscan
400    cluster_data_optics
401    """
402    if IS_GPU_AVAILABLE and isinstance(xyz_data, cp.ndarray):
403        clusterer = DBSCAN_GPU
404    else:
405        clusterer = DBSCAN
406    try:
407        model = clusterer(
408            eps=DBSCAN_epsilon, min_samples=DBSCAN_min_samples
409        )  # apply coarse-graining (DBSCAN)
410        model.fit_predict(xyz_data)
411        return model
412    except:
413        print(
414            'Perhaps, "Fatal Python error: _PyErr_NormalizeException: Cannot recover from MemoryErrors while normalizing exceptions. Python runtime state: initialized" at tahoma for empiar inverted mrc'
415        )
416        print(f"Failed xyz_data.shape:{xyz_data.shape}")
417        print(f"DBSCAN_epsilon:{DBSCAN_epsilon}")
418        print(f"DBSCAN_min_samples:{DBSCAN_min_samples}")
419        return False
420
421
422def cluster_data_hdbscan(xyz_data, HDBSCAN_min_cluster_size, HDBSCAN_min_samples):
423    """
424    Clusters data using HDBSCAN from hdbscan or cuml.
425
426    Parameters
427    ----------
428    xyz_data : numpy.ndarray or cupy.ndarray
429        Array of x, y, z coordinates.
430    HDBSCAN_min_cluster_size : int
431        The minimum number of points in a cluster.
432    HDBSCAN_min_samples : int
433        The number of samples in a neighborhood for a point to be considered as a core point.
434
435    Returns
436    -------
437    hdbscan.HDBSCAN or cuml.cluster.HDBSCAN: clustering results stored in an object
438
439    See Also
440    -------
441    hdbscan.HDBSCAN
442    cuml.cluster.HDBSCAN
443    cluster_data
444    cluster_data_optics
445    """
446    if IS_GPU_AVAILABLE and isinstance(xyz_data, cp.ndarray):
447        clusterer = HDBSCAN_GPU
448    else:
449        clusterer = HDBSCAN
450    try:
451        model = clusterer(
452            min_cluster_size=HDBSCAN_min_cluster_size,
453            min_samples=HDBSCAN_min_samples,
454            allow_single_cluster=False,
455            core_dist_n_jobs=-1,
456            approx_min_span_tree=False,
457            cluster_selection_method="eom",
458        )  # apply clustering (HDBSCAN)
459        model.fit(xyz_data)
460        return model
461    except Exception as e:
462        print(f"Error while performing HDBSCAN: {e}")
463        print(f"Failed xyz_data.shape:{xyz_data.shape}")
464        print(f"HDBSCAN_min_cluster_size:{HDBSCAN_min_cluster_size}")
465        print(f"HDBSCAN_min_samples:{HDBSCAN_min_samples}")
466        return False
467
468
469def cluster_data_optics(
470    xyz_data,
471    OPTICS_min_samples,
472    OPTICS_min_cluster_size=None,
473    OPTICS_max_epsilon=np.inf,
474):
475    """
476    Clusters data using OPTICS from sklearn.
477
478    Parameters
479    ----------
480    xyz_data : numpy.ndarray or cupy.ndarray
481        Array of x, y, z coordinates.
482    OPTICS_min_samples : int
483        The number of samples in a neighborhood for a point to be considered as a core point.
484    OPTICS_min_cluster_size : int, optional
485        The minimum number of points in a cluster. Default is None.
486    OPTICS_max_epsilon : float, optional
487        The maximum distance between points considered by OPTICS. Default is numpy.inf.
488
489    Returns
490    -------
491    sklearn.cluster.OPTICS: clustering results stored in an object
492
493    See Also
494    -------
495    sklearn.cluster.OPTICS
496    cluster_data_hdbscan
497    cluster_data
498    """
499    try:
500        model = OPTICS(
501            min_cluster_size=OPTICS_min_cluster_size,
502            min_samples=OPTICS_min_samples,
503            max_eps=OPTICS_max_epsilon,
504            n_jobs=-1,
505        )  # apply clustering (OPTICS)
506        model.fit(xyz_data)
507        return model
508    except Exception as e:
509        print(f"Error while performing OPTICS: {e}")
510        print(f"Failed xyz_data.shape:{xyz_data.shape}")
511        print(f"OPTICS_min_cluster_size:{OPTICS_min_cluster_size}")
512        print(f"OPTICS_min_samples:{OPTICS_min_samples}")
513        return False
514
515
516def voxel_coarsening(voxel_dim, point_cloud, labels, density=None, averaged=False):
517    """
518    Coarsens largest cluster into voxel_dim^3 chunks.
519
520    Parameters
521    ----------
522    voxel_dim : int
523        Chunk dimension for coarsening.
524    point_cloud : numpy.ndarray or cupy.ndarray
525        Array of x, y, z coordinates.
526    labels : numpy.ndarray or cupy.ndarray
527        Array of cluster labels.
528    density : numpy.ndarray or cupy.ndarray, optional
529        Array of density values. Default is None.
530    averaged : bool, optional
531        If True, the coarsened point cloud will be the average of the points in each chunk, using the density as weights, if provided. Default is False.
532
533    Returns
534    -------
535    numpy.ndarray or cupy.ndarray
536        Coordinates of the centroid of each chunk that contain more than half of the non-empty voxels.
537    """
538    max_x_dim, max_y_dim, max_z_dim = np.max(point_cloud, axis=0) + 1
539    coarsened_dim_x = int(np.ceil(max_x_dim / voxel_dim))
540    coarsened_dim_y = int(np.ceil(max_y_dim / voxel_dim))
541    coarsened_dim_z = int(np.ceil(max_z_dim / voxel_dim))
542    largest_cluster = np.argmax(np.bincount(labels[labels != -1]))
543    dense_bins_count = np.zeros((coarsened_dim_x, coarsened_dim_y, coarsened_dim_z))
544    label_mask = labels == largest_cluster
545    isolated_points = point_cloud[label_mask]
546    coarsened_points = isolated_points // voxel_dim
547    coarsened_points = coarsened_points.astype(int)
548    if density is not None:
549        isolated_density = density[label_mask]
550        coarse_dense_count = np.zeros_like(dense_bins_count)
551    if not averaged:
552        for point in coarsened_points:
553            dense_bins_count[point[0], point[1], point[2]] += 1
554        x, y, z = np.where(dense_bins_count >= (voxel_dim**3) / 2)
555        coarsened_point_cloud = np.transpose(np.stack([x, y, z])) * 3 + 1
556    else:
557        dense_bins = np.zeros((coarsened_dim_x, coarsened_dim_y, coarsened_dim_z, 3))
558        if density is not None:
559            np.add.at(
560                dense_bins,
561                (
562                    coarsened_points[:, 0],
563                    coarsened_points[:, 1],
564                    coarsened_points[:, 2],
565                ),
566                isolated_points * isolated_density[:, np.newaxis],
567            )
568            np.add.at(
569                dense_bins_count,
570                (
571                    coarsened_points[:, 0],
572                    coarsened_points[:, 1],
573                    coarsened_points[:, 2],
574                ),
575                isolated_density,
576            )
577            np.add.at(
578                coarse_dense_count,
579                (
580                    coarsened_points[:, 0],
581                    coarsened_points[:, 1],
582                    coarsened_points[:, 2],
583                ),
584                1,
585            )
586        else:
587            for coarse_point, raw_point in zip(coarsened_points, isolated_points):
588                dense_bins[
589                    coarse_point[0], coarse_point[1], coarse_point[2]
590                ] += raw_point
591                dense_bins_count[coarse_point[0], coarse_point[1], coarse_point[2]] += 1
592        dense_bin_points = np.where(dense_bins_count > 0)
593        coarsened_point_cloud = (
594            dense_bins[dense_bin_points]
595            / dense_bins_count[dense_bin_points][:, np.newaxis]
596        )
597        if density is not None:
598            coarsened_density = (
599                dense_bins_count[dense_bin_points]
600                / coarse_dense_count[dense_bin_points]
601            )
602            return coarsened_point_cloud, coarsened_density
603    return coarsened_point_cloud
604
605
606def get_cluster_centroids(xyz_data, model):
607    """
608    Coarse grain density model using cluster centroids.
609
610    Parameters
611    ----------
612    xyz_data : numpy.ndarray or cupy.ndarray
613        Array of x, y, z coordinates.
614    model : sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN
615        DBSCAN clustering results stored in an object.
616
617    Returns
618    -------
619    numpy.ndarray or cupy.ndarray
620        Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
621    """
622    samples_w_lbls = np.concatenate((xyz_data, model.labels_[:, np.newaxis]), axis=1)
623    if -1 in set(model.labels_):  # if noise detected
624        coarse_model = np.zeros(
625            (len(set(model.labels_)) - 1, 3)
626        )  # remove last label which is noise
627        for i in range(len(set(model.labels_)) - 1):
628            # https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python
629            tmp_T = np.transpose(
630                samples_w_lbls[np.in1d(samples_w_lbls[:, -1], np.asarray([i]))]
631            )
632            x_mean = np.mean(tmp_T[0])
633            y_mean = np.mean(tmp_T[1])
634            z_mean = np.mean(tmp_T[2])
635            coarse_model[i] = [x_mean, y_mean, z_mean]
636    else:
637        coarse_model = np.zeros((len(set(model.labels_)), 3))
638        for i in range(len(set(model.labels_))):
639            # https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python
640            tmp_T = np.transpose(
641                samples_w_lbls[np.in1d(samples_w_lbls[:, -1], np.asarray([i]))]
642            )
643            x_mean = np.mean(tmp_T[0])
644            y_mean = np.mean(tmp_T[1])
645            z_mean = np.mean(tmp_T[2])
646            coarse_model[i] = [x_mean, y_mean, z_mean]
647    return coarse_model
648
649
650def plot_clustering_results(xyz_data, coarse_model, figsize=3):
651    """
652    Creates a 3D scatter plot containing both the xyz data and the cluster centroids.
653
654    Note: should rotate afterwards for better visualization.
655
656    Parameters
657    ----------
658    xyz_data : numpy.ndarray or cupy.ndarray
659        Array of x, y, z coordinates.
660    coarse_model : numpy.ndarray or cupy.ndarray
661        Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
662    figsize : int, optional
663        Size of the figure. Default is 3.
664
665    Returns
666    -------
667    matplotlib.figure.Figure
668        3D scatter plot figure.
669    """
670    fig = plt.figure(figsize=(figsize, figsize))
671    plt.title("clustering results")
672    ax = fig.add_subplot(projection="3d")
673    ax.scatter(
674        xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2], c="purple", s=2, alpha=0.3
675    )
676    ax.scatter(
677        coarse_model[:, 0],
678        coarse_model[:, 1],
679        coarse_model[:, 2],
680        c="k",
681        s=10,
682        alpha=0.9,
683    )
684    return fig
685
686
687def create_cugraph_from_point_cloud(point_cloud, proximity_px):
688    """
689    Creates a cuGraph object from a point cloud with edges assigned based on proximity.
690
691    Parameters
692    ----------
693    point_cloud : numpy.ndarray or cupy.ndarray
694        Array of x, y, z coordinates.
695    proximity_px : float
696        Pairwise cutoff distance for assigning edges to nodes, in pixels.
697    savepath : str, optional
698        Path to save the cuGraph object as a GML file. If None, don't save.
699        Default is None.
700
701    Returns
702    -------
703    cugraph.Graph
704        Unweighted, undirected cuGraph graph object.
705
706    Notes
707    -----
708    The cuGraph graph is unweighted and undirected.
709    Uses the KDTree algorithm to find the neighbors under the cutoff distance.
710    """
711    if IS_GPU_AVAILABLE:
712        if isinstance(point_cloud, cp.ndarray):
713            point_cloud = cp.asnumpy(point_cloud)
714        tree = KDTree(point_cloud)
715        edgelist = cp.array(tree.query_pairs(proximity_px, output_type="ndarray"))
716        edgelist_cudf = cudf.DataFrame(edgelist, columns=["source", "destination"])
717        edgelist_cudf = cg.symmetrize_df(edgelist_cudf, "source", "destination")
718        G = cg.Graph()
719        G.from_cudf_edgelist(edgelist_cudf, source="source", destination="destination")
720        return G
721    else:
722        raise ValueError("GPU support is not available.")
723
724
725def create_igraph_from_point_cloud(point_cloud, proximity_px, savepath=None):
726    """
727    Creates an igraph object from a point cloud with edges assigned based on proximity.
728
729    Parameters
730    ----------
731    point_cloud : numpy.ndarray
732        Array of x, y, z coordinates.
733    proximity_px : float
734        Pairwise cutoff distance for assigning edges to nodes, in pixels.
735    savepath : str, optional
736        Path to save the igraph object as a GML file. If None, don't save.
737        Default is None.
738
739    Returns
740    -------
741    igraph.Graph
742        Unweighted, undirected igraph graph object.
743
744    Notes
745    -----
746    The igraph graph is unweighted and undirected.
747    Uses the KDTree algorithm to find the neighbors under the cutoff distance.
748    """
749    tree = KDTree(point_cloud)
750    edgelist = tree.query_pairs(proximity_px, output_type="ndarray")
751    G = ig.Graph(edgelist)
752    if savepath:
753        G.write(savepath, format="gml")
754    return G
755
756
757def create_and_save_graph(coarse_model, proximity_px, out_fname, save=True):
758    """
759    Creates a Networkx graph from the coarse grained model (cluster centroids) and saves it as a graph XML file (.gexf).
760
761    Parameters
762    ----------
763    coarse_model : numpy.ndarray or cupy.ndarray
764        Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
765    proximity_px : float
766        Pairwise cutoff distance for assigning edges to nodes, in pixels.
767    out_fname : str
768        Filename for output.
769    save : bool, optional
770        Flag to save file (True) or not (False). Default is True.
771
772    Returns
773    -------
774    networkx.Graph
775        Networkx graph representation of coarse model (cluster centroids).
776    """
777    tree = KDTree(coarse_model)
778    edgelist = tree.query_pairs(proximity_px, output_type="ndarray")
779
780    # Arsam Firoozfar "It appears that the current code using nx is slightly faster than using igraph to directly create graph from matrix." 3/31/2023
781
782    G = nx.from_edgelist(edgelist)
783
784    if save:
785        nx.write_gexf(G, f"{out_fname}.gexf")
786    return G
787
788
789def add_Gaussian_noise(mrc, loc=0.0, scale=1.0):
790    """
791    Adds Gaussian white noise to the data in an mrc file.
792
793    Parameters
794    ----------
795    mrc : mrcfile.mrcfile.MrcFile
796        MRC object to add noise to.
797    loc : float, optional
798        Mean of Gaussian distribution. Default is 0.
799    scale : float, optional
800        Standard deviation of Gaussian distribution. Default is 1.
801
802    Returns
803    -------
804    numpy.ndarray or cupy.ndarray
805        Data with noise added.
806    """
807    D = mrc.data
808    noise = np.random.normal(loc=loc, scale=scale, size=D.shape)
809    D_w_noise = D + noise
810    return D_w_noise
811
812
813# TODO: Add documentation for leave_2D_density
814def leave_2D_density(mrc_data, middle_z_index, xyz_data):
815    three_D_ndarray_left = np.zeros(
816        (mrc_data.shape[0], mrc_data.shape[1], mrc_data.shape[2])
817    )
818    for k in range(xyz_data.shape[0]):
819        # print (xyz_data[k]) # 0 0 64
820        # print (type(xyz_data[k])) #<class 'numpy.ndarray'>
821        x_to_keep = xyz_data[k][0]  # 0
822        y_to_keep = xyz_data[k][1]  # 0
823        z_to_keep = xyz_data[k][2]  # 64
824        three_D_ndarray_left[x_to_keep, y_to_keep, z_to_keep] = mrc_data[
825            x_to_keep, y_to_keep, z_to_keep
826        ]
827    return three_D_ndarray_left
828
829
830def main(args):
831    """
832    Takes a 3D density (.mrc), applies threshold, coarse-grains data, and converts it into a graph network.
833    Outputs a .png file of the coarse grained model, and a .gexf graph xml file.
834
835    Parameters
836    ----------
837    args : argparse.Namespace
838        Argument parser object containing the following attributes:
839        - fname : str
840            .mrc filename (white density w/ black background).
841        - mrc_t : float
842            Unnormalized pixel intensity threshold level.
843        - eps : float
844            DBSCAN epsilon (inter cluster distance).
845        - ms : int
846            DBSCAN min samples (minimum number of samples in cluster).
847        - d_cut : float
848            Pairwise distance cutoff for assigning edges to graph, in pixels.
849    """
850    fname = Path(args.fname)
851    mrc_t = args.mrc_t
852    DBSCAN_epsilon = args.eps  # try 1
853    DBSCAN_min_samples = args.ms  # try 4
854    d_cut = args.d_cut  # try 8
855    out_fname = fname.with_suffix("")
856
857    mrc = load_density_file(fname)
858    xyz_data = normalize_and_threshold_data(mrc, mrc_t)
859
860    # print (f"xyz_data:{xyz_data}")
861
862    model = cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples)
863    coarse_model = get_cluster_centroids(xyz_data, model)
864    G = create_and_save_graph(coarse_model, d_cut, out_fname)
865    fig = plot_clustering_results(xyz_data, coarse_model)
866
867
868if __name__ == "__main__":
869
870    # example: >> python density2graph.py fname.mrc 0.5 1 4 8
871    parser = argparse.ArgumentParser()
872    parser.add_argument(
873        "fname",
874        help="tomogram .mrc filename (white density w/ black background)",
875        type=str,
876    )
877    parser.add_argument(
878        "t", help="pixel intensity threshold cutoff (unormalized)", type=float
879    )
880    parser.add_argument(
881        "eps", help="DBSCAN epsilon (inter cluster distance) in pixels", type=float
882    )
883    parser.add_argument(
884        "ms", help="DBSCAN min samples (minimum number of samples in cluster)", type=int
885    )
886    parser.add_argument(
887        "d_cut",
888        help="pairwise distance cutoff for assigning edges in pixels",
889        type=float,
890    )
891    args = parser.parse_args()
892    main(args)

def load_density_file(fname, print_warning=True): View Source

54def load_density_file(fname, print_warning=True):
55    """
56    Load a .mrc file using the mrcfile package.
57
58    Parameters
59    ----------
60    fname : str
61        Filename or filepath of the .mrc file.
62    print_warning : bool, optional
63        If True, prints a warning message about data inversion. Default is True.
64
65    Returns
66    -------
67    mrcfile.mrcfile.MrcFile
68        MRC file object containing the header and data properties.
69    """
70    # load .mrc tomogram file as a MRC object which has header and data properties.
71    # see: https://mrcfile.readthedocs.io/en/latest/usage_guide.html
72    mrc = mrcfile.mmap(fname, mode="r")  # memory mapped mode for large files
73    # warn('WARNING: check the inversion of the data. White voxels should correspond to high density.')
74    if print_warning:
75        print(
76            "WARNING: check the inversion of the data. White voxels should correspond to high density."
77        )
78    return mrc

Load a .mrc file using the mrcfile package.

Parameters

fname (str): Filename or filepath of the .mrc file.
print_warning (bool, optional): If True, prints a warning message about data inversion. Default is True.

Returns

mrcfile.mrcfile.MrcFile: MRC file object containing the header and data properties.

def normalize_and_threshold_data(mrc, mrc_t, noise_stdev=0.0, norm_T=True): View Source

 81def normalize_and_threshold_data(mrc, mrc_t, noise_stdev=0.0, norm_T=True):
 82    """
 83    Normalizes threshold value and densities then applies a cutoff threshold.
 84
 85    Parameters
 86    ----------
 87    mrc : mrcfile.mrcfile.MrcFile
 88        MRC data.
 89    mrc_t : float
 90        Raw (unnormalized) pixel intensity cutoff threshold.
 91    noise_stdev : float, optional
 92        Standard deviation of Gaussian noise (mean=0) to add. Default is 0 (no noise added).
 93    norm_T : bool, optional
 94        If True, threshold value is normalized. Default is True.
 95
 96    Returns
 97    -------
 98    numpy.ndarray or cupy.ndarray
 99        Array of x, y, z coordinates which are above the cutoff threshold. A[0] = [x0, y0, z0].
100    """
101    warn("This method is deprecated.", DeprecationWarning, stacklevel=2)
102    # load and normalize data, normalize threshold value
103    if noise_stdev == 0:
104        D = mrc.data
105    else:
106        assert noise_stdev >= 0
107        print("add_Gaussian_noise")
108        D = add_Gaussian_noise(mrc, scale=noise_stdev)
109
110    # get x,y,z coordinates above threshold
111    if norm_T == False:
112        print("Do not normalize")
113        x, y, z = np.where(D > mrc_t)
114    else:
115        D_min = np.min(D)
116        D_max = np.max(D)
117        D_norm = (D - D_min) / (
118            D_max - D_min
119        )  # normalize to 0,1: (x_i-x_min) / (x_max - x_min)
120
121        t_norm = (mrc_t - D_min) / (D_max - D_min)
122        # Doo Nam confirmed that normalization of density ran well.
123        # Doo Nam confirmed that as long as he puts reasonable mrc_t, t_norm is 0~1
124        # However, since both density and t_norm are normalized, there was no practice difference for xyz_data (regardless of norm_T)
125
126        x, y, z = np.where(D_norm > t_norm)
127    xyz_data = np.transpose(np.stack([x, y, z]))
128    return xyz_data

Normalizes threshold value and densities then applies a cutoff threshold.

Parameters

mrc (mrcfile.mrcfile.MrcFile): MRC data.
mrc_t (float): Raw (unnormalized) pixel intensity cutoff threshold.
noise_stdev (float, optional): Standard deviation of Gaussian noise (mean=0) to add. Default is 0 (no noise added).
norm_T (bool, optional): If True, threshold value is normalized. Default is True.

Returns

numpy.ndarray or cupy.ndarray: Array of x, y, z coordinates which are above the cutoff threshold. A[0] = [x0, y0, z0].

def normalize_mrc_data( mrc_data: mrcfile.mrcfile.MrcFile | numpy.ndarray, lower_bound: float = -1, upper_bound: float = 1, device: str = 'cpu') -> numpy.ndarray: View Source

134def normalize_mrc_data(
135    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray,
136    lower_bound: float = -1,
137    upper_bound: float = 1,
138    device: str = "cpu",
139) -> np.ndarray:
140    """Normalizes the data of an MRC file.
141
142    Normalizes the data from `mrc` to the range of
143    [`lower_bound`, `upper_bound`].
144
145    Parameters
146    ----------
147    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
148        MRC file or MRC data (numpy array or cupy array).
149    lower_bound : float, optional
150        Lower bound of the normalized data. Default is -1.
151    upper_bound : float, optional
152        Upper bound of the normalized data. Default is 1.
153    device : str, optional
154        Device to use for computation. Default is cpu.
155
156    Returns
157    -------
158    numpy.ndarray or cupy.ndarray
159        Array of normalized MRC data.
160
161    Notes
162    -----
163    x_norm = lower_bound + (x - min(x))(upper_bound-lower_bound)/(max(x)-min(x))
164    """
165    D = _ensure_array(mrc_data, device)
166    assert lower_bound < upper_bound
167    D_min = np.min(D)
168    D_max = np.max(D)
169    D_norm = lower_bound + (upper_bound - lower_bound) * (D - D_min) / (D_max - D_min)
170    return D_norm

Normalizes the data of an MRC file.

Normalizes the data from mrc to the range of [lower_bound, upper_bound].

Parameters

mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
lower_bound (float, optional): Lower bound of the normalized data. Default is -1.
upper_bound (float, optional): Upper bound of the normalized data. Default is 1.
device (str, optional): Device to use for computation. Default is cpu.

Returns

numpy.ndarray or cupy.ndarray: Array of normalized MRC data.

Notes

x_norm = lower_bound + (x - min(x))(upper_bound-lower_bound)/(max(x)-min(x))

def standardize_mrc_data( mrc_data: mrcfile.mrcfile.MrcFile | numpy.ndarray, device: str = 'cpu') -> numpy.ndarray: View Source

173def standardize_mrc_data(
174    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, device: str = "cpu"
175) -> np.ndarray:
176    """Standardizes the data using scikit-learn's StandardScaler.
177
178    Parameters
179    ----------
180    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
181        MRC file or MRC data (numpy array or cupy array).
182    device : str, optional
183        Device to use for computation. Default is cpu.
184
185    Returns
186    -------
187    numpy.ndarray or cupy.ndarray
188        Array of standardized MRC data.
189
190    Notes
191    -----
192    Standardization: z = (x - u) / s
193    u: mean of the data
194    s: standard deviation of the data
195
196    """
197    D = _ensure_array(mrc_data, device)
198
199    if device == "gpu" and IS_GPU_AVAILABLE:
200        scaler = StandardScaler_GPU()
201    else:
202        scaler = StandardScaler()
203    D_standardized = scaler.fit_transform(D.reshape(-1, 1)).reshape(D.shape)
204
205    return D_standardized

Standardizes the data using scikit-learn's StandardScaler.

Parameters

mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
device (str, optional): Device to use for computation. Default is cpu.

Returns

numpy.ndarray or cupy.ndarray: Array of standardized MRC data.

Notes

Standardization: z = (x - u) / s u: mean of the data s: standard deviation of the data

def threshold_mrc_data( mrc_data: mrcfile.mrcfile.MrcFile | numpy.ndarray, threshold: float, device: str = 'cpu') -> numpy.ndarray: View Source

208def threshold_mrc_data(
209    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray,
210    threshold: float,
211    device: str = "cpu",
212) -> np.ndarray:
213    """
214    Keeps only the MRC data above the threshold value, setting the other data to the min value.
215
216    Parameters
217    ----------
218    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
219        MRC file or MRC data (numpy array or cupy array).
220    threshold : float
221        Threshold value.
222    device : str, optional
223        Device to use for computation. Default is cpu.
224
225    Returns
226    -------
227    numpy.ndarray or cupy.ndarray
228        Thresholded MRC data.
229
230    Notes
231    ------
232    This function is agnostic towards data normalization.
233    """
234    D = _ensure_array(mrc_data, device)
235    thresholded_D = np.where(D > threshold, D, np.min(D))
236    return thresholded_D

Keeps only the MRC data above the threshold value, setting the other data to the min value.

Parameters

mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
threshold (float): Threshold value.
device (str, optional): Device to use for computation. Default is cpu.

Returns

numpy.ndarray or cupy.ndarray: Thresholded MRC data.

Notes

This function is agnostic towards data normalization.

def identify_threshold_ratio( mrc_data: mrcfile.mrcfile.MrcFile | numpy.ndarray, ratio: float, device: str = 'cpu') -> numpy.ndarray: View Source

239def identify_threshold_ratio(
240    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray, ratio: float, device: str = "cpu"
241) -> np.ndarray:
242    """
243    Given the MRC file, finds the density value that corresponds to the requested threshold ratio from 0 to 1.
244
245    Parameters
246    ----------
247    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
248        MRC file or MRC data (numpy array or cupy array).
249    ratio : float
250        Threshold ratio from 0 to 1. 1 = max density (remove all points), 0 = min density (keep all points).
251    device : str, optional
252        Device to use for computation. Default is cpu.
253
254    Returns
255    -------
256    numpy.ndarray or cupy.ndarray
257        Density value corresponding to the requested threshold ratio.
258    """
259    assert 0 <= ratio <= 1
260    D = _ensure_array(mrc_data, device)
261    sorted_densities = np.sort(D.flatten())
262    threshold_index = int(round(ratio * len(sorted_densities)))
263    if threshold_index == len(sorted_densities):
264        raise ValueError(
265            f"Threshold index is at the end of the sorted densities. Please choose a lower ratio."
266        )
267    return sorted_densities[threshold_index]

Given the MRC file, finds the density value that corresponds to the requested threshold ratio from 0 to 1.

Parameters

mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
ratio (float): Threshold ratio from 0 to 1. 1 = max density (remove all points), 0 = min density (keep all points).
device (str, optional): Device to use for computation. Default is cpu.

Returns

numpy.ndarray or cupy.ndarray: Density value corresponding to the requested threshold ratio.

def generate_point_cloud_from_mrc_data( mrc_data: mrcfile.mrcfile.MrcFile | numpy.ndarray, threshold: float, device: str = 'cpu') -> numpy.ndarray: View Source

270def generate_point_cloud_from_mrc_data(
271    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray,
272    threshold: float,
273    device: str = "cpu",
274) -> np.ndarray:
275    """
276    Generates an array of X, Y, and Z coordinates of the data that is above a threshold.
277
278    Parameters
279    ----------
280    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray
281        MRC file or MRC data (numpy array or cupy array).
282    threshold : float
283        Threshold value.
284    device : str, optional
285        Device to use for computation. Default is cpu.
286
287    Returns
288    -------
289    numpy.ndarray or cupy.ndarray
290        Array of X, Y, and Z coordinates of the data that is above the threshold.
291
292    Notes
293    -----
294    This function is agnostic towards data normalization.
295    """
296    D = _ensure_array(mrc_data, device)
297    x, y, z = np.where(D > threshold)
298    xyz_data = np.transpose(np.stack([x, y, z]))
299    return xyz_data

Generates an array of X, Y, and Z coordinates of the data that is above a threshold.

Parameters

mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray or cupy.ndarray): MRC file or MRC data (numpy array or cupy array).
threshold (float): Threshold value.
device (str, optional): Device to use for computation. Default is cpu.

Returns

numpy.ndarray or cupy.ndarray: Array of X, Y, and Z coordinates of the data that is above the threshold.

Notes

This function is agnostic towards data normalization.

def augment_mrc_data( mrc_data: mrcfile.mrcfile.MrcFile | numpy.ndarray, offset_percent: float = 10, n: int = 1) -> list[numpy.ndarray] | numpy.ndarray: View Source

302def augment_mrc_data(
303    mrc_data: mrcfile.mrcfile.MrcFile | np.ndarray,
304    offset_percent: float = 10,
305    n: int = 1,
306) -> list[np.ndarray] | np.ndarray:
307    """
308    Generates N augmented MRC datasets that are injected with random noise from a uniform distribution bounded by the offset percentage.
309
310    Parameters
311    ----------
312    mrc_data : mrcfile.mrcfile.MrcFile or numpy.ndarray
313        MRC file or MRC data (numpy array).
314    offset_percent : float, optional
315        Offset percentage for random noise. Default is 10.
316    n : int, optional
317        Number of augmented datasets to generate. Default is 1.
318
319    Returns
320    -------
321    list[numpy.ndarray] or numpy.ndarray
322        List of augmented MRC datasets or a single augmented dataset.
323    """
324    D = _ensure_array(mrc_data)
325    assert 0.0 <= offset_percent <= 100
326    assert n >= 1
327
328    random_uniform = np.random.uniform
329
330    if n == 1:
331        random_offsets = (
332            random_uniform(low=-offset_percent, high=offset_percent, size=D.shape) / 100
333        )
334        adjusted_data = D * (1 + random_offsets)
335        return adjusted_data
336    else:
337        adjusted_data_list = []
338        for _ in range(n):
339            random_offsets = (
340                random_uniform(low=-offset_percent, high=offset_percent, size=D.shape)
341                / 100
342            )
343            adjusted_data = D * (1 + random_offsets)
344            adjusted_data_list.append(adjusted_data)
345        return adjusted_data_list

Generates N augmented MRC datasets that are injected with random noise from a uniform distribution bounded by the offset percentage.

Parameters

mrc_data (mrcfile.mrcfile.MrcFile or numpy.ndarray): MRC file or MRC data (numpy array).
offset_percent (float, optional): Offset percentage for random noise. Default is 10.
n (int, optional): Number of augmented datasets to generate. Default is 1.

Returns

list[numpy.ndarray] or numpy.ndarray: List of augmented MRC datasets or a single augmented dataset.

def calculate_point_cloud_mass(point_cloud: numpy.ndarray, voxel_dim: float, density: float) -> float: View Source

348def calculate_point_cloud_mass(
349    point_cloud: np.ndarray, voxel_dim: float, density: float
350) -> float:
351    """
352    Calculates the mass of an array of voxel xyz coordinates.
353
354    Parameters
355    ----------
356    point_cloud : numpy.ndarray or cupy.ndarray
357        Array of voxel xyz coordinates.
358    voxel_dim : float
359        Voxel dimension.
360    density : float
361        Density value.
362
363    Returns
364    -------
365    float
366        Estimated mass of the point cloud.
367
368    Notes
369    -----
370    The estimated mass = n_voxels * voxel_dim * density.
371    For example, mass in kDa = 10 voxels * 1 nm3/ voxel * 0.82 kDa / nm3.
372    The units for `voxel_dim` and `density` must be consistent.
373    Use 0.82 kDa / nm3 for large globular proteins.
374    """
375    n_voxels = point_cloud.shape[0]
376    return n_voxels * voxel_dim * density

Calculates the mass of an array of voxel xyz coordinates.

Parameters

point_cloud (numpy.ndarray or cupy.ndarray): Array of voxel xyz coordinates.
voxel_dim (float): Voxel dimension.
density (float): Density value.

Returns

float: Estimated mass of the point cloud.

Notes

The estimated mass = n_voxels * voxel_dim * density. For example, mass in kDa = 10 voxels * 1 nm3/ voxel * 0.82 kDa / nm3. The units for voxel_dim and density must be consistent. Use 0.82 kDa / nm3 for large globular proteins.

def cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples): View Source

379def cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples):
380    """
381    Clusters data using DBSCAN from sklearn or cuml.
382
383    Parameters
384    ----------
385    xyz_data : numpy.ndarray or cupy.ndarray
386        Array of x, y, z coordinates.
387    DBSCAN_epsilon : float
388        The maximum distance between two samples for one to be considered as in the neighborhood of the other.
389    DBSCAN_min_samples : int
390        The number of samples in a neighborhood for a point to be considered as a core point.
391
392    Returns
393    -------
394    sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN: clustering results stored in an object
395
396    See Also
397    -------
398    sklearn.cluster.DBSCAN
399    cuml.cluster.DBSCAN
400    cluster_data_hdbscan
401    cluster_data_optics
402    """
403    if IS_GPU_AVAILABLE and isinstance(xyz_data, cp.ndarray):
404        clusterer = DBSCAN_GPU
405    else:
406        clusterer = DBSCAN
407    try:
408        model = clusterer(
409            eps=DBSCAN_epsilon, min_samples=DBSCAN_min_samples
410        )  # apply coarse-graining (DBSCAN)
411        model.fit_predict(xyz_data)
412        return model
413    except:
414        print(
415            'Perhaps, "Fatal Python error: _PyErr_NormalizeException: Cannot recover from MemoryErrors while normalizing exceptions. Python runtime state: initialized" at tahoma for empiar inverted mrc'
416        )
417        print(f"Failed xyz_data.shape:{xyz_data.shape}")
418        print(f"DBSCAN_epsilon:{DBSCAN_epsilon}")
419        print(f"DBSCAN_min_samples:{DBSCAN_min_samples}")
420        return False

Clusters data using DBSCAN from sklearn or cuml.

Parameters

xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
DBSCAN_epsilon (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
DBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.

Returns

sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN (clustering results stored in an object):

Parameters

xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
HDBSCAN_min_cluster_size (int): The minimum number of points in a cluster.
HDBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.

Returns

hdbscan.HDBSCAN or cuml.cluster.HDBSCAN (clustering results stored in an object):

Parameters

xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
OPTICS_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
OPTICS_min_cluster_size (int, optional): The minimum number of points in a cluster. Default is None.
OPTICS_max_epsilon (float, optional): The maximum distance between points considered by OPTICS. Default is numpy.inf.

Returns

sklearn.cluster.OPTICS (clustering results stored in an object):

Parameters

voxel_dim (int): Chunk dimension for coarsening.
point_cloud (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
labels (numpy.ndarray or cupy.ndarray): Array of cluster labels.
density (numpy.ndarray or cupy.ndarray, optional): Array of density values. Default is None.
averaged (bool, optional): If True, the coarsened point cloud will be the average of the points in each chunk, using the density as weights, if provided. Default is False.

Returns

numpy.ndarray or cupy.ndarray: Coordinates of the centroid of each chunk that contain more than half of the non-empty voxels.

def get_cluster_centroids(xyz_data, model): View Source

607def get_cluster_centroids(xyz_data, model):
608    """
609    Coarse grain density model using cluster centroids.
610
611    Parameters
612    ----------
613    xyz_data : numpy.ndarray or cupy.ndarray
614        Array of x, y, z coordinates.
615    model : sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN
616        DBSCAN clustering results stored in an object.
617
618    Returns
619    -------
620    numpy.ndarray or cupy.ndarray
621        Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
622    """
623    samples_w_lbls = np.concatenate((xyz_data, model.labels_[:, np.newaxis]), axis=1)
624    if -1 in set(model.labels_):  # if noise detected
625        coarse_model = np.zeros(
626            (len(set(model.labels_)) - 1, 3)
627        )  # remove last label which is noise
628        for i in range(len(set(model.labels_)) - 1):
629            # https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python
630            tmp_T = np.transpose(
631                samples_w_lbls[np.in1d(samples_w_lbls[:, -1], np.asarray([i]))]
632            )
633            x_mean = np.mean(tmp_T[0])
634            y_mean = np.mean(tmp_T[1])
635            z_mean = np.mean(tmp_T[2])
636            coarse_model[i] = [x_mean, y_mean, z_mean]
637    else:
638        coarse_model = np.zeros((len(set(model.labels_)), 3))
639        for i in range(len(set(model.labels_))):
640            # https://stackoverflow.com/questions/55604239/find-which-points-belong-to-a-cluster-in-dbscan-in-python
641            tmp_T = np.transpose(
642                samples_w_lbls[np.in1d(samples_w_lbls[:, -1], np.asarray([i]))]
643            )
644            x_mean = np.mean(tmp_T[0])
645            y_mean = np.mean(tmp_T[1])
646            z_mean = np.mean(tmp_T[2])
647            coarse_model[i] = [x_mean, y_mean, z_mean]
648    return coarse_model

Coarse grain density model using cluster centroids.

Parameters

xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
model (sklearn.cluster.DBSCAN or cuml.cluster.DBSCAN): DBSCAN clustering results stored in an object.

Returns

numpy.ndarray or cupy.ndarray: Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].

def plot_clustering_results(xyz_data, coarse_model, figsize=3): View Source

651def plot_clustering_results(xyz_data, coarse_model, figsize=3):
652    """
653    Creates a 3D scatter plot containing both the xyz data and the cluster centroids.
654
655    Note: should rotate afterwards for better visualization.
656
657    Parameters
658    ----------
659    xyz_data : numpy.ndarray or cupy.ndarray
660        Array of x, y, z coordinates.
661    coarse_model : numpy.ndarray or cupy.ndarray
662        Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
663    figsize : int, optional
664        Size of the figure. Default is 3.
665
666    Returns
667    -------
668    matplotlib.figure.Figure
669        3D scatter plot figure.
670    """
671    fig = plt.figure(figsize=(figsize, figsize))
672    plt.title("clustering results")
673    ax = fig.add_subplot(projection="3d")
674    ax.scatter(
675        xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2], c="purple", s=2, alpha=0.3
676    )
677    ax.scatter(
678        coarse_model[:, 0],
679        coarse_model[:, 1],
680        coarse_model[:, 2],
681        c="k",
682        s=10,
683        alpha=0.9,
684    )
685    return fig

Creates a 3D scatter plot containing both the xyz data and the cluster centroids.

Note: should rotate afterwards for better visualization.

Parameters

xyz_data (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
coarse_model (numpy.ndarray or cupy.ndarray): Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
figsize (int, optional): Size of the figure. Default is 3.

Returns

matplotlib.figure.Figure: 3D scatter plot figure.

def create_cugraph_from_point_cloud(point_cloud, proximity_px): View Source

688def create_cugraph_from_point_cloud(point_cloud, proximity_px):
689    """
690    Creates a cuGraph object from a point cloud with edges assigned based on proximity.
691
692    Parameters
693    ----------
694    point_cloud : numpy.ndarray or cupy.ndarray
695        Array of x, y, z coordinates.
696    proximity_px : float
697        Pairwise cutoff distance for assigning edges to nodes, in pixels.
698    savepath : str, optional
699        Path to save the cuGraph object as a GML file. If None, don't save.
700        Default is None.
701
702    Returns
703    -------
704    cugraph.Graph
705        Unweighted, undirected cuGraph graph object.
706
707    Notes
708    -----
709    The cuGraph graph is unweighted and undirected.
710    Uses the KDTree algorithm to find the neighbors under the cutoff distance.
711    """
712    if IS_GPU_AVAILABLE:
713        if isinstance(point_cloud, cp.ndarray):
714            point_cloud = cp.asnumpy(point_cloud)
715        tree = KDTree(point_cloud)
716        edgelist = cp.array(tree.query_pairs(proximity_px, output_type="ndarray"))
717        edgelist_cudf = cudf.DataFrame(edgelist, columns=["source", "destination"])
718        edgelist_cudf = cg.symmetrize_df(edgelist_cudf, "source", "destination")
719        G = cg.Graph()
720        G.from_cudf_edgelist(edgelist_cudf, source="source", destination="destination")
721        return G
722    else:
723        raise ValueError("GPU support is not available.")

Creates a cuGraph object from a point cloud with edges assigned based on proximity.

Parameters

point_cloud (numpy.ndarray or cupy.ndarray): Array of x, y, z coordinates.
proximity_px (float): Pairwise cutoff distance for assigning edges to nodes, in pixels.
savepath (str, optional): Path to save the cuGraph object as a GML file. If None, don't save. Default is None.

Returns

cugraph.Graph: Unweighted, undirected cuGraph graph object.

Notes

The cuGraph graph is unweighted and undirected. Uses the KDTree algorithm to find the neighbors under the cutoff distance.

def create_igraph_from_point_cloud(point_cloud, proximity_px, savepath=None): View Source

726def create_igraph_from_point_cloud(point_cloud, proximity_px, savepath=None):
727    """
728    Creates an igraph object from a point cloud with edges assigned based on proximity.
729
730    Parameters
731    ----------
732    point_cloud : numpy.ndarray
733        Array of x, y, z coordinates.
734    proximity_px : float
735        Pairwise cutoff distance for assigning edges to nodes, in pixels.
736    savepath : str, optional
737        Path to save the igraph object as a GML file. If None, don't save.
738        Default is None.
739
740    Returns
741    -------
742    igraph.Graph
743        Unweighted, undirected igraph graph object.
744
745    Notes
746    -----
747    The igraph graph is unweighted and undirected.
748    Uses the KDTree algorithm to find the neighbors under the cutoff distance.
749    """
750    tree = KDTree(point_cloud)
751    edgelist = tree.query_pairs(proximity_px, output_type="ndarray")
752    G = ig.Graph(edgelist)
753    if savepath:
754        G.write(savepath, format="gml")
755    return G

Creates an igraph object from a point cloud with edges assigned based on proximity.

Parameters

point_cloud (numpy.ndarray): Array of x, y, z coordinates.
proximity_px (float): Pairwise cutoff distance for assigning edges to nodes, in pixels.
savepath (str, optional): Path to save the igraph object as a GML file. If None, don't save. Default is None.

Returns

igraph.Graph: Unweighted, undirected igraph graph object.

Notes

The igraph graph is unweighted and undirected. Uses the KDTree algorithm to find the neighbors under the cutoff distance.

def create_and_save_graph(coarse_model, proximity_px, out_fname, save=True): View Source

758def create_and_save_graph(coarse_model, proximity_px, out_fname, save=True):
759    """
760    Creates a Networkx graph from the coarse grained model (cluster centroids) and saves it as a graph XML file (.gexf).
761
762    Parameters
763    ----------
764    coarse_model : numpy.ndarray or cupy.ndarray
765        Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
766    proximity_px : float
767        Pairwise cutoff distance for assigning edges to nodes, in pixels.
768    out_fname : str
769        Filename for output.
770    save : bool, optional
771        Flag to save file (True) or not (False). Default is True.
772
773    Returns
774    -------
775    networkx.Graph
776        Networkx graph representation of coarse model (cluster centroids).
777    """
778    tree = KDTree(coarse_model)
779    edgelist = tree.query_pairs(proximity_px, output_type="ndarray")
780
781    # Arsam Firoozfar "It appears that the current code using nx is slightly faster than using igraph to directly create graph from matrix." 3/31/2023
782
783    G = nx.from_edgelist(edgelist)
784
785    if save:
786        nx.write_gexf(G, f"{out_fname}.gexf")
787    return G

Creates a Networkx graph from the coarse grained model (cluster centroids) and saves it as a graph XML file (.gexf).

Parameters

coarse_model (numpy.ndarray or cupy.ndarray): Array of cluster centroids. A[0] = [centroid_x0, centroid_y0, centroid_z0].
proximity_px (float): Pairwise cutoff distance for assigning edges to nodes, in pixels.
out_fname (str): Filename for output.
save (bool, optional): Flag to save file (True) or not (False). Default is True.

Returns

networkx.Graph: Networkx graph representation of coarse model (cluster centroids).

def add_Gaussian_noise(mrc, loc=0.0, scale=1.0): View Source

790def add_Gaussian_noise(mrc, loc=0.0, scale=1.0):
791    """
792    Adds Gaussian white noise to the data in an mrc file.
793
794    Parameters
795    ----------
796    mrc : mrcfile.mrcfile.MrcFile
797        MRC object to add noise to.
798    loc : float, optional
799        Mean of Gaussian distribution. Default is 0.
800    scale : float, optional
801        Standard deviation of Gaussian distribution. Default is 1.
802
803    Returns
804    -------
805    numpy.ndarray or cupy.ndarray
806        Data with noise added.
807    """
808    D = mrc.data
809    noise = np.random.normal(loc=loc, scale=scale, size=D.shape)
810    D_w_noise = D + noise
811    return D_w_noise

Adds Gaussian white noise to the data in an mrc file.

Parameters

mrc (mrcfile.mrcfile.MrcFile): MRC object to add noise to.
loc (float, optional): Mean of Gaussian distribution. Default is 0.
scale (float, optional): Standard deviation of Gaussian distribution. Default is 1.

Returns

numpy.ndarray or cupy.ndarray: Data with noise added.

def leave_2D_density(mrc_data, middle_z_index, xyz_data): View Source

815def leave_2D_density(mrc_data, middle_z_index, xyz_data):
816    three_D_ndarray_left = np.zeros(
817        (mrc_data.shape[0], mrc_data.shape[1], mrc_data.shape[2])
818    )
819    for k in range(xyz_data.shape[0]):
820        # print (xyz_data[k]) # 0 0 64
821        # print (type(xyz_data[k])) #<class 'numpy.ndarray'>
822        x_to_keep = xyz_data[k][0]  # 0
823        y_to_keep = xyz_data[k][1]  # 0
824        z_to_keep = xyz_data[k][2]  # 64
825        three_D_ndarray_left[x_to_keep, y_to_keep, z_to_keep] = mrc_data[
826            x_to_keep, y_to_keep, z_to_keep
827        ]
828    return three_D_ndarray_left

def main(args): View Source

831def main(args):
832    """
833    Takes a 3D density (.mrc), applies threshold, coarse-grains data, and converts it into a graph network.
834    Outputs a .png file of the coarse grained model, and a .gexf graph xml file.
835
836    Parameters
837    ----------
838    args : argparse.Namespace
839        Argument parser object containing the following attributes:
840        - fname : str
841            .mrc filename (white density w/ black background).
842        - mrc_t : float
843            Unnormalized pixel intensity threshold level.
844        - eps : float
845            DBSCAN epsilon (inter cluster distance).
846        - ms : int
847            DBSCAN min samples (minimum number of samples in cluster).
848        - d_cut : float
849            Pairwise distance cutoff for assigning edges to graph, in pixels.
850    """
851    fname = Path(args.fname)
852    mrc_t = args.mrc_t
853    DBSCAN_epsilon = args.eps  # try 1
854    DBSCAN_min_samples = args.ms  # try 4
855    d_cut = args.d_cut  # try 8
856    out_fname = fname.with_suffix("")
857
858    mrc = load_density_file(fname)
859    xyz_data = normalize_and_threshold_data(mrc, mrc_t)
860
861    # print (f"xyz_data:{xyz_data}")
862
863    model = cluster_data(xyz_data, DBSCAN_epsilon, DBSCAN_min_samples)
864    coarse_model = get_cluster_centroids(xyz_data, model)
865    G = create_and_save_graph(coarse_model, d_cut, out_fname)
866    fig = plot_clustering_results(xyz_data, coarse_model)

Takes a 3D density (.mrc), applies threshold, coarse-grains data, and converts it into a graph network. Outputs a .png file of the coarse grained model, and a .gexf graph xml file.

Parameters

args (argparse.Namespace): Argument parser object containing the following attributes:
- fname : str .mrc filename (white density w/ black background).
- mrc_t : float Unnormalized pixel intensity threshold level.
- eps : float DBSCAN epsilon (inter cluster distance).
- ms : int DBSCAN min samples (minimum number of samples in cluster).
- d_cut : float Pairwise distance cutoff for assigning edges to graph, in pixels.