griptomo.parsl.apps

View Source

   1from parsl import python_app
   2
   3
   4@python_app
   5def convert_density_file_to_centroids(
   6    protein_name,
   7    mrc_file_path,
   8    DBSCAN_epsilon,
   9    DBSCAN_min_samples,
  10    density_threshold,
  11    output_dir,
  12):
  13    """
  14    Convert a density file to centroids using DBSCAN clustering.
  15
  16    Parameters
  17    ----------
  18    protein_name : str
  19        Name of the protein.
  20    mrc_file_path : str
  21        Path to the MRC density file.
  22    DBSCAN_epsilon : float
  23        The maximum distance between two samples for one to be considered as in the neighborhood of the other.
  24    DBSCAN_min_samples : int
  25        The number of samples in a neighborhood for a point to be considered as a core point.
  26    density_threshold : float
  27        The threshold value for density.
  28    output_dir : str
  29        Directory where the output will be saved.
  30
  31    Returns
  32    -------
  33    tuple
  34        A tuple containing:
  35        - centroids (numpy.ndarray): The centroids of the clusters.
  36        - centroid_dir (str): The directory where the centroids are saved.
  37    """
  38    import os
  39    import numpy as np
  40    import csv
  41
  42    from griptomo.core import density2graph as d2g
  43
  44    fname = os.path.basename(mrc_file_path)
  45    density_name = fname[:-4]
  46
  47    centroid_dir = os.path.join(
  48        output_dir,
  49        protein_name,
  50        density_name,
  51        f"density_{density_threshold}",
  52        f"min_samples_{DBSCAN_min_samples}",
  53        f"epsilon_{DBSCAN_epsilon}",
  54    )
  55    if os.path.exists(centroid_dir):
  56        print(f"{centroid_dir} already exists.")
  57    else:
  58        os.makedirs(centroid_dir)
  59        print(f"{centroid_dir} does not exist. Creating it.")
  60
  61    centroid_filename = os.path.join(
  62        centroid_dir,
  63        "centroids.npy",
  64    )
  65
  66    if os.path.exists(centroid_filename):
  67        print(f"{centroid_filename} already exists.")
  68        centroids = np.load(centroid_filename)
  69    else:
  70        print(f"{centroid_filename} does not exist. Generating coarse model.")
  71        ### load density file, normalize the data and threshold, and extract the x,y,z coordinates of the remaining pixels
  72        mrc = d2g.load_density_file(mrc_file_path)  # load density file
  73        D_norm = d2g.normalize_mrc_data(mrc.data)
  74        total_voxel_count = mrc.data.size
  75        D_thresholded = d2g.threshold_mrc_data(D_norm, density_threshold)
  76        xyz_data = d2g.generate_point_cloud_from_mrc_data(
  77            D_thresholded, density_threshold
  78        )
  79        point_cloud_node_count = len(xyz_data)
  80
  81        ### perform clustering and get cluster centers
  82        model = d2g.cluster_data(
  83            xyz_data, DBSCAN_epsilon, DBSCAN_min_samples
  84        )  # cluster thresholded data using DBSCAN
  85        noisy_node_count = np.sum(model.labels_ == -1)
  86        non_noise_cluster_count = len(np.unique(model.labels_))
  87        if noisy_node_count > 0:
  88            non_noise_cluster_count -= 1
  89        centroids = d2g.get_cluster_centroids(
  90            xyz_data, model
  91        )  # coarse grain model by getting cluster centroids
  92        # Save centroids in output folder.
  93        np.save(centroid_filename, centroids)
  94        # Save centroids_run_info in output folder.
  95        run_info_filename = centroid_filename.replace(".npy", "_run_info.csv")
  96        with open(run_info_filename, "w", newline="") as csvfile:
  97            fieldnames = [
  98                "Total Voxel Count",
  99                "Point Cloud Node Count",
 100                "Non Noise Cluster Count",
 101                "Noisy Node Count",
 102            ]
 103            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 104
 105            writer.writeheader()
 106            writer.writerow(
 107                {
 108                    "Total Voxel Count": total_voxel_count,
 109                    "Point Cloud Node Count": point_cloud_node_count,
 110                    "Non Noise Cluster Count": non_noise_cluster_count,
 111                    "Noisy Node Count": noisy_node_count,
 112                }
 113            )
 114    return (centroids, centroid_dir)
 115
 116
 117@python_app
 118def extract_graph_features_from_centroids(
 119    centroids,
 120    cutoff,
 121    output_dir,
 122    force_overwrite=False,
 123):
 124    """
 125    Extracts graph features from centroid coordinates and saves them to a CSV file.
 126
 127    Parameters
 128    ----------
 129    centroids : str or numpy.ndarray
 130        Path to a file containing centroid coordinates or a numpy array of centroid coordinates.
 131    cutoff : float
 132        Distance threshold for creating edges in the graph.
 133    output_dir : str
 134        Directory where the output CSV file will be saved.
 135    force_overwrite : bool, optional
 136        If True, overwrite the existing output file. Defaults to False.
 137
 138    Returns
 139    -------
 140    str
 141        Path to the output CSV file containing the graph features.
 142    """
 143    import os
 144
 145    feature_output_dir = os.path.join(output_dir, f"cutoff_{cutoff}")
 146
 147    if os.path.exists(feature_output_dir):
 148        print(f"{feature_output_dir} already exists.")
 149    else:
 150        os.makedirs(feature_output_dir)
 151        print(f"{feature_output_dir} does not exist. Creating it.")
 152
 153    feature_output_filename = os.path.join(feature_output_dir, "graph_features.csv")
 154
 155    if os.path.exists(feature_output_filename) and not force_overwrite:
 156        print(f"{feature_output_filename} already exists and force_overwrite is False.")
 157    else:
 158        import numpy as np
 159        import csv
 160
 161        from griptomo.core import ig_extract_features as g2c
 162        from griptomo.core.density2graph import create_igraph_from_point_cloud
 163
 164        # If the centroids are not already loaded, load them.
 165        if isinstance(centroids, str):
 166            centroids = np.load(centroids)
 167
 168        graph = create_igraph_from_point_cloud(centroids, cutoff)
 169        graph_features = g2c.igraph_calc_graph_features(graph)
 170
 171        if os.path.exists(feature_output_filename):
 172            print(f"{feature_output_filename} already exists but will be overwritten.")
 173        else:
 174            print(f"{feature_output_filename} does not exist. Writing graph features.")
 175        with open(feature_output_filename, "w", newline="") as csvfile:
 176            fieldnames = list(graph_features.keys())
 177            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 178            writer.writeheader()
 179            writer.writerow(graph_features)
 180
 181    return feature_output_filename
 182
 183
 184@python_app
 185def convert_density_file_to_centroids_timed(
 186    protein_name,
 187    mrc_file_path,
 188    DBSCAN_epsilon,
 189    DBSCAN_min_samples,
 190    density_threshold,
 191    output_dir,
 192):
 193    """
 194    Convert a density file to centroids using DBSCAN clustering.
 195    Additionally, save the timings of each step to a CSV file.
 196
 197    Parameters
 198    ----------
 199    protein_name : str
 200        Name of the protein.
 201    mrc_file_path : str
 202        Path to the MRC density file.
 203    DBSCAN_epsilon : float
 204        The maximum distance between two samples for one to be considered as in the neighborhood of the other.
 205    DBSCAN_min_samples : int
 206        The number of samples in a neighborhood for a point to be considered as a core point.
 207    density_threshold : float
 208        The threshold value for density.
 209    output_dir : str
 210        Directory where the output will be saved.
 211
 212    Returns
 213    -------
 214    tuple
 215        A tuple containing:
 216        - centroids (numpy.ndarray): The centroids of the clusters.
 217        - centroid_dir (str): The directory where the centroids are saved.
 218    """
 219    import time
 220
 221    times = {}
 222
 223    start_time = time.perf_counter()
 224    import os
 225    import numpy as np
 226    import csv
 227
 228    from griptomo.core import density2graph as d2g
 229
 230    times["module import"] = time.perf_counter() - start_time
 231
 232    fname = os.path.basename(mrc_file_path)
 233    density_name = fname[:-4]
 234
 235    centroid_dir = os.path.join(
 236        output_dir,
 237        protein_name,
 238        density_name,
 239        f"density_{density_threshold}",
 240        f"min_samples_{DBSCAN_min_samples}",
 241        f"epsilon_{DBSCAN_epsilon}",
 242    )
 243    if os.path.exists(centroid_dir):
 244        print(f"{centroid_dir} already exists.")
 245    else:
 246        os.makedirs(centroid_dir)
 247        print(f"{centroid_dir} does not exist. Creating it.")
 248
 249    centroid_filename = os.path.join(
 250        centroid_dir,
 251        "centroids.npy",
 252    )
 253
 254    if os.path.exists(centroid_filename):
 255        print(f"{centroid_filename} already exists.")
 256        start_time = time.perf_counter()
 257        centroids = np.load(centroid_filename)
 258        times["load centroids"] = time.perf_counter() - start_time
 259    else:
 260        print(f"{centroid_filename} does not exist. Generating coarse model.")
 261        ### load density file, normalize the data and threshold, and extract the x,y,z coordinates of the remaining pixels
 262        start_time = time.perf_counter()
 263        mrc = d2g.load_density_file(mrc_file_path)  # load density file
 264        times["load density file"] = time.perf_counter() - start_time
 265
 266        start_time = time.perf_counter()
 267        D_norm = d2g.normalize_mrc_data(mrc.data)
 268        times["normalize data"] = time.perf_counter() - start_time
 269        total_voxel_count = mrc.data.size
 270
 271        start_time = time.perf_counter()
 272        D_thresholded = d2g.threshold_mrc_data(D_norm, density_threshold)
 273        times["threshold data"] = time.perf_counter() - start_time
 274
 275        start_time = time.perf_counter()
 276        xyz_data = d2g.generate_point_cloud_from_mrc_data(
 277            D_thresholded, density_threshold
 278        )
 279        times["generate point cloud"] = time.perf_counter() - start_time
 280
 281        point_cloud_node_count = len(xyz_data)
 282
 283        start_time = time.perf_counter()
 284        ### perform clustering and get cluster centers
 285        model = d2g.cluster_data(
 286            xyz_data, DBSCAN_epsilon, DBSCAN_min_samples
 287        )  # cluster thresholded data using DBSCAN
 288        times["cluster data"] = time.perf_counter() - start_time
 289
 290        noisy_node_count = np.sum(model.labels_ == -1)
 291        non_noise_cluster_count = len(np.unique(model.labels_))
 292        if noisy_node_count > 0:
 293            non_noise_cluster_count -= 1
 294
 295        start_time = time.perf_counter()
 296        centroids = d2g.get_cluster_centroids(
 297            xyz_data, model
 298        )  # coarse grain model by getting cluster centroids
 299        times["get cluster centroids"] = time.perf_counter() - start_time
 300
 301        start_time = time.perf_counter()
 302        np.save(centroid_filename, centroids)
 303        times["save centroids"] = time.perf_counter() - start_time
 304
 305        run_info_filename = centroid_filename.replace(".npy", "_run_info.csv")
 306        with open(run_info_filename, "w", newline="") as csvfile:
 307            fieldnames = [
 308                "Total Voxel Count",
 309                "Point Cloud Node Count",
 310                "Non Noise Cluster Count",
 311                "Noisy Node Count",
 312            ]
 313            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 314
 315            writer.writeheader()
 316            writer.writerow(
 317                {
 318                    "Total Voxel Count": total_voxel_count,
 319                    "Point Cloud Node Count": point_cloud_node_count,
 320                    "Non Noise Cluster Count": non_noise_cluster_count,
 321                    "Noisy Node Count": noisy_node_count,
 322                }
 323            )
 324            # Print timing information to a file
 325            times_filename = centroid_filename.replace("centroids.npy", "times.csv")
 326            with open(times_filename, "w", newline="") as csvfile:
 327                fieldnames = list(times.keys())
 328                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 329                writer.writeheader()
 330                writer.writerow(times)
 331    return (centroids, centroid_dir)
 332
 333
 334@python_app
 335def extract_graph_features_from_centroids_timed(
 336    centroids,
 337    cutoff,
 338    output_dir,
 339    force_overwrite=False,
 340    skip_clique_num=False,
 341):
 342    """
 343    Extracts graph features from centroid coordinates and saves them to a CSV file.
 344    Additionally, save the times of each step to a separate CSV file.
 345
 346    Parameters
 347    ----------
 348    centroids : str or numpy.ndarray
 349        Path to a file containing centroid coordinates or a numpy array of centroid coordinates.
 350    cutoff : float
 351        Distance threshold for creating edges in the graph.
 352    output_dir : str
 353        Directory where the output CSV file will be saved.
 354    force_overwrite : bool, optional
 355        If True, overwrite the existing output file. Defaults to False.
 356    skip_clique_num : bool, optional
 357        If True, skip the calculation of the number of maximal cliques. Defaults to False.
 358
 359    Returns
 360    -------
 361    str
 362        Path to the output CSV file containing the graph features.
 363    """
 364    import os
 365
 366    feature_output_dir = os.path.join(output_dir, f"cutoff_{cutoff}")
 367
 368    if os.path.exists(feature_output_dir):
 369        print(f"{feature_output_dir} already exists.")
 370    else:
 371        os.makedirs(feature_output_dir)
 372        print(f"{feature_output_dir} does not exist. Creating it.")
 373
 374    feature_output_filename = os.path.join(feature_output_dir, "graph_features.csv")
 375
 376    if os.path.exists(feature_output_filename) and not force_overwrite:
 377        print(f"{feature_output_filename} already exists and force_overwrite is False.")
 378    else:
 379        import time
 380
 381        times = {}
 382
 383        start_time = time.perf_counter()
 384        import numpy as np
 385        import csv
 386
 387        from griptomo.core import ig_extract_features as g2c
 388        from griptomo.core.density2graph import create_igraph_from_point_cloud
 389
 390        times["module import"] = time.perf_counter() - start_time
 391
 392        # If the centroids are not already loaded, load them.
 393        if isinstance(centroids, str):
 394            centroids = np.load(centroids)
 395
 396        graph = create_igraph_from_point_cloud(centroids, cutoff)
 397        times["convert to igraph"] = time.perf_counter() - start_time
 398
 399        start_time = time.perf_counter()
 400        graph_features, graph_times = g2c.igraph_calc_graph_features_timed(
 401            graph, skip_clique_num=skip_clique_num
 402        )
 403        times["extract features"] = time.perf_counter() - start_time
 404
 405        if os.path.exists(feature_output_filename):
 406            print(f"{feature_output_filename} already exists but will be overwritten.")
 407        else:
 408            print(f"{feature_output_filename} does not exist. Writing graph features.")
 409        with open(feature_output_filename, "w", newline="") as csvfile:
 410            fieldnames = list(graph_features.keys())
 411            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 412            writer.writeheader()
 413            writer.writerow(graph_features)
 414        graph_times_filename = feature_output_filename.replace(
 415            "graph_features.csv", "graph_times.csv"
 416        )
 417        with open(graph_times_filename, "w", newline="") as csvfile:
 418            fieldnames = list(graph_times.keys())
 419            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 420            writer.writeheader()
 421            writer.writerow(graph_times)
 422        times_filename = feature_output_filename.replace(
 423            "graph_features.csv", "times.csv"
 424        )
 425        with open(times_filename, "w", newline="") as csvfile:
 426            fieldnames = list(times.keys())
 427            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 428            writer.writeheader()
 429            writer.writerow(times)
 430
 431    return feature_output_filename
 432
 433
 434@python_app
 435def convert_density_file_to_point_cloud_hdbscan_timed(
 436    protein_name,
 437    mrc_file_path,
 438    HDBSCAN_min_cluster_size,
 439    HDBSCAN_min_samples,
 440    coarsening_dim,
 441    density_threshold,
 442    output_dir,
 443    averaged=False,
 444    density_coarsening=False,
 445    precoarsened=False,
 446    standardize=False,
 447    threshold_ratio=False,
 448):
 449    """
 450    Convert a density file to centroids using HDBSCAN clustering.
 451    Additionally, save the timings of each step to a CSV file.
 452
 453    Parameters
 454    ----------
 455    protein_name : str
 456        Name of the protein.
 457    mrc_file_path : str
 458        Path to the MRC density file.
 459    HDBSCAN_min_cluster_size : float
 460        The minimum number of points in a cluster.
 461    HDBSCAN_min_samples : int
 462        The number of samples in a neighborhood for a point to be considered as a core point.
 463    coarsening_dim : int
 464    The dimensionality passed in to griptomo.core.density2graph.voxel_coarsening's voxel_dim argument to be used for performing coarsening on the thresholded point cloud.
 465    density_threshold : float
 466        The threshold value for density.
 467    output_dir : str
 468        Directory where the output will be saved.
 469    averaged : bool, optional
 470        If True, average the points in the coarsening box instead of returning the center of the box. Defaults to False.
 471    density_coarsening : bool, optional
 472        If True, use the density values of each voxel in the box to provide weight when averaging during coarsening. Defaults to False.
 473    precoarsened : bool, optional
 474        If True, pre-coarsen the data before extracting the cluster centroids. Defaults to False.
 475    standardize : bool, optional
 476        If True, standardize the data before thresholding. Defaults to False.
 477    threshold_ratio : bool, optional
 478        If True, use the threshold ratio to identify the threshold. Defaults to False.
 479
 480    Returns
 481    -------
 482    tuple
 483        A tuple containing:
 484        - centroids (numpy.ndarray): The centroids of the clusters.
 485        - centroid_dir (str): The directory where the centroids are saved.
 486    """
 487    import time
 488
 489    times = {}
 490
 491    start_time = time.perf_counter()
 492    import os
 493    import numpy as np
 494    import csv
 495
 496    from griptomo.core import density2graph as d2g
 497
 498    times["module import"] = time.perf_counter() - start_time
 499
 500    fname = os.path.basename(mrc_file_path)
 501    density_name = fname[:-4]
 502
 503    centroid_dir = os.path.join(
 504        output_dir,
 505        protein_name,
 506        density_name,
 507        f"density_{density_threshold}",
 508        f"min_samples_{HDBSCAN_min_samples}",
 509        f"min_cluster_size_{HDBSCAN_min_cluster_size}",
 510        f"coarsening_dim_{coarsening_dim}",
 511    )
 512    if os.path.exists(centroid_dir):
 513        print(f"{centroid_dir} already exists.")
 514    else:
 515        print(f"{centroid_dir} does not exist. Creating it.")
 516        os.makedirs(centroid_dir)
 517
 518    centroid_filename = os.path.join(
 519        centroid_dir,
 520        "centroids.npy",
 521    )
 522
 523    if os.path.exists(centroid_filename):
 524        print(f"{centroid_filename} already exists.")
 525        start_time = time.perf_counter()
 526        centroids = np.load(centroid_filename)
 527        times["load centroids"] = time.perf_counter() - start_time
 528    else:
 529        print(f"{centroid_filename} does not exist. Generating coarse model.")
 530        ### load density file, normalize the data and threshold, and extract the x,y,z coordinates of the remaining pixels
 531        start_time = time.perf_counter()
 532        mrc = d2g.load_density_file(mrc_file_path)  # load density file
 533        times["load density file"] = time.perf_counter() - start_time
 534
 535        start_time = time.perf_counter()
 536        if standardize:
 537            D_standardized = d2g.standardize_mrc_data(mrc.data)
 538            times["standardize data"] = time.perf_counter() - start_time
 539            start_time = time.perf_counter()
 540            D_norm = d2g.normalize_mrc_data(D_standardized)
 541            times["normalize data"] = time.perf_counter() - start_time
 542        else:
 543            D_norm = d2g.normalize_mrc_data(mrc.data)
 544            times["normalize data"] = time.perf_counter() - start_time
 545        total_voxel_count = mrc.data.size
 546
 547        start_time = time.perf_counter()
 548        if threshold_ratio:
 549            threshold = d2g.identify_threshold_ratio(D_norm, density_threshold)
 550        else:
 551            threshold = density_threshold
 552        start_time = time.perf_counter()
 553        xyz_data = d2g.generate_point_cloud_from_mrc_data(D_norm, threshold)
 554        times["generate point cloud"] = time.perf_counter() - start_time
 555
 556        if density_coarsening:
 557            xyz_densities = D_norm[xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2]] + 1
 558            xyz_densities = xyz_densities / 2
 559            fine_mean_density = np.mean(xyz_densities)
 560            fine_median_density = np.median(xyz_densities)
 561            fine_max_density = np.max(xyz_densities)
 562            fine_min_density = np.min(xyz_densities)
 563            fine_std_density = np.std(xyz_densities)
 564            fine_sum_density = np.sum(xyz_densities)
 565            xyz_densities_filename = os.path.join(
 566                centroid_dir,
 567                "xyz_densities.npy",
 568            )
 569            np.save(xyz_densities_filename, xyz_densities)
 570        else:
 571            xyz_densities = None
 572
 573        point_cloud_node_count = len(xyz_data)
 574
 575        if precoarsened:
 576            start_time = time.perf_counter()
 577            coarsened = d2g.voxel_coarsening(
 578                coarsening_dim,
 579                xyz_data,
 580                np.zeros(xyz_data.shape[0], dtype=np.int64),
 581                density=xyz_densities,
 582                averaged=averaged,
 583            )
 584            if xyz_densities is not None:
 585                xyz_densities = coarsened[1]
 586                xyz_data = coarsened[0]
 587            else:
 588                xyz_data = coarsened
 589            times["precoarsening"] = time.perf_counter() - start_time
 590
 591        start_time = time.perf_counter()
 592        ### perform clustering and get cluster centers
 593        model = d2g.cluster_data_hdbscan(
 594            xyz_data, HDBSCAN_min_cluster_size, HDBSCAN_min_samples
 595        )  # cluster thresholded data using DBSCAN
 596        times["cluster data"] = time.perf_counter() - start_time
 597
 598        noisy_node_count = np.sum(model.labels_ == -1)
 599        non_noise_cluster_count = len(np.unique(model.labels_))
 600        if noisy_node_count > 0:
 601            non_noise_cluster_count -= 1
 602        if non_noise_cluster_count == 0:
 603            print("No clusters found for {centroid_dir}")
 604            run_info_filename = centroid_filename.replace(".npy", "_run_info.csv")
 605            with open(run_info_filename, "w", newline="") as csvfile:
 606                fieldnames = ["Total Voxel Count", "Point Cloud Node Count"]
 607                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 608                writer.writeheader()
 609                writer.writerow(
 610                    {
 611                        "Total Voxel Count": total_voxel_count,
 612                        "Point Cloud Node Count": point_cloud_node_count,
 613                    }
 614                )
 615            # Print timing information to a file
 616            times_filename = centroid_filename.replace("centroids.npy", "times.csv")
 617            with open(times_filename, "w", newline="") as csvfile:
 618                fieldnames = list(times.keys())
 619                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 620                writer.writeheader()
 621                writer.writerow(times)
 622            raise ValueError("No clusters found.")
 623        largest_cluster = np.argmax(np.bincount(model.labels_[model.labels_ != -1]))
 624        largest_cluster_size = np.sum(model.labels_ == largest_cluster)
 625
 626        xyz_filename = os.path.join(
 627            centroid_dir,
 628            "xyz_data.npy",
 629        )
 630        labels_filename = os.path.join(
 631            centroid_dir,
 632            "labels.npy",
 633        )
 634        np.save(xyz_filename, xyz_data)
 635        np.save(labels_filename, model.labels_)
 636
 637        if not precoarsened:
 638            start_time = time.perf_counter()
 639            coarsened = d2g.voxel_coarsening(
 640                coarsening_dim,
 641                xyz_data,
 642                model.labels_,
 643                density=xyz_densities,
 644                averaged=averaged,
 645            )  # coarse grain model by getting cluster centroids
 646            if density_coarsening:
 647                centroids, centroid_densities = coarsened
 648            else:
 649                centroids = coarsened
 650            coarsened_size = centroids.shape[0]
 651            times["voxel coarsening"] = time.perf_counter() - start_time
 652        else:
 653            centroids = xyz_data[model.labels_ == largest_cluster]
 654            centroid_densities = (
 655                xyz_densities[model.labels_ == largest_cluster]
 656                if xyz_densities is not None
 657                else None
 658            )
 659            coarsened_size = centroids.shape[0]
 660        if density_coarsening:
 661            coarse_mean_density = np.mean(centroid_densities)
 662            coarse_median_density = np.median(centroid_densities)
 663            coarse_max_density = np.max(centroid_densities)
 664            coarse_min_density = np.min(centroid_densities)
 665            coarse_std_density = np.std(centroid_densities)
 666            coarse_sum_density = np.sum(centroid_densities)
 667        else:
 668            centroids = coarsened
 669
 670        coarsened_size = centroids.shape[0]
 671        times["voxel coarsening"] = time.perf_counter() - start_time
 672
 673        start_time = time.perf_counter()
 674        if density_coarsening:
 675            centroid_densities_filename = os.path.join(
 676                centroid_dir,
 677                "centroid_densities.npy",
 678            )
 679            np.save(centroid_densities_filename, centroid_densities)
 680        np.save(centroid_filename, centroids)
 681        times["save centroids"] = time.perf_counter() - start_time
 682
 683        run_info_filename = centroid_filename.replace(".npy", "_run_info.csv")
 684        with open(run_info_filename, "w", newline="") as csvfile:
 685            fieldnames = [
 686                "Total Voxel Count",
 687                "Point Cloud Node Count",
 688                "Non Noise Cluster Count",
 689                "Noisy Node Count",
 690                "Largest Cluster Size",
 691                "Coarsened Size",
 692            ]
 693            if density_coarsening:
 694                fieldnames += [
 695                    "Fine Mean Density",
 696                    "Fine Median Density",
 697                    "Fine Max Density",
 698                    "Fine Min Density",
 699                    "Fine Std Density",
 700                    "Fine Sum Density",
 701                    "Coarse Mean Density",
 702                    "Coarse Median Density",
 703                    "Coarse Max Density",
 704                    "Coarse Min Density",
 705                    "Coarse Std Density",
 706                    "Coarse Sum Density",
 707                ]
 708            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 709
 710            writer.writeheader()
 711            if density_coarsening:
 712                writer.writerow(
 713                    {
 714                        "Total Voxel Count": total_voxel_count,
 715                        "Point Cloud Node Count": point_cloud_node_count,
 716                        "Non Noise Cluster Count": non_noise_cluster_count,
 717                        "Noisy Node Count": noisy_node_count,
 718                        "Largest Cluster Size": largest_cluster_size,
 719                        "Coarsened Size": coarsened_size,
 720                        "Fine Mean Density": fine_mean_density,
 721                        "Fine Median Density": fine_median_density,
 722                        "Fine Max Density": fine_max_density,
 723                        "Fine Min Density": fine_min_density,
 724                        "Fine Std Density": fine_std_density,
 725                        "Fine Sum Density": fine_sum_density,
 726                        "Coarse Mean Density": coarse_mean_density,
 727                        "Coarse Median Density": coarse_median_density,
 728                        "Coarse Max Density": coarse_max_density,
 729                        "Coarse Min Density": coarse_min_density,
 730                        "Coarse Std Density": coarse_std_density,
 731                        "Coarse Sum Density": coarse_sum_density,
 732                    }
 733                )
 734            else:
 735                writer.writerow(
 736                    {
 737                        "Total Voxel Count": total_voxel_count,
 738                        "Point Cloud Node Count": point_cloud_node_count,
 739                        "Non Noise Cluster Count": non_noise_cluster_count,
 740                        "Noisy Node Count": noisy_node_count,
 741                        "Largest Cluster Size": largest_cluster_size,
 742                        "Coarsened Size": coarsened_size,
 743                    }
 744                )
 745        # Print timing information to a file
 746        times_filename = centroid_filename.replace("centroids.npy", "times.csv")
 747        with open(times_filename, "w", newline="") as csvfile:
 748            fieldnames = list(times.keys())
 749            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 750            writer.writeheader()
 751            writer.writerow(times)
 752    return (centroids, centroid_dir)
 753
 754
 755@python_app
 756def convert_density_file_to_point_cloud_optics_timed(
 757    protein_name,
 758    mrc_file_path,
 759    OPTICS_min_samples,
 760    coarsening_dim,
 761    density_threshold,
 762    output_dir,
 763    OPTICS_min_cluster_size=None,
 764    OPTICS_max_epsilon=None,
 765    averaged=False,
 766    density_coarsening=False,
 767):
 768    """
 769    Convert a density file to centroids using OPTICS clustering.
 770    Additionally, save the timings of each step to a CSV file.
 771
 772    Parameters
 773    ----------
 774    protein_name : str
 775        Name of the protein.
 776    mrc_file_path : str
 777        Path to the MRC density file.
 778    OPTICS_min_samples : int
 779        The number of samples in a neighborhood for a point to be considered as a core point.
 780    coarsening_dim : int
 781    The dimensionality passed in to griptomo.core.density2graph.voxel_coarsening's voxel_dim argument to be used for performing coarsening on the thresholded point cloud.
 782    density_threshold : float
 783        The threshold value for density.
 784    output_dir : str
 785        Directory where the output will be saved.
 786    OPTICS_min_cluster_size : float, optional
 787        The minimum number of points in a cluster. Defaults to None.
 788    OPTICS_max_epsilon : float, optional
 789        The maximum distance between points considered by OPTICS. Defaults to None, which uses np.inf. A lower value may reduce computation time.
 790    averaged : bool, optional
 791        If True, average the points in the coarsening box instead of returning the center of the box. Defaults to False.
 792    density_coarsening : bool, optional
 793        If True, use the density values of each voxel in the box to provide weight when averaging during coarsening. Defaults to False.
 794
 795    Returns
 796    -------
 797    tuple
 798        A tuple containing:
 799        - centroids (numpy.ndarray): The centroids of the clusters.
 800        - centroid_dir (str): The directory where the centroids are saved.
 801    """
 802    import time
 803
 804    times = {}
 805
 806    start_time = time.perf_counter()
 807    import os
 808    import numpy as np
 809    import csv
 810
 811    from griptomo.core import density2graph as d2g
 812
 813    times["module import"] = time.perf_counter() - start_time
 814
 815    OPTICS_max_epsilon = (
 816        OPTICS_max_epsilon if OPTICS_max_epsilon is not None else np.inf
 817    )
 818
 819    fname = os.path.basename(mrc_file_path)
 820    density_name = fname[:-4]
 821
 822    centroid_dir = os.path.join(
 823        output_dir,
 824        protein_name,
 825        density_name,
 826        f"density_{density_threshold}",
 827        f"min_samples_{OPTICS_min_samples}",
 828        f"max_eps_{OPTICS_max_epsilon}",
 829        f"coarsening_dim_{coarsening_dim}",
 830    )
 831    if os.path.exists(centroid_dir):
 832        print(f"{centroid_dir} already exists.")
 833    else:
 834        print(f"{centroid_dir} does not exist. Creating it.")
 835        os.makedirs(centroid_dir)
 836
 837    centroid_filename = os.path.join(
 838        centroid_dir,
 839        "centroids.npy",
 840    )
 841
 842    if os.path.exists(centroid_filename):
 843        print(f"{centroid_filename} already exists.")
 844        start_time = time.perf_counter()
 845        centroids = np.load(centroid_filename)
 846        times["load centroids"] = time.perf_counter() - start_time
 847    else:
 848        print(f"{centroid_filename} does not exist. Generating coarse model.")
 849        ### load density file, normalize the data and threshold, and extract the x,y,z coordinates of the remaining pixels
 850        start_time = time.perf_counter()
 851        mrc = d2g.load_density_file(mrc_file_path)  # load density file
 852        times["load density file"] = time.perf_counter() - start_time
 853
 854        start_time = time.perf_counter()
 855        D_norm = d2g.normalize_mrc_data(mrc.data)
 856        times["normalize data"] = time.perf_counter() - start_time
 857        total_voxel_count = mrc.data.size
 858
 859        start_time = time.perf_counter()
 860        D_thresholded = d2g.threshold_mrc_data(D_norm, density_threshold)
 861        times["threshold data"] = time.perf_counter() - start_time
 862
 863        start_time = time.perf_counter()
 864        xyz_data = d2g.generate_point_cloud_from_mrc_data(
 865            D_thresholded, density_threshold
 866        )
 867        times["generate point cloud"] = time.perf_counter() - start_time
 868
 869        point_cloud_node_count = len(xyz_data)
 870
 871        start_time = time.perf_counter()
 872        ### perform clustering and get cluster centers
 873        model = d2g.cluster_data_optics(
 874            xyz_data, OPTICS_min_samples, OPTICS_min_cluster_size, OPTICS_max_epsilon
 875        )  # cluster thresholded data using DBSCAN
 876        times["cluster data"] = time.perf_counter() - start_time
 877
 878        noisy_node_count = np.sum(model.labels_ == -1)
 879        non_noise_cluster_count = len(np.unique(model.labels_))
 880        if noisy_node_count > 0:
 881            non_noise_cluster_count -= 1
 882        if non_noise_cluster_count == 0:
 883            print("No clusters found for {centroid_dir}")
 884            run_info_filename = centroid_filename.replace(".npy", "_run_info.csv")
 885            with open(run_info_filename, "w", newline="") as csvfile:
 886                fieldnames = ["Total Voxel Count", "Point Cloud Node Count"]
 887                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 888                writer.writeheader()
 889                writer.writerow(
 890                    {
 891                        "Total Voxel Count": total_voxel_count,
 892                        "Point Cloud Node Count": point_cloud_node_count,
 893                    }
 894                )
 895            # Print timing information to a file
 896            times_filename = centroid_filename.replace("centroids.npy", "times.csv")
 897            with open(times_filename, "w", newline="") as csvfile:
 898                fieldnames = list(times.keys())
 899                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 900                writer.writeheader()
 901                writer.writerow(times)
 902            raise ValueError("No clusters found.")
 903        largest_cluster = np.argmax(np.bincount(model.labels_[model.labels_ != -1]))
 904        largest_cluster_size = np.sum(model.labels_ == largest_cluster)
 905
 906        if density_coarsening:
 907            xyz_densities = D_norm[xyz_data[:, 0], xyz_data[:, 1], xyz_data[:, 2]] + 1
 908            xyz_densities = xyz_densities / 2
 909            fine_mean_density = np.mean(xyz_densities)
 910            fine_median_density = np.median(xyz_densities)
 911            fine_max_density = np.max(xyz_densities)
 912            fine_min_density = np.min(xyz_densities)
 913            fine_std_density = np.std(xyz_densities)
 914            fine_sum_density = np.sum(xyz_densities)
 915            xyz_densities_filename = os.path.join(
 916                centroid_dir,
 917                "xyz_densities.npy",
 918            )
 919            np.save(xyz_densities_filename, xyz_densities)
 920        else:
 921            xyz_densities = None
 922
 923        xyz_filename = os.path.join(
 924            centroid_dir,
 925            "xyz_data.npy",
 926        )
 927        labels_filename = os.path.join(
 928            centroid_dir,
 929            "labels.npy",
 930        )
 931        np.save(xyz_filename, xyz_data)
 932        np.save(labels_filename, model.labels_)
 933
 934        start_time = time.perf_counter()
 935        coarsened = d2g.voxel_coarsening(
 936            coarsening_dim,
 937            xyz_data,
 938            model.labels_,
 939            density=xyz_densities,
 940            averaged=averaged,
 941        )  # coarse grain model by getting cluster centroids
 942        if density_coarsening:
 943            centroids, centroid_densities = coarsened
 944            coarse_mean_density = np.mean(centroid_densities)
 945            coarse_median_density = np.median(centroid_densities)
 946            coarse_max_density = np.max(centroid_densities)
 947            coarse_min_density = np.min(centroid_densities)
 948            coarse_std_density = np.std(centroid_densities)
 949            coarse_sum_density = np.sum(centroid_densities)
 950        else:
 951            centroids = coarsened
 952
 953        coarsened_size = centroids.shape[0]
 954        times["voxel coarsening"] = time.perf_counter() - start_time
 955
 956        start_time = time.perf_counter()
 957        if density_coarsening:
 958            centroid_densities_filename = os.path.join(
 959                centroid_dir,
 960                "centroid_densities.npy",
 961            )
 962            np.save(centroid_densities_filename, centroid_densities)
 963        np.save(centroid_filename, centroids)
 964        times["save centroids"] = time.perf_counter() - start_time
 965
 966        run_info_filename = centroid_filename.replace(".npy", "_run_info.csv")
 967        with open(run_info_filename, "w", newline="") as csvfile:
 968            fieldnames = [
 969                "Total Voxel Count",
 970                "Point Cloud Node Count",
 971                "Non Noise Cluster Count",
 972                "Noisy Node Count",
 973                "Largest Cluster Size",
 974                "Coarsened Size",
 975            ]
 976            if density_coarsening:
 977                fieldnames += [
 978                    "Fine Mean Density",
 979                    "Fine Median Density",
 980                    "Fine Max Density",
 981                    "Fine Min Density",
 982                    "Fine Std Density",
 983                    "Fine Sum Density",
 984                    "Coarse Mean Density",
 985                    "Coarse Median Density",
 986                    "Coarse Max Density",
 987                    "Coarse Min Density",
 988                    "Coarse Std Density",
 989                    "Coarse Sum Density",
 990                ]
 991            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 992
 993            writer.writeheader()
 994            if density_coarsening:
 995                writer.writerow(
 996                    {
 997                        "Total Voxel Count": total_voxel_count,
 998                        "Point Cloud Node Count": point_cloud_node_count,
 999                        "Non Noise Cluster Count": non_noise_cluster_count,
1000                        "Noisy Node Count": noisy_node_count,
1001                        "Largest Cluster Size": largest_cluster_size,
1002                        "Coarsened Size": coarsened_size,
1003                        "Fine Mean Density": fine_mean_density,
1004                        "Fine Median Density": fine_median_density,
1005                        "Fine Max Density": fine_max_density,
1006                        "Fine Min Density": fine_min_density,
1007                        "Fine Std Density": fine_std_density,
1008                        "Fine Sum Density": fine_sum_density,
1009                        "Coarse Mean Density": coarse_mean_density,
1010                        "Coarse Median Density": coarse_median_density,
1011                        "Coarse Max Density": coarse_max_density,
1012                        "Coarse Min Density": coarse_min_density,
1013                        "Coarse Std Density": coarse_std_density,
1014                        "Coarse Sum Density": coarse_sum_density,
1015                    }
1016                )
1017            else:
1018                writer.writerow(
1019                    {
1020                        "Total Voxel Count": total_voxel_count,
1021                        "Point Cloud Node Count": point_cloud_node_count,
1022                        "Non Noise Cluster Count": non_noise_cluster_count,
1023                        "Noisy Node Count": noisy_node_count,
1024                        "Largest Cluster Size": largest_cluster_size,
1025                        "Coarsened Size": coarsened_size,
1026                    }
1027                )
1028        # Print timing information to a file
1029        times_filename = centroid_filename.replace("centroids.npy", "times.csv")
1030        with open(times_filename, "w", newline="") as csvfile:
1031            fieldnames = list(times.keys())
1032            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
1033            writer.writeheader()
1034            writer.writerow(times)
1035    return (centroids, centroid_dir)

convert_density_file_to_centroids = <parsl.app.python.PythonApp object>

Convert a density file to centroids using DBSCAN clustering.

Parameters

protein_name (str): Name of the protein.
mrc_file_path (str): Path to the MRC density file.
DBSCAN_epsilon (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
DBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
density_threshold (float): The threshold value for density.
output_dir (str): Directory where the output will be saved.

Returns

tuple: A tuple containing:
- centroids (numpy.ndarray): The centroids of the clusters.
- centroid_dir (str): The directory where the centroids are saved.

extract_graph_features_from_centroids = <parsl.app.python.PythonApp object>

Extracts graph features from centroid coordinates and saves them to a CSV file.

Parameters

centroids (str or numpy.ndarray): Path to a file containing centroid coordinates or a numpy array of centroid coordinates.
cutoff (float): Distance threshold for creating edges in the graph.
output_dir (str): Directory where the output CSV file will be saved.
force_overwrite (bool, optional): If True, overwrite the existing output file. Defaults to False.

Returns

str: Path to the output CSV file containing the graph features.

convert_density_file_to_centroids_timed = <parsl.app.python.PythonApp object>

Convert a density file to centroids using DBSCAN clustering. Additionally, save the timings of each step to a CSV file.

Parameters

protein_name (str): Name of the protein.
mrc_file_path (str): Path to the MRC density file.
DBSCAN_epsilon (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
DBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
density_threshold (float): The threshold value for density.
output_dir (str): Directory where the output will be saved.

Returns

tuple: A tuple containing:
- centroids (numpy.ndarray): The centroids of the clusters.
- centroid_dir (str): The directory where the centroids are saved.

extract_graph_features_from_centroids_timed = <parsl.app.python.PythonApp object>

Extracts graph features from centroid coordinates and saves them to a CSV file. Additionally, save the times of each step to a separate CSV file.

Parameters

centroids (str or numpy.ndarray): Path to a file containing centroid coordinates or a numpy array of centroid coordinates.
cutoff (float): Distance threshold for creating edges in the graph.
output_dir (str): Directory where the output CSV file will be saved.
force_overwrite (bool, optional): If True, overwrite the existing output file. Defaults to False.
skip_clique_num (bool, optional): If True, skip the calculation of the number of maximal cliques. Defaults to False.

Returns

str: Path to the output CSV file containing the graph features.

convert_density_file_to_point_cloud_hdbscan_timed = <parsl.app.python.PythonApp object>

Convert a density file to centroids using HDBSCAN clustering. Additionally, save the timings of each step to a CSV file.

Parameters

protein_name (str): Name of the protein.
mrc_file_path (str): Path to the MRC density file.
HDBSCAN_min_cluster_size (float): The minimum number of points in a cluster.
HDBSCAN_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
coarsening_dim (int):
The dimensionality passed in to griptomo.core.density2graph.voxel_coarsening's voxel_dim argument to be used for performing coarsening on the thresholded point cloud.
density_threshold (float): The threshold value for density.
output_dir (str): Directory where the output will be saved.
averaged (bool, optional): If True, average the points in the coarsening box instead of returning the center of the box. Defaults to False.
density_coarsening (bool, optional): If True, use the density values of each voxel in the box to provide weight when averaging during coarsening. Defaults to False.
precoarsened (bool, optional): If True, pre-coarsen the data before extracting the cluster centroids. Defaults to False.
standardize (bool, optional): If True, standardize the data before thresholding. Defaults to False.
threshold_ratio (bool, optional): If True, use the threshold ratio to identify the threshold. Defaults to False.

Returns

tuple: A tuple containing:
- centroids (numpy.ndarray): The centroids of the clusters.
- centroid_dir (str): The directory where the centroids are saved.

convert_density_file_to_point_cloud_optics_timed = <parsl.app.python.PythonApp object>

Convert a density file to centroids using OPTICS clustering. Additionally, save the timings of each step to a CSV file.

Parameters

protein_name (str): Name of the protein.
mrc_file_path (str): Path to the MRC density file.
OPTICS_min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.
coarsening_dim (int):
The dimensionality passed in to griptomo.core.density2graph.voxel_coarsening's voxel_dim argument to be used for performing coarsening on the thresholded point cloud.
density_threshold (float): The threshold value for density.
output_dir (str): Directory where the output will be saved.
OPTICS_min_cluster_size (float, optional): The minimum number of points in a cluster. Defaults to None.
OPTICS_max_epsilon (float, optional): The maximum distance between points considered by OPTICS. Defaults to None, which uses np.inf. A lower value may reduce computation time.
averaged (bool, optional): If True, average the points in the coarsening box instead of returning the center of the box. Defaults to False.
density_coarsening (bool, optional): If True, use the density values of each voxel in the box to provide weight when averaging during coarsening. Defaults to False.

Returns

tuple: A tuple containing:
- centroids (numpy.ndarray): The centroids of the clusters.
- centroid_dir (str): The directory where the centroids are saved.