corems.mass_spectra.calc.lc_calc

View Source

   1import numpy as np
   2import pandas as pd
   3from ripser import ripser
   4from scipy import sparse
   5from scipy.spatial import KDTree
   6
   7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
   8from corems.mass_spectra.calc import SignalProcessing as sp
   9from corems.mass_spectra.factory.chromat_data import EIC_Data
  10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  11
  12
  13def find_closest(A, target):
  14    """Find the index of closest value in A to each value in target.
  15
  16    Parameters
  17    ----------
  18    A : :obj:`~numpy.array`
  19        The array to search (blueprint). A must be sorted.
  20    target : :obj:`~numpy.array`
  21        The array of values to search for. target must be sorted.
  22
  23    Returns
  24    -------
  25    :obj:`~numpy.array`
  26        The indices of the closest values in A to each value in target.
  27    """
  28    idx = A.searchsorted(target)
  29    idx = np.clip(idx, 1, len(A) - 1)
  30    left = A[idx - 1]
  31    right = A[idx]
  32    idx -= target - left < right - target
  33    return idx
  34
  35
  36class LCCalculations:
  37    """Methods for performing LC calculations on mass spectra data.
  38
  39    Notes
  40    -----
  41    This class is intended to be used as a mixin for the LCMSBase class.
  42
  43    Methods
  44    -------
  45    * get_max_eic(eic_data).
  46        Returns the maximum EIC value from the given EIC data. A static method.
  47    * smooth_tic(tic).
  48        Smooths the TIC data using the specified smoothing method and settings.
  49    * eic_centroid_detector(rt, eic, max_eic).
  50        Performs EIC centroid detection on the given EIC data.
  51    * find_nearest_scan(rt).
  52        Finds the nearest scan to the given retention time.
  53    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
  54        Returns an averaged mass spectrum object.
  55    * find_mass_features(ms_level=1).
  56        Find regions of interest for a given MS level (default is MS1).
  57    * integrate_mass_features(drop_if_fail=False, ms_level=1).
  58        Integrate mass features of interest and extracts EICs.
  59    * find_c13_mass_features().
  60        Evaluate mass features and mark likely C13 isotopes.
  61    * deconvolute_ms1_mass_features().
  62        Deconvolute mass features' ms1 mass spectra.
  63    """
  64
  65    @staticmethod
  66    def get_max_eic(eic_data: dict):
  67        """Returns the maximum EIC value from the given EIC data.
  68
  69        Notes
  70        -----
  71        This is a static method.
  72
  73        Parameters
  74        ----------
  75        eic_data : dict
  76            A dictionary containing EIC data.
  77
  78        Returns
  79        -------
  80        float
  81            The maximum EIC value.
  82        """
  83        max_eic = 0
  84        for eic_data in eic_data.values():
  85            ind_max_eic = max(eic_data.get("EIC"))
  86            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
  87
  88        return max_eic
  89
  90    def smooth_tic(self, tic):
  91        """Smooths the TIC or EIC data using the specified smoothing method and settings.
  92
  93        Parameters
  94        ----------
  95        tic : numpy.ndarray
  96            The TIC (or EIC) data to be smoothed.
  97
  98        Returns
  99        -------
 100        numpy.ndarray
 101            The smoothed TIC data.
 102        """
 103        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
 104
 105        pol_order = self.parameters.lc_ms.savgol_pol_order
 106
 107        window_len = self.parameters.lc_ms.smooth_window
 108
 109        window = self.parameters.lc_ms.smooth_method
 110
 111        return sp.smooth_signal(
 112            tic, window_len, window, pol_order, implemented_smooth_method
 113        )
 114
 115    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
 116        """Performs EIC centroid detection on the given EIC data.
 117
 118        Parameters
 119        ----------
 120        rt : numpy.ndarray
 121            The retention time data.
 122        eic : numpy.ndarray
 123            The EIC data.
 124        max_eic : float
 125            The maximum EIC value.
 126        apex_indexes : list, optional
 127            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
 128
 129        Returns
 130        -------
 131        numpy.ndarray
 132            The indexes of left, apex, and right limits as a generator.
 133        """
 134
 135        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
 136
 137        max_height = self.parameters.lc_ms.peak_height_max_percent
 138
 139        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
 140
 141        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
 142
 143        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
 144
 145        include_indexes = sp.peak_picking_first_derivative(
 146            domain=rt,
 147            signal=eic,
 148            max_height=max_height,
 149            max_prominence=max_prominence,
 150            max_signal=max_eic,
 151            min_peak_datapoints=min_peak_datapoints,
 152            peak_derivative_threshold=peak_derivative_threshold,
 153            signal_threshold=signal_threshold,
 154            correct_baseline=False,
 155            plot_res=False,
 156            apex_indexes=apex_indexes,
 157        )
 158
 159        return include_indexes
 160
 161    def find_nearest_scan(self, rt):
 162        """Finds the nearest scan to the given retention time.
 163
 164        Parameters
 165        ----------
 166        rt : float
 167            The retention time (in minutes) to find the nearest scan for.
 168
 169        Returns
 170        -------
 171        int
 172            The scan number of the nearest scan.
 173        """
 174        array_rt = np.array(self.retention_time)
 175
 176        scan_index = (np.abs(array_rt - rt)).argmin()
 177
 178        real_scan = self.scans_number[scan_index]
 179
 180        return real_scan
 181
 182    def add_peak_metrics(self):
 183        """Add peak metrics to the mass features.
 184
 185        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
 186        """
 187        # Check that at least some mass features have eic data
 188        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
 189            raise ValueError(
 190                "No mass features have EIC data. Run integrate_mass_features first."
 191            )
 192
 193        for mass_feature in self.mass_features.values():
 194            # Check if the mass feature has been integrated
 195            if mass_feature._eic_data is not None:
 196                # Calculate peak metrics
 197                mass_feature.calc_half_height_width()
 198                mass_feature.calc_tailing_factor()
 199                mass_feature.calc_dispersity_index()
 200
 201    def get_average_mass_spectrum(
 202        self,
 203        scan_list,
 204        apex_scan,
 205        spectrum_mode="profile",
 206        ms_level=1,
 207        auto_process=True,
 208        use_parser=False,
 209        perform_checks=True,
 210        polarity=None,
 211        ms_params=None,
 212    ):
 213        """Returns an averaged mass spectrum object
 214
 215        Parameters
 216        ----------
 217        scan_list : list
 218            List of scan numbers to average.
 219        apex_scan : int
 220            Number of the apex scan
 221        spectrum_mode : str, optional
 222            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
 223        ms_level : int, optional
 224            The MS level to use. Defaults to 1.
 225        auto_process : bool, optional
 226            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
 227        use_parser : bool, optional
 228            If True, the mass spectra will be obtained from the parser. Defaults to False.
 229        perform_checks : bool, optional
 230            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
 231        polarity : int, optional
 232            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
 233        ms_params : MSParameters, optional
 234            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
 235
 236        Returns
 237        -------
 238        MassSpectrumProfile
 239            The averaged mass spectrum object.
 240
 241        Raises
 242        ------
 243        ValueError
 244            If the spectrum mode is not "profile".
 245            If the MS level is not found in the unprocessed mass spectra dictionary.
 246            If not all scan numbers are found in the unprocessed mass spectra dictionary.
 247        """
 248        if perform_checks:
 249            if spectrum_mode != "profile":
 250                raise ValueError("Averaging only supported for profile mode")
 251
 252        if polarity is None:
 253            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 254            if self.polarity == "negative":
 255                polarity = -1
 256            elif self.polarity == "positive":
 257                polarity = 1
 258            else:
 259                raise ValueError(
 260                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
 261                )
 262
 263        # if not using_parser, check that scan numbers are in _ms_unprocessed
 264        if not use_parser:
 265            if perform_checks:
 266                # Set index to scan for faster lookup
 267                ms_df = (
 268                    self._ms_unprocessed[ms_level]
 269                    .copy()
 270                    .set_index("scan", drop=False)
 271                    .sort_index()
 272                )
 273                my_ms_df = ms_df.loc[scan_list]
 274                # Check that all scan numbers are in the ms_df
 275                if not all(np.isin(scan_list, ms_df.index)):
 276                    raise ValueError(
 277                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
 278                    )
 279            else:
 280                my_ms_df = (
 281                    pd.DataFrame({"scan": scan_list})
 282                    .set_index("scan")
 283                    .join(self._ms_unprocessed[ms_level], how="left")
 284                )
 285
 286        if use_parser:
 287            ms_list = [
 288                self.spectra_parser.get_mass_spectrum_from_scan(
 289                    x, spectrum_mode=spectrum_mode, auto_process=False
 290                )
 291                for x in scan_list
 292            ]
 293            ms_mz = [x._mz_exp for x in ms_list]
 294            ms_int = [x._abundance for x in ms_list]
 295            my_ms_df = []
 296            for i in np.arange(len(ms_mz)):
 297                my_ms_df.append(
 298                    pd.DataFrame(
 299                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
 300                    )
 301                )
 302            my_ms_df = pd.concat(my_ms_df)
 303
 304        if not self.check_if_grid(my_ms_df):
 305            my_ms_df = self.grid_data(my_ms_df)
 306
 307        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
 308
 309        ms = ms_from_array_profile(
 310            my_ms_ave.mz,
 311            my_ms_ave.intensity,
 312            self.file_location,
 313            polarity=polarity,
 314            auto_process=False,
 315        )
 316
 317        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 318        if ms is not None:
 319            if ms_params is not None:
 320                ms.parameters = ms_params
 321            ms.scan_number = apex_scan
 322            if auto_process:
 323                ms.process_mass_spec()
 324        return ms
 325
 326    def find_mass_features(self, ms_level=1, grid=True):
 327        """Find mass features within an LCMSBase object
 328
 329        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
 330
 331        Parameters
 332        ----------
 333        ms_level : int, optional
 334            The MS level to use for peak picking Default is 1.
 335        grid : bool, optional
 336            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 
 337            used for persistent homology peak picking for profile data only. Default is True.
 338
 339        Raises
 340        ------
 341        ValueError
 342            If no MS level data is found on the object.
 343            If persistent homology peak picking is attempted on non-profile mode data.
 344            If data is not gridded and grid is False.
 345            If peak picking method is not implemented.
 346
 347        Returns
 348        -------
 349        None, but assigns the mass_features and eics attributes to the object.
 350
 351        """
 352        pp_method = self.parameters.lc_ms.peak_picking_method
 353
 354        if pp_method == "persistent homology":
 355            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 356            if all(msx_scan_df["ms_format"] == "profile"):
 357                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
 358                self.cluster_mass_features(drop_children=True, sort_by="persistence")
 359            else:
 360                raise ValueError(
 361                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
 362                        ms_level
 363                    )
 364                )
 365        elif pp_method == "centroided_persistent_homology":
 366            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 367            if all(msx_scan_df["ms_format"] == "centroid"):
 368                self.find_mass_features_ph_centroid(ms_level=ms_level)
 369            else:
 370                raise ValueError(
 371                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
 372                        ms_level
 373                    )
 374                )
 375        else:
 376            raise ValueError("Peak picking method not implemented")
 377
 378    def integrate_mass_features(
 379        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
 380    ):
 381        """Integrate mass features and extract EICs.
 382
 383        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
 384
 385        Parameters
 386        ----------
 387        drop_if_fail : bool, optional
 388            Whether to drop mass features if the EIC limit calculations fail.
 389            Default is True.
 390        drop_duplicates : bool, optional
 391            Whether to mass features that appear to be duplicates
 392            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
 393            Default is True.
 394        ms_level : int, optional
 395            The MS level to use. Default is 1.
 396
 397        Raises
 398        ------
 399        ValueError
 400            If no mass features are found.
 401            If no MS level data is found for the given MS level (either in data or in the scan data)
 402
 403        Returns
 404        -------
 405        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
 406
 407        Notes
 408        -----
 409        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
 410        """
 411        # Check if there is data
 412        if ms_level in self._ms_unprocessed.keys():
 413            raw_data = self._ms_unprocessed[ms_level].copy()
 414        else:
 415            raise ValueError("No MS level " + str(ms_level) + " data found")
 416        if self.mass_features is not None:
 417            mf_df = self.mass_features_to_df().copy()
 418        else:
 419            raise ValueError(
 420                "No mass features found, did you run find_mass_features() first?"
 421            )
 422        # Check if mass_spectrum exists on each mass feature
 423        if not all(
 424            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
 425        ):
 426            raise ValueError(
 427                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
 428            )
 429
 430        # Subset scan data to only include correct ms_level
 431        scan_df_sub = self.scan_df[
 432            self.scan_df["ms_level"] == int(ms_level)
 433        ].reset_index(drop=True)
 434        if scan_df_sub.empty:
 435            raise ValueError("No MS level " + ms_level + " data found in scan data")
 436        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
 437
 438        mzs_to_extract = np.unique(mf_df["mz"].values)
 439        mzs_to_extract.sort()
 440
 441        # Get EICs for each unique mz in mass features list
 442        for mz in mzs_to_extract:
 443            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 444            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 445            raw_data_sub = raw_data[
 446                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
 447            ].reset_index(drop=True)
 448            raw_data_sub = (
 449                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
 450            )
 451            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
 452            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
 453            myEIC = EIC_Data(
 454                scans=raw_data_sub["scan"].values,
 455                time=raw_data_sub["scan_time"].values,
 456                eic=raw_data_sub["intensity"].values,
 457            )
 458            # Smooth EIC
 459            smoothed_eic = self.smooth_tic(myEIC.eic)
 460            smoothed_eic[smoothed_eic < 0] = 0
 461            myEIC.eic_smoothed = smoothed_eic
 462            self.eics[mz] = myEIC
 463
 464        # Get limits of mass features using EIC centroid detector and integrate
 465        mf_df["area"] = np.nan
 466        for idx, mass_feature in mf_df.iterrows():
 467            mz = mass_feature.mz
 468            apex_scan = mass_feature.apex_scan
 469
 470            # Pull EIC data and find apex scan index
 471            myEIC = self.eics[mz]
 472            self.mass_features[idx]._eic_data = myEIC
 473            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
 474
 475            # Find left and right limits of peak using EIC centroid detector, add to EICData
 476            centroid_eics = self.eic_centroid_detector(
 477                myEIC.time,
 478                myEIC.eic_smoothed,
 479                mass_feature.intensity * 1.1,
 480                apex_indexes=[int(apex_index)],
 481            )
 482            l_a_r_scan_idx = [i for i in centroid_eics]
 483            if len(l_a_r_scan_idx) > 0:
 484                # Add start and final scan to mass_features and EICData
 485                left_scan, right_scan = (
 486                    myEIC.scans[l_a_r_scan_idx[0][0]],
 487                    myEIC.scans[l_a_r_scan_idx[0][2]],
 488                )
 489                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
 490                myEIC.apexes = myEIC.apexes + mf_scan_apex
 491                self.mass_features[idx].start_scan = left_scan
 492                self.mass_features[idx].final_scan = right_scan
 493
 494                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
 495                area = np.trapz(
 496                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 497                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 498                )
 499                mf_df.at[idx, "area"] = area
 500                myEIC.areas = myEIC.areas + [area]
 501                self.eics[mz] = myEIC
 502                self.mass_features[idx]._area = area
 503            else:
 504                if drop_if_fail is True:
 505                    self.mass_features.pop(idx)
 506
 507        if drop_duplicates:
 508            # Prepare mass feature dataframe
 509            mf_df = self.mass_features_to_df().copy()
 510
 511            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
 512            # Kepp the first mass fea
 513            for idx, mass_feature in mf_df.iterrows():
 514                mz = mass_feature.mz
 515                apex_scan = mass_feature.apex_scan
 516
 517                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
 518                mf_df_sub = mf_df[
 519                    mf_df["mz_diff_ppm"]
 520                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 521                    * 10**6
 522                ].copy()
 523
 524                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
 525                for idx2, mass_feature2 in mf_df_sub.iterrows():
 526                    if idx2 != idx:
 527                        if (
 528                            mass_feature2.start_scan >= mass_feature.start_scan
 529                            and mass_feature2.final_scan <= mass_feature.final_scan
 530                        ):
 531                            if idx2 in self.mass_features.keys():
 532                                self.mass_features.pop(idx2)
 533
 534    def find_c13_mass_features(self):
 535        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
 536
 537        Returns
 538        -------
 539        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
 540
 541        Raises
 542        ------
 543        ValueError
 544            If no mass features are found.
 545        """
 546        verbose = self.parameters.lc_ms.verbose_processing
 547        if verbose:
 548            print("evaluating mass features for C13 isotopes")
 549        if self.mass_features is None:
 550            raise ValueError("No mass features found, run find_mass_features() first")
 551
 552        # Data prep fo sparse distance matrix
 553        dims = ["mz", "scan_time"]
 554        mf_df = self.mass_features_to_df().copy()
 555        # Drop mass features that have no area (these are likely to be noise)
 556        mf_df = mf_df[mf_df["area"].notnull()]
 557        mf_df["mf_id"] = mf_df.index.values
 558        dims = ["mz", "scan_time"]
 559
 560        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
 561        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
 562
 563        mz_diff = 1.003355  # C13-C12 mass difference
 564        tol = [
 565            mf_df["mz"].median()
 566            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
 567            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
 568        ]  # mz, in relative; scan_time in minutes
 569
 570        # Compute inter-feature distances
 571        distances = None
 572        for i in range(len(dims)):
 573            # Construct k-d tree
 574            values = mf_df[dims[i]].values
 575            tree = KDTree(values.reshape(-1, 1))
 576
 577            max_tol = tol[i]
 578            if dims[i] == "mz":
 579                # Maximum absolute tolerance
 580                max_tol = mz_diff + tol[i]
 581
 582            # Compute sparse distance matrix
 583            # the larger the max_tol, the slower this operation is
 584            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
 585
 586            # Only consider forward case, exclude diagonal
 587            sdm = sparse.triu(sdm, k=1)
 588
 589            if dims[i] == "mz":
 590                min_tol = mz_diff - tol[i]
 591                # Get only the ones that are above the min tol
 592                idx = sdm.data > min_tol
 593
 594                # Reconstruct sparse distance matrix
 595                sdm = sparse.coo_matrix(
 596                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
 597                    shape=(len(values), len(values)),
 598                )
 599
 600            # Cast as binary matrix
 601            sdm.data = np.ones_like(sdm.data)
 602
 603            # Stack distances
 604            if distances is None:
 605                distances = sdm
 606            else:
 607                distances = distances.multiply(sdm)
 608
 609        # Extract indices of within-tolerance points
 610        distances = distances.tocoo()
 611        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
 612
 613        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
 614        pairs_mf = pairs.copy()
 615        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
 616        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
 617
 618        # Connect monoisotopic masses with isotopologes within mass_features
 619        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
 620        for mono in monos:
 621            self.mass_features[mono].monoisotopic_mf_id = mono
 622        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
 623        while not pairs_iso_df.empty:
 624            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
 625            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
 626            for iso in m1_isos:
 627                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
 628                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
 629                if len(parent) > 1:
 630                    # Choose the parent that is closest in time to the isotopologue
 631                    parent_time = [self.mass_features[p].retention_time for p in parent]
 632                    time_diff = [
 633                        np.abs(self.mass_features[iso].retention_time - x)
 634                        for x in parent_time
 635                    ]
 636                    parent = parent[np.argmin(time_diff)]
 637                else:
 638                    parent = parent[0]
 639                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
 640                    parent
 641                ].monoisotopic_mf_id
 642                if self.mass_features[iso].monoisotopic_mf_id is not None:
 643                    mass_diff = (
 644                        self.mass_features[iso].mz
 645                        - self.mass_features[
 646                            self.mass_features[iso].monoisotopic_mf_id
 647                        ].mz
 648                    )
 649                    self.mass_features[iso].isotopologue_type = "13C" + str(
 650                        int(round(mass_diff, 0))
 651                    )
 652
 653            # Drop the mono and iso from the pairs_iso_df
 654            pairs_iso_df = pairs_iso_df.drop(
 655                index=monos, errors="ignore"
 656            )  # Drop pairs where the parent is a child that is a child of a root
 657            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
 658            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
 659
 660            if not pairs_iso_df.empty:
 661                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
 662                monos = np.setdiff1d(
 663                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
 664                )
 665        if verbose:
 666            # Report fraction of compounds annotated with isotopes
 667            mf_df["c13_flag"] = np.where(
 668                np.logical_or(
 669                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
 670                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
 671                ),
 672                1,
 673                0,
 674            )
 675            print(
 676                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
 677                + " of mass features have or are C13 isotopes"
 678            )
 679
 680    def deconvolute_ms1_mass_features(self):
 681        """Deconvolute MS1 mass features
 682
 683        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
 684
 685        Parameters
 686        ----------
 687        None
 688
 689        Returns
 690        -------
 691        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
 692        and associated_mass_features_deconvoluted attributes to the mass features in the
 693        mass_features attribute of the LCMSBase object.
 694
 695        Raises
 696        ------
 697        ValueError
 698            If no mass features are found, must run find_mass_features() first.
 699            If no EICs are found, did you run integrate_mass_features() first?
 700
 701        """
 702        # Checks for set mass_features and eics
 703        if self.mass_features is None:
 704            raise ValueError(
 705                "No mass features found, did you run find_mass_features() first?"
 706            )
 707
 708        if self.eics == {}:
 709            raise ValueError(
 710                "No EICs found, did you run integrate_mass_features() first?"
 711            )
 712
 713        if 1 not in self._ms_unprocessed.keys():
 714            raise ValueError("No unprocessed MS1 spectra found.")
 715
 716        # Prep ms1 data
 717        ms1_data = self._ms_unprocessed[1].copy()
 718        ms1_data = ms1_data.set_index("scan")
 719
 720        # Prep mass feature summary
 721        mass_feature_df = self.mass_features_to_df()
 722
 723        # Loop through each mass feature
 724        for mf_id, mass_feature in self.mass_features.items():
 725            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
 726            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
 727                continue
 728
 729            # Get the left and right limits of the EIC of the mass feature
 730            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
 731
 732            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
 733            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
 734            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
 735
 736            # Get the centroided masses of the mass feature
 737            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
 738
 739            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
 740            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
 741                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
 742            ]
 743
 744            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
 745            ms1_data_sub["mz_diff_rel"] = (
 746                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
 747                / ms1_data_sub["mz"]
 748            )
 749            ms1_data_sub = ms1_data_sub[
 750                ms1_data_sub["mz_diff_rel"]
 751                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 752            ].reset_index(drop=True)
 753
 754            # Group by mass_feature_mz and scan and sum intensity
 755            ms1_data_sub_group = (
 756                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
 757                .sum()
 758                .reset_index()
 759            )
 760
 761            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
 762            corr = (
 763                ms1_data_sub_group.pivot(
 764                    index="scan", columns="mass_feature_mz", values="intensity"
 765                )
 766                .fillna(0)
 767                .corr()
 768            )
 769
 770            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
 771            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
 772            corr_subset = corr.loc[mass_feature.mz,]
 773            corr_subset = corr_subset[corr_subset > decon_corr_min]
 774
 775            # Get the masses from the mass spectrum that are the result of the deconvolution
 776            mzs_decon = corr_subset.index.values
 777
 778            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
 779            mzs_decon_idx = [
 780                id
 781                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
 782                if mz in mzs_decon
 783            ]
 784            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
 785
 786            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
 787            if (
 788                mass_feature.ms1_peak.abundance
 789                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
 790            ):
 791                mass_feature.mass_spectrum_deconvoluted_parent = True
 792            else:
 793                mass_feature.mass_spectrum_deconvoluted_parent = False
 794
 795            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
 796            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
 797            mass_feature_df_sub = mass_feature_df[
 798                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
 799                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 800            ].copy()
 801            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
 802            mass_feature_df_sub["mz_diff_ppm"] = [
 803                np.abs(mzs_decon - mz).min() / mz * 10**6
 804                for mz in mass_feature_df_sub["mz"]
 805            ]
 806            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
 807            mfs_associated_decon = mass_feature_df_sub[
 808                mass_feature_df_sub["mz_diff_ppm"]
 809                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
 810            ].index.values
 811
 812            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
 813
 814
 815class PHCalculations:
 816    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 817
 818    Notes
 819    -----
 820    This class is intended to be used as a mixin for the LCMSBase class.
 821
 822    Methods
 823    -------
 824    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 825        Sparse implementation of a mean filter.
 826    * embed_unique_indices(a).
 827        Creates an array of indices, sorted by unique element.
 828    * sparse_upper_star(idx, V).
 829        Sparse implementation of an upper star filtration.
 830    * check_if_grid(data).
 831        Check if the data is gridded in mz space.
 832    * grid_data(data).
 833        Grid the data in the mz dimension.
 834    * find_mass_features_ph(ms_level=1, grid=True).
 835        Find mass features within an LCMSBase object using persistent homology.
 836    * cluster_mass_features(drop_children=True).
 837        Cluster regions of interest.
 838    """
 839
 840    @staticmethod
 841    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 842        """Sparse implementation of a mean filter.
 843
 844        Parameters
 845        ----------
 846        idx : :obj:`~numpy.array`
 847            Edge indices for each dimension (MxN).
 848        V : :obj:`~numpy.array`
 849            Array of intensity data (Mx1).
 850        radius : float or list
 851            Radius of the sparse filter in each dimension. Values less than
 852            zero indicate no connectivity in that dimension.
 853
 854        Returns
 855        -------
 856        :obj:`~numpy.array`
 857            Filtered intensities (Mx1).
 858
 859        Notes
 860        -----
 861        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 862        This is a static method.
 863        """
 864
 865        # Copy indices
 866        idx = idx.copy().astype(V.dtype)
 867
 868        # Scale
 869        for i, r in enumerate(radius):
 870            # Increase inter-index distance
 871            if r < 1:
 872                idx[:, i] *= 2
 873
 874            # Do nothing
 875            elif r == 1:
 876                pass
 877
 878            # Decrease inter-index distance
 879            else:
 880                idx[:, i] /= r
 881
 882        # Connectivity matrix
 883        cmat = KDTree(idx)
 884        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 885        cmat.setdiag(1)
 886
 887        # Pair indices
 888        I, J = cmat.nonzero()
 889
 890        # Delete cmat
 891        cmat_shape = cmat.shape
 892        del cmat
 893
 894        # Sum over columns
 895        V_sum = sparse.bsr_matrix(
 896            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 897        ).diagonal(0)
 898
 899        # Count over columns
 900        V_count = sparse.bsr_matrix(
 901            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 902        ).diagonal(0)
 903
 904        return V_sum / V_count
 905
 906    @staticmethod
 907    def embed_unique_indices(a):
 908        """Creates an array of indices, sorted by unique element.
 909
 910        Parameters
 911        ----------
 912        a : :obj:`~numpy.array`
 913            Array of unique elements (Mx1).
 914
 915        Returns
 916        -------
 917        :obj:`~numpy.array`
 918            Array of indices (Mx1).
 919
 920        Notes
 921        -----
 922        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 923        This is a static method.
 924        """
 925
 926        def count_tens(n):
 927            # Count tens
 928            ntens = (n - 1) // 10
 929
 930            while True:
 931                ntens_test = (ntens + n - 1) // 10
 932
 933                if ntens_test == ntens:
 934                    return ntens
 935                else:
 936                    ntens = ntens_test
 937
 938        def arange_exclude_10s(n):
 939            # How many 10s will there be?
 940            ntens = count_tens(n)
 941
 942            # Base array
 943            arr = np.arange(0, n + ntens)
 944
 945            # Exclude 10s
 946            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 947
 948            return arr
 949
 950        # Creates an array of indices, sorted by unique element
 951        idx_sort = np.argsort(a)
 952        idx_unsort = np.argsort(idx_sort)
 953
 954        # Sorts records array so all unique elements are together
 955        sorted_a = a[idx_sort]
 956
 957        # Returns the unique values, the index of the first occurrence,
 958        # and the count for each element
 959        vals, idx_start, count = np.unique(
 960            sorted_a, return_index=True, return_counts=True
 961        )
 962
 963        # Splits the indices into separate arrays
 964        splits = np.split(idx_sort, idx_start[1:])
 965
 966        # Creates unique indices for each split
 967        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 968
 969        # Reorders according to input array
 970        idx_unq = idx_unq[idx_unsort]
 971
 972        # Magnitude of each index
 973        exp = np.log10(
 974            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
 975        )
 976        idx_unq_mag = np.power(10, np.floor(exp) + 1)
 977
 978        # Result
 979        return a + idx_unq / idx_unq_mag
 980    
 981    @staticmethod
 982    def roll_up_dataframe(
 983        df : pd.DataFrame,
 984        sort_by : str,
 985        tol : list,
 986        relative : list,
 987        dims : list
 988    ):
 989        """Subset data by rolling up into apex in appropriate dimensions.
 990        
 991        Parameters
 992        ----------
 993        data : pd.DataFrame
 994            The input data containing "dims" columns and the "sort_by" column.
 995        sort_by : str
 996            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 
 997            (i.e., the mass feature with the highest value in the sort_by column).
 998        dims : list
 999            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1000        tol : list
1001            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1002            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1003            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1004            If relative is False, the tolerance will be used as is.
1005        relative : list
1006            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1007
1008        Returns
1009        -------
1010        pd.DataFrame
1011            A DataFrame with only the rolled up mass features, with the original index and columns.
1012
1013            
1014        Raises
1015        ------
1016        ValueError
1017            If the input data is not a pandas DataFrame.
1018            If the input data does not have columns for each of the dimensions in "dims".
1019            If the length of "dims", "tol", and "relative" do not match.
1020        """
1021        og_columns = df.columns.copy()
1022
1023        # Unindex the data, but keep the original index
1024        if df.index.name is not None:
1025            og_index = df.index.name
1026        else:
1027            og_index = "index"
1028        df = df.reset_index(drop=False)
1029
1030        # Sort data by sort_by column, and reindex
1031        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1032
1033        # Check that data is a DataFrame and has columns for each of the dims
1034        if not isinstance(df, pd.DataFrame):
1035            raise ValueError("Data must be a pandas DataFrame")
1036        for dim in dims:
1037            if dim not in df.columns:
1038                raise ValueError(f"Data must have a column for {dim}")
1039        if len(dims) != len(tol) or len(dims) != len(relative):
1040            raise ValueError(
1041                "Dimensions, tolerances, and relative flags must be the same length"
1042            )
1043        
1044        # Compute inter-feature distances
1045        distances = None
1046        for i in range(len(dims)):
1047            # Construct k-d tree
1048            values = df[dims[i]].values
1049            tree = KDTree(values.reshape(-1, 1))
1050
1051            max_tol = tol[i]
1052            if relative[i] is True:
1053                # Maximum absolute tolerance
1054                max_tol = tol[i] * values.max()
1055
1056            # Compute sparse distance matrix
1057            # the larger the max_tol, the slower this operation is
1058            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1059
1060            # Only consider forward case, exclude diagonal
1061            sdm = sparse.triu(sdm, k=1)
1062
1063            # Filter relative distances
1064            if relative[i] is True:
1065                # Compute relative distances
1066                rel_dists = sdm.data / values[sdm.row]  # or col?
1067
1068                # Indices of relative distances less than tolerance
1069                idx = rel_dists <= tol[i]
1070
1071                # Reconstruct sparse distance matrix
1072                sdm = sparse.coo_matrix(
1073                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1074                    shape=(len(values), len(values)),
1075                )
1076
1077            # Cast as binary matrix
1078            sdm.data = np.ones_like(sdm.data)
1079
1080            # Stack distances
1081            if distances is None:
1082                distances = sdm
1083            else:
1084                distances = distances.multiply(sdm)
1085        
1086        # Extract indices of within-tolerance points
1087        distances = distances.tocoo()
1088        pairs = np.stack((distances.row, distances.col), axis=1)
1089        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1090        pairs_df = pairs_df.set_index("parent")
1091
1092        to_drop = []
1093        while not pairs_df.empty:
1094            # Find root_parents and their children
1095            root_parents = np.setdiff1d(
1096                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1097            )
1098            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1099            to_drop = np.append(to_drop, children_of_roots)
1100
1101            # Remove root_children as possible parents from pairs_df for next iteration
1102            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1103            pairs_df = pairs_df.reset_index().set_index("child")
1104            # Remove root_children as possible children from pairs_df for next iteration
1105            pairs_df = pairs_df.drop(index=children_of_roots)
1106
1107            # Prepare for next iteration
1108            pairs_df = pairs_df.reset_index().set_index("parent")
1109
1110        # Drop mass features that are not cluster parents
1111        df_sub = df.drop(index=np.array(to_drop))
1112
1113        # Set index back to og_index and only keep the columns that are in the original dataframe
1114        df_sub = df_sub.set_index(og_index)
1115
1116        # sort the dataframe by the original index
1117        df_sub = df_sub.sort_index()
1118        df_sub = df_sub[og_columns]
1119
1120        return df_sub
1121    
1122    def sparse_upper_star(self, idx, V):
1123        """Sparse implementation of an upper star filtration.
1124
1125        Parameters
1126        ----------
1127        idx : :obj:`~numpy.array`
1128            Edge indices for each dimension (MxN).
1129        V : :obj:`~numpy.array`
1130            Array of intensity data (Mx1).
1131        Returns
1132        -------
1133        idx : :obj:`~numpy.array`
1134            Index of filtered points (Mx1).
1135        persistence : :obj:`~numpy.array`
1136            Persistence of each filtered point (Mx1).
1137
1138        Notes
1139        -----
1140        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1141        """
1142
1143        # Invert
1144        V = -1 * V.copy().astype(int)
1145
1146        # Embed indices
1147        V = self.embed_unique_indices(V)
1148
1149        # Connectivity matrix
1150        cmat = KDTree(idx)
1151        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1152        cmat.setdiag(1)
1153        cmat = sparse.triu(cmat)
1154
1155        # Pairwise minimums
1156        I, J = cmat.nonzero()
1157        d = np.maximum(V[I], V[J])
1158
1159        # Delete connectiity matrix
1160        cmat_shape = cmat.shape
1161        del cmat
1162
1163        # Sparse distance matrix
1164        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1165
1166        # Delete pairwise mins
1167        del d, I, J
1168
1169        # Persistence homology
1170        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1171
1172        # Bound death values
1173        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1174
1175        # Construct tree to query against
1176        tree = KDTree(V.reshape((-1, 1)))
1177
1178        # Get the indexes of the first nearest neighbor by birth
1179        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1180
1181        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1182
1183    def check_if_grid(self, data):
1184        """Check if the data are gridded in mz space.
1185
1186        Parameters
1187        ----------
1188        data : DataFrame
1189            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1190
1191        Returns
1192        -------
1193        bool
1194            True if the data is gridded in the mz direction, False otherwise.
1195
1196        Notes
1197        -----
1198        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1199        """
1200        # Calculate the difference between consecutive mz values in a single scan
1201        dat_check = data.copy().reset_index(drop=True)
1202        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1203        mz_diff_min = (
1204            dat_check.groupby("scan")["mz_diff"].min().min()
1205        )  # within each scan, what is the smallest mz difference between consecutive mz values
1206
1207        # Find the mininum mz difference between mz values in the data; regardless of scan
1208        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1209        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1210        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1211
1212        # Get minimum mz_diff between mz values in the data
1213        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1214
1215        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1216        if mz_diff_min_raw < mz_diff_min:
1217            return False
1218        else:
1219            return True
1220
1221    def grid_data(self, data):
1222        """Grid the data in the mz dimension.
1223
1224        Data must be gridded prior to persistent homology calculations.
1225
1226        Parameters
1227        ----------
1228        data : DataFrame
1229            The input data containing mz, scan, scan_time, and intensity columns.
1230
1231        Returns
1232        -------
1233        DataFrame
1234            The gridded data with mz, scan, scan_time, and intensity columns.
1235
1236        Raises
1237        ------
1238        ValueError
1239            If gridding fails.
1240        """
1241
1242        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1243        data_w = data.copy().reset_index(drop=True)
1244        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1245        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1246
1247        # Need high intensity mz values first so they are parents in the output pairs stack
1248        dat_mz = data_w[["mz", "intensity"]].sort_values(
1249            by=["intensity"], ascending=False
1250        )
1251        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1252
1253        # Construct KD tree
1254        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1255        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1256        sdm = sparse.triu(sdm, k=1)
1257        sdm.data = np.ones_like(sdm.data)
1258        distances = sdm.tocoo()
1259        pairs = np.stack((distances.row, distances.col), axis=1)
1260
1261        # Cull pairs to just get root
1262        to_drop = []
1263        while len(pairs) > 0:
1264            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1265            id_root_parents = np.isin(pairs[:, 0], root_parents)
1266            children_of_roots = np.unique(pairs[id_root_parents, 1])
1267            to_drop = np.append(to_drop, children_of_roots)
1268
1269            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1270            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1271            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1272        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1273        mz_dat_np = (
1274            dat_mz[["mz"]]
1275            .sort_values(by=["mz"])
1276            .reset_index(drop=True)
1277            .values.flatten()
1278        )
1279
1280        # Sort data by mz and recast mz to nearest value in mz_dat_np
1281        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1282        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1283        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1284
1285        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1286        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1287        new_data_w = (
1288            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1289            .groupby(["scan", "mz"])["intensity"]
1290            .sum()
1291            .reset_index()
1292        )
1293        new_data_w = (
1294            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1295            .reset_index(drop=True)
1296            .copy()
1297        )
1298
1299        # Check if grid worked and return
1300        if self.check_if_grid(new_data_w):
1301            return new_data_w
1302        else:
1303            raise ValueError("Gridding failed")
1304
1305    def find_mass_features_ph(self, ms_level=1, grid=True):
1306        """Find mass features within an LCMSBase object using persistent homology.
1307
1308        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1309
1310        Parameters
1311        ----------
1312        ms_level : int, optional
1313            The MS level to use. Default is 1.
1314        grid : bool, optional
1315            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1316
1317        Raises
1318        ------
1319        ValueError
1320            If no MS level data is found on the object.
1321            If data is not gridded and grid is False.
1322
1323        Returns
1324        -------
1325        None, but assigns the mass_features attribute to the object.
1326
1327        Notes
1328        -----
1329        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1330        """
1331        # Check that ms_level is a key in self._ms_uprocessed
1332        if ms_level not in self._ms_unprocessed.keys():
1333            raise ValueError(
1334                "No MS level "
1335                + str(ms_level)
1336                + " data found, did you instantiate with parser specific to MS level?"
1337            )
1338
1339        # Get ms data
1340        data = self._ms_unprocessed[ms_level].copy()
1341
1342        # Drop rows with missing intensity values and reset index
1343        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1344
1345        # Threshold data
1346        dims = ["mz", "scan_time"]
1347        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1348        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1349
1350        # Check if gridded, if not, grid
1351        gridded_mz = self.check_if_grid(data_thres)
1352        if gridded_mz is False:
1353            if grid is False:
1354                raise ValueError(
1355                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1356                )
1357            else:
1358                data_thres = self.grid_data(data_thres)
1359
1360        # Add build factors and add scan_time
1361        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1362        factors = {
1363            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1364            for dim in dims
1365        }  # this is return a float64 index
1366
1367        # Build indexes
1368        index = {
1369            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1370            for dim in factors
1371        }
1372
1373        # Smooth data
1374        iterations = self.parameters.lc_ms.ph_smooth_it
1375        smooth_radius = [
1376            self.parameters.lc_ms.ph_smooth_radius_mz,
1377            self.parameters.lc_ms.ph_smooth_radius_scan,
1378        ]  # mz, scan_time smoothing radius (in steps)
1379
1380        index = np.vstack([index[dim] for dim in dims]).T
1381        V = data_thres["intensity"].values
1382        resid = np.inf
1383        for i in range(iterations):
1384            # Previous iteration
1385            V_prev = V.copy()
1386            resid_prev = resid
1387            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1388
1389            # Calculate residual with previous iteration
1390            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1391
1392            # Evaluate convergence
1393            if i > 0:
1394                # Percent change in residual
1395                test = np.abs(resid - resid_prev) / resid_prev
1396
1397                # Exit criteria
1398                if test <= 0:
1399                    break
1400
1401        # Overwrite values
1402        data_thres["intensity"] = V
1403
1404        # Use persistent homology to find regions of interest
1405        pidx, pers = self.sparse_upper_star(index, V)
1406        pidx = pidx[pers > 1]
1407        pers = pers[pers > 1]
1408
1409        # Get peaks
1410        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1411
1412        # Add persistence column
1413        peaks["persistence"] = pers
1414        mass_features = peaks.sort_values(
1415            by="persistence", ascending=False
1416        ).reset_index(drop=True)
1417
1418        # Filter by persistence threshold
1419        persistence_threshold = (
1420            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1421        )
1422        mass_features = mass_features.loc[
1423            mass_features["persistence"] > persistence_threshold, :
1424        ].reset_index(drop=True)
1425
1426        # Rename scan column to apex_scan
1427        mass_features = mass_features.rename(
1428            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1429        )
1430
1431        # Populate mass_features attribute
1432        self.mass_features = {}
1433        for row in mass_features.itertuples():
1434            row_dict = mass_features.iloc[row.Index].to_dict()
1435            lcms_feature = LCMSMassFeature(self, **row_dict)
1436            self.mass_features[lcms_feature.id] = lcms_feature
1437
1438        if self.parameters.lc_ms.verbose_processing:
1439            print("Found " + str(len(mass_features)) + " initial mass features")
1440
1441    def find_mass_features_ph_centroid(self, ms_level=1):
1442        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1443        
1444        Parameters
1445        ----------
1446        ms_level : int, optional
1447            The MS level to use. Default is 1.
1448        
1449        Raises
1450        ------
1451        ValueError
1452            If no MS level data is found on the object.
1453
1454        Returns
1455        -------
1456        None, but assigns the mass_features attribute to the object.        
1457        """
1458        # Check that ms_level is a key in self._ms_uprocessed
1459        if ms_level not in self._ms_unprocessed.keys():
1460            raise ValueError(
1461                "No MS level "
1462                + str(ms_level)
1463                + " data found, did you instantiate with parser specific to MS level?"
1464            )
1465        
1466        # Get ms data
1467        data = self._ms_unprocessed[ms_level].copy()
1468
1469        # Drop rows with missing intensity values and reset index
1470        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1471        
1472        # Threshold data
1473        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1474        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1475        data_thres['persistence'] = data_thres['intensity']
1476
1477        # Use this as the starting point for the mass features, adding scan_time
1478        mf_df = data_thres
1479        mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1480
1481        # Define tolerances and dimensions for rolling up
1482        tol = [
1483            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1484            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1485        ]
1486        relative = [True, False]    
1487        dims = ["mz", "scan_time"]
1488        print("here")
1489        mf_df = self.roll_up_dataframe(
1490            df=mf_df,
1491            sort_by="persistence",
1492            dims=dims,
1493            tol=tol,
1494            relative=relative
1495        )
1496
1497        # Rename scan column to apex_scan
1498        mass_features = mf_df.rename(
1499            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1500        )
1501        # Sort my persistence and reset index
1502        mass_features = mass_features.sort_values(
1503            by="persistence", ascending=False
1504        ).reset_index(drop=True)
1505
1506        # Populate mass_features attribute
1507        self.mass_features = {}
1508        for row in mass_features.itertuples():
1509            row_dict = mass_features.iloc[row.Index].to_dict()
1510            lcms_feature = LCMSMassFeature(self, **row_dict)
1511            self.mass_features[lcms_feature.id] = lcms_feature
1512
1513        if self.parameters.lc_ms.verbose_processing:
1514            print("Found " + str(len(mass_features)) + " initial mass features")
1515    
1516    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1517        """Cluster mass features
1518
1519        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1520
1521        Parameters
1522        ----------
1523        drop_children : bool, optional
1524            Whether to drop the mass features that are not cluster parents. Default is True.
1525        sort_by : str, optional
1526            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1527
1528        Raises
1529        ------
1530        ValueError
1531            If no mass features are found.
1532            If too many mass features are found.
1533
1534        Returns
1535        -------
1536        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1537        """
1538        verbose = self.parameters.lc_ms.verbose_processing
1539
1540        if self.mass_features is None:
1541            raise ValueError("No mass features found, run find_mass_features() first")
1542        if len(self.mass_features) > 400000:
1543            raise ValueError(
1544                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1545            )
1546        dims = ["mz", "scan_time"]
1547        mf_df_og = self.mass_features_to_df()
1548        mf_df = mf_df_og.copy()
1549
1550        tol = [
1551            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1552            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1553        ]  # mz, in relative; scan_time in minutes
1554        relative = [True, False]
1555
1556        # Roll up mass features based on their proximity in the declared dimensions
1557        mf_df_new = self.roll_up_dataframe(
1558            df=mf_df,
1559            sort_by=sort_by,
1560            dims=dims,
1561            tol=tol,
1562            relative=relative
1563        )
1564
1565        mf_df["cluster_parent"] = np.where(
1566            np.isin(mf_df.index, mf_df_new.index), True, False
1567        )
1568
1569        # get mass feature ids of features that are not cluster parents
1570        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1571        if drop_children is True:
1572            # Drop mass features that are not cluster parents from self
1573            self.mass_features = {
1574                k: v
1575                for k, v in self.mass_features.items()
1576                if k not in cluster_daughters
1577            }
1578        else:
1579            return cluster_daughters

def find_closest(A, target): View Source

14def find_closest(A, target):
15    """Find the index of closest value in A to each value in target.
16
17    Parameters
18    ----------
19    A : :obj:`~numpy.array`
20        The array to search (blueprint). A must be sorted.
21    target : :obj:`~numpy.array`
22        The array of values to search for. target must be sorted.
23
24    Returns
25    -------
26    :obj:`~numpy.array`
27        The indices of the closest values in A to each value in target.
28    """
29    idx = A.searchsorted(target)
30    idx = np.clip(idx, 1, len(A) - 1)
31    left = A[idx - 1]
32    right = A[idx]
33    idx -= target - left < right - target
34    return idx

Find the index of closest value in A to each value in target.

Parameters

A (~numpy.array): The array to search (blueprint). A must be sorted.
target (~numpy.array): The array of values to search for. target must be sorted.

Returns

~numpy.array: The indices of the closest values in A to each value in target.