corems.mass_spectra.calc.lc_calc

   1import numpy as np
   2import pandas as pd
   3from ripser import ripser
   4from scipy import sparse
   5from scipy.spatial import KDTree
   6
   7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
   8from corems.mass_spectra.calc import SignalProcessing as sp
   9from corems.mass_spectra.factory.chromat_data import EIC_Data
  10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  11
  12
  13def find_closest(A, target):
  14    """Find the index of closest value in A to each value in target.
  15
  16    Parameters
  17    ----------
  18    A : :obj:`~numpy.array`
  19        The array to search (blueprint). A must be sorted.
  20    target : :obj:`~numpy.array`
  21        The array of values to search for. target must be sorted.
  22
  23    Returns
  24    -------
  25    :obj:`~numpy.array`
  26        The indices of the closest values in A to each value in target.
  27    """
  28    idx = A.searchsorted(target)
  29    idx = np.clip(idx, 1, len(A) - 1)
  30    left = A[idx - 1]
  31    right = A[idx]
  32    idx -= target - left < right - target
  33    return idx
  34
  35
  36class LCCalculations:
  37    """Methods for performing LC calculations on mass spectra data.
  38
  39    Notes
  40    -----
  41    This class is intended to be used as a mixin for the LCMSBase class.
  42
  43    Methods
  44    -------
  45    * get_max_eic(eic_data).
  46        Returns the maximum EIC value from the given EIC data. A static method.
  47    * smooth_tic(tic).
  48        Smooths the TIC data using the specified smoothing method and settings.
  49    * eic_centroid_detector(rt, eic, max_eic).
  50        Performs EIC centroid detection on the given EIC data.
  51    * find_nearest_scan(rt).
  52        Finds the nearest scan to the given retention time.
  53    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
  54        Returns an averaged mass spectrum object.
  55    * find_mass_features(ms_level=1).
  56        Find regions of interest for a given MS level (default is MS1).
  57    * integrate_mass_features(drop_if_fail=False, ms_level=1).
  58        Integrate mass features of interest and extracts EICs.
  59    * find_c13_mass_features().
  60        Evaluate mass features and mark likely C13 isotopes.
  61    * deconvolute_ms1_mass_features().
  62        Deconvolute mass features' ms1 mass spectra.
  63    """
  64
  65    @staticmethod
  66    def get_max_eic(eic_data: dict):
  67        """Returns the maximum EIC value from the given EIC data.
  68
  69        Notes
  70        -----
  71        This is a static method.
  72
  73        Parameters
  74        ----------
  75        eic_data : dict
  76            A dictionary containing EIC data.
  77
  78        Returns
  79        -------
  80        float
  81            The maximum EIC value.
  82        """
  83        max_eic = 0
  84        for eic_data in eic_data.values():
  85            ind_max_eic = max(eic_data.get("EIC"))
  86            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
  87
  88        return max_eic
  89
  90    def smooth_tic(self, tic):
  91        """Smooths the TIC or EIC data using the specified smoothing method and settings.
  92
  93        Parameters
  94        ----------
  95        tic : numpy.ndarray
  96            The TIC (or EIC) data to be smoothed.
  97
  98        Returns
  99        -------
 100        numpy.ndarray
 101            The smoothed TIC data.
 102        """
 103        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
 104
 105        pol_order = self.parameters.lc_ms.savgol_pol_order
 106
 107        window_len = self.parameters.lc_ms.smooth_window
 108
 109        window = self.parameters.lc_ms.smooth_method
 110
 111        return sp.smooth_signal(
 112            tic, window_len, window, pol_order, implemented_smooth_method
 113        )
 114
 115    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
 116        """Performs EIC centroid detection on the given EIC data.
 117
 118        Parameters
 119        ----------
 120        rt : numpy.ndarray
 121            The retention time data.
 122        eic : numpy.ndarray
 123            The EIC data.
 124        max_eic : float
 125            The maximum EIC value.
 126        apex_indexes : list, optional
 127            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
 128
 129        Returns
 130        -------
 131        numpy.ndarray
 132            The indexes of left, apex, and right limits as a generator.
 133        """
 134
 135        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
 136
 137        max_height = self.parameters.lc_ms.peak_height_max_percent
 138
 139        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
 140
 141        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
 142
 143        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
 144
 145        include_indexes = sp.peak_picking_first_derivative(
 146            domain=rt,
 147            signal=eic,
 148            max_height=max_height,
 149            max_prominence=max_prominence,
 150            max_signal=max_eic,
 151            min_peak_datapoints=min_peak_datapoints,
 152            peak_derivative_threshold=peak_derivative_threshold,
 153            signal_threshold=signal_threshold,
 154            correct_baseline=False,
 155            plot_res=False,
 156            apex_indexes=apex_indexes,
 157        )
 158
 159        return include_indexes
 160
 161    def find_nearest_scan(self, rt):
 162        """Finds the nearest scan to the given retention time.
 163
 164        Parameters
 165        ----------
 166        rt : float
 167            The retention time (in minutes) to find the nearest scan for.
 168
 169        Returns
 170        -------
 171        int
 172            The scan number of the nearest scan.
 173        """
 174        array_rt = np.array(self.retention_time)
 175
 176        scan_index = (np.abs(array_rt - rt)).argmin()
 177
 178        real_scan = self.scans_number[scan_index]
 179
 180        return real_scan
 181
 182    def add_peak_metrics(self):
 183        """Add peak metrics to the mass features.
 184
 185        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
 186        """
 187        # Check that at least some mass features have eic data
 188        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
 189            raise ValueError(
 190                "No mass features have EIC data. Run integrate_mass_features first."
 191            )
 192
 193        for mass_feature in self.mass_features.values():
 194            # Check if the mass feature has been integrated
 195            if mass_feature._eic_data is not None:
 196                # Calculate peak metrics
 197                mass_feature.calc_half_height_width()
 198                mass_feature.calc_tailing_factor()
 199                mass_feature.calc_dispersity_index()
 200
 201    def get_average_mass_spectrum(
 202        self,
 203        scan_list,
 204        apex_scan,
 205        spectrum_mode="profile",
 206        ms_level=1,
 207        auto_process=True,
 208        use_parser=False,
 209        perform_checks=True,
 210        polarity=None,
 211        ms_params=None,
 212    ):
 213        """Returns an averaged mass spectrum object
 214
 215        Parameters
 216        ----------
 217        scan_list : list
 218            List of scan numbers to average.
 219        apex_scan : int
 220            Number of the apex scan
 221        spectrum_mode : str, optional
 222            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
 223        ms_level : int, optional
 224            The MS level to use. Defaults to 1.
 225        auto_process : bool, optional
 226            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
 227        use_parser : bool, optional
 228            If True, the mass spectra will be obtained from the parser. Defaults to False.
 229        perform_checks : bool, optional
 230            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
 231        polarity : int, optional
 232            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
 233        ms_params : MSParameters, optional
 234            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
 235
 236        Returns
 237        -------
 238        MassSpectrumProfile
 239            The averaged mass spectrum object.
 240
 241        Raises
 242        ------
 243        ValueError
 244            If the spectrum mode is not "profile".
 245            If the MS level is not found in the unprocessed mass spectra dictionary.
 246            If not all scan numbers are found in the unprocessed mass spectra dictionary.
 247        """
 248        if perform_checks:
 249            if spectrum_mode != "profile":
 250                raise ValueError("Averaging only supported for profile mode")
 251
 252        if polarity is None:
 253            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 254            if self.polarity == "negative":
 255                polarity = -1
 256            elif self.polarity == "positive":
 257                polarity = 1
 258            else:
 259                raise ValueError(
 260                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
 261                )
 262
 263        # if not using_parser, check that scan numbers are in _ms_unprocessed
 264        if not use_parser:
 265            if perform_checks:
 266                # Set index to scan for faster lookup
 267                ms_df = (
 268                    self._ms_unprocessed[ms_level]
 269                    .copy()
 270                    .set_index("scan", drop=False)
 271                    .sort_index()
 272                )
 273                my_ms_df = ms_df.loc[scan_list]
 274                # Check that all scan numbers are in the ms_df
 275                if not all(np.isin(scan_list, ms_df.index)):
 276                    raise ValueError(
 277                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
 278                    )
 279            else:
 280                my_ms_df = (
 281                    pd.DataFrame({"scan": scan_list})
 282                    .set_index("scan")
 283                    .join(self._ms_unprocessed[ms_level], how="left")
 284                )
 285
 286        if use_parser:
 287            ms_list = [
 288                self.spectra_parser.get_mass_spectrum_from_scan(
 289                    x, spectrum_mode=spectrum_mode, auto_process=False
 290                )
 291                for x in scan_list
 292            ]
 293            ms_mz = [x._mz_exp for x in ms_list]
 294            ms_int = [x._abundance for x in ms_list]
 295            my_ms_df = []
 296            for i in np.arange(len(ms_mz)):
 297                my_ms_df.append(
 298                    pd.DataFrame(
 299                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
 300                    )
 301                )
 302            my_ms_df = pd.concat(my_ms_df)
 303
 304        if not self.check_if_grid(my_ms_df):
 305            my_ms_df = self.grid_data(my_ms_df)
 306
 307        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
 308
 309        ms = ms_from_array_profile(
 310            my_ms_ave.mz,
 311            my_ms_ave.intensity,
 312            self.file_location,
 313            polarity=polarity,
 314            auto_process=False,
 315        )
 316
 317        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 318        if ms is not None:
 319            if ms_params is not None:
 320                ms.parameters = ms_params
 321            ms.scan_number = apex_scan
 322            if auto_process:
 323                ms.process_mass_spec()
 324        return ms
 325
 326    def find_mass_features(self, ms_level=1, grid=True):
 327        """Find mass features within an LCMSBase object
 328
 329        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
 330
 331        Parameters
 332        ----------
 333        ms_level : int, optional
 334            The MS level to use for peak picking Default is 1.
 335        grid : bool, optional
 336            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 
 337            used for persistent homology peak picking for profile data only. Default is True.
 338
 339        Raises
 340        ------
 341        ValueError
 342            If no MS level data is found on the object.
 343            If persistent homology peak picking is attempted on non-profile mode data.
 344            If data is not gridded and grid is False.
 345            If peak picking method is not implemented.
 346
 347        Returns
 348        -------
 349        None, but assigns the mass_features and eics attributes to the object.
 350
 351        """
 352        pp_method = self.parameters.lc_ms.peak_picking_method
 353
 354        if pp_method == "persistent homology":
 355            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 356            if all(msx_scan_df["ms_format"] == "profile"):
 357                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
 358                self.cluster_mass_features(drop_children=True, sort_by="persistence")
 359            else:
 360                raise ValueError(
 361                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
 362                        ms_level
 363                    )
 364                )
 365        elif pp_method == "centroided_persistent_homology":
 366            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 367            if all(msx_scan_df["ms_format"] == "centroid"):
 368                self.find_mass_features_ph_centroid(ms_level=ms_level)
 369            else:
 370                raise ValueError(
 371                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
 372                        ms_level
 373                    )
 374                )
 375        else:
 376            raise ValueError("Peak picking method not implemented")
 377
 378    def integrate_mass_features(
 379        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
 380    ):
 381        """Integrate mass features and extract EICs.
 382
 383        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
 384
 385        Parameters
 386        ----------
 387        drop_if_fail : bool, optional
 388            Whether to drop mass features if the EIC limit calculations fail.
 389            Default is True.
 390        drop_duplicates : bool, optional
 391            Whether to mass features that appear to be duplicates
 392            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
 393            Default is True.
 394        ms_level : int, optional
 395            The MS level to use. Default is 1.
 396
 397        Raises
 398        ------
 399        ValueError
 400            If no mass features are found.
 401            If no MS level data is found for the given MS level (either in data or in the scan data)
 402
 403        Returns
 404        -------
 405        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
 406
 407        Notes
 408        -----
 409        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
 410        """
 411        # Check if there is data
 412        if ms_level in self._ms_unprocessed.keys():
 413            raw_data = self._ms_unprocessed[ms_level].copy()
 414        else:
 415            raise ValueError("No MS level " + str(ms_level) + " data found")
 416        if self.mass_features is not None:
 417            mf_df = self.mass_features_to_df().copy()
 418        else:
 419            raise ValueError(
 420                "No mass features found, did you run find_mass_features() first?"
 421            )
 422        # Check if mass_spectrum exists on each mass feature
 423        if not all(
 424            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
 425        ):
 426            raise ValueError(
 427                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
 428            )
 429
 430        # Subset scan data to only include correct ms_level
 431        scan_df_sub = self.scan_df[
 432            self.scan_df["ms_level"] == int(ms_level)
 433        ].reset_index(drop=True)
 434        if scan_df_sub.empty:
 435            raise ValueError("No MS level " + ms_level + " data found in scan data")
 436        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
 437
 438        mzs_to_extract = np.unique(mf_df["mz"].values)
 439        mzs_to_extract.sort()
 440
 441        # Get EICs for each unique mz in mass features list
 442        for mz in mzs_to_extract:
 443            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 444            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 445            raw_data_sub = raw_data[
 446                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
 447            ].reset_index(drop=True)
 448            raw_data_sub = (
 449                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
 450            )
 451            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
 452            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
 453            myEIC = EIC_Data(
 454                scans=raw_data_sub["scan"].values,
 455                time=raw_data_sub["scan_time"].values,
 456                eic=raw_data_sub["intensity"].values,
 457            )
 458            # Smooth EIC
 459            smoothed_eic = self.smooth_tic(myEIC.eic)
 460            smoothed_eic[smoothed_eic < 0] = 0
 461            myEIC.eic_smoothed = smoothed_eic
 462            self.eics[mz] = myEIC
 463
 464        # Get limits of mass features using EIC centroid detector and integrate
 465        mf_df["area"] = np.nan
 466        for idx, mass_feature in mf_df.iterrows():
 467            mz = mass_feature.mz
 468            apex_scan = mass_feature.apex_scan
 469
 470            # Pull EIC data and find apex scan index
 471            myEIC = self.eics[mz]
 472            self.mass_features[idx]._eic_data = myEIC
 473            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
 474
 475            # Find left and right limits of peak using EIC centroid detector, add to EICData
 476            centroid_eics = self.eic_centroid_detector(
 477                myEIC.time,
 478                myEIC.eic_smoothed,
 479                mass_feature.intensity * 1.1,
 480                apex_indexes=[int(apex_index)],
 481            )
 482            l_a_r_scan_idx = [i for i in centroid_eics]
 483            if len(l_a_r_scan_idx) > 0:
 484                # Add start and final scan to mass_features and EICData
 485                left_scan, right_scan = (
 486                    myEIC.scans[l_a_r_scan_idx[0][0]],
 487                    myEIC.scans[l_a_r_scan_idx[0][2]],
 488                )
 489                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
 490                myEIC.apexes = myEIC.apexes + mf_scan_apex
 491                self.mass_features[idx].start_scan = left_scan
 492                self.mass_features[idx].final_scan = right_scan
 493
 494                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
 495                area = np.trapz(
 496                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 497                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 498                )
 499                mf_df.at[idx, "area"] = area
 500                myEIC.areas = myEIC.areas + [area]
 501                self.eics[mz] = myEIC
 502                self.mass_features[idx]._area = area
 503            else:
 504                if drop_if_fail is True:
 505                    self.mass_features.pop(idx)
 506
 507        if drop_duplicates:
 508            # Prepare mass feature dataframe
 509            mf_df = self.mass_features_to_df().copy()
 510
 511            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
 512            # Kepp the first mass fea
 513            for idx, mass_feature in mf_df.iterrows():
 514                mz = mass_feature.mz
 515                apex_scan = mass_feature.apex_scan
 516
 517                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
 518                mf_df_sub = mf_df[
 519                    mf_df["mz_diff_ppm"]
 520                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 521                    * 10**6
 522                ].copy()
 523
 524                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
 525                for idx2, mass_feature2 in mf_df_sub.iterrows():
 526                    if idx2 != idx:
 527                        if (
 528                            mass_feature2.start_scan >= mass_feature.start_scan
 529                            and mass_feature2.final_scan <= mass_feature.final_scan
 530                        ):
 531                            if idx2 in self.mass_features.keys():
 532                                self.mass_features.pop(idx2)
 533
 534    def find_c13_mass_features(self):
 535        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
 536
 537        Returns
 538        -------
 539        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
 540
 541        Raises
 542        ------
 543        ValueError
 544            If no mass features are found.
 545        """
 546        verbose = self.parameters.lc_ms.verbose_processing
 547        if verbose:
 548            print("evaluating mass features for C13 isotopes")
 549        if self.mass_features is None:
 550            raise ValueError("No mass features found, run find_mass_features() first")
 551
 552        # Data prep fo sparse distance matrix
 553        dims = ["mz", "scan_time"]
 554        mf_df = self.mass_features_to_df().copy()
 555        # Drop mass features that have no area (these are likely to be noise)
 556        mf_df = mf_df[mf_df["area"].notnull()]
 557        mf_df["mf_id"] = mf_df.index.values
 558        dims = ["mz", "scan_time"]
 559
 560        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
 561        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
 562
 563        mz_diff = 1.003355  # C13-C12 mass difference
 564        tol = [
 565            mf_df["mz"].median()
 566            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
 567            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
 568        ]  # mz, in relative; scan_time in minutes
 569
 570        # Compute inter-feature distances
 571        distances = None
 572        for i in range(len(dims)):
 573            # Construct k-d tree
 574            values = mf_df[dims[i]].values
 575            tree = KDTree(values.reshape(-1, 1))
 576
 577            max_tol = tol[i]
 578            if dims[i] == "mz":
 579                # Maximum absolute tolerance
 580                max_tol = mz_diff + tol[i]
 581
 582            # Compute sparse distance matrix
 583            # the larger the max_tol, the slower this operation is
 584            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
 585
 586            # Only consider forward case, exclude diagonal
 587            sdm = sparse.triu(sdm, k=1)
 588
 589            if dims[i] == "mz":
 590                min_tol = mz_diff - tol[i]
 591                # Get only the ones that are above the min tol
 592                idx = sdm.data > min_tol
 593
 594                # Reconstruct sparse distance matrix
 595                sdm = sparse.coo_matrix(
 596                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
 597                    shape=(len(values), len(values)),
 598                )
 599
 600            # Cast as binary matrix
 601            sdm.data = np.ones_like(sdm.data)
 602
 603            # Stack distances
 604            if distances is None:
 605                distances = sdm
 606            else:
 607                distances = distances.multiply(sdm)
 608
 609        # Extract indices of within-tolerance points
 610        distances = distances.tocoo()
 611        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
 612
 613        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
 614        pairs_mf = pairs.copy()
 615        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
 616        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
 617
 618        # Connect monoisotopic masses with isotopologes within mass_features
 619        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
 620        for mono in monos:
 621            self.mass_features[mono].monoisotopic_mf_id = mono
 622        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
 623        while not pairs_iso_df.empty:
 624            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
 625            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
 626            for iso in m1_isos:
 627                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
 628                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
 629                if len(parent) > 1:
 630                    # Choose the parent that is closest in time to the isotopologue
 631                    parent_time = [self.mass_features[p].retention_time for p in parent]
 632                    time_diff = [
 633                        np.abs(self.mass_features[iso].retention_time - x)
 634                        for x in parent_time
 635                    ]
 636                    parent = parent[np.argmin(time_diff)]
 637                else:
 638                    parent = parent[0]
 639                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
 640                    parent
 641                ].monoisotopic_mf_id
 642                if self.mass_features[iso].monoisotopic_mf_id is not None:
 643                    mass_diff = (
 644                        self.mass_features[iso].mz
 645                        - self.mass_features[
 646                            self.mass_features[iso].monoisotopic_mf_id
 647                        ].mz
 648                    )
 649                    self.mass_features[iso].isotopologue_type = "13C" + str(
 650                        int(round(mass_diff, 0))
 651                    )
 652
 653            # Drop the mono and iso from the pairs_iso_df
 654            pairs_iso_df = pairs_iso_df.drop(
 655                index=monos, errors="ignore"
 656            )  # Drop pairs where the parent is a child that is a child of a root
 657            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
 658            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
 659
 660            if not pairs_iso_df.empty:
 661                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
 662                monos = np.setdiff1d(
 663                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
 664                )
 665        if verbose:
 666            # Report fraction of compounds annotated with isotopes
 667            mf_df["c13_flag"] = np.where(
 668                np.logical_or(
 669                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
 670                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
 671                ),
 672                1,
 673                0,
 674            )
 675            print(
 676                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
 677                + " of mass features have or are C13 isotopes"
 678            )
 679
 680    def deconvolute_ms1_mass_features(self):
 681        """Deconvolute MS1 mass features
 682
 683        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
 684
 685        Parameters
 686        ----------
 687        None
 688
 689        Returns
 690        -------
 691        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
 692        and associated_mass_features_deconvoluted attributes to the mass features in the
 693        mass_features attribute of the LCMSBase object.
 694
 695        Raises
 696        ------
 697        ValueError
 698            If no mass features are found, must run find_mass_features() first.
 699            If no EICs are found, did you run integrate_mass_features() first?
 700
 701        """
 702        # Checks for set mass_features and eics
 703        if self.mass_features is None:
 704            raise ValueError(
 705                "No mass features found, did you run find_mass_features() first?"
 706            )
 707
 708        if self.eics == {}:
 709            raise ValueError(
 710                "No EICs found, did you run integrate_mass_features() first?"
 711            )
 712
 713        if 1 not in self._ms_unprocessed.keys():
 714            raise ValueError("No unprocessed MS1 spectra found.")
 715
 716        # Prep ms1 data
 717        ms1_data = self._ms_unprocessed[1].copy()
 718        ms1_data = ms1_data.set_index("scan")
 719
 720        # Prep mass feature summary
 721        mass_feature_df = self.mass_features_to_df()
 722
 723        # Loop through each mass feature
 724        for mf_id, mass_feature in self.mass_features.items():
 725            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
 726            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
 727                continue
 728
 729            # Get the left and right limits of the EIC of the mass feature
 730            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
 731
 732            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
 733            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
 734            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
 735
 736            # Get the centroided masses of the mass feature
 737            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
 738
 739            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
 740            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
 741                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
 742            ]
 743
 744            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
 745            ms1_data_sub["mz_diff_rel"] = (
 746                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
 747                / ms1_data_sub["mz"]
 748            )
 749            ms1_data_sub = ms1_data_sub[
 750                ms1_data_sub["mz_diff_rel"]
 751                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 752            ].reset_index(drop=True)
 753
 754            # Group by mass_feature_mz and scan and sum intensity
 755            ms1_data_sub_group = (
 756                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
 757                .sum()
 758                .reset_index()
 759            )
 760
 761            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
 762            corr = (
 763                ms1_data_sub_group.pivot(
 764                    index="scan", columns="mass_feature_mz", values="intensity"
 765                )
 766                .fillna(0)
 767                .corr()
 768            )
 769
 770            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
 771            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
 772            corr_subset = corr.loc[mass_feature.mz,]
 773            corr_subset = corr_subset[corr_subset > decon_corr_min]
 774
 775            # Get the masses from the mass spectrum that are the result of the deconvolution
 776            mzs_decon = corr_subset.index.values
 777
 778            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
 779            mzs_decon_idx = [
 780                id
 781                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
 782                if mz in mzs_decon
 783            ]
 784            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
 785
 786            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
 787            if (
 788                mass_feature.ms1_peak.abundance
 789                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
 790            ):
 791                mass_feature.mass_spectrum_deconvoluted_parent = True
 792            else:
 793                mass_feature.mass_spectrum_deconvoluted_parent = False
 794
 795            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
 796            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
 797            mass_feature_df_sub = mass_feature_df[
 798                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
 799                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 800            ].copy()
 801            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
 802            mass_feature_df_sub["mz_diff_ppm"] = [
 803                np.abs(mzs_decon - mz).min() / mz * 10**6
 804                for mz in mass_feature_df_sub["mz"]
 805            ]
 806            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
 807            mfs_associated_decon = mass_feature_df_sub[
 808                mass_feature_df_sub["mz_diff_ppm"]
 809                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
 810            ].index.values
 811
 812            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
 813
 814
 815class PHCalculations:
 816    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 817
 818    Notes
 819    -----
 820    This class is intended to be used as a mixin for the LCMSBase class.
 821
 822    Methods
 823    -------
 824    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 825        Sparse implementation of a mean filter.
 826    * embed_unique_indices(a).
 827        Creates an array of indices, sorted by unique element.
 828    * sparse_upper_star(idx, V).
 829        Sparse implementation of an upper star filtration.
 830    * check_if_grid(data).
 831        Check if the data is gridded in mz space.
 832    * grid_data(data).
 833        Grid the data in the mz dimension.
 834    * find_mass_features_ph(ms_level=1, grid=True).
 835        Find mass features within an LCMSBase object using persistent homology.
 836    * cluster_mass_features(drop_children=True).
 837        Cluster regions of interest.
 838    """
 839
 840    @staticmethod
 841    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 842        """Sparse implementation of a mean filter.
 843
 844        Parameters
 845        ----------
 846        idx : :obj:`~numpy.array`
 847            Edge indices for each dimension (MxN).
 848        V : :obj:`~numpy.array`
 849            Array of intensity data (Mx1).
 850        radius : float or list
 851            Radius of the sparse filter in each dimension. Values less than
 852            zero indicate no connectivity in that dimension.
 853
 854        Returns
 855        -------
 856        :obj:`~numpy.array`
 857            Filtered intensities (Mx1).
 858
 859        Notes
 860        -----
 861        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 862        This is a static method.
 863        """
 864
 865        # Copy indices
 866        idx = idx.copy().astype(V.dtype)
 867
 868        # Scale
 869        for i, r in enumerate(radius):
 870            # Increase inter-index distance
 871            if r < 1:
 872                idx[:, i] *= 2
 873
 874            # Do nothing
 875            elif r == 1:
 876                pass
 877
 878            # Decrease inter-index distance
 879            else:
 880                idx[:, i] /= r
 881
 882        # Connectivity matrix
 883        cmat = KDTree(idx)
 884        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 885        cmat.setdiag(1)
 886
 887        # Pair indices
 888        I, J = cmat.nonzero()
 889
 890        # Delete cmat
 891        cmat_shape = cmat.shape
 892        del cmat
 893
 894        # Sum over columns
 895        V_sum = sparse.bsr_matrix(
 896            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 897        ).diagonal(0)
 898
 899        # Count over columns
 900        V_count = sparse.bsr_matrix(
 901            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 902        ).diagonal(0)
 903
 904        return V_sum / V_count
 905
 906    @staticmethod
 907    def embed_unique_indices(a):
 908        """Creates an array of indices, sorted by unique element.
 909
 910        Parameters
 911        ----------
 912        a : :obj:`~numpy.array`
 913            Array of unique elements (Mx1).
 914
 915        Returns
 916        -------
 917        :obj:`~numpy.array`
 918            Array of indices (Mx1).
 919
 920        Notes
 921        -----
 922        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 923        This is a static method.
 924        """
 925
 926        def count_tens(n):
 927            # Count tens
 928            ntens = (n - 1) // 10
 929
 930            while True:
 931                ntens_test = (ntens + n - 1) // 10
 932
 933                if ntens_test == ntens:
 934                    return ntens
 935                else:
 936                    ntens = ntens_test
 937
 938        def arange_exclude_10s(n):
 939            # How many 10s will there be?
 940            ntens = count_tens(n)
 941
 942            # Base array
 943            arr = np.arange(0, n + ntens)
 944
 945            # Exclude 10s
 946            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 947
 948            return arr
 949
 950        # Creates an array of indices, sorted by unique element
 951        idx_sort = np.argsort(a)
 952        idx_unsort = np.argsort(idx_sort)
 953
 954        # Sorts records array so all unique elements are together
 955        sorted_a = a[idx_sort]
 956
 957        # Returns the unique values, the index of the first occurrence,
 958        # and the count for each element
 959        vals, idx_start, count = np.unique(
 960            sorted_a, return_index=True, return_counts=True
 961        )
 962
 963        # Splits the indices into separate arrays
 964        splits = np.split(idx_sort, idx_start[1:])
 965
 966        # Creates unique indices for each split
 967        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 968
 969        # Reorders according to input array
 970        idx_unq = idx_unq[idx_unsort]
 971
 972        # Magnitude of each index
 973        exp = np.log10(
 974            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
 975        )
 976        idx_unq_mag = np.power(10, np.floor(exp) + 1)
 977
 978        # Result
 979        return a + idx_unq / idx_unq_mag
 980    
 981    @staticmethod
 982    def roll_up_dataframe(
 983        df : pd.DataFrame,
 984        sort_by : str,
 985        tol : list,
 986        relative : list,
 987        dims : list
 988    ):
 989        """Subset data by rolling up into apex in appropriate dimensions.
 990        
 991        Parameters
 992        ----------
 993        data : pd.DataFrame
 994            The input data containing "dims" columns and the "sort_by" column.
 995        sort_by : str
 996            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 
 997            (i.e., the mass feature with the highest value in the sort_by column).
 998        dims : list
 999            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1000        tol : list
1001            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1002            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1003            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1004            If relative is False, the tolerance will be used as is.
1005        relative : list
1006            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1007
1008        Returns
1009        -------
1010        pd.DataFrame
1011            A DataFrame with only the rolled up mass features, with the original index and columns.
1012
1013            
1014        Raises
1015        ------
1016        ValueError
1017            If the input data is not a pandas DataFrame.
1018            If the input data does not have columns for each of the dimensions in "dims".
1019            If the length of "dims", "tol", and "relative" do not match.
1020        """
1021        og_columns = df.columns.copy()
1022
1023        # Unindex the data, but keep the original index
1024        if df.index.name is not None:
1025            og_index = df.index.name
1026        else:
1027            og_index = "index"
1028        df = df.reset_index(drop=False)
1029
1030        # Sort data by sort_by column, and reindex
1031        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1032
1033        # Check that data is a DataFrame and has columns for each of the dims
1034        if not isinstance(df, pd.DataFrame):
1035            raise ValueError("Data must be a pandas DataFrame")
1036        for dim in dims:
1037            if dim not in df.columns:
1038                raise ValueError(f"Data must have a column for {dim}")
1039        if len(dims) != len(tol) or len(dims) != len(relative):
1040            raise ValueError(
1041                "Dimensions, tolerances, and relative flags must be the same length"
1042            )
1043        
1044        # Compute inter-feature distances
1045        distances = None
1046        for i in range(len(dims)):
1047            # Construct k-d tree
1048            values = df[dims[i]].values
1049            tree = KDTree(values.reshape(-1, 1))
1050
1051            max_tol = tol[i]
1052            if relative[i] is True:
1053                # Maximum absolute tolerance
1054                max_tol = tol[i] * values.max()
1055
1056            # Compute sparse distance matrix
1057            # the larger the max_tol, the slower this operation is
1058            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1059
1060            # Only consider forward case, exclude diagonal
1061            sdm = sparse.triu(sdm, k=1)
1062
1063            # Filter relative distances
1064            if relative[i] is True:
1065                # Compute relative distances
1066                rel_dists = sdm.data / values[sdm.row]  # or col?
1067
1068                # Indices of relative distances less than tolerance
1069                idx = rel_dists <= tol[i]
1070
1071                # Reconstruct sparse distance matrix
1072                sdm = sparse.coo_matrix(
1073                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1074                    shape=(len(values), len(values)),
1075                )
1076
1077            # Cast as binary matrix
1078            sdm.data = np.ones_like(sdm.data)
1079
1080            # Stack distances
1081            if distances is None:
1082                distances = sdm
1083            else:
1084                distances = distances.multiply(sdm)
1085        
1086        # Extract indices of within-tolerance points
1087        distances = distances.tocoo()
1088        pairs = np.stack((distances.row, distances.col), axis=1)
1089        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1090        pairs_df = pairs_df.set_index("parent")
1091
1092        to_drop = []
1093        while not pairs_df.empty:
1094            # Find root_parents and their children
1095            root_parents = np.setdiff1d(
1096                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1097            )
1098            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1099            to_drop = np.append(to_drop, children_of_roots)
1100
1101            # Remove root_children as possible parents from pairs_df for next iteration
1102            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1103            pairs_df = pairs_df.reset_index().set_index("child")
1104            # Remove root_children as possible children from pairs_df for next iteration
1105            pairs_df = pairs_df.drop(index=children_of_roots)
1106
1107            # Prepare for next iteration
1108            pairs_df = pairs_df.reset_index().set_index("parent")
1109
1110        # Drop mass features that are not cluster parents
1111        df_sub = df.drop(index=np.array(to_drop))
1112
1113        # Set index back to og_index and only keep the columns that are in the original dataframe
1114        df_sub = df_sub.set_index(og_index)
1115
1116        # sort the dataframe by the original index
1117        df_sub = df_sub.sort_index()
1118        df_sub = df_sub[og_columns]
1119
1120        return df_sub
1121    
1122    def sparse_upper_star(self, idx, V):
1123        """Sparse implementation of an upper star filtration.
1124
1125        Parameters
1126        ----------
1127        idx : :obj:`~numpy.array`
1128            Edge indices for each dimension (MxN).
1129        V : :obj:`~numpy.array`
1130            Array of intensity data (Mx1).
1131        Returns
1132        -------
1133        idx : :obj:`~numpy.array`
1134            Index of filtered points (Mx1).
1135        persistence : :obj:`~numpy.array`
1136            Persistence of each filtered point (Mx1).
1137
1138        Notes
1139        -----
1140        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1141        """
1142
1143        # Invert
1144        V = -1 * V.copy().astype(int)
1145
1146        # Embed indices
1147        V = self.embed_unique_indices(V)
1148
1149        # Connectivity matrix
1150        cmat = KDTree(idx)
1151        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1152        cmat.setdiag(1)
1153        cmat = sparse.triu(cmat)
1154
1155        # Pairwise minimums
1156        I, J = cmat.nonzero()
1157        d = np.maximum(V[I], V[J])
1158
1159        # Delete connectiity matrix
1160        cmat_shape = cmat.shape
1161        del cmat
1162
1163        # Sparse distance matrix
1164        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1165
1166        # Delete pairwise mins
1167        del d, I, J
1168
1169        # Persistence homology
1170        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1171
1172        # Bound death values
1173        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1174
1175        # Construct tree to query against
1176        tree = KDTree(V.reshape((-1, 1)))
1177
1178        # Get the indexes of the first nearest neighbor by birth
1179        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1180
1181        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1182
1183    def check_if_grid(self, data):
1184        """Check if the data are gridded in mz space.
1185
1186        Parameters
1187        ----------
1188        data : DataFrame
1189            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1190
1191        Returns
1192        -------
1193        bool
1194            True if the data is gridded in the mz direction, False otherwise.
1195
1196        Notes
1197        -----
1198        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1199        """
1200        # Calculate the difference between consecutive mz values in a single scan
1201        dat_check = data.copy().reset_index(drop=True)
1202        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1203        mz_diff_min = (
1204            dat_check.groupby("scan")["mz_diff"].min().min()
1205        )  # within each scan, what is the smallest mz difference between consecutive mz values
1206
1207        # Find the mininum mz difference between mz values in the data; regardless of scan
1208        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1209        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1210        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1211
1212        # Get minimum mz_diff between mz values in the data
1213        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1214
1215        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1216        if mz_diff_min_raw < mz_diff_min:
1217            return False
1218        else:
1219            return True
1220
1221    def grid_data(self, data):
1222        """Grid the data in the mz dimension.
1223
1224        Data must be gridded prior to persistent homology calculations.
1225
1226        Parameters
1227        ----------
1228        data : DataFrame
1229            The input data containing mz, scan, scan_time, and intensity columns.
1230
1231        Returns
1232        -------
1233        DataFrame
1234            The gridded data with mz, scan, scan_time, and intensity columns.
1235
1236        Raises
1237        ------
1238        ValueError
1239            If gridding fails.
1240        """
1241
1242        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1243        data_w = data.copy().reset_index(drop=True)
1244        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1245        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1246
1247        # Need high intensity mz values first so they are parents in the output pairs stack
1248        dat_mz = data_w[["mz", "intensity"]].sort_values(
1249            by=["intensity"], ascending=False
1250        )
1251        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1252
1253        # Construct KD tree
1254        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1255        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1256        sdm = sparse.triu(sdm, k=1)
1257        sdm.data = np.ones_like(sdm.data)
1258        distances = sdm.tocoo()
1259        pairs = np.stack((distances.row, distances.col), axis=1)
1260
1261        # Cull pairs to just get root
1262        to_drop = []
1263        while len(pairs) > 0:
1264            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1265            id_root_parents = np.isin(pairs[:, 0], root_parents)
1266            children_of_roots = np.unique(pairs[id_root_parents, 1])
1267            to_drop = np.append(to_drop, children_of_roots)
1268
1269            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1270            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1271            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1272        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1273        mz_dat_np = (
1274            dat_mz[["mz"]]
1275            .sort_values(by=["mz"])
1276            .reset_index(drop=True)
1277            .values.flatten()
1278        )
1279
1280        # Sort data by mz and recast mz to nearest value in mz_dat_np
1281        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1282        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1283        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1284
1285        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1286        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1287        new_data_w = (
1288            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1289            .groupby(["scan", "mz"])["intensity"]
1290            .sum()
1291            .reset_index()
1292        )
1293        new_data_w = (
1294            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1295            .reset_index(drop=True)
1296            .copy()
1297        )
1298
1299        # Check if grid worked and return
1300        if self.check_if_grid(new_data_w):
1301            return new_data_w
1302        else:
1303            raise ValueError("Gridding failed")
1304
1305    def find_mass_features_ph(self, ms_level=1, grid=True):
1306        """Find mass features within an LCMSBase object using persistent homology.
1307
1308        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1309
1310        Parameters
1311        ----------
1312        ms_level : int, optional
1313            The MS level to use. Default is 1.
1314        grid : bool, optional
1315            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1316
1317        Raises
1318        ------
1319        ValueError
1320            If no MS level data is found on the object.
1321            If data is not gridded and grid is False.
1322
1323        Returns
1324        -------
1325        None, but assigns the mass_features attribute to the object.
1326
1327        Notes
1328        -----
1329        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1330        """
1331        # Check that ms_level is a key in self._ms_uprocessed
1332        if ms_level not in self._ms_unprocessed.keys():
1333            raise ValueError(
1334                "No MS level "
1335                + str(ms_level)
1336                + " data found, did you instantiate with parser specific to MS level?"
1337            )
1338
1339        # Get ms data
1340        data = self._ms_unprocessed[ms_level].copy()
1341
1342        # Drop rows with missing intensity values and reset index
1343        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1344
1345        # Threshold data
1346        dims = ["mz", "scan_time"]
1347        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1348        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1349
1350        # Check if gridded, if not, grid
1351        gridded_mz = self.check_if_grid(data_thres)
1352        if gridded_mz is False:
1353            if grid is False:
1354                raise ValueError(
1355                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1356                )
1357            else:
1358                data_thres = self.grid_data(data_thres)
1359
1360        # Add build factors and add scan_time
1361        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1362        factors = {
1363            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1364            for dim in dims
1365        }  # this is return a float64 index
1366
1367        # Build indexes
1368        index = {
1369            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1370            for dim in factors
1371        }
1372
1373        # Smooth data
1374        iterations = self.parameters.lc_ms.ph_smooth_it
1375        smooth_radius = [
1376            self.parameters.lc_ms.ph_smooth_radius_mz,
1377            self.parameters.lc_ms.ph_smooth_radius_scan,
1378        ]  # mz, scan_time smoothing radius (in steps)
1379
1380        index = np.vstack([index[dim] for dim in dims]).T
1381        V = data_thres["intensity"].values
1382        resid = np.inf
1383        for i in range(iterations):
1384            # Previous iteration
1385            V_prev = V.copy()
1386            resid_prev = resid
1387            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1388
1389            # Calculate residual with previous iteration
1390            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1391
1392            # Evaluate convergence
1393            if i > 0:
1394                # Percent change in residual
1395                test = np.abs(resid - resid_prev) / resid_prev
1396
1397                # Exit criteria
1398                if test <= 0:
1399                    break
1400
1401        # Overwrite values
1402        data_thres["intensity"] = V
1403
1404        # Use persistent homology to find regions of interest
1405        pidx, pers = self.sparse_upper_star(index, V)
1406        pidx = pidx[pers > 1]
1407        pers = pers[pers > 1]
1408
1409        # Get peaks
1410        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1411
1412        # Add persistence column
1413        peaks["persistence"] = pers
1414        mass_features = peaks.sort_values(
1415            by="persistence", ascending=False
1416        ).reset_index(drop=True)
1417
1418        # Filter by persistence threshold
1419        persistence_threshold = (
1420            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1421        )
1422        mass_features = mass_features.loc[
1423            mass_features["persistence"] > persistence_threshold, :
1424        ].reset_index(drop=True)
1425
1426        # Rename scan column to apex_scan
1427        mass_features = mass_features.rename(
1428            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1429        )
1430
1431        # Populate mass_features attribute
1432        self.mass_features = {}
1433        for row in mass_features.itertuples():
1434            row_dict = mass_features.iloc[row.Index].to_dict()
1435            lcms_feature = LCMSMassFeature(self, **row_dict)
1436            self.mass_features[lcms_feature.id] = lcms_feature
1437
1438        if self.parameters.lc_ms.verbose_processing:
1439            print("Found " + str(len(mass_features)) + " initial mass features")
1440
1441    def find_mass_features_ph_centroid(self, ms_level=1):
1442        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1443        
1444        Parameters
1445        ----------
1446        ms_level : int, optional
1447            The MS level to use. Default is 1.
1448        
1449        Raises
1450        ------
1451        ValueError
1452            If no MS level data is found on the object.
1453
1454        Returns
1455        -------
1456        None, but assigns the mass_features attribute to the object.        
1457        """
1458        # Check that ms_level is a key in self._ms_uprocessed
1459        if ms_level not in self._ms_unprocessed.keys():
1460            raise ValueError(
1461                "No MS level "
1462                + str(ms_level)
1463                + " data found, did you instantiate with parser specific to MS level?"
1464            )
1465        
1466        # Get ms data
1467        data = self._ms_unprocessed[ms_level].copy()
1468
1469        # Drop rows with missing intensity values and reset index
1470        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1471        
1472        # Threshold data
1473        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1474        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1475        data_thres['persistence'] = data_thres['intensity']
1476
1477        # Use this as the starting point for the mass features, adding scan_time
1478        mf_df = data_thres
1479        mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1480
1481        # Define tolerances and dimensions for rolling up
1482        tol = [
1483            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1484            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1485        ]
1486        relative = [True, False]    
1487        dims = ["mz", "scan_time"]
1488        print("here")
1489        mf_df = self.roll_up_dataframe(
1490            df=mf_df,
1491            sort_by="persistence",
1492            dims=dims,
1493            tol=tol,
1494            relative=relative
1495        )
1496
1497        # Rename scan column to apex_scan
1498        mass_features = mf_df.rename(
1499            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1500        )
1501        # Sort my persistence and reset index
1502        mass_features = mass_features.sort_values(
1503            by="persistence", ascending=False
1504        ).reset_index(drop=True)
1505
1506        # Populate mass_features attribute
1507        self.mass_features = {}
1508        for row in mass_features.itertuples():
1509            row_dict = mass_features.iloc[row.Index].to_dict()
1510            lcms_feature = LCMSMassFeature(self, **row_dict)
1511            self.mass_features[lcms_feature.id] = lcms_feature
1512
1513        if self.parameters.lc_ms.verbose_processing:
1514            print("Found " + str(len(mass_features)) + " initial mass features")
1515    
1516    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1517        """Cluster mass features
1518
1519        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1520
1521        Parameters
1522        ----------
1523        drop_children : bool, optional
1524            Whether to drop the mass features that are not cluster parents. Default is True.
1525        sort_by : str, optional
1526            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1527
1528        Raises
1529        ------
1530        ValueError
1531            If no mass features are found.
1532            If too many mass features are found.
1533
1534        Returns
1535        -------
1536        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1537        """
1538        verbose = self.parameters.lc_ms.verbose_processing
1539
1540        if self.mass_features is None:
1541            raise ValueError("No mass features found, run find_mass_features() first")
1542        if len(self.mass_features) > 400000:
1543            raise ValueError(
1544                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1545            )
1546        dims = ["mz", "scan_time"]
1547        mf_df_og = self.mass_features_to_df()
1548        mf_df = mf_df_og.copy()
1549
1550        tol = [
1551            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1552            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1553        ]  # mz, in relative; scan_time in minutes
1554        relative = [True, False]
1555
1556        # Roll up mass features based on their proximity in the declared dimensions
1557        mf_df_new = self.roll_up_dataframe(
1558            df=mf_df,
1559            sort_by=sort_by,
1560            dims=dims,
1561            tol=tol,
1562            relative=relative
1563        )
1564
1565        mf_df["cluster_parent"] = np.where(
1566            np.isin(mf_df.index, mf_df_new.index), True, False
1567        )
1568
1569        # get mass feature ids of features that are not cluster parents
1570        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1571        if drop_children is True:
1572            # Drop mass features that are not cluster parents from self
1573            self.mass_features = {
1574                k: v
1575                for k, v in self.mass_features.items()
1576                if k not in cluster_daughters
1577            }
1578        else:
1579            return cluster_daughters
def find_closest(A, target):
14def find_closest(A, target):
15    """Find the index of closest value in A to each value in target.
16
17    Parameters
18    ----------
19    A : :obj:`~numpy.array`
20        The array to search (blueprint). A must be sorted.
21    target : :obj:`~numpy.array`
22        The array of values to search for. target must be sorted.
23
24    Returns
25    -------
26    :obj:`~numpy.array`
27        The indices of the closest values in A to each value in target.
28    """
29    idx = A.searchsorted(target)
30    idx = np.clip(idx, 1, len(A) - 1)
31    left = A[idx - 1]
32    right = A[idx]
33    idx -= target - left < right - target
34    return idx

Find the index of closest value in A to each value in target.

Parameters
  • A (~numpy.array): The array to search (blueprint). A must be sorted.
  • target (~numpy.array): The array of values to search for. target must be sorted.
Returns
  • ~numpy.array: The indices of the closest values in A to each value in target.
class LCCalculations:
 37class LCCalculations:
 38    """Methods for performing LC calculations on mass spectra data.
 39
 40    Notes
 41    -----
 42    This class is intended to be used as a mixin for the LCMSBase class.
 43
 44    Methods
 45    -------
 46    * get_max_eic(eic_data).
 47        Returns the maximum EIC value from the given EIC data. A static method.
 48    * smooth_tic(tic).
 49        Smooths the TIC data using the specified smoothing method and settings.
 50    * eic_centroid_detector(rt, eic, max_eic).
 51        Performs EIC centroid detection on the given EIC data.
 52    * find_nearest_scan(rt).
 53        Finds the nearest scan to the given retention time.
 54    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
 55        Returns an averaged mass spectrum object.
 56    * find_mass_features(ms_level=1).
 57        Find regions of interest for a given MS level (default is MS1).
 58    * integrate_mass_features(drop_if_fail=False, ms_level=1).
 59        Integrate mass features of interest and extracts EICs.
 60    * find_c13_mass_features().
 61        Evaluate mass features and mark likely C13 isotopes.
 62    * deconvolute_ms1_mass_features().
 63        Deconvolute mass features' ms1 mass spectra.
 64    """
 65
 66    @staticmethod
 67    def get_max_eic(eic_data: dict):
 68        """Returns the maximum EIC value from the given EIC data.
 69
 70        Notes
 71        -----
 72        This is a static method.
 73
 74        Parameters
 75        ----------
 76        eic_data : dict
 77            A dictionary containing EIC data.
 78
 79        Returns
 80        -------
 81        float
 82            The maximum EIC value.
 83        """
 84        max_eic = 0
 85        for eic_data in eic_data.values():
 86            ind_max_eic = max(eic_data.get("EIC"))
 87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
 88
 89        return max_eic
 90
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )
115
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159
160        return include_indexes
161
162    def find_nearest_scan(self, rt):
163        """Finds the nearest scan to the given retention time.
164
165        Parameters
166        ----------
167        rt : float
168            The retention time (in minutes) to find the nearest scan for.
169
170        Returns
171        -------
172        int
173            The scan number of the nearest scan.
174        """
175        array_rt = np.array(self.retention_time)
176
177        scan_index = (np.abs(array_rt - rt)).argmin()
178
179        real_scan = self.scans_number[scan_index]
180
181        return real_scan
182
183    def add_peak_metrics(self):
184        """Add peak metrics to the mass features.
185
186        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
187        """
188        # Check that at least some mass features have eic data
189        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
190            raise ValueError(
191                "No mass features have EIC data. Run integrate_mass_features first."
192            )
193
194        for mass_feature in self.mass_features.values():
195            # Check if the mass feature has been integrated
196            if mass_feature._eic_data is not None:
197                # Calculate peak metrics
198                mass_feature.calc_half_height_width()
199                mass_feature.calc_tailing_factor()
200                mass_feature.calc_dispersity_index()
201
202    def get_average_mass_spectrum(
203        self,
204        scan_list,
205        apex_scan,
206        spectrum_mode="profile",
207        ms_level=1,
208        auto_process=True,
209        use_parser=False,
210        perform_checks=True,
211        polarity=None,
212        ms_params=None,
213    ):
214        """Returns an averaged mass spectrum object
215
216        Parameters
217        ----------
218        scan_list : list
219            List of scan numbers to average.
220        apex_scan : int
221            Number of the apex scan
222        spectrum_mode : str, optional
223            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
224        ms_level : int, optional
225            The MS level to use. Defaults to 1.
226        auto_process : bool, optional
227            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
228        use_parser : bool, optional
229            If True, the mass spectra will be obtained from the parser. Defaults to False.
230        perform_checks : bool, optional
231            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
232        polarity : int, optional
233            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
234        ms_params : MSParameters, optional
235            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
236
237        Returns
238        -------
239        MassSpectrumProfile
240            The averaged mass spectrum object.
241
242        Raises
243        ------
244        ValueError
245            If the spectrum mode is not "profile".
246            If the MS level is not found in the unprocessed mass spectra dictionary.
247            If not all scan numbers are found in the unprocessed mass spectra dictionary.
248        """
249        if perform_checks:
250            if spectrum_mode != "profile":
251                raise ValueError("Averaging only supported for profile mode")
252
253        if polarity is None:
254            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
255            if self.polarity == "negative":
256                polarity = -1
257            elif self.polarity == "positive":
258                polarity = 1
259            else:
260                raise ValueError(
261                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
262                )
263
264        # if not using_parser, check that scan numbers are in _ms_unprocessed
265        if not use_parser:
266            if perform_checks:
267                # Set index to scan for faster lookup
268                ms_df = (
269                    self._ms_unprocessed[ms_level]
270                    .copy()
271                    .set_index("scan", drop=False)
272                    .sort_index()
273                )
274                my_ms_df = ms_df.loc[scan_list]
275                # Check that all scan numbers are in the ms_df
276                if not all(np.isin(scan_list, ms_df.index)):
277                    raise ValueError(
278                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
279                    )
280            else:
281                my_ms_df = (
282                    pd.DataFrame({"scan": scan_list})
283                    .set_index("scan")
284                    .join(self._ms_unprocessed[ms_level], how="left")
285                )
286
287        if use_parser:
288            ms_list = [
289                self.spectra_parser.get_mass_spectrum_from_scan(
290                    x, spectrum_mode=spectrum_mode, auto_process=False
291                )
292                for x in scan_list
293            ]
294            ms_mz = [x._mz_exp for x in ms_list]
295            ms_int = [x._abundance for x in ms_list]
296            my_ms_df = []
297            for i in np.arange(len(ms_mz)):
298                my_ms_df.append(
299                    pd.DataFrame(
300                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
301                    )
302                )
303            my_ms_df = pd.concat(my_ms_df)
304
305        if not self.check_if_grid(my_ms_df):
306            my_ms_df = self.grid_data(my_ms_df)
307
308        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
309
310        ms = ms_from_array_profile(
311            my_ms_ave.mz,
312            my_ms_ave.intensity,
313            self.file_location,
314            polarity=polarity,
315            auto_process=False,
316        )
317
318        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
319        if ms is not None:
320            if ms_params is not None:
321                ms.parameters = ms_params
322            ms.scan_number = apex_scan
323            if auto_process:
324                ms.process_mass_spec()
325        return ms
326
327    def find_mass_features(self, ms_level=1, grid=True):
328        """Find mass features within an LCMSBase object
329
330        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
331
332        Parameters
333        ----------
334        ms_level : int, optional
335            The MS level to use for peak picking Default is 1.
336        grid : bool, optional
337            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 
338            used for persistent homology peak picking for profile data only. Default is True.
339
340        Raises
341        ------
342        ValueError
343            If no MS level data is found on the object.
344            If persistent homology peak picking is attempted on non-profile mode data.
345            If data is not gridded and grid is False.
346            If peak picking method is not implemented.
347
348        Returns
349        -------
350        None, but assigns the mass_features and eics attributes to the object.
351
352        """
353        pp_method = self.parameters.lc_ms.peak_picking_method
354
355        if pp_method == "persistent homology":
356            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
357            if all(msx_scan_df["ms_format"] == "profile"):
358                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
359                self.cluster_mass_features(drop_children=True, sort_by="persistence")
360            else:
361                raise ValueError(
362                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
363                        ms_level
364                    )
365                )
366        elif pp_method == "centroided_persistent_homology":
367            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
368            if all(msx_scan_df["ms_format"] == "centroid"):
369                self.find_mass_features_ph_centroid(ms_level=ms_level)
370            else:
371                raise ValueError(
372                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
373                        ms_level
374                    )
375                )
376        else:
377            raise ValueError("Peak picking method not implemented")
378
379    def integrate_mass_features(
380        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
381    ):
382        """Integrate mass features and extract EICs.
383
384        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
385
386        Parameters
387        ----------
388        drop_if_fail : bool, optional
389            Whether to drop mass features if the EIC limit calculations fail.
390            Default is True.
391        drop_duplicates : bool, optional
392            Whether to mass features that appear to be duplicates
393            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
394            Default is True.
395        ms_level : int, optional
396            The MS level to use. Default is 1.
397
398        Raises
399        ------
400        ValueError
401            If no mass features are found.
402            If no MS level data is found for the given MS level (either in data or in the scan data)
403
404        Returns
405        -------
406        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
407
408        Notes
409        -----
410        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
411        """
412        # Check if there is data
413        if ms_level in self._ms_unprocessed.keys():
414            raw_data = self._ms_unprocessed[ms_level].copy()
415        else:
416            raise ValueError("No MS level " + str(ms_level) + " data found")
417        if self.mass_features is not None:
418            mf_df = self.mass_features_to_df().copy()
419        else:
420            raise ValueError(
421                "No mass features found, did you run find_mass_features() first?"
422            )
423        # Check if mass_spectrum exists on each mass feature
424        if not all(
425            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
426        ):
427            raise ValueError(
428                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
429            )
430
431        # Subset scan data to only include correct ms_level
432        scan_df_sub = self.scan_df[
433            self.scan_df["ms_level"] == int(ms_level)
434        ].reset_index(drop=True)
435        if scan_df_sub.empty:
436            raise ValueError("No MS level " + ms_level + " data found in scan data")
437        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
438
439        mzs_to_extract = np.unique(mf_df["mz"].values)
440        mzs_to_extract.sort()
441
442        # Get EICs for each unique mz in mass features list
443        for mz in mzs_to_extract:
444            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
445            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
446            raw_data_sub = raw_data[
447                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
448            ].reset_index(drop=True)
449            raw_data_sub = (
450                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
451            )
452            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
453            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
454            myEIC = EIC_Data(
455                scans=raw_data_sub["scan"].values,
456                time=raw_data_sub["scan_time"].values,
457                eic=raw_data_sub["intensity"].values,
458            )
459            # Smooth EIC
460            smoothed_eic = self.smooth_tic(myEIC.eic)
461            smoothed_eic[smoothed_eic < 0] = 0
462            myEIC.eic_smoothed = smoothed_eic
463            self.eics[mz] = myEIC
464
465        # Get limits of mass features using EIC centroid detector and integrate
466        mf_df["area"] = np.nan
467        for idx, mass_feature in mf_df.iterrows():
468            mz = mass_feature.mz
469            apex_scan = mass_feature.apex_scan
470
471            # Pull EIC data and find apex scan index
472            myEIC = self.eics[mz]
473            self.mass_features[idx]._eic_data = myEIC
474            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
475
476            # Find left and right limits of peak using EIC centroid detector, add to EICData
477            centroid_eics = self.eic_centroid_detector(
478                myEIC.time,
479                myEIC.eic_smoothed,
480                mass_feature.intensity * 1.1,
481                apex_indexes=[int(apex_index)],
482            )
483            l_a_r_scan_idx = [i for i in centroid_eics]
484            if len(l_a_r_scan_idx) > 0:
485                # Add start and final scan to mass_features and EICData
486                left_scan, right_scan = (
487                    myEIC.scans[l_a_r_scan_idx[0][0]],
488                    myEIC.scans[l_a_r_scan_idx[0][2]],
489                )
490                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
491                myEIC.apexes = myEIC.apexes + mf_scan_apex
492                self.mass_features[idx].start_scan = left_scan
493                self.mass_features[idx].final_scan = right_scan
494
495                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
496                area = np.trapz(
497                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
498                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
499                )
500                mf_df.at[idx, "area"] = area
501                myEIC.areas = myEIC.areas + [area]
502                self.eics[mz] = myEIC
503                self.mass_features[idx]._area = area
504            else:
505                if drop_if_fail is True:
506                    self.mass_features.pop(idx)
507
508        if drop_duplicates:
509            # Prepare mass feature dataframe
510            mf_df = self.mass_features_to_df().copy()
511
512            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
513            # Kepp the first mass fea
514            for idx, mass_feature in mf_df.iterrows():
515                mz = mass_feature.mz
516                apex_scan = mass_feature.apex_scan
517
518                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
519                mf_df_sub = mf_df[
520                    mf_df["mz_diff_ppm"]
521                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
522                    * 10**6
523                ].copy()
524
525                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
526                for idx2, mass_feature2 in mf_df_sub.iterrows():
527                    if idx2 != idx:
528                        if (
529                            mass_feature2.start_scan >= mass_feature.start_scan
530                            and mass_feature2.final_scan <= mass_feature.final_scan
531                        ):
532                            if idx2 in self.mass_features.keys():
533                                self.mass_features.pop(idx2)
534
535    def find_c13_mass_features(self):
536        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
537
538        Returns
539        -------
540        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
541
542        Raises
543        ------
544        ValueError
545            If no mass features are found.
546        """
547        verbose = self.parameters.lc_ms.verbose_processing
548        if verbose:
549            print("evaluating mass features for C13 isotopes")
550        if self.mass_features is None:
551            raise ValueError("No mass features found, run find_mass_features() first")
552
553        # Data prep fo sparse distance matrix
554        dims = ["mz", "scan_time"]
555        mf_df = self.mass_features_to_df().copy()
556        # Drop mass features that have no area (these are likely to be noise)
557        mf_df = mf_df[mf_df["area"].notnull()]
558        mf_df["mf_id"] = mf_df.index.values
559        dims = ["mz", "scan_time"]
560
561        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
562        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
563
564        mz_diff = 1.003355  # C13-C12 mass difference
565        tol = [
566            mf_df["mz"].median()
567            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
568            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
569        ]  # mz, in relative; scan_time in minutes
570
571        # Compute inter-feature distances
572        distances = None
573        for i in range(len(dims)):
574            # Construct k-d tree
575            values = mf_df[dims[i]].values
576            tree = KDTree(values.reshape(-1, 1))
577
578            max_tol = tol[i]
579            if dims[i] == "mz":
580                # Maximum absolute tolerance
581                max_tol = mz_diff + tol[i]
582
583            # Compute sparse distance matrix
584            # the larger the max_tol, the slower this operation is
585            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
586
587            # Only consider forward case, exclude diagonal
588            sdm = sparse.triu(sdm, k=1)
589
590            if dims[i] == "mz":
591                min_tol = mz_diff - tol[i]
592                # Get only the ones that are above the min tol
593                idx = sdm.data > min_tol
594
595                # Reconstruct sparse distance matrix
596                sdm = sparse.coo_matrix(
597                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
598                    shape=(len(values), len(values)),
599                )
600
601            # Cast as binary matrix
602            sdm.data = np.ones_like(sdm.data)
603
604            # Stack distances
605            if distances is None:
606                distances = sdm
607            else:
608                distances = distances.multiply(sdm)
609
610        # Extract indices of within-tolerance points
611        distances = distances.tocoo()
612        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
613
614        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
615        pairs_mf = pairs.copy()
616        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
617        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
618
619        # Connect monoisotopic masses with isotopologes within mass_features
620        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
621        for mono in monos:
622            self.mass_features[mono].monoisotopic_mf_id = mono
623        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
624        while not pairs_iso_df.empty:
625            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
626            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
627            for iso in m1_isos:
628                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
629                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
630                if len(parent) > 1:
631                    # Choose the parent that is closest in time to the isotopologue
632                    parent_time = [self.mass_features[p].retention_time for p in parent]
633                    time_diff = [
634                        np.abs(self.mass_features[iso].retention_time - x)
635                        for x in parent_time
636                    ]
637                    parent = parent[np.argmin(time_diff)]
638                else:
639                    parent = parent[0]
640                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
641                    parent
642                ].monoisotopic_mf_id
643                if self.mass_features[iso].monoisotopic_mf_id is not None:
644                    mass_diff = (
645                        self.mass_features[iso].mz
646                        - self.mass_features[
647                            self.mass_features[iso].monoisotopic_mf_id
648                        ].mz
649                    )
650                    self.mass_features[iso].isotopologue_type = "13C" + str(
651                        int(round(mass_diff, 0))
652                    )
653
654            # Drop the mono and iso from the pairs_iso_df
655            pairs_iso_df = pairs_iso_df.drop(
656                index=monos, errors="ignore"
657            )  # Drop pairs where the parent is a child that is a child of a root
658            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
659            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
660
661            if not pairs_iso_df.empty:
662                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
663                monos = np.setdiff1d(
664                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
665                )
666        if verbose:
667            # Report fraction of compounds annotated with isotopes
668            mf_df["c13_flag"] = np.where(
669                np.logical_or(
670                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
671                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
672                ),
673                1,
674                0,
675            )
676            print(
677                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
678                + " of mass features have or are C13 isotopes"
679            )
680
681    def deconvolute_ms1_mass_features(self):
682        """Deconvolute MS1 mass features
683
684        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
685
686        Parameters
687        ----------
688        None
689
690        Returns
691        -------
692        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
693        and associated_mass_features_deconvoluted attributes to the mass features in the
694        mass_features attribute of the LCMSBase object.
695
696        Raises
697        ------
698        ValueError
699            If no mass features are found, must run find_mass_features() first.
700            If no EICs are found, did you run integrate_mass_features() first?
701
702        """
703        # Checks for set mass_features and eics
704        if self.mass_features is None:
705            raise ValueError(
706                "No mass features found, did you run find_mass_features() first?"
707            )
708
709        if self.eics == {}:
710            raise ValueError(
711                "No EICs found, did you run integrate_mass_features() first?"
712            )
713
714        if 1 not in self._ms_unprocessed.keys():
715            raise ValueError("No unprocessed MS1 spectra found.")
716
717        # Prep ms1 data
718        ms1_data = self._ms_unprocessed[1].copy()
719        ms1_data = ms1_data.set_index("scan")
720
721        # Prep mass feature summary
722        mass_feature_df = self.mass_features_to_df()
723
724        # Loop through each mass feature
725        for mf_id, mass_feature in self.mass_features.items():
726            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
727            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
728                continue
729
730            # Get the left and right limits of the EIC of the mass feature
731            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
732
733            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
734            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
735            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
736
737            # Get the centroided masses of the mass feature
738            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
739
740            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
741            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
742                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
743            ]
744
745            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
746            ms1_data_sub["mz_diff_rel"] = (
747                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
748                / ms1_data_sub["mz"]
749            )
750            ms1_data_sub = ms1_data_sub[
751                ms1_data_sub["mz_diff_rel"]
752                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
753            ].reset_index(drop=True)
754
755            # Group by mass_feature_mz and scan and sum intensity
756            ms1_data_sub_group = (
757                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
758                .sum()
759                .reset_index()
760            )
761
762            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
763            corr = (
764                ms1_data_sub_group.pivot(
765                    index="scan", columns="mass_feature_mz", values="intensity"
766                )
767                .fillna(0)
768                .corr()
769            )
770
771            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
772            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
773            corr_subset = corr.loc[mass_feature.mz,]
774            corr_subset = corr_subset[corr_subset > decon_corr_min]
775
776            # Get the masses from the mass spectrum that are the result of the deconvolution
777            mzs_decon = corr_subset.index.values
778
779            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
780            mzs_decon_idx = [
781                id
782                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
783                if mz in mzs_decon
784            ]
785            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
786
787            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
788            if (
789                mass_feature.ms1_peak.abundance
790                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
791            ):
792                mass_feature.mass_spectrum_deconvoluted_parent = True
793            else:
794                mass_feature.mass_spectrum_deconvoluted_parent = False
795
796            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
797            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
798            mass_feature_df_sub = mass_feature_df[
799                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
800                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
801            ].copy()
802            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
803            mass_feature_df_sub["mz_diff_ppm"] = [
804                np.abs(mzs_decon - mz).min() / mz * 10**6
805                for mz in mass_feature_df_sub["mz"]
806            ]
807            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
808            mfs_associated_decon = mass_feature_df_sub[
809                mass_feature_df_sub["mz_diff_ppm"]
810                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
811            ].index.values
812
813            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Methods for performing LC calculations on mass spectra data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
  • smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
  • eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
  • find_nearest_scan(rt). Finds the nearest scan to the given retention time.
  • get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
  • find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
  • integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
  • find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
  • deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
@staticmethod
def get_max_eic(eic_data: dict):
66    @staticmethod
67    def get_max_eic(eic_data: dict):
68        """Returns the maximum EIC value from the given EIC data.
69
70        Notes
71        -----
72        This is a static method.
73
74        Parameters
75        ----------
76        eic_data : dict
77            A dictionary containing EIC data.
78
79        Returns
80        -------
81        float
82            The maximum EIC value.
83        """
84        max_eic = 0
85        for eic_data in eic_data.values():
86            ind_max_eic = max(eic_data.get("EIC"))
87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
88
89        return max_eic

Returns the maximum EIC value from the given EIC data.

Notes

This is a static method.

Parameters
  • eic_data (dict): A dictionary containing EIC data.
Returns
  • float: The maximum EIC value.
def smooth_tic(self, tic):
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )

Smooths the TIC or EIC data using the specified smoothing method and settings.

Parameters
  • tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
  • numpy.ndarray: The smoothed TIC data.
def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159
160        return include_indexes

Performs EIC centroid detection on the given EIC data.

Parameters
  • rt (numpy.ndarray): The retention time data.
  • eic (numpy.ndarray): The EIC data.
  • max_eic (float): The maximum EIC value.
  • apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
  • numpy.ndarray: The indexes of left, apex, and right limits as a generator.
def find_nearest_scan(self, rt):
162    def find_nearest_scan(self, rt):
163        """Finds the nearest scan to the given retention time.
164
165        Parameters
166        ----------
167        rt : float
168            The retention time (in minutes) to find the nearest scan for.
169
170        Returns
171        -------
172        int
173            The scan number of the nearest scan.
174        """
175        array_rt = np.array(self.retention_time)
176
177        scan_index = (np.abs(array_rt - rt)).argmin()
178
179        real_scan = self.scans_number[scan_index]
180
181        return real_scan

Finds the nearest scan to the given retention time.

Parameters
  • rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
  • int: The scan number of the nearest scan.
def add_peak_metrics(self):
183    def add_peak_metrics(self):
184        """Add peak metrics to the mass features.
185
186        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
187        """
188        # Check that at least some mass features have eic data
189        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
190            raise ValueError(
191                "No mass features have EIC data. Run integrate_mass_features first."
192            )
193
194        for mass_feature in self.mass_features.values():
195            # Check if the mass feature has been integrated
196            if mass_feature._eic_data is not None:
197                # Calculate peak metrics
198                mass_feature.calc_half_height_width()
199                mass_feature.calc_tailing_factor()
200                mass_feature.calc_dispersity_index()

Add peak metrics to the mass features.

This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.

def get_average_mass_spectrum( self, scan_list, apex_scan, spectrum_mode='profile', ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None, ms_params=None):
202    def get_average_mass_spectrum(
203        self,
204        scan_list,
205        apex_scan,
206        spectrum_mode="profile",
207        ms_level=1,
208        auto_process=True,
209        use_parser=False,
210        perform_checks=True,
211        polarity=None,
212        ms_params=None,
213    ):
214        """Returns an averaged mass spectrum object
215
216        Parameters
217        ----------
218        scan_list : list
219            List of scan numbers to average.
220        apex_scan : int
221            Number of the apex scan
222        spectrum_mode : str, optional
223            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
224        ms_level : int, optional
225            The MS level to use. Defaults to 1.
226        auto_process : bool, optional
227            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
228        use_parser : bool, optional
229            If True, the mass spectra will be obtained from the parser. Defaults to False.
230        perform_checks : bool, optional
231            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
232        polarity : int, optional
233            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
234        ms_params : MSParameters, optional
235            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
236
237        Returns
238        -------
239        MassSpectrumProfile
240            The averaged mass spectrum object.
241
242        Raises
243        ------
244        ValueError
245            If the spectrum mode is not "profile".
246            If the MS level is not found in the unprocessed mass spectra dictionary.
247            If not all scan numbers are found in the unprocessed mass spectra dictionary.
248        """
249        if perform_checks:
250            if spectrum_mode != "profile":
251                raise ValueError("Averaging only supported for profile mode")
252
253        if polarity is None:
254            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
255            if self.polarity == "negative":
256                polarity = -1
257            elif self.polarity == "positive":
258                polarity = 1
259            else:
260                raise ValueError(
261                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
262                )
263
264        # if not using_parser, check that scan numbers are in _ms_unprocessed
265        if not use_parser:
266            if perform_checks:
267                # Set index to scan for faster lookup
268                ms_df = (
269                    self._ms_unprocessed[ms_level]
270                    .copy()
271                    .set_index("scan", drop=False)
272                    .sort_index()
273                )
274                my_ms_df = ms_df.loc[scan_list]
275                # Check that all scan numbers are in the ms_df
276                if not all(np.isin(scan_list, ms_df.index)):
277                    raise ValueError(
278                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
279                    )
280            else:
281                my_ms_df = (
282                    pd.DataFrame({"scan": scan_list})
283                    .set_index("scan")
284                    .join(self._ms_unprocessed[ms_level], how="left")
285                )
286
287        if use_parser:
288            ms_list = [
289                self.spectra_parser.get_mass_spectrum_from_scan(
290                    x, spectrum_mode=spectrum_mode, auto_process=False
291                )
292                for x in scan_list
293            ]
294            ms_mz = [x._mz_exp for x in ms_list]
295            ms_int = [x._abundance for x in ms_list]
296            my_ms_df = []
297            for i in np.arange(len(ms_mz)):
298                my_ms_df.append(
299                    pd.DataFrame(
300                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
301                    )
302                )
303            my_ms_df = pd.concat(my_ms_df)
304
305        if not self.check_if_grid(my_ms_df):
306            my_ms_df = self.grid_data(my_ms_df)
307
308        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
309
310        ms = ms_from_array_profile(
311            my_ms_ave.mz,
312            my_ms_ave.intensity,
313            self.file_location,
314            polarity=polarity,
315            auto_process=False,
316        )
317
318        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
319        if ms is not None:
320            if ms_params is not None:
321                ms.parameters = ms_params
322            ms.scan_number = apex_scan
323            if auto_process:
324                ms.process_mass_spec()
325        return ms

Returns an averaged mass spectrum object

Parameters
  • scan_list (list): List of scan numbers to average.
  • apex_scan (int): Number of the apex scan
  • spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
  • ms_level (int, optional): The MS level to use. Defaults to 1.
  • auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
  • use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
  • perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
  • polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
  • ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
  • MassSpectrumProfile: The averaged mass spectrum object.
Raises
  • ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
def find_mass_features(self, ms_level=1, grid=True):
327    def find_mass_features(self, ms_level=1, grid=True):
328        """Find mass features within an LCMSBase object
329
330        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
331
332        Parameters
333        ----------
334        ms_level : int, optional
335            The MS level to use for peak picking Default is 1.
336        grid : bool, optional
337            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 
338            used for persistent homology peak picking for profile data only. Default is True.
339
340        Raises
341        ------
342        ValueError
343            If no MS level data is found on the object.
344            If persistent homology peak picking is attempted on non-profile mode data.
345            If data is not gridded and grid is False.
346            If peak picking method is not implemented.
347
348        Returns
349        -------
350        None, but assigns the mass_features and eics attributes to the object.
351
352        """
353        pp_method = self.parameters.lc_ms.peak_picking_method
354
355        if pp_method == "persistent homology":
356            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
357            if all(msx_scan_df["ms_format"] == "profile"):
358                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
359                self.cluster_mass_features(drop_children=True, sort_by="persistence")
360            else:
361                raise ValueError(
362                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
363                        ms_level
364                    )
365                )
366        elif pp_method == "centroided_persistent_homology":
367            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
368            if all(msx_scan_df["ms_format"] == "centroid"):
369                self.find_mass_features_ph_centroid(ms_level=ms_level)
370            else:
371                raise ValueError(
372                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
373                        ms_level
374                    )
375                )
376        else:
377            raise ValueError("Peak picking method not implemented")

Find mass features within an LCMSBase object

Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.

Parameters
  • ms_level (int, optional): The MS level to use for peak picking Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), used for persistent homology peak picking for profile data only. Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
  • None, but assigns the mass_features and eics attributes to the object.
def integrate_mass_features(self, drop_if_fail=True, drop_duplicates=True, ms_level=1):
379    def integrate_mass_features(
380        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
381    ):
382        """Integrate mass features and extract EICs.
383
384        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
385
386        Parameters
387        ----------
388        drop_if_fail : bool, optional
389            Whether to drop mass features if the EIC limit calculations fail.
390            Default is True.
391        drop_duplicates : bool, optional
392            Whether to mass features that appear to be duplicates
393            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
394            Default is True.
395        ms_level : int, optional
396            The MS level to use. Default is 1.
397
398        Raises
399        ------
400        ValueError
401            If no mass features are found.
402            If no MS level data is found for the given MS level (either in data or in the scan data)
403
404        Returns
405        -------
406        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
407
408        Notes
409        -----
410        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
411        """
412        # Check if there is data
413        if ms_level in self._ms_unprocessed.keys():
414            raw_data = self._ms_unprocessed[ms_level].copy()
415        else:
416            raise ValueError("No MS level " + str(ms_level) + " data found")
417        if self.mass_features is not None:
418            mf_df = self.mass_features_to_df().copy()
419        else:
420            raise ValueError(
421                "No mass features found, did you run find_mass_features() first?"
422            )
423        # Check if mass_spectrum exists on each mass feature
424        if not all(
425            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
426        ):
427            raise ValueError(
428                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
429            )
430
431        # Subset scan data to only include correct ms_level
432        scan_df_sub = self.scan_df[
433            self.scan_df["ms_level"] == int(ms_level)
434        ].reset_index(drop=True)
435        if scan_df_sub.empty:
436            raise ValueError("No MS level " + ms_level + " data found in scan data")
437        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
438
439        mzs_to_extract = np.unique(mf_df["mz"].values)
440        mzs_to_extract.sort()
441
442        # Get EICs for each unique mz in mass features list
443        for mz in mzs_to_extract:
444            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
445            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
446            raw_data_sub = raw_data[
447                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
448            ].reset_index(drop=True)
449            raw_data_sub = (
450                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
451            )
452            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
453            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
454            myEIC = EIC_Data(
455                scans=raw_data_sub["scan"].values,
456                time=raw_data_sub["scan_time"].values,
457                eic=raw_data_sub["intensity"].values,
458            )
459            # Smooth EIC
460            smoothed_eic = self.smooth_tic(myEIC.eic)
461            smoothed_eic[smoothed_eic < 0] = 0
462            myEIC.eic_smoothed = smoothed_eic
463            self.eics[mz] = myEIC
464
465        # Get limits of mass features using EIC centroid detector and integrate
466        mf_df["area"] = np.nan
467        for idx, mass_feature in mf_df.iterrows():
468            mz = mass_feature.mz
469            apex_scan = mass_feature.apex_scan
470
471            # Pull EIC data and find apex scan index
472            myEIC = self.eics[mz]
473            self.mass_features[idx]._eic_data = myEIC
474            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
475
476            # Find left and right limits of peak using EIC centroid detector, add to EICData
477            centroid_eics = self.eic_centroid_detector(
478                myEIC.time,
479                myEIC.eic_smoothed,
480                mass_feature.intensity * 1.1,
481                apex_indexes=[int(apex_index)],
482            )
483            l_a_r_scan_idx = [i for i in centroid_eics]
484            if len(l_a_r_scan_idx) > 0:
485                # Add start and final scan to mass_features and EICData
486                left_scan, right_scan = (
487                    myEIC.scans[l_a_r_scan_idx[0][0]],
488                    myEIC.scans[l_a_r_scan_idx[0][2]],
489                )
490                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
491                myEIC.apexes = myEIC.apexes + mf_scan_apex
492                self.mass_features[idx].start_scan = left_scan
493                self.mass_features[idx].final_scan = right_scan
494
495                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
496                area = np.trapz(
497                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
498                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
499                )
500                mf_df.at[idx, "area"] = area
501                myEIC.areas = myEIC.areas + [area]
502                self.eics[mz] = myEIC
503                self.mass_features[idx]._area = area
504            else:
505                if drop_if_fail is True:
506                    self.mass_features.pop(idx)
507
508        if drop_duplicates:
509            # Prepare mass feature dataframe
510            mf_df = self.mass_features_to_df().copy()
511
512            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
513            # Kepp the first mass fea
514            for idx, mass_feature in mf_df.iterrows():
515                mz = mass_feature.mz
516                apex_scan = mass_feature.apex_scan
517
518                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
519                mf_df_sub = mf_df[
520                    mf_df["mz_diff_ppm"]
521                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
522                    * 10**6
523                ].copy()
524
525                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
526                for idx2, mass_feature2 in mf_df_sub.iterrows():
527                    if idx2 != idx:
528                        if (
529                            mass_feature2.start_scan >= mass_feature.start_scan
530                            and mass_feature2.final_scan <= mass_feature.final_scan
531                        ):
532                            if idx2 in self.mass_features.keys():
533                                self.mass_features.pop(idx2)

Integrate mass features and extract EICs.

Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.

Parameters
  • drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
  • drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
  • None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes

drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).

def find_c13_mass_features(self):
535    def find_c13_mass_features(self):
536        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
537
538        Returns
539        -------
540        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
541
542        Raises
543        ------
544        ValueError
545            If no mass features are found.
546        """
547        verbose = self.parameters.lc_ms.verbose_processing
548        if verbose:
549            print("evaluating mass features for C13 isotopes")
550        if self.mass_features is None:
551            raise ValueError("No mass features found, run find_mass_features() first")
552
553        # Data prep fo sparse distance matrix
554        dims = ["mz", "scan_time"]
555        mf_df = self.mass_features_to_df().copy()
556        # Drop mass features that have no area (these are likely to be noise)
557        mf_df = mf_df[mf_df["area"].notnull()]
558        mf_df["mf_id"] = mf_df.index.values
559        dims = ["mz", "scan_time"]
560
561        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
562        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
563
564        mz_diff = 1.003355  # C13-C12 mass difference
565        tol = [
566            mf_df["mz"].median()
567            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
568            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
569        ]  # mz, in relative; scan_time in minutes
570
571        # Compute inter-feature distances
572        distances = None
573        for i in range(len(dims)):
574            # Construct k-d tree
575            values = mf_df[dims[i]].values
576            tree = KDTree(values.reshape(-1, 1))
577
578            max_tol = tol[i]
579            if dims[i] == "mz":
580                # Maximum absolute tolerance
581                max_tol = mz_diff + tol[i]
582
583            # Compute sparse distance matrix
584            # the larger the max_tol, the slower this operation is
585            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
586
587            # Only consider forward case, exclude diagonal
588            sdm = sparse.triu(sdm, k=1)
589
590            if dims[i] == "mz":
591                min_tol = mz_diff - tol[i]
592                # Get only the ones that are above the min tol
593                idx = sdm.data > min_tol
594
595                # Reconstruct sparse distance matrix
596                sdm = sparse.coo_matrix(
597                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
598                    shape=(len(values), len(values)),
599                )
600
601            # Cast as binary matrix
602            sdm.data = np.ones_like(sdm.data)
603
604            # Stack distances
605            if distances is None:
606                distances = sdm
607            else:
608                distances = distances.multiply(sdm)
609
610        # Extract indices of within-tolerance points
611        distances = distances.tocoo()
612        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
613
614        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
615        pairs_mf = pairs.copy()
616        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
617        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
618
619        # Connect monoisotopic masses with isotopologes within mass_features
620        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
621        for mono in monos:
622            self.mass_features[mono].monoisotopic_mf_id = mono
623        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
624        while not pairs_iso_df.empty:
625            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
626            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
627            for iso in m1_isos:
628                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
629                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
630                if len(parent) > 1:
631                    # Choose the parent that is closest in time to the isotopologue
632                    parent_time = [self.mass_features[p].retention_time for p in parent]
633                    time_diff = [
634                        np.abs(self.mass_features[iso].retention_time - x)
635                        for x in parent_time
636                    ]
637                    parent = parent[np.argmin(time_diff)]
638                else:
639                    parent = parent[0]
640                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
641                    parent
642                ].monoisotopic_mf_id
643                if self.mass_features[iso].monoisotopic_mf_id is not None:
644                    mass_diff = (
645                        self.mass_features[iso].mz
646                        - self.mass_features[
647                            self.mass_features[iso].monoisotopic_mf_id
648                        ].mz
649                    )
650                    self.mass_features[iso].isotopologue_type = "13C" + str(
651                        int(round(mass_diff, 0))
652                    )
653
654            # Drop the mono and iso from the pairs_iso_df
655            pairs_iso_df = pairs_iso_df.drop(
656                index=monos, errors="ignore"
657            )  # Drop pairs where the parent is a child that is a child of a root
658            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
659            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
660
661            if not pairs_iso_df.empty:
662                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
663                monos = np.setdiff1d(
664                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
665                )
666        if verbose:
667            # Report fraction of compounds annotated with isotopes
668            mf_df["c13_flag"] = np.where(
669                np.logical_or(
670                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
671                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
672                ),
673                1,
674                0,
675            )
676            print(
677                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
678                + " of mass features have or are C13 isotopes"
679            )

Mark likely C13 isotopes and connect to monoisoitopic mass features.

Returns
  • None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found.
def deconvolute_ms1_mass_features(self):
681    def deconvolute_ms1_mass_features(self):
682        """Deconvolute MS1 mass features
683
684        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
685
686        Parameters
687        ----------
688        None
689
690        Returns
691        -------
692        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
693        and associated_mass_features_deconvoluted attributes to the mass features in the
694        mass_features attribute of the LCMSBase object.
695
696        Raises
697        ------
698        ValueError
699            If no mass features are found, must run find_mass_features() first.
700            If no EICs are found, did you run integrate_mass_features() first?
701
702        """
703        # Checks for set mass_features and eics
704        if self.mass_features is None:
705            raise ValueError(
706                "No mass features found, did you run find_mass_features() first?"
707            )
708
709        if self.eics == {}:
710            raise ValueError(
711                "No EICs found, did you run integrate_mass_features() first?"
712            )
713
714        if 1 not in self._ms_unprocessed.keys():
715            raise ValueError("No unprocessed MS1 spectra found.")
716
717        # Prep ms1 data
718        ms1_data = self._ms_unprocessed[1].copy()
719        ms1_data = ms1_data.set_index("scan")
720
721        # Prep mass feature summary
722        mass_feature_df = self.mass_features_to_df()
723
724        # Loop through each mass feature
725        for mf_id, mass_feature in self.mass_features.items():
726            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
727            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
728                continue
729
730            # Get the left and right limits of the EIC of the mass feature
731            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
732
733            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
734            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
735            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
736
737            # Get the centroided masses of the mass feature
738            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
739
740            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
741            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
742                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
743            ]
744
745            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
746            ms1_data_sub["mz_diff_rel"] = (
747                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
748                / ms1_data_sub["mz"]
749            )
750            ms1_data_sub = ms1_data_sub[
751                ms1_data_sub["mz_diff_rel"]
752                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
753            ].reset_index(drop=True)
754
755            # Group by mass_feature_mz and scan and sum intensity
756            ms1_data_sub_group = (
757                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
758                .sum()
759                .reset_index()
760            )
761
762            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
763            corr = (
764                ms1_data_sub_group.pivot(
765                    index="scan", columns="mass_feature_mz", values="intensity"
766                )
767                .fillna(0)
768                .corr()
769            )
770
771            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
772            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
773            corr_subset = corr.loc[mass_feature.mz,]
774            corr_subset = corr_subset[corr_subset > decon_corr_min]
775
776            # Get the masses from the mass spectrum that are the result of the deconvolution
777            mzs_decon = corr_subset.index.values
778
779            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
780            mzs_decon_idx = [
781                id
782                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
783                if mz in mzs_decon
784            ]
785            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
786
787            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
788            if (
789                mass_feature.ms1_peak.abundance
790                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
791            ):
792                mass_feature.mass_spectrum_deconvoluted_parent = True
793            else:
794                mass_feature.mass_spectrum_deconvoluted_parent = False
795
796            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
797            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
798            mass_feature_df_sub = mass_feature_df[
799                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
800                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
801            ].copy()
802            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
803            mass_feature_df_sub["mz_diff_ppm"] = [
804                np.abs(mzs_decon - mz).min() / mz * 10**6
805                for mz in mass_feature_df_sub["mz"]
806            ]
807            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
808            mfs_associated_decon = mass_feature_df_sub[
809                mass_feature_df_sub["mz_diff_ppm"]
810                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
811            ].index.values
812
813            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Deconvolute MS1 mass features

Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features

Parameters
  • None
Returns
  • None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
  • and associated_mass_features_deconvoluted attributes to the mass features in the
  • mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
class PHCalculations:
 816class PHCalculations:
 817    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 818
 819    Notes
 820    -----
 821    This class is intended to be used as a mixin for the LCMSBase class.
 822
 823    Methods
 824    -------
 825    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 826        Sparse implementation of a mean filter.
 827    * embed_unique_indices(a).
 828        Creates an array of indices, sorted by unique element.
 829    * sparse_upper_star(idx, V).
 830        Sparse implementation of an upper star filtration.
 831    * check_if_grid(data).
 832        Check if the data is gridded in mz space.
 833    * grid_data(data).
 834        Grid the data in the mz dimension.
 835    * find_mass_features_ph(ms_level=1, grid=True).
 836        Find mass features within an LCMSBase object using persistent homology.
 837    * cluster_mass_features(drop_children=True).
 838        Cluster regions of interest.
 839    """
 840
 841    @staticmethod
 842    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 843        """Sparse implementation of a mean filter.
 844
 845        Parameters
 846        ----------
 847        idx : :obj:`~numpy.array`
 848            Edge indices for each dimension (MxN).
 849        V : :obj:`~numpy.array`
 850            Array of intensity data (Mx1).
 851        radius : float or list
 852            Radius of the sparse filter in each dimension. Values less than
 853            zero indicate no connectivity in that dimension.
 854
 855        Returns
 856        -------
 857        :obj:`~numpy.array`
 858            Filtered intensities (Mx1).
 859
 860        Notes
 861        -----
 862        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 863        This is a static method.
 864        """
 865
 866        # Copy indices
 867        idx = idx.copy().astype(V.dtype)
 868
 869        # Scale
 870        for i, r in enumerate(radius):
 871            # Increase inter-index distance
 872            if r < 1:
 873                idx[:, i] *= 2
 874
 875            # Do nothing
 876            elif r == 1:
 877                pass
 878
 879            # Decrease inter-index distance
 880            else:
 881                idx[:, i] /= r
 882
 883        # Connectivity matrix
 884        cmat = KDTree(idx)
 885        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 886        cmat.setdiag(1)
 887
 888        # Pair indices
 889        I, J = cmat.nonzero()
 890
 891        # Delete cmat
 892        cmat_shape = cmat.shape
 893        del cmat
 894
 895        # Sum over columns
 896        V_sum = sparse.bsr_matrix(
 897            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 898        ).diagonal(0)
 899
 900        # Count over columns
 901        V_count = sparse.bsr_matrix(
 902            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 903        ).diagonal(0)
 904
 905        return V_sum / V_count
 906
 907    @staticmethod
 908    def embed_unique_indices(a):
 909        """Creates an array of indices, sorted by unique element.
 910
 911        Parameters
 912        ----------
 913        a : :obj:`~numpy.array`
 914            Array of unique elements (Mx1).
 915
 916        Returns
 917        -------
 918        :obj:`~numpy.array`
 919            Array of indices (Mx1).
 920
 921        Notes
 922        -----
 923        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 924        This is a static method.
 925        """
 926
 927        def count_tens(n):
 928            # Count tens
 929            ntens = (n - 1) // 10
 930
 931            while True:
 932                ntens_test = (ntens + n - 1) // 10
 933
 934                if ntens_test == ntens:
 935                    return ntens
 936                else:
 937                    ntens = ntens_test
 938
 939        def arange_exclude_10s(n):
 940            # How many 10s will there be?
 941            ntens = count_tens(n)
 942
 943            # Base array
 944            arr = np.arange(0, n + ntens)
 945
 946            # Exclude 10s
 947            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 948
 949            return arr
 950
 951        # Creates an array of indices, sorted by unique element
 952        idx_sort = np.argsort(a)
 953        idx_unsort = np.argsort(idx_sort)
 954
 955        # Sorts records array so all unique elements are together
 956        sorted_a = a[idx_sort]
 957
 958        # Returns the unique values, the index of the first occurrence,
 959        # and the count for each element
 960        vals, idx_start, count = np.unique(
 961            sorted_a, return_index=True, return_counts=True
 962        )
 963
 964        # Splits the indices into separate arrays
 965        splits = np.split(idx_sort, idx_start[1:])
 966
 967        # Creates unique indices for each split
 968        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 969
 970        # Reorders according to input array
 971        idx_unq = idx_unq[idx_unsort]
 972
 973        # Magnitude of each index
 974        exp = np.log10(
 975            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
 976        )
 977        idx_unq_mag = np.power(10, np.floor(exp) + 1)
 978
 979        # Result
 980        return a + idx_unq / idx_unq_mag
 981    
 982    @staticmethod
 983    def roll_up_dataframe(
 984        df : pd.DataFrame,
 985        sort_by : str,
 986        tol : list,
 987        relative : list,
 988        dims : list
 989    ):
 990        """Subset data by rolling up into apex in appropriate dimensions.
 991        
 992        Parameters
 993        ----------
 994        data : pd.DataFrame
 995            The input data containing "dims" columns and the "sort_by" column.
 996        sort_by : str
 997            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 
 998            (i.e., the mass feature with the highest value in the sort_by column).
 999        dims : list
1000            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1001        tol : list
1002            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1003            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1004            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1005            If relative is False, the tolerance will be used as is.
1006        relative : list
1007            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1008
1009        Returns
1010        -------
1011        pd.DataFrame
1012            A DataFrame with only the rolled up mass features, with the original index and columns.
1013
1014            
1015        Raises
1016        ------
1017        ValueError
1018            If the input data is not a pandas DataFrame.
1019            If the input data does not have columns for each of the dimensions in "dims".
1020            If the length of "dims", "tol", and "relative" do not match.
1021        """
1022        og_columns = df.columns.copy()
1023
1024        # Unindex the data, but keep the original index
1025        if df.index.name is not None:
1026            og_index = df.index.name
1027        else:
1028            og_index = "index"
1029        df = df.reset_index(drop=False)
1030
1031        # Sort data by sort_by column, and reindex
1032        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1033
1034        # Check that data is a DataFrame and has columns for each of the dims
1035        if not isinstance(df, pd.DataFrame):
1036            raise ValueError("Data must be a pandas DataFrame")
1037        for dim in dims:
1038            if dim not in df.columns:
1039                raise ValueError(f"Data must have a column for {dim}")
1040        if len(dims) != len(tol) or len(dims) != len(relative):
1041            raise ValueError(
1042                "Dimensions, tolerances, and relative flags must be the same length"
1043            )
1044        
1045        # Compute inter-feature distances
1046        distances = None
1047        for i in range(len(dims)):
1048            # Construct k-d tree
1049            values = df[dims[i]].values
1050            tree = KDTree(values.reshape(-1, 1))
1051
1052            max_tol = tol[i]
1053            if relative[i] is True:
1054                # Maximum absolute tolerance
1055                max_tol = tol[i] * values.max()
1056
1057            # Compute sparse distance matrix
1058            # the larger the max_tol, the slower this operation is
1059            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1060
1061            # Only consider forward case, exclude diagonal
1062            sdm = sparse.triu(sdm, k=1)
1063
1064            # Filter relative distances
1065            if relative[i] is True:
1066                # Compute relative distances
1067                rel_dists = sdm.data / values[sdm.row]  # or col?
1068
1069                # Indices of relative distances less than tolerance
1070                idx = rel_dists <= tol[i]
1071
1072                # Reconstruct sparse distance matrix
1073                sdm = sparse.coo_matrix(
1074                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1075                    shape=(len(values), len(values)),
1076                )
1077
1078            # Cast as binary matrix
1079            sdm.data = np.ones_like(sdm.data)
1080
1081            # Stack distances
1082            if distances is None:
1083                distances = sdm
1084            else:
1085                distances = distances.multiply(sdm)
1086        
1087        # Extract indices of within-tolerance points
1088        distances = distances.tocoo()
1089        pairs = np.stack((distances.row, distances.col), axis=1)
1090        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1091        pairs_df = pairs_df.set_index("parent")
1092
1093        to_drop = []
1094        while not pairs_df.empty:
1095            # Find root_parents and their children
1096            root_parents = np.setdiff1d(
1097                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1098            )
1099            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1100            to_drop = np.append(to_drop, children_of_roots)
1101
1102            # Remove root_children as possible parents from pairs_df for next iteration
1103            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1104            pairs_df = pairs_df.reset_index().set_index("child")
1105            # Remove root_children as possible children from pairs_df for next iteration
1106            pairs_df = pairs_df.drop(index=children_of_roots)
1107
1108            # Prepare for next iteration
1109            pairs_df = pairs_df.reset_index().set_index("parent")
1110
1111        # Drop mass features that are not cluster parents
1112        df_sub = df.drop(index=np.array(to_drop))
1113
1114        # Set index back to og_index and only keep the columns that are in the original dataframe
1115        df_sub = df_sub.set_index(og_index)
1116
1117        # sort the dataframe by the original index
1118        df_sub = df_sub.sort_index()
1119        df_sub = df_sub[og_columns]
1120
1121        return df_sub
1122    
1123    def sparse_upper_star(self, idx, V):
1124        """Sparse implementation of an upper star filtration.
1125
1126        Parameters
1127        ----------
1128        idx : :obj:`~numpy.array`
1129            Edge indices for each dimension (MxN).
1130        V : :obj:`~numpy.array`
1131            Array of intensity data (Mx1).
1132        Returns
1133        -------
1134        idx : :obj:`~numpy.array`
1135            Index of filtered points (Mx1).
1136        persistence : :obj:`~numpy.array`
1137            Persistence of each filtered point (Mx1).
1138
1139        Notes
1140        -----
1141        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1142        """
1143
1144        # Invert
1145        V = -1 * V.copy().astype(int)
1146
1147        # Embed indices
1148        V = self.embed_unique_indices(V)
1149
1150        # Connectivity matrix
1151        cmat = KDTree(idx)
1152        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1153        cmat.setdiag(1)
1154        cmat = sparse.triu(cmat)
1155
1156        # Pairwise minimums
1157        I, J = cmat.nonzero()
1158        d = np.maximum(V[I], V[J])
1159
1160        # Delete connectiity matrix
1161        cmat_shape = cmat.shape
1162        del cmat
1163
1164        # Sparse distance matrix
1165        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1166
1167        # Delete pairwise mins
1168        del d, I, J
1169
1170        # Persistence homology
1171        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1172
1173        # Bound death values
1174        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1175
1176        # Construct tree to query against
1177        tree = KDTree(V.reshape((-1, 1)))
1178
1179        # Get the indexes of the first nearest neighbor by birth
1180        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1181
1182        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1183
1184    def check_if_grid(self, data):
1185        """Check if the data are gridded in mz space.
1186
1187        Parameters
1188        ----------
1189        data : DataFrame
1190            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1191
1192        Returns
1193        -------
1194        bool
1195            True if the data is gridded in the mz direction, False otherwise.
1196
1197        Notes
1198        -----
1199        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1200        """
1201        # Calculate the difference between consecutive mz values in a single scan
1202        dat_check = data.copy().reset_index(drop=True)
1203        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1204        mz_diff_min = (
1205            dat_check.groupby("scan")["mz_diff"].min().min()
1206        )  # within each scan, what is the smallest mz difference between consecutive mz values
1207
1208        # Find the mininum mz difference between mz values in the data; regardless of scan
1209        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1210        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1211        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1212
1213        # Get minimum mz_diff between mz values in the data
1214        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1215
1216        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1217        if mz_diff_min_raw < mz_diff_min:
1218            return False
1219        else:
1220            return True
1221
1222    def grid_data(self, data):
1223        """Grid the data in the mz dimension.
1224
1225        Data must be gridded prior to persistent homology calculations.
1226
1227        Parameters
1228        ----------
1229        data : DataFrame
1230            The input data containing mz, scan, scan_time, and intensity columns.
1231
1232        Returns
1233        -------
1234        DataFrame
1235            The gridded data with mz, scan, scan_time, and intensity columns.
1236
1237        Raises
1238        ------
1239        ValueError
1240            If gridding fails.
1241        """
1242
1243        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1244        data_w = data.copy().reset_index(drop=True)
1245        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1246        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1247
1248        # Need high intensity mz values first so they are parents in the output pairs stack
1249        dat_mz = data_w[["mz", "intensity"]].sort_values(
1250            by=["intensity"], ascending=False
1251        )
1252        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1253
1254        # Construct KD tree
1255        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1256        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1257        sdm = sparse.triu(sdm, k=1)
1258        sdm.data = np.ones_like(sdm.data)
1259        distances = sdm.tocoo()
1260        pairs = np.stack((distances.row, distances.col), axis=1)
1261
1262        # Cull pairs to just get root
1263        to_drop = []
1264        while len(pairs) > 0:
1265            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1266            id_root_parents = np.isin(pairs[:, 0], root_parents)
1267            children_of_roots = np.unique(pairs[id_root_parents, 1])
1268            to_drop = np.append(to_drop, children_of_roots)
1269
1270            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1271            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1272            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1273        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1274        mz_dat_np = (
1275            dat_mz[["mz"]]
1276            .sort_values(by=["mz"])
1277            .reset_index(drop=True)
1278            .values.flatten()
1279        )
1280
1281        # Sort data by mz and recast mz to nearest value in mz_dat_np
1282        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1283        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1284        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1285
1286        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1287        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1288        new_data_w = (
1289            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1290            .groupby(["scan", "mz"])["intensity"]
1291            .sum()
1292            .reset_index()
1293        )
1294        new_data_w = (
1295            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1296            .reset_index(drop=True)
1297            .copy()
1298        )
1299
1300        # Check if grid worked and return
1301        if self.check_if_grid(new_data_w):
1302            return new_data_w
1303        else:
1304            raise ValueError("Gridding failed")
1305
1306    def find_mass_features_ph(self, ms_level=1, grid=True):
1307        """Find mass features within an LCMSBase object using persistent homology.
1308
1309        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1310
1311        Parameters
1312        ----------
1313        ms_level : int, optional
1314            The MS level to use. Default is 1.
1315        grid : bool, optional
1316            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1317
1318        Raises
1319        ------
1320        ValueError
1321            If no MS level data is found on the object.
1322            If data is not gridded and grid is False.
1323
1324        Returns
1325        -------
1326        None, but assigns the mass_features attribute to the object.
1327
1328        Notes
1329        -----
1330        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1331        """
1332        # Check that ms_level is a key in self._ms_uprocessed
1333        if ms_level not in self._ms_unprocessed.keys():
1334            raise ValueError(
1335                "No MS level "
1336                + str(ms_level)
1337                + " data found, did you instantiate with parser specific to MS level?"
1338            )
1339
1340        # Get ms data
1341        data = self._ms_unprocessed[ms_level].copy()
1342
1343        # Drop rows with missing intensity values and reset index
1344        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1345
1346        # Threshold data
1347        dims = ["mz", "scan_time"]
1348        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1349        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1350
1351        # Check if gridded, if not, grid
1352        gridded_mz = self.check_if_grid(data_thres)
1353        if gridded_mz is False:
1354            if grid is False:
1355                raise ValueError(
1356                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1357                )
1358            else:
1359                data_thres = self.grid_data(data_thres)
1360
1361        # Add build factors and add scan_time
1362        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1363        factors = {
1364            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1365            for dim in dims
1366        }  # this is return a float64 index
1367
1368        # Build indexes
1369        index = {
1370            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1371            for dim in factors
1372        }
1373
1374        # Smooth data
1375        iterations = self.parameters.lc_ms.ph_smooth_it
1376        smooth_radius = [
1377            self.parameters.lc_ms.ph_smooth_radius_mz,
1378            self.parameters.lc_ms.ph_smooth_radius_scan,
1379        ]  # mz, scan_time smoothing radius (in steps)
1380
1381        index = np.vstack([index[dim] for dim in dims]).T
1382        V = data_thres["intensity"].values
1383        resid = np.inf
1384        for i in range(iterations):
1385            # Previous iteration
1386            V_prev = V.copy()
1387            resid_prev = resid
1388            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1389
1390            # Calculate residual with previous iteration
1391            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1392
1393            # Evaluate convergence
1394            if i > 0:
1395                # Percent change in residual
1396                test = np.abs(resid - resid_prev) / resid_prev
1397
1398                # Exit criteria
1399                if test <= 0:
1400                    break
1401
1402        # Overwrite values
1403        data_thres["intensity"] = V
1404
1405        # Use persistent homology to find regions of interest
1406        pidx, pers = self.sparse_upper_star(index, V)
1407        pidx = pidx[pers > 1]
1408        pers = pers[pers > 1]
1409
1410        # Get peaks
1411        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1412
1413        # Add persistence column
1414        peaks["persistence"] = pers
1415        mass_features = peaks.sort_values(
1416            by="persistence", ascending=False
1417        ).reset_index(drop=True)
1418
1419        # Filter by persistence threshold
1420        persistence_threshold = (
1421            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1422        )
1423        mass_features = mass_features.loc[
1424            mass_features["persistence"] > persistence_threshold, :
1425        ].reset_index(drop=True)
1426
1427        # Rename scan column to apex_scan
1428        mass_features = mass_features.rename(
1429            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1430        )
1431
1432        # Populate mass_features attribute
1433        self.mass_features = {}
1434        for row in mass_features.itertuples():
1435            row_dict = mass_features.iloc[row.Index].to_dict()
1436            lcms_feature = LCMSMassFeature(self, **row_dict)
1437            self.mass_features[lcms_feature.id] = lcms_feature
1438
1439        if self.parameters.lc_ms.verbose_processing:
1440            print("Found " + str(len(mass_features)) + " initial mass features")
1441
1442    def find_mass_features_ph_centroid(self, ms_level=1):
1443        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1444        
1445        Parameters
1446        ----------
1447        ms_level : int, optional
1448            The MS level to use. Default is 1.
1449        
1450        Raises
1451        ------
1452        ValueError
1453            If no MS level data is found on the object.
1454
1455        Returns
1456        -------
1457        None, but assigns the mass_features attribute to the object.        
1458        """
1459        # Check that ms_level is a key in self._ms_uprocessed
1460        if ms_level not in self._ms_unprocessed.keys():
1461            raise ValueError(
1462                "No MS level "
1463                + str(ms_level)
1464                + " data found, did you instantiate with parser specific to MS level?"
1465            )
1466        
1467        # Get ms data
1468        data = self._ms_unprocessed[ms_level].copy()
1469
1470        # Drop rows with missing intensity values and reset index
1471        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1472        
1473        # Threshold data
1474        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1475        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1476        data_thres['persistence'] = data_thres['intensity']
1477
1478        # Use this as the starting point for the mass features, adding scan_time
1479        mf_df = data_thres
1480        mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1481
1482        # Define tolerances and dimensions for rolling up
1483        tol = [
1484            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1485            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1486        ]
1487        relative = [True, False]    
1488        dims = ["mz", "scan_time"]
1489        print("here")
1490        mf_df = self.roll_up_dataframe(
1491            df=mf_df,
1492            sort_by="persistence",
1493            dims=dims,
1494            tol=tol,
1495            relative=relative
1496        )
1497
1498        # Rename scan column to apex_scan
1499        mass_features = mf_df.rename(
1500            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1501        )
1502        # Sort my persistence and reset index
1503        mass_features = mass_features.sort_values(
1504            by="persistence", ascending=False
1505        ).reset_index(drop=True)
1506
1507        # Populate mass_features attribute
1508        self.mass_features = {}
1509        for row in mass_features.itertuples():
1510            row_dict = mass_features.iloc[row.Index].to_dict()
1511            lcms_feature = LCMSMassFeature(self, **row_dict)
1512            self.mass_features[lcms_feature.id] = lcms_feature
1513
1514        if self.parameters.lc_ms.verbose_processing:
1515            print("Found " + str(len(mass_features)) + " initial mass features")
1516    
1517    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1518        """Cluster mass features
1519
1520        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1521
1522        Parameters
1523        ----------
1524        drop_children : bool, optional
1525            Whether to drop the mass features that are not cluster parents. Default is True.
1526        sort_by : str, optional
1527            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1528
1529        Raises
1530        ------
1531        ValueError
1532            If no mass features are found.
1533            If too many mass features are found.
1534
1535        Returns
1536        -------
1537        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1538        """
1539        verbose = self.parameters.lc_ms.verbose_processing
1540
1541        if self.mass_features is None:
1542            raise ValueError("No mass features found, run find_mass_features() first")
1543        if len(self.mass_features) > 400000:
1544            raise ValueError(
1545                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1546            )
1547        dims = ["mz", "scan_time"]
1548        mf_df_og = self.mass_features_to_df()
1549        mf_df = mf_df_og.copy()
1550
1551        tol = [
1552            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1553            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1554        ]  # mz, in relative; scan_time in minutes
1555        relative = [True, False]
1556
1557        # Roll up mass features based on their proximity in the declared dimensions
1558        mf_df_new = self.roll_up_dataframe(
1559            df=mf_df,
1560            sort_by=sort_by,
1561            dims=dims,
1562            tol=tol,
1563            relative=relative
1564        )
1565
1566        mf_df["cluster_parent"] = np.where(
1567            np.isin(mf_df.index, mf_df_new.index), True, False
1568        )
1569
1570        # get mass feature ids of features that are not cluster parents
1571        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1572        if drop_children is True:
1573            # Drop mass features that are not cluster parents from self
1574            self.mass_features = {
1575                k: v
1576                for k, v in self.mass_features.items()
1577                if k not in cluster_daughters
1578            }
1579        else:
1580            return cluster_daughters

Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
  • embed_unique_indices(a). Creates an array of indices, sorted by unique element.
  • sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
  • check_if_grid(data). Check if the data is gridded in mz space.
  • grid_data(data). Grid the data in the mz dimension.
  • find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
  • cluster_mass_features(drop_children=True). Cluster regions of interest.
@staticmethod
def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
841    @staticmethod
842    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
843        """Sparse implementation of a mean filter.
844
845        Parameters
846        ----------
847        idx : :obj:`~numpy.array`
848            Edge indices for each dimension (MxN).
849        V : :obj:`~numpy.array`
850            Array of intensity data (Mx1).
851        radius : float or list
852            Radius of the sparse filter in each dimension. Values less than
853            zero indicate no connectivity in that dimension.
854
855        Returns
856        -------
857        :obj:`~numpy.array`
858            Filtered intensities (Mx1).
859
860        Notes
861        -----
862        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
863        This is a static method.
864        """
865
866        # Copy indices
867        idx = idx.copy().astype(V.dtype)
868
869        # Scale
870        for i, r in enumerate(radius):
871            # Increase inter-index distance
872            if r < 1:
873                idx[:, i] *= 2
874
875            # Do nothing
876            elif r == 1:
877                pass
878
879            # Decrease inter-index distance
880            else:
881                idx[:, i] /= r
882
883        # Connectivity matrix
884        cmat = KDTree(idx)
885        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
886        cmat.setdiag(1)
887
888        # Pair indices
889        I, J = cmat.nonzero()
890
891        # Delete cmat
892        cmat_shape = cmat.shape
893        del cmat
894
895        # Sum over columns
896        V_sum = sparse.bsr_matrix(
897            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
898        ).diagonal(0)
899
900        # Count over columns
901        V_count = sparse.bsr_matrix(
902            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
903        ).diagonal(0)
904
905        return V_sum / V_count

Sparse implementation of a mean filter.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
  • radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
  • ~numpy.array: Filtered intensities (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.

@staticmethod
def embed_unique_indices(a):
907    @staticmethod
908    def embed_unique_indices(a):
909        """Creates an array of indices, sorted by unique element.
910
911        Parameters
912        ----------
913        a : :obj:`~numpy.array`
914            Array of unique elements (Mx1).
915
916        Returns
917        -------
918        :obj:`~numpy.array`
919            Array of indices (Mx1).
920
921        Notes
922        -----
923        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
924        This is a static method.
925        """
926
927        def count_tens(n):
928            # Count tens
929            ntens = (n - 1) // 10
930
931            while True:
932                ntens_test = (ntens + n - 1) // 10
933
934                if ntens_test == ntens:
935                    return ntens
936                else:
937                    ntens = ntens_test
938
939        def arange_exclude_10s(n):
940            # How many 10s will there be?
941            ntens = count_tens(n)
942
943            # Base array
944            arr = np.arange(0, n + ntens)
945
946            # Exclude 10s
947            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
948
949            return arr
950
951        # Creates an array of indices, sorted by unique element
952        idx_sort = np.argsort(a)
953        idx_unsort = np.argsort(idx_sort)
954
955        # Sorts records array so all unique elements are together
956        sorted_a = a[idx_sort]
957
958        # Returns the unique values, the index of the first occurrence,
959        # and the count for each element
960        vals, idx_start, count = np.unique(
961            sorted_a, return_index=True, return_counts=True
962        )
963
964        # Splits the indices into separate arrays
965        splits = np.split(idx_sort, idx_start[1:])
966
967        # Creates unique indices for each split
968        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
969
970        # Reorders according to input array
971        idx_unq = idx_unq[idx_unsort]
972
973        # Magnitude of each index
974        exp = np.log10(
975            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
976        )
977        idx_unq_mag = np.power(10, np.floor(exp) + 1)
978
979        # Result
980        return a + idx_unq / idx_unq_mag

Creates an array of indices, sorted by unique element.

Parameters
  • a (~numpy.array): Array of unique elements (Mx1).
Returns
  • ~numpy.array: Array of indices (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.

@staticmethod
def roll_up_dataframe( df: pandas.core.frame.DataFrame, sort_by: str, tol: list, relative: list, dims: list):
 982    @staticmethod
 983    def roll_up_dataframe(
 984        df : pd.DataFrame,
 985        sort_by : str,
 986        tol : list,
 987        relative : list,
 988        dims : list
 989    ):
 990        """Subset data by rolling up into apex in appropriate dimensions.
 991        
 992        Parameters
 993        ----------
 994        data : pd.DataFrame
 995            The input data containing "dims" columns and the "sort_by" column.
 996        sort_by : str
 997            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 
 998            (i.e., the mass feature with the highest value in the sort_by column).
 999        dims : list
1000            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1001        tol : list
1002            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1003            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1004            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1005            If relative is False, the tolerance will be used as is.
1006        relative : list
1007            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1008
1009        Returns
1010        -------
1011        pd.DataFrame
1012            A DataFrame with only the rolled up mass features, with the original index and columns.
1013
1014            
1015        Raises
1016        ------
1017        ValueError
1018            If the input data is not a pandas DataFrame.
1019            If the input data does not have columns for each of the dimensions in "dims".
1020            If the length of "dims", "tol", and "relative" do not match.
1021        """
1022        og_columns = df.columns.copy()
1023
1024        # Unindex the data, but keep the original index
1025        if df.index.name is not None:
1026            og_index = df.index.name
1027        else:
1028            og_index = "index"
1029        df = df.reset_index(drop=False)
1030
1031        # Sort data by sort_by column, and reindex
1032        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1033
1034        # Check that data is a DataFrame and has columns for each of the dims
1035        if not isinstance(df, pd.DataFrame):
1036            raise ValueError("Data must be a pandas DataFrame")
1037        for dim in dims:
1038            if dim not in df.columns:
1039                raise ValueError(f"Data must have a column for {dim}")
1040        if len(dims) != len(tol) or len(dims) != len(relative):
1041            raise ValueError(
1042                "Dimensions, tolerances, and relative flags must be the same length"
1043            )
1044        
1045        # Compute inter-feature distances
1046        distances = None
1047        for i in range(len(dims)):
1048            # Construct k-d tree
1049            values = df[dims[i]].values
1050            tree = KDTree(values.reshape(-1, 1))
1051
1052            max_tol = tol[i]
1053            if relative[i] is True:
1054                # Maximum absolute tolerance
1055                max_tol = tol[i] * values.max()
1056
1057            # Compute sparse distance matrix
1058            # the larger the max_tol, the slower this operation is
1059            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1060
1061            # Only consider forward case, exclude diagonal
1062            sdm = sparse.triu(sdm, k=1)
1063
1064            # Filter relative distances
1065            if relative[i] is True:
1066                # Compute relative distances
1067                rel_dists = sdm.data / values[sdm.row]  # or col?
1068
1069                # Indices of relative distances less than tolerance
1070                idx = rel_dists <= tol[i]
1071
1072                # Reconstruct sparse distance matrix
1073                sdm = sparse.coo_matrix(
1074                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1075                    shape=(len(values), len(values)),
1076                )
1077
1078            # Cast as binary matrix
1079            sdm.data = np.ones_like(sdm.data)
1080
1081            # Stack distances
1082            if distances is None:
1083                distances = sdm
1084            else:
1085                distances = distances.multiply(sdm)
1086        
1087        # Extract indices of within-tolerance points
1088        distances = distances.tocoo()
1089        pairs = np.stack((distances.row, distances.col), axis=1)
1090        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1091        pairs_df = pairs_df.set_index("parent")
1092
1093        to_drop = []
1094        while not pairs_df.empty:
1095            # Find root_parents and their children
1096            root_parents = np.setdiff1d(
1097                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1098            )
1099            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1100            to_drop = np.append(to_drop, children_of_roots)
1101
1102            # Remove root_children as possible parents from pairs_df for next iteration
1103            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1104            pairs_df = pairs_df.reset_index().set_index("child")
1105            # Remove root_children as possible children from pairs_df for next iteration
1106            pairs_df = pairs_df.drop(index=children_of_roots)
1107
1108            # Prepare for next iteration
1109            pairs_df = pairs_df.reset_index().set_index("parent")
1110
1111        # Drop mass features that are not cluster parents
1112        df_sub = df.drop(index=np.array(to_drop))
1113
1114        # Set index back to og_index and only keep the columns that are in the original dataframe
1115        df_sub = df_sub.set_index(og_index)
1116
1117        # sort the dataframe by the original index
1118        df_sub = df_sub.sort_index()
1119        df_sub = df_sub[og_columns]
1120
1121        return df_sub

Subset data by rolling up into apex in appropriate dimensions.

Parameters
  • data (pd.DataFrame): The input data containing "dims" columns and the "sort_by" column.
  • sort_by (str): The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature (i.e., the mass feature with the highest value in the sort_by column).
  • dims (list): A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
  • tol (list): A list of tolerances for each dimension. The length of the list must match the number of dimensions. The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). If relative is True, the tolerance will be multiplied by the maximum value in that dimension. If relative is False, the tolerance will be used as is.
  • relative (list): A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
Returns
  • pd.DataFrame: A DataFrame with only the rolled up mass features, with the original index and columns.
Raises
  • ValueError: If the input data is not a pandas DataFrame. If the input data does not have columns for each of the dimensions in "dims". If the length of "dims", "tol", and "relative" do not match.
def sparse_upper_star(self, idx, V):
1123    def sparse_upper_star(self, idx, V):
1124        """Sparse implementation of an upper star filtration.
1125
1126        Parameters
1127        ----------
1128        idx : :obj:`~numpy.array`
1129            Edge indices for each dimension (MxN).
1130        V : :obj:`~numpy.array`
1131            Array of intensity data (Mx1).
1132        Returns
1133        -------
1134        idx : :obj:`~numpy.array`
1135            Index of filtered points (Mx1).
1136        persistence : :obj:`~numpy.array`
1137            Persistence of each filtered point (Mx1).
1138
1139        Notes
1140        -----
1141        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1142        """
1143
1144        # Invert
1145        V = -1 * V.copy().astype(int)
1146
1147        # Embed indices
1148        V = self.embed_unique_indices(V)
1149
1150        # Connectivity matrix
1151        cmat = KDTree(idx)
1152        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1153        cmat.setdiag(1)
1154        cmat = sparse.triu(cmat)
1155
1156        # Pairwise minimums
1157        I, J = cmat.nonzero()
1158        d = np.maximum(V[I], V[J])
1159
1160        # Delete connectiity matrix
1161        cmat_shape = cmat.shape
1162        del cmat
1163
1164        # Sparse distance matrix
1165        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1166
1167        # Delete pairwise mins
1168        del d, I, J
1169
1170        # Persistence homology
1171        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1172
1173        # Bound death values
1174        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1175
1176        # Construct tree to query against
1177        tree = KDTree(V.reshape((-1, 1)))
1178
1179        # Get the indexes of the first nearest neighbor by birth
1180        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1181
1182        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)

Sparse implementation of an upper star filtration.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
Returns
  • idx (~numpy.array): Index of filtered points (Mx1).
  • persistence (~numpy.array): Persistence of each filtered point (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def check_if_grid(self, data):
1184    def check_if_grid(self, data):
1185        """Check if the data are gridded in mz space.
1186
1187        Parameters
1188        ----------
1189        data : DataFrame
1190            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1191
1192        Returns
1193        -------
1194        bool
1195            True if the data is gridded in the mz direction, False otherwise.
1196
1197        Notes
1198        -----
1199        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1200        """
1201        # Calculate the difference between consecutive mz values in a single scan
1202        dat_check = data.copy().reset_index(drop=True)
1203        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1204        mz_diff_min = (
1205            dat_check.groupby("scan")["mz_diff"].min().min()
1206        )  # within each scan, what is the smallest mz difference between consecutive mz values
1207
1208        # Find the mininum mz difference between mz values in the data; regardless of scan
1209        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1210        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1211        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1212
1213        # Get minimum mz_diff between mz values in the data
1214        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1215
1216        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1217        if mz_diff_min_raw < mz_diff_min:
1218            return False
1219        else:
1220            return True

Check if the data are gridded in mz space.

Parameters
  • data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
  • bool: True if the data is gridded in the mz direction, False otherwise.
Notes

This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.

def grid_data(self, data):
1222    def grid_data(self, data):
1223        """Grid the data in the mz dimension.
1224
1225        Data must be gridded prior to persistent homology calculations.
1226
1227        Parameters
1228        ----------
1229        data : DataFrame
1230            The input data containing mz, scan, scan_time, and intensity columns.
1231
1232        Returns
1233        -------
1234        DataFrame
1235            The gridded data with mz, scan, scan_time, and intensity columns.
1236
1237        Raises
1238        ------
1239        ValueError
1240            If gridding fails.
1241        """
1242
1243        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1244        data_w = data.copy().reset_index(drop=True)
1245        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1246        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1247
1248        # Need high intensity mz values first so they are parents in the output pairs stack
1249        dat_mz = data_w[["mz", "intensity"]].sort_values(
1250            by=["intensity"], ascending=False
1251        )
1252        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1253
1254        # Construct KD tree
1255        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1256        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1257        sdm = sparse.triu(sdm, k=1)
1258        sdm.data = np.ones_like(sdm.data)
1259        distances = sdm.tocoo()
1260        pairs = np.stack((distances.row, distances.col), axis=1)
1261
1262        # Cull pairs to just get root
1263        to_drop = []
1264        while len(pairs) > 0:
1265            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1266            id_root_parents = np.isin(pairs[:, 0], root_parents)
1267            children_of_roots = np.unique(pairs[id_root_parents, 1])
1268            to_drop = np.append(to_drop, children_of_roots)
1269
1270            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1271            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1272            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1273        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1274        mz_dat_np = (
1275            dat_mz[["mz"]]
1276            .sort_values(by=["mz"])
1277            .reset_index(drop=True)
1278            .values.flatten()
1279        )
1280
1281        # Sort data by mz and recast mz to nearest value in mz_dat_np
1282        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1283        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1284        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1285
1286        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1287        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1288        new_data_w = (
1289            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1290            .groupby(["scan", "mz"])["intensity"]
1291            .sum()
1292            .reset_index()
1293        )
1294        new_data_w = (
1295            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1296            .reset_index(drop=True)
1297            .copy()
1298        )
1299
1300        # Check if grid worked and return
1301        if self.check_if_grid(new_data_w):
1302            return new_data_w
1303        else:
1304            raise ValueError("Gridding failed")

Grid the data in the mz dimension.

Data must be gridded prior to persistent homology calculations.

Parameters
  • data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
Returns
  • DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
  • ValueError: If gridding fails.
def find_mass_features_ph(self, ms_level=1, grid=True):
1306    def find_mass_features_ph(self, ms_level=1, grid=True):
1307        """Find mass features within an LCMSBase object using persistent homology.
1308
1309        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1310
1311        Parameters
1312        ----------
1313        ms_level : int, optional
1314            The MS level to use. Default is 1.
1315        grid : bool, optional
1316            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1317
1318        Raises
1319        ------
1320        ValueError
1321            If no MS level data is found on the object.
1322            If data is not gridded and grid is False.
1323
1324        Returns
1325        -------
1326        None, but assigns the mass_features attribute to the object.
1327
1328        Notes
1329        -----
1330        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1331        """
1332        # Check that ms_level is a key in self._ms_uprocessed
1333        if ms_level not in self._ms_unprocessed.keys():
1334            raise ValueError(
1335                "No MS level "
1336                + str(ms_level)
1337                + " data found, did you instantiate with parser specific to MS level?"
1338            )
1339
1340        # Get ms data
1341        data = self._ms_unprocessed[ms_level].copy()
1342
1343        # Drop rows with missing intensity values and reset index
1344        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1345
1346        # Threshold data
1347        dims = ["mz", "scan_time"]
1348        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1349        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1350
1351        # Check if gridded, if not, grid
1352        gridded_mz = self.check_if_grid(data_thres)
1353        if gridded_mz is False:
1354            if grid is False:
1355                raise ValueError(
1356                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1357                )
1358            else:
1359                data_thres = self.grid_data(data_thres)
1360
1361        # Add build factors and add scan_time
1362        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1363        factors = {
1364            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1365            for dim in dims
1366        }  # this is return a float64 index
1367
1368        # Build indexes
1369        index = {
1370            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1371            for dim in factors
1372        }
1373
1374        # Smooth data
1375        iterations = self.parameters.lc_ms.ph_smooth_it
1376        smooth_radius = [
1377            self.parameters.lc_ms.ph_smooth_radius_mz,
1378            self.parameters.lc_ms.ph_smooth_radius_scan,
1379        ]  # mz, scan_time smoothing radius (in steps)
1380
1381        index = np.vstack([index[dim] for dim in dims]).T
1382        V = data_thres["intensity"].values
1383        resid = np.inf
1384        for i in range(iterations):
1385            # Previous iteration
1386            V_prev = V.copy()
1387            resid_prev = resid
1388            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1389
1390            # Calculate residual with previous iteration
1391            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1392
1393            # Evaluate convergence
1394            if i > 0:
1395                # Percent change in residual
1396                test = np.abs(resid - resid_prev) / resid_prev
1397
1398                # Exit criteria
1399                if test <= 0:
1400                    break
1401
1402        # Overwrite values
1403        data_thres["intensity"] = V
1404
1405        # Use persistent homology to find regions of interest
1406        pidx, pers = self.sparse_upper_star(index, V)
1407        pidx = pidx[pers > 1]
1408        pers = pers[pers > 1]
1409
1410        # Get peaks
1411        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1412
1413        # Add persistence column
1414        peaks["persistence"] = pers
1415        mass_features = peaks.sort_values(
1416            by="persistence", ascending=False
1417        ).reset_index(drop=True)
1418
1419        # Filter by persistence threshold
1420        persistence_threshold = (
1421            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1422        )
1423        mass_features = mass_features.loc[
1424            mass_features["persistence"] > persistence_threshold, :
1425        ].reset_index(drop=True)
1426
1427        # Rename scan column to apex_scan
1428        mass_features = mass_features.rename(
1429            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1430        )
1431
1432        # Populate mass_features attribute
1433        self.mass_features = {}
1434        for row in mass_features.itertuples():
1435            row_dict = mass_features.iloc[row.Index].to_dict()
1436            lcms_feature = LCMSMassFeature(self, **row_dict)
1437            self.mass_features[lcms_feature.id] = lcms_feature
1438
1439        if self.parameters.lc_ms.verbose_processing:
1440            print("Found " + str(len(mass_features)) + " initial mass features")

Find mass features within an LCMSBase object using persistent homology.

Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
  • None, but assigns the mass_features attribute to the object.
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def find_mass_features_ph_centroid(self, ms_level=1):
1442    def find_mass_features_ph_centroid(self, ms_level=1):
1443        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1444        
1445        Parameters
1446        ----------
1447        ms_level : int, optional
1448            The MS level to use. Default is 1.
1449        
1450        Raises
1451        ------
1452        ValueError
1453            If no MS level data is found on the object.
1454
1455        Returns
1456        -------
1457        None, but assigns the mass_features attribute to the object.        
1458        """
1459        # Check that ms_level is a key in self._ms_uprocessed
1460        if ms_level not in self._ms_unprocessed.keys():
1461            raise ValueError(
1462                "No MS level "
1463                + str(ms_level)
1464                + " data found, did you instantiate with parser specific to MS level?"
1465            )
1466        
1467        # Get ms data
1468        data = self._ms_unprocessed[ms_level].copy()
1469
1470        # Drop rows with missing intensity values and reset index
1471        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1472        
1473        # Threshold data
1474        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1475        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1476        data_thres['persistence'] = data_thres['intensity']
1477
1478        # Use this as the starting point for the mass features, adding scan_time
1479        mf_df = data_thres
1480        mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1481
1482        # Define tolerances and dimensions for rolling up
1483        tol = [
1484            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1485            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1486        ]
1487        relative = [True, False]    
1488        dims = ["mz", "scan_time"]
1489        print("here")
1490        mf_df = self.roll_up_dataframe(
1491            df=mf_df,
1492            sort_by="persistence",
1493            dims=dims,
1494            tol=tol,
1495            relative=relative
1496        )
1497
1498        # Rename scan column to apex_scan
1499        mass_features = mf_df.rename(
1500            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1501        )
1502        # Sort my persistence and reset index
1503        mass_features = mass_features.sort_values(
1504            by="persistence", ascending=False
1505        ).reset_index(drop=True)
1506
1507        # Populate mass_features attribute
1508        self.mass_features = {}
1509        for row in mass_features.itertuples():
1510            row_dict = mass_features.iloc[row.Index].to_dict()
1511            lcms_feature = LCMSMassFeature(self, **row_dict)
1512            self.mass_features[lcms_feature.id] = lcms_feature
1513
1514        if self.parameters.lc_ms.verbose_processing:
1515            print("Found " + str(len(mass_features)) + " initial mass features")

Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no MS level data is found on the object.
Returns
  • None, but assigns the mass_features attribute to the object.
def cluster_mass_features(self, drop_children=True, sort_by='persistence'):
1517    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1518        """Cluster mass features
1519
1520        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1521
1522        Parameters
1523        ----------
1524        drop_children : bool, optional
1525            Whether to drop the mass features that are not cluster parents. Default is True.
1526        sort_by : str, optional
1527            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1528
1529        Raises
1530        ------
1531        ValueError
1532            If no mass features are found.
1533            If too many mass features are found.
1534
1535        Returns
1536        -------
1537        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1538        """
1539        verbose = self.parameters.lc_ms.verbose_processing
1540
1541        if self.mass_features is None:
1542            raise ValueError("No mass features found, run find_mass_features() first")
1543        if len(self.mass_features) > 400000:
1544            raise ValueError(
1545                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1546            )
1547        dims = ["mz", "scan_time"]
1548        mf_df_og = self.mass_features_to_df()
1549        mf_df = mf_df_og.copy()
1550
1551        tol = [
1552            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1553            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1554        ]  # mz, in relative; scan_time in minutes
1555        relative = [True, False]
1556
1557        # Roll up mass features based on their proximity in the declared dimensions
1558        mf_df_new = self.roll_up_dataframe(
1559            df=mf_df,
1560            sort_by=sort_by,
1561            dims=dims,
1562            tol=tol,
1563            relative=relative
1564        )
1565
1566        mf_df["cluster_parent"] = np.where(
1567            np.isin(mf_df.index, mf_df_new.index), True, False
1568        )
1569
1570        # get mass feature ids of features that are not cluster parents
1571        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1572        if drop_children is True:
1573            # Drop mass features that are not cluster parents from self
1574            self.mass_features = {
1575                k: v
1576                for k, v in self.mass_features.items()
1577                if k not in cluster_daughters
1578            }
1579        else:
1580            return cluster_daughters

Cluster mass features

Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.

Parameters
  • drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
  • sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
  • ValueError: If no mass features are found. If too many mass features are found.
Returns
  • None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.