corems.mass_spectra.calc.lc_calc

   1import numpy as np
   2import pandas as pd
   3from ripser import ripser
   4from scipy import sparse
   5from scipy.spatial import KDTree
   6
   7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
   8from corems.mass_spectra.calc import SignalProcessing as sp
   9from corems.mass_spectra.factory.chromat_data import EIC_Data
  10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  11
  12
  13def find_closest(A, target):
  14    """Find the index of closest value in A to each value in target.
  15
  16    Parameters
  17    ----------
  18    A : :obj:`~numpy.array`
  19        The array to search (blueprint). A must be sorted.
  20    target : :obj:`~numpy.array`
  21        The array of values to search for. target must be sorted.
  22
  23    Returns
  24    -------
  25    :obj:`~numpy.array`
  26        The indices of the closest values in A to each value in target.
  27    """
  28    idx = A.searchsorted(target)
  29    idx = np.clip(idx, 1, len(A) - 1)
  30    left = A[idx - 1]
  31    right = A[idx]
  32    idx -= target - left < right - target
  33    return idx
  34
  35
  36class LCCalculations:
  37    """Methods for performing LC calculations on mass spectra data.
  38
  39    Notes
  40    -----
  41    This class is intended to be used as a mixin for the LCMSBase class.
  42
  43    Methods
  44    -------
  45    * get_max_eic(eic_data).
  46        Returns the maximum EIC value from the given EIC data. A static method.
  47    * smooth_tic(tic).
  48        Smooths the TIC data using the specified smoothing method and settings.
  49    * eic_centroid_detector(rt, eic, max_eic).
  50        Performs EIC centroid detection on the given EIC data.
  51    * find_nearest_scan(rt).
  52        Finds the nearest scan to the given retention time.
  53    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
  54        Returns an averaged mass spectrum object.
  55    * find_mass_features(ms_level=1).
  56        Find regions of interest for a given MS level (default is MS1).
  57    * integrate_mass_features(drop_if_fail=False, ms_level=1).
  58        Integrate mass features of interest and extracts EICs.
  59    * find_c13_mass_features().
  60        Evaluate mass features and mark likely C13 isotopes.
  61    * deconvolute_ms1_mass_features().
  62        Deconvolute mass features' ms1 mass spectra.
  63    """
  64
  65    @staticmethod
  66    def get_max_eic(eic_data: dict):
  67        """Returns the maximum EIC value from the given EIC data.
  68
  69        Notes
  70        -----
  71        This is a static method.
  72
  73        Parameters
  74        ----------
  75        eic_data : dict
  76            A dictionary containing EIC data.
  77
  78        Returns
  79        -------
  80        float
  81            The maximum EIC value.
  82        """
  83        max_eic = 0
  84        for eic_data in eic_data.values():
  85            ind_max_eic = max(eic_data.get("EIC"))
  86            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
  87
  88        return max_eic
  89
  90    def smooth_tic(self, tic):
  91        """Smooths the TIC or EIC data using the specified smoothing method and settings.
  92
  93        Parameters
  94        ----------
  95        tic : numpy.ndarray
  96            The TIC (or EIC) data to be smoothed.
  97
  98        Returns
  99        -------
 100        numpy.ndarray
 101            The smoothed TIC data.
 102        """
 103        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
 104
 105        pol_order = self.parameters.lc_ms.savgol_pol_order
 106
 107        window_len = self.parameters.lc_ms.smooth_window
 108
 109        window = self.parameters.lc_ms.smooth_method
 110
 111        return sp.smooth_signal(
 112            tic, window_len, window, pol_order, implemented_smooth_method
 113        )
 114
 115    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
 116        """Performs EIC centroid detection on the given EIC data.
 117
 118        Parameters
 119        ----------
 120        rt : numpy.ndarray
 121            The retention time data.
 122        eic : numpy.ndarray
 123            The EIC data.
 124        max_eic : float
 125            The maximum EIC value.
 126        apex_indexes : list, optional
 127            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
 128
 129        Returns
 130        -------
 131        numpy.ndarray
 132            The indexes of left, apex, and right limits as a generator.
 133        """
 134
 135        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
 136
 137        max_height = self.parameters.lc_ms.peak_height_max_percent
 138
 139        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
 140
 141        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
 142
 143        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
 144
 145        include_indexes = sp.peak_picking_first_derivative(
 146            domain=rt,
 147            signal=eic,
 148            max_height=max_height,
 149            max_prominence=max_prominence,
 150            max_signal=max_eic,
 151            min_peak_datapoints=min_peak_datapoints,
 152            peak_derivative_threshold=peak_derivative_threshold,
 153            signal_threshold=signal_threshold,
 154            correct_baseline=False,
 155            plot_res=False,
 156            apex_indexes=apex_indexes,
 157        )
 158
 159        return include_indexes
 160
 161    def find_nearest_scan(self, rt):
 162        """Finds the nearest scan to the given retention time.
 163
 164        Parameters
 165        ----------
 166        rt : float
 167            The retention time (in minutes) to find the nearest scan for.
 168
 169        Returns
 170        -------
 171        int
 172            The scan number of the nearest scan.
 173        """
 174        array_rt = np.array(self.retention_time)
 175
 176        scan_index = (np.abs(array_rt - rt)).argmin()
 177
 178        real_scan = self.scans_number[scan_index]
 179
 180        return real_scan
 181
 182    def add_peak_metrics(self):
 183        """Add peak metrics to the mass features.
 184
 185        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
 186        """
 187        # Check that at least some mass features have eic data
 188        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
 189            raise ValueError(
 190                "No mass features have EIC data. Run integrate_mass_features first."
 191            )
 192
 193        for mass_feature in self.mass_features.values():
 194            # Check if the mass feature has been integrated
 195            if mass_feature._eic_data is not None:
 196                # Calculate peak metrics
 197                mass_feature.calc_half_height_width()
 198                mass_feature.calc_tailing_factor()
 199                mass_feature.calc_dispersity_index()
 200
 201    def get_average_mass_spectrum(
 202        self,
 203        scan_list,
 204        apex_scan,
 205        spectrum_mode="profile",
 206        ms_level=1,
 207        auto_process=True,
 208        use_parser=False,
 209        perform_checks=True,
 210        polarity=None,
 211        ms_params=None,
 212    ):
 213        """Returns an averaged mass spectrum object
 214
 215        Parameters
 216        ----------
 217        scan_list : list
 218            List of scan numbers to average.
 219        apex_scan : int
 220            Number of the apex scan
 221        spectrum_mode : str, optional
 222            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
 223        ms_level : int, optional
 224            The MS level to use. Defaults to 1.
 225        auto_process : bool, optional
 226            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
 227        use_parser : bool, optional
 228            If True, the mass spectra will be obtained from the parser. Defaults to False.
 229        perform_checks : bool, optional
 230            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
 231        polarity : int, optional
 232            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
 233        ms_params : MSParameters, optional
 234            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
 235
 236        Returns
 237        -------
 238        MassSpectrumProfile
 239            The averaged mass spectrum object.
 240
 241        Raises
 242        ------
 243        ValueError
 244            If the spectrum mode is not "profile".
 245            If the MS level is not found in the unprocessed mass spectra dictionary.
 246            If not all scan numbers are found in the unprocessed mass spectra dictionary.
 247        """
 248        if perform_checks:
 249            if spectrum_mode != "profile":
 250                raise ValueError("Averaging only supported for profile mode")
 251
 252        if polarity is None:
 253            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 254            if self.polarity == "negative":
 255                polarity = -1
 256            elif self.polarity == "positive":
 257                polarity = 1
 258            else:
 259                raise ValueError(
 260                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
 261                )
 262
 263        # if not using_parser, check that scan numbers are in _ms_unprocessed
 264        if not use_parser:
 265            if perform_checks:
 266                # Set index to scan for faster lookup
 267                ms_df = (
 268                    self._ms_unprocessed[ms_level]
 269                    .copy()
 270                    .set_index("scan", drop=False)
 271                    .sort_index()
 272                )
 273                my_ms_df = ms_df.loc[scan_list]
 274                # Check that all scan numbers are in the ms_df
 275                if not all(np.isin(scan_list, ms_df.index)):
 276                    raise ValueError(
 277                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
 278                    )
 279            else:
 280                my_ms_df = (
 281                    pd.DataFrame({"scan": scan_list})
 282                    .set_index("scan")
 283                    .join(self._ms_unprocessed[ms_level], how="left")
 284                )
 285
 286        if use_parser:
 287            ms_list = [
 288                self.spectra_parser.get_mass_spectrum_from_scan(
 289                    x, spectrum_mode=spectrum_mode, auto_process=False
 290                )
 291                for x in scan_list
 292            ]
 293            ms_mz = [x._mz_exp for x in ms_list]
 294            ms_int = [x._abundance for x in ms_list]
 295            my_ms_df = []
 296            for i in np.arange(len(ms_mz)):
 297                my_ms_df.append(
 298                    pd.DataFrame(
 299                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
 300                    )
 301                )
 302            my_ms_df = pd.concat(my_ms_df)
 303
 304        if not self.check_if_grid(my_ms_df):
 305            my_ms_df = self.grid_data(my_ms_df)
 306
 307        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
 308
 309        ms = ms_from_array_profile(
 310            my_ms_ave.mz,
 311            my_ms_ave.intensity,
 312            self.file_location,
 313            polarity=polarity,
 314            auto_process=False,
 315        )
 316
 317        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 318        if ms is not None:
 319            if ms_params is not None:
 320                ms.parameters = ms_params
 321            ms.scan_number = apex_scan
 322            if auto_process:
 323                ms.process_mass_spec()
 324        return ms
 325
 326    def find_mass_features(self, ms_level=1, grid=True):
 327        """Find mass features within an LCMSBase object
 328
 329        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
 330
 331        Parameters
 332        ----------
 333        ms_level : int, optional
 334            The MS level to use for peak picking Default is 1.
 335        grid : bool, optional
 336            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded, used for persistent homology peak picking. Default is True.
 337
 338        Raises
 339        ------
 340        ValueError
 341            If no MS level data is found on the object.
 342            If persistent homology peak picking is attempted on non-profile mode data.
 343            If data is not gridded and grid is False.
 344            If peak picking method is not implemented.
 345
 346        Returns
 347        -------
 348        None, but assigns the mass_features and eics attributes to the object.
 349
 350        """
 351        pp_method = self.parameters.lc_ms.peak_picking_method
 352
 353        if pp_method == "persistent homology":
 354            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 355            if all(msx_scan_df["ms_format"] == "profile"):
 356                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
 357                self.cluster_mass_features(drop_children=True, sort_by="persistence")
 358            else:
 359                raise ValueError(
 360                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
 361                        ms_level
 362                    )
 363                )
 364        else:
 365            raise ValueError("Peak picking method not implemented")
 366
 367    def integrate_mass_features(
 368        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
 369    ):
 370        """Integrate mass features and extract EICs.
 371
 372        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
 373
 374        Parameters
 375        ----------
 376        drop_if_fail : bool, optional
 377            Whether to drop mass features if the EIC limit calculations fail.
 378            Default is True.
 379        drop_duplicates : bool, optional
 380            Whether to mass features that appear to be duplicates
 381            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
 382            Default is True.
 383        ms_level : int, optional
 384            The MS level to use. Default is 1.
 385
 386        Raises
 387        ------
 388        ValueError
 389            If no mass features are found.
 390            If no MS level data is found for the given MS level (either in data or in the scan data)
 391
 392        Returns
 393        -------
 394        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
 395
 396        Notes
 397        -----
 398        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
 399        """
 400        # Check if there is data
 401        if ms_level in self._ms_unprocessed.keys():
 402            raw_data = self._ms_unprocessed[ms_level].copy()
 403        else:
 404            raise ValueError("No MS level " + str(ms_level) + " data found")
 405        if self.mass_features is not None:
 406            mf_df = self.mass_features_to_df().copy()
 407        else:
 408            raise ValueError(
 409                "No mass features found, did you run find_mass_features() first?"
 410            )
 411        # Check if mass_spectrum exists on each mass feature
 412        if not all(
 413            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
 414        ):
 415            raise ValueError(
 416                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
 417            )
 418
 419        # Subset scan data to only include correct ms_level
 420        scan_df_sub = self.scan_df[
 421            self.scan_df["ms_level"] == int(ms_level)
 422        ].reset_index(drop=True)
 423        if scan_df_sub.empty:
 424            raise ValueError("No MS level " + ms_level + " data found in scan data")
 425        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
 426
 427        mzs_to_extract = np.unique(mf_df["mz"].values)
 428        mzs_to_extract.sort()
 429
 430        # Get EICs for each unique mz in mass features list
 431        for mz in mzs_to_extract:
 432            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 433            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 434            raw_data_sub = raw_data[
 435                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
 436            ].reset_index(drop=True)
 437            raw_data_sub = (
 438                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
 439            )
 440            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
 441            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
 442            myEIC = EIC_Data(
 443                scans=raw_data_sub["scan"].values,
 444                time=raw_data_sub["scan_time"].values,
 445                eic=raw_data_sub["intensity"].values,
 446            )
 447            # Smooth EIC
 448            smoothed_eic = self.smooth_tic(myEIC.eic)
 449            smoothed_eic[smoothed_eic < 0] = 0
 450            myEIC.eic_smoothed = smoothed_eic
 451            self.eics[mz] = myEIC
 452
 453        # Get limits of mass features using EIC centroid detector and integrate
 454        mf_df["area"] = np.nan
 455        for idx, mass_feature in mf_df.iterrows():
 456            mz = mass_feature.mz
 457            apex_scan = mass_feature.apex_scan
 458
 459            # Pull EIC data and find apex scan index
 460            myEIC = self.eics[mz]
 461            self.mass_features[idx]._eic_data = myEIC
 462            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
 463
 464            # Find left and right limits of peak using EIC centroid detector, add to EICData
 465            centroid_eics = self.eic_centroid_detector(
 466                myEIC.time,
 467                myEIC.eic_smoothed,
 468                mass_feature.intensity * 1.1,
 469                apex_indexes=[int(apex_index)],
 470            )
 471            l_a_r_scan_idx = [i for i in centroid_eics]
 472            if len(l_a_r_scan_idx) > 0:
 473                # Add start and final scan to mass_features and EICData
 474                left_scan, right_scan = (
 475                    myEIC.scans[l_a_r_scan_idx[0][0]],
 476                    myEIC.scans[l_a_r_scan_idx[0][2]],
 477                )
 478                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
 479                myEIC.apexes = myEIC.apexes + mf_scan_apex
 480                self.mass_features[idx].start_scan = left_scan
 481                self.mass_features[idx].final_scan = right_scan
 482
 483                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
 484                area = np.trapz(
 485                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 486                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 487                )
 488                mf_df.at[idx, "area"] = area
 489                myEIC.areas = myEIC.areas + [area]
 490                self.eics[mz] = myEIC
 491                self.mass_features[idx]._area = area
 492            else:
 493                if drop_if_fail is True:
 494                    self.mass_features.pop(idx)
 495
 496        if drop_duplicates:
 497            # Prepare mass feature dataframe
 498            mf_df = self.mass_features_to_df().copy()
 499
 500            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
 501            # Kepp the first mass fea
 502            for idx, mass_feature in mf_df.iterrows():
 503                mz = mass_feature.mz
 504                apex_scan = mass_feature.apex_scan
 505
 506                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
 507                mf_df_sub = mf_df[
 508                    mf_df["mz_diff_ppm"]
 509                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 510                    * 10**6
 511                ].copy()
 512
 513                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
 514                for idx2, mass_feature2 in mf_df_sub.iterrows():
 515                    if idx2 != idx:
 516                        if (
 517                            mass_feature2.start_scan >= mass_feature.start_scan
 518                            and mass_feature2.final_scan <= mass_feature.final_scan
 519                        ):
 520                            if idx2 in self.mass_features.keys():
 521                                self.mass_features.pop(idx2)
 522
 523    def find_c13_mass_features(self):
 524        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
 525
 526        Returns
 527        -------
 528        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
 529
 530        Raises
 531        ------
 532        ValueError
 533            If no mass features are found.
 534        """
 535        verbose = self.parameters.lc_ms.verbose_processing
 536        if verbose:
 537            print("evaluating mass features for C13 isotopes")
 538        if self.mass_features is None:
 539            raise ValueError("No mass features found, run find_mass_features() first")
 540
 541        # Data prep fo sparse distance matrix
 542        dims = ["mz", "scan_time"]
 543        mf_df = self.mass_features_to_df().copy()
 544        # Drop mass features that have no area (these are likely to be noise)
 545        mf_df = mf_df[mf_df["area"].notnull()]
 546        mf_df["mf_id"] = mf_df.index.values
 547        dims = ["mz", "scan_time"]
 548
 549        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
 550        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
 551
 552        mz_diff = 1.003355  # C13-C12 mass difference
 553        tol = [
 554            mf_df["mz"].median()
 555            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
 556            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
 557        ]  # mz, in relative; scan_time in minutes
 558
 559        # Compute inter-feature distances
 560        distances = None
 561        for i in range(len(dims)):
 562            # Construct k-d tree
 563            values = mf_df[dims[i]].values
 564            tree = KDTree(values.reshape(-1, 1))
 565
 566            max_tol = tol[i]
 567            if dims[i] == "mz":
 568                # Maximum absolute tolerance
 569                max_tol = mz_diff + tol[i]
 570
 571            # Compute sparse distance matrix
 572            # the larger the max_tol, the slower this operation is
 573            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
 574
 575            # Only consider forward case, exclude diagonal
 576            sdm = sparse.triu(sdm, k=1)
 577
 578            if dims[i] == "mz":
 579                min_tol = mz_diff - tol[i]
 580                # Get only the ones that are above the min tol
 581                idx = sdm.data > min_tol
 582
 583                # Reconstruct sparse distance matrix
 584                sdm = sparse.coo_matrix(
 585                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
 586                    shape=(len(values), len(values)),
 587                )
 588
 589            # Cast as binary matrix
 590            sdm.data = np.ones_like(sdm.data)
 591
 592            # Stack distances
 593            if distances is None:
 594                distances = sdm
 595            else:
 596                distances = distances.multiply(sdm)
 597
 598        # Extract indices of within-tolerance points
 599        distances = distances.tocoo()
 600        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
 601
 602        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
 603        pairs_mf = pairs.copy()
 604        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
 605        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
 606
 607        # Connect monoisotopic masses with isotopologes within mass_features
 608        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
 609        for mono in monos:
 610            self.mass_features[mono].monoisotopic_mf_id = mono
 611        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
 612        while not pairs_iso_df.empty:
 613            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
 614            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
 615            for iso in m1_isos:
 616                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
 617                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
 618                if len(parent) > 1:
 619                    # Choose the parent that is closest in time to the isotopologue
 620                    parent_time = [self.mass_features[p].retention_time for p in parent]
 621                    time_diff = [
 622                        np.abs(self.mass_features[iso].retention_time - x)
 623                        for x in parent_time
 624                    ]
 625                    parent = parent[np.argmin(time_diff)]
 626                else:
 627                    parent = parent[0]
 628                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
 629                    parent
 630                ].monoisotopic_mf_id
 631                if self.mass_features[iso].monoisotopic_mf_id is not None:
 632                    mass_diff = (
 633                        self.mass_features[iso].mz
 634                        - self.mass_features[
 635                            self.mass_features[iso].monoisotopic_mf_id
 636                        ].mz
 637                    )
 638                    self.mass_features[iso].isotopologue_type = "13C" + str(
 639                        int(round(mass_diff, 0))
 640                    )
 641
 642            # Drop the mono and iso from the pairs_iso_df
 643            pairs_iso_df = pairs_iso_df.drop(
 644                index=monos, errors="ignore"
 645            )  # Drop pairs where the parent is a child that is a child of a root
 646            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
 647            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
 648
 649            if not pairs_iso_df.empty:
 650                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
 651                monos = np.setdiff1d(
 652                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
 653                )
 654        if verbose:
 655            # Report fraction of compounds annotated with isotopes
 656            mf_df["c13_flag"] = np.where(
 657                np.logical_or(
 658                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
 659                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
 660                ),
 661                1,
 662                0,
 663            )
 664            print(
 665                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
 666                + " of mass features have or are C13 isotopes"
 667            )
 668
 669    def deconvolute_ms1_mass_features(self):
 670        """Deconvolute MS1 mass features
 671
 672        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
 673
 674        Parameters
 675        ----------
 676        None
 677
 678        Returns
 679        -------
 680        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
 681        and associated_mass_features_deconvoluted attributes to the mass features in the
 682        mass_features attribute of the LCMSBase object.
 683
 684        Raises
 685        ------
 686        ValueError
 687            If no mass features are found, must run find_mass_features() first.
 688            If no EICs are found, did you run integrate_mass_features() first?
 689
 690        """
 691        # Checks for set mass_features and eics
 692        if self.mass_features is None:
 693            raise ValueError(
 694                "No mass features found, did you run find_mass_features() first?"
 695            )
 696
 697        if self.eics == {}:
 698            raise ValueError(
 699                "No EICs found, did you run integrate_mass_features() first?"
 700            )
 701
 702        if 1 not in self._ms_unprocessed.keys():
 703            raise ValueError("No unprocessed MS1 spectra found.")
 704
 705        # Prep ms1 data
 706        ms1_data = self._ms_unprocessed[1].copy()
 707        ms1_data = ms1_data.set_index("scan")
 708
 709        # Prep mass feature summary
 710        mass_feature_df = self.mass_features_to_df()
 711
 712        # Loop through each mass feature
 713        for mf_id, mass_feature in self.mass_features.items():
 714            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
 715            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
 716                continue
 717
 718            # Get the left and right limits of the EIC of the mass feature
 719            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
 720
 721            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
 722            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
 723            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
 724
 725            # Get the centroided masses of the mass feature
 726            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
 727
 728            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
 729            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
 730                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
 731            ]
 732
 733            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
 734            ms1_data_sub["mz_diff_rel"] = (
 735                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
 736                / ms1_data_sub["mz"]
 737            )
 738            ms1_data_sub = ms1_data_sub[
 739                ms1_data_sub["mz_diff_rel"]
 740                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 741            ].reset_index(drop=True)
 742
 743            # Group by mass_feature_mz and scan and sum intensity
 744            ms1_data_sub_group = (
 745                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
 746                .sum()
 747                .reset_index()
 748            )
 749
 750            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
 751            corr = (
 752                ms1_data_sub_group.pivot(
 753                    index="scan", columns="mass_feature_mz", values="intensity"
 754                )
 755                .fillna(0)
 756                .corr()
 757            )
 758
 759            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
 760            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
 761            corr_subset = corr.loc[mass_feature.mz,]
 762            corr_subset = corr_subset[corr_subset > decon_corr_min]
 763
 764            # Get the masses from the mass spectrum that are the result of the deconvolution
 765            mzs_decon = corr_subset.index.values
 766
 767            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
 768            mzs_decon_idx = [
 769                id
 770                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
 771                if mz in mzs_decon
 772            ]
 773            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
 774
 775            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
 776            if (
 777                mass_feature.ms1_peak.abundance
 778                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
 779            ):
 780                mass_feature.mass_spectrum_deconvoluted_parent = True
 781            else:
 782                mass_feature.mass_spectrum_deconvoluted_parent = False
 783
 784            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
 785            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
 786            mass_feature_df_sub = mass_feature_df[
 787                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
 788                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 789            ].copy()
 790            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
 791            mass_feature_df_sub["mz_diff_ppm"] = [
 792                np.abs(mzs_decon - mz).min() / mz * 10**6
 793                for mz in mass_feature_df_sub["mz"]
 794            ]
 795            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
 796            mfs_associated_decon = mass_feature_df_sub[
 797                mass_feature_df_sub["mz_diff_ppm"]
 798                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
 799            ].index.values
 800
 801            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
 802
 803
 804class PHCalculations:
 805    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 806
 807    Notes
 808    -----
 809    This class is intended to be used as a mixin for the LCMSBase class.
 810
 811    Methods
 812    -------
 813    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 814        Sparse implementation of a mean filter.
 815    * embed_unique_indices(a).
 816        Creates an array of indices, sorted by unique element.
 817    * sparse_upper_star(idx, V).
 818        Sparse implementation of an upper star filtration.
 819    * check_if_grid(data).
 820        Check if the data is gridded in mz space.
 821    * grid_data(data).
 822        Grid the data in the mz dimension.
 823    * find_mass_features_ph(ms_level=1, grid=True).
 824        Find mass features within an LCMSBase object using persistent homology.
 825    * cluster_mass_features(drop_children=True).
 826        Cluster regions of interest.
 827    """
 828
 829    @staticmethod
 830    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 831        """Sparse implementation of a mean filter.
 832
 833        Parameters
 834        ----------
 835        idx : :obj:`~numpy.array`
 836            Edge indices for each dimension (MxN).
 837        V : :obj:`~numpy.array`
 838            Array of intensity data (Mx1).
 839        radius : float or list
 840            Radius of the sparse filter in each dimension. Values less than
 841            zero indicate no connectivity in that dimension.
 842
 843        Returns
 844        -------
 845        :obj:`~numpy.array`
 846            Filtered intensities (Mx1).
 847
 848        Notes
 849        -----
 850        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 851        This is a static method.
 852        """
 853
 854        # Copy indices
 855        idx = idx.copy().astype(V.dtype)
 856
 857        # Scale
 858        for i, r in enumerate(radius):
 859            # Increase inter-index distance
 860            if r < 1:
 861                idx[:, i] *= 2
 862
 863            # Do nothing
 864            elif r == 1:
 865                pass
 866
 867            # Decrease inter-index distance
 868            else:
 869                idx[:, i] /= r
 870
 871        # Connectivity matrix
 872        cmat = KDTree(idx)
 873        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 874        cmat.setdiag(1)
 875
 876        # Pair indices
 877        I, J = cmat.nonzero()
 878
 879        # Delete cmat
 880        cmat_shape = cmat.shape
 881        del cmat
 882
 883        # Sum over columns
 884        V_sum = sparse.bsr_matrix(
 885            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 886        ).diagonal(0)
 887
 888        # Count over columns
 889        V_count = sparse.bsr_matrix(
 890            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 891        ).diagonal(0)
 892
 893        return V_sum / V_count
 894
 895    @staticmethod
 896    def embed_unique_indices(a):
 897        """Creates an array of indices, sorted by unique element.
 898
 899        Parameters
 900        ----------
 901        a : :obj:`~numpy.array`
 902            Array of unique elements (Mx1).
 903
 904        Returns
 905        -------
 906        :obj:`~numpy.array`
 907            Array of indices (Mx1).
 908
 909        Notes
 910        -----
 911        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 912        This is a static method.
 913        """
 914
 915        def count_tens(n):
 916            # Count tens
 917            ntens = (n - 1) // 10
 918
 919            while True:
 920                ntens_test = (ntens + n - 1) // 10
 921
 922                if ntens_test == ntens:
 923                    return ntens
 924                else:
 925                    ntens = ntens_test
 926
 927        def arange_exclude_10s(n):
 928            # How many 10s will there be?
 929            ntens = count_tens(n)
 930
 931            # Base array
 932            arr = np.arange(0, n + ntens)
 933
 934            # Exclude 10s
 935            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 936
 937            return arr
 938
 939        # Creates an array of indices, sorted by unique element
 940        idx_sort = np.argsort(a)
 941        idx_unsort = np.argsort(idx_sort)
 942
 943        # Sorts records array so all unique elements are together
 944        sorted_a = a[idx_sort]
 945
 946        # Returns the unique values, the index of the first occurrence,
 947        # and the count for each element
 948        vals, idx_start, count = np.unique(
 949            sorted_a, return_index=True, return_counts=True
 950        )
 951
 952        # Splits the indices into separate arrays
 953        splits = np.split(idx_sort, idx_start[1:])
 954
 955        # Creates unique indices for each split
 956        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 957
 958        # Reorders according to input array
 959        idx_unq = idx_unq[idx_unsort]
 960
 961        # Magnitude of each index
 962        exp = np.log10(
 963            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
 964        )
 965        idx_unq_mag = np.power(10, np.floor(exp) + 1)
 966
 967        # Result
 968        return a + idx_unq / idx_unq_mag
 969
 970    def sparse_upper_star(self, idx, V):
 971        """Sparse implementation of an upper star filtration.
 972
 973        Parameters
 974        ----------
 975        idx : :obj:`~numpy.array`
 976            Edge indices for each dimension (MxN).
 977        V : :obj:`~numpy.array`
 978            Array of intensity data (Mx1).
 979        Returns
 980        -------
 981        idx : :obj:`~numpy.array`
 982            Index of filtered points (Mx1).
 983        persistence : :obj:`~numpy.array`
 984            Persistence of each filtered point (Mx1).
 985
 986        Notes
 987        -----
 988        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 989        """
 990
 991        # Invert
 992        V = -1 * V.copy().astype(int)
 993
 994        # Embed indices
 995        V = self.embed_unique_indices(V)
 996
 997        # Connectivity matrix
 998        cmat = KDTree(idx)
 999        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1000        cmat.setdiag(1)
1001        cmat = sparse.triu(cmat)
1002
1003        # Pairwise minimums
1004        I, J = cmat.nonzero()
1005        d = np.maximum(V[I], V[J])
1006
1007        # Delete connectiity matrix
1008        cmat_shape = cmat.shape
1009        del cmat
1010
1011        # Sparse distance matrix
1012        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1013
1014        # Delete pairwise mins
1015        del d, I, J
1016
1017        # Persistence homology
1018        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1019
1020        # Bound death values
1021        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1022
1023        # Construct tree to query against
1024        tree = KDTree(V.reshape((-1, 1)))
1025
1026        # Get the indexes of the first nearest neighbor by birth
1027        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1028
1029        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1030
1031    def check_if_grid(self, data):
1032        """Check if the data are gridded in mz space.
1033
1034        Parameters
1035        ----------
1036        data : DataFrame
1037            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1038
1039        Returns
1040        -------
1041        bool
1042            True if the data is gridded in the mz direction, False otherwise.
1043
1044        Notes
1045        -----
1046        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1047        """
1048        # Calculate the difference between consecutive mz values in a single scan
1049        dat_check = data.copy().reset_index(drop=True)
1050        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1051        mz_diff_min = (
1052            dat_check.groupby("scan")["mz_diff"].min().min()
1053        )  # within each scan, what is the smallest mz difference between consecutive mz values
1054
1055        # Find the mininum mz difference between mz values in the data; regardless of scan
1056        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1057        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1058        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1059
1060        # Get minimum mz_diff between mz values in the data
1061        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1062
1063        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1064        if mz_diff_min_raw < mz_diff_min:
1065            return False
1066        else:
1067            return True
1068
1069    def grid_data(self, data):
1070        """Grid the data in the mz dimension.
1071
1072        Data must be gridded prior to persistent homology calculations.
1073
1074        Parameters
1075        ----------
1076        data : DataFrame
1077            The input data containing mz, scan, scan_time, and intensity columns.
1078
1079        Returns
1080        -------
1081        DataFrame
1082            The gridded data with mz, scan, scan_time, and intensity columns.
1083
1084        Raises
1085        ------
1086        ValueError
1087            If gridding fails.
1088        """
1089
1090        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1091        data_w = data.copy().reset_index(drop=True)
1092        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1093        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1094
1095        # Need high intensity mz values first so they are parents in the output pairs stack
1096        dat_mz = data_w[["mz", "intensity"]].sort_values(
1097            by=["intensity"], ascending=False
1098        )
1099        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1100
1101        # Construct KD tree
1102        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1103        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1104        sdm = sparse.triu(sdm, k=1)
1105        sdm.data = np.ones_like(sdm.data)
1106        distances = sdm.tocoo()
1107        pairs = np.stack((distances.row, distances.col), axis=1)
1108
1109        # Cull pairs to just get root
1110        to_drop = []
1111        while len(pairs) > 0:
1112            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1113            id_root_parents = np.isin(pairs[:, 0], root_parents)
1114            children_of_roots = np.unique(pairs[id_root_parents, 1])
1115            to_drop = np.append(to_drop, children_of_roots)
1116
1117            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1118            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1119            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1120        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1121        mz_dat_np = (
1122            dat_mz[["mz"]]
1123            .sort_values(by=["mz"])
1124            .reset_index(drop=True)
1125            .values.flatten()
1126        )
1127
1128        # Sort data by mz and recast mz to nearest value in mz_dat_np
1129        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1130        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1131        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1132
1133        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1134        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1135        new_data_w = (
1136            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1137            .groupby(["scan", "mz"])["intensity"]
1138            .sum()
1139            .reset_index()
1140        )
1141        new_data_w = (
1142            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1143            .reset_index(drop=True)
1144            .copy()
1145        )
1146
1147        # Check if grid worked and return
1148        if self.check_if_grid(new_data_w):
1149            return new_data_w
1150        else:
1151            raise ValueError("Gridding failed")
1152
1153    def find_mass_features_ph(self, ms_level=1, grid=True):
1154        """Find mass features within an LCMSBase object using persistent homology.
1155
1156        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1157
1158        Parameters
1159        ----------
1160        ms_level : int, optional
1161            The MS level to use. Default is 1.
1162        grid : bool, optional
1163            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1164
1165        Raises
1166        ------
1167        ValueError
1168            If no MS level data is found on the object.
1169            If data is not gridded and grid is False.
1170
1171        Returns
1172        -------
1173        None, but assigns the mass_features attribute to the object.
1174
1175        Notes
1176        -----
1177        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1178        """
1179        # Check that ms_level is a key in self._ms_uprocessed
1180        if ms_level not in self._ms_unprocessed.keys():
1181            raise ValueError(
1182                "No MS level "
1183                + str(ms_level)
1184                + " data found, did you instantiate with parser specific to MS level?"
1185            )
1186
1187        # Get ms data
1188        data = self._ms_unprocessed[ms_level].copy()
1189
1190        # Drop rows with missing intensity values and reset index
1191        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1192
1193        # Threshold data
1194        dims = ["mz", "scan_time"]
1195        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1196        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1197
1198        # Check if gridded, if not, grid
1199        gridded_mz = self.check_if_grid(data_thres)
1200        if gridded_mz is False:
1201            if grid is False:
1202                raise ValueError(
1203                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1204                )
1205            else:
1206                data_thres = self.grid_data(data_thres)
1207
1208        # Add build factors and add scan_time
1209        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1210        factors = {
1211            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1212            for dim in dims
1213        }  # this is return a float64 index
1214
1215        # Build indexes
1216        index = {
1217            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1218            for dim in factors
1219        }
1220
1221        # Smooth data
1222        iterations = self.parameters.lc_ms.ph_smooth_it
1223        smooth_radius = [
1224            self.parameters.lc_ms.ph_smooth_radius_mz,
1225            self.parameters.lc_ms.ph_smooth_radius_scan,
1226        ]  # mz, scan_time smoothing radius (in steps)
1227
1228        index = np.vstack([index[dim] for dim in dims]).T
1229        V = data_thres["intensity"].values
1230        resid = np.inf
1231        for i in range(iterations):
1232            # Previous iteration
1233            V_prev = V.copy()
1234            resid_prev = resid
1235            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1236
1237            # Calculate residual with previous iteration
1238            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1239
1240            # Evaluate convergence
1241            if i > 0:
1242                # Percent change in residual
1243                test = np.abs(resid - resid_prev) / resid_prev
1244
1245                # Exit criteria
1246                if test <= 0:
1247                    break
1248
1249        # Overwrite values
1250        data_thres["intensity"] = V
1251
1252        # Use persistent homology to find regions of interest
1253        pidx, pers = self.sparse_upper_star(index, V)
1254        pidx = pidx[pers > 1]
1255        pers = pers[pers > 1]
1256
1257        # Get peaks
1258        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1259
1260        # Add persistence column
1261        peaks["persistence"] = pers
1262        mass_features = peaks.sort_values(
1263            by="persistence", ascending=False
1264        ).reset_index(drop=True)
1265
1266        # Filter by persistence threshold
1267        persistence_threshold = (
1268            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1269        )
1270        mass_features = mass_features.loc[
1271            mass_features["persistence"] > persistence_threshold, :
1272        ].reset_index(drop=True)
1273
1274        # Rename scan column to apex_scan
1275        mass_features = mass_features.rename(
1276            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1277        )
1278
1279        # Populate mass_features attribute
1280        self.mass_features = {}
1281        for row in mass_features.itertuples():
1282            row_dict = mass_features.iloc[row.Index].to_dict()
1283            lcms_feature = LCMSMassFeature(self, **row_dict)
1284            self.mass_features[lcms_feature.id] = lcms_feature
1285
1286        if self.parameters.lc_ms.verbose_processing:
1287            print("Found " + str(len(mass_features)) + " initial mass features")
1288
1289    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1290        """Cluster mass features
1291
1292        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1293
1294        Parameters
1295        ----------
1296        drop_children : bool, optional
1297            Whether to drop the mass features that are not cluster parents. Default is True.
1298        sort_by : str, optional
1299            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1300
1301        Raises
1302        ------
1303        ValueError
1304            If no mass features are found.
1305            If too many mass features are found.
1306
1307        Returns
1308        -------
1309        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1310        """
1311        verbose = self.parameters.lc_ms.verbose_processing
1312
1313        if self.mass_features is None:
1314            raise ValueError("No mass features found, run find_mass_features() first")
1315        if len(self.mass_features) > 400000:
1316            raise ValueError(
1317                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1318            )
1319        dims = ["mz", "scan_time"]
1320        mf_df_og = self.mass_features_to_df()
1321        mf_df = mf_df_og.copy()
1322
1323        # Sort mass features by sort_by column, make mf_id its own column for easier bookkeeping
1324        mf_df = mf_df.sort_values(by=sort_by, ascending=False).reset_index(drop=False)
1325
1326        tol = [
1327            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1328            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1329        ]  # mz, in relative; scan_time in minutes
1330        relative = [True, False]
1331
1332        # Compute inter-feature distances
1333        distances = None
1334        for i in range(len(dims)):
1335            # Construct k-d tree
1336            values = mf_df[dims[i]].values
1337            tree = KDTree(values.reshape(-1, 1))
1338
1339            max_tol = tol[i]
1340            if relative[i] is True:
1341                # Maximum absolute tolerance
1342                max_tol = tol[i] * values.max()
1343
1344            # Compute sparse distance matrix
1345            # the larger the max_tol, the slower this operation is
1346            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1347
1348            # Only consider forward case, exclude diagonal
1349            sdm = sparse.triu(sdm, k=1)
1350
1351            # Filter relative distances
1352            if relative[i] is True:
1353                # Compute relative distances
1354                rel_dists = sdm.data / values[sdm.row]  # or col?
1355
1356                # Indices of relative distances less than tolerance
1357                idx = rel_dists <= tol[i]
1358
1359                # Reconstruct sparse distance matrix
1360                sdm = sparse.coo_matrix(
1361                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1362                    shape=(len(values), len(values)),
1363                )
1364
1365            # Cast as binary matrix
1366            sdm.data = np.ones_like(sdm.data)
1367
1368            # Stack distances
1369            if distances is None:
1370                distances = sdm
1371            else:
1372                distances = distances.multiply(sdm)
1373
1374        # Extract indices of within-tolerance points
1375        distances = distances.tocoo()
1376        pairs = np.stack((distances.row, distances.col), axis=1)
1377        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1378        pairs_df = pairs_df.set_index("parent")
1379
1380        to_drop = []
1381        while not pairs_df.empty:
1382            # Find root_parents and their children
1383            root_parents = np.setdiff1d(
1384                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1385            )
1386            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1387            to_drop = np.append(to_drop, children_of_roots)
1388
1389            # Remove root_children as possible parents from pairs_df for next iteration
1390            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1391            pairs_df = pairs_df.reset_index().set_index("child")
1392            # Remove root_children as possible children from pairs_df for next iteration
1393            pairs_df = pairs_df.drop(index=children_of_roots)
1394
1395            # Prepare for next iteration
1396            pairs_df = pairs_df.reset_index().set_index("parent")
1397
1398        # Drop mass features that are not cluster parents
1399        mf_df = mf_df.drop(index=np.array(to_drop))
1400
1401        # Set index back to mf_id
1402        mf_df = mf_df.set_index("mf_id")
1403        if verbose:
1404            print(str(len(mf_df)) + " mass features remaining")
1405
1406        mf_df_new = mf_df_og.copy()
1407        mf_df_new["cluster_parent"] = np.where(
1408            np.isin(mf_df_new.index, mf_df.index), True, False
1409        )
1410
1411        # get mass feature ids of features that are not cluster parents
1412        cluster_daughters = mf_df_new[mf_df_new["cluster_parent"] == False].index.values
1413        if drop_children is True:
1414            # Drop mass features that are not cluster parents from self
1415            self.mass_features = {
1416                k: v
1417                for k, v in self.mass_features.items()
1418                if k not in cluster_daughters
1419            }
1420        else:
1421            return cluster_daughters
def find_closest(A, target):
14def find_closest(A, target):
15    """Find the index of closest value in A to each value in target.
16
17    Parameters
18    ----------
19    A : :obj:`~numpy.array`
20        The array to search (blueprint). A must be sorted.
21    target : :obj:`~numpy.array`
22        The array of values to search for. target must be sorted.
23
24    Returns
25    -------
26    :obj:`~numpy.array`
27        The indices of the closest values in A to each value in target.
28    """
29    idx = A.searchsorted(target)
30    idx = np.clip(idx, 1, len(A) - 1)
31    left = A[idx - 1]
32    right = A[idx]
33    idx -= target - left < right - target
34    return idx

Find the index of closest value in A to each value in target.

Parameters
  • A (~numpy.array): The array to search (blueprint). A must be sorted.
  • target (~numpy.array): The array of values to search for. target must be sorted.
Returns
  • ~numpy.array: The indices of the closest values in A to each value in target.
class LCCalculations:
 37class LCCalculations:
 38    """Methods for performing LC calculations on mass spectra data.
 39
 40    Notes
 41    -----
 42    This class is intended to be used as a mixin for the LCMSBase class.
 43
 44    Methods
 45    -------
 46    * get_max_eic(eic_data).
 47        Returns the maximum EIC value from the given EIC data. A static method.
 48    * smooth_tic(tic).
 49        Smooths the TIC data using the specified smoothing method and settings.
 50    * eic_centroid_detector(rt, eic, max_eic).
 51        Performs EIC centroid detection on the given EIC data.
 52    * find_nearest_scan(rt).
 53        Finds the nearest scan to the given retention time.
 54    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
 55        Returns an averaged mass spectrum object.
 56    * find_mass_features(ms_level=1).
 57        Find regions of interest for a given MS level (default is MS1).
 58    * integrate_mass_features(drop_if_fail=False, ms_level=1).
 59        Integrate mass features of interest and extracts EICs.
 60    * find_c13_mass_features().
 61        Evaluate mass features and mark likely C13 isotopes.
 62    * deconvolute_ms1_mass_features().
 63        Deconvolute mass features' ms1 mass spectra.
 64    """
 65
 66    @staticmethod
 67    def get_max_eic(eic_data: dict):
 68        """Returns the maximum EIC value from the given EIC data.
 69
 70        Notes
 71        -----
 72        This is a static method.
 73
 74        Parameters
 75        ----------
 76        eic_data : dict
 77            A dictionary containing EIC data.
 78
 79        Returns
 80        -------
 81        float
 82            The maximum EIC value.
 83        """
 84        max_eic = 0
 85        for eic_data in eic_data.values():
 86            ind_max_eic = max(eic_data.get("EIC"))
 87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
 88
 89        return max_eic
 90
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )
115
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159
160        return include_indexes
161
162    def find_nearest_scan(self, rt):
163        """Finds the nearest scan to the given retention time.
164
165        Parameters
166        ----------
167        rt : float
168            The retention time (in minutes) to find the nearest scan for.
169
170        Returns
171        -------
172        int
173            The scan number of the nearest scan.
174        """
175        array_rt = np.array(self.retention_time)
176
177        scan_index = (np.abs(array_rt - rt)).argmin()
178
179        real_scan = self.scans_number[scan_index]
180
181        return real_scan
182
183    def add_peak_metrics(self):
184        """Add peak metrics to the mass features.
185
186        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
187        """
188        # Check that at least some mass features have eic data
189        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
190            raise ValueError(
191                "No mass features have EIC data. Run integrate_mass_features first."
192            )
193
194        for mass_feature in self.mass_features.values():
195            # Check if the mass feature has been integrated
196            if mass_feature._eic_data is not None:
197                # Calculate peak metrics
198                mass_feature.calc_half_height_width()
199                mass_feature.calc_tailing_factor()
200                mass_feature.calc_dispersity_index()
201
202    def get_average_mass_spectrum(
203        self,
204        scan_list,
205        apex_scan,
206        spectrum_mode="profile",
207        ms_level=1,
208        auto_process=True,
209        use_parser=False,
210        perform_checks=True,
211        polarity=None,
212        ms_params=None,
213    ):
214        """Returns an averaged mass spectrum object
215
216        Parameters
217        ----------
218        scan_list : list
219            List of scan numbers to average.
220        apex_scan : int
221            Number of the apex scan
222        spectrum_mode : str, optional
223            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
224        ms_level : int, optional
225            The MS level to use. Defaults to 1.
226        auto_process : bool, optional
227            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
228        use_parser : bool, optional
229            If True, the mass spectra will be obtained from the parser. Defaults to False.
230        perform_checks : bool, optional
231            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
232        polarity : int, optional
233            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
234        ms_params : MSParameters, optional
235            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
236
237        Returns
238        -------
239        MassSpectrumProfile
240            The averaged mass spectrum object.
241
242        Raises
243        ------
244        ValueError
245            If the spectrum mode is not "profile".
246            If the MS level is not found in the unprocessed mass spectra dictionary.
247            If not all scan numbers are found in the unprocessed mass spectra dictionary.
248        """
249        if perform_checks:
250            if spectrum_mode != "profile":
251                raise ValueError("Averaging only supported for profile mode")
252
253        if polarity is None:
254            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
255            if self.polarity == "negative":
256                polarity = -1
257            elif self.polarity == "positive":
258                polarity = 1
259            else:
260                raise ValueError(
261                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
262                )
263
264        # if not using_parser, check that scan numbers are in _ms_unprocessed
265        if not use_parser:
266            if perform_checks:
267                # Set index to scan for faster lookup
268                ms_df = (
269                    self._ms_unprocessed[ms_level]
270                    .copy()
271                    .set_index("scan", drop=False)
272                    .sort_index()
273                )
274                my_ms_df = ms_df.loc[scan_list]
275                # Check that all scan numbers are in the ms_df
276                if not all(np.isin(scan_list, ms_df.index)):
277                    raise ValueError(
278                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
279                    )
280            else:
281                my_ms_df = (
282                    pd.DataFrame({"scan": scan_list})
283                    .set_index("scan")
284                    .join(self._ms_unprocessed[ms_level], how="left")
285                )
286
287        if use_parser:
288            ms_list = [
289                self.spectra_parser.get_mass_spectrum_from_scan(
290                    x, spectrum_mode=spectrum_mode, auto_process=False
291                )
292                for x in scan_list
293            ]
294            ms_mz = [x._mz_exp for x in ms_list]
295            ms_int = [x._abundance for x in ms_list]
296            my_ms_df = []
297            for i in np.arange(len(ms_mz)):
298                my_ms_df.append(
299                    pd.DataFrame(
300                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
301                    )
302                )
303            my_ms_df = pd.concat(my_ms_df)
304
305        if not self.check_if_grid(my_ms_df):
306            my_ms_df = self.grid_data(my_ms_df)
307
308        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
309
310        ms = ms_from_array_profile(
311            my_ms_ave.mz,
312            my_ms_ave.intensity,
313            self.file_location,
314            polarity=polarity,
315            auto_process=False,
316        )
317
318        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
319        if ms is not None:
320            if ms_params is not None:
321                ms.parameters = ms_params
322            ms.scan_number = apex_scan
323            if auto_process:
324                ms.process_mass_spec()
325        return ms
326
327    def find_mass_features(self, ms_level=1, grid=True):
328        """Find mass features within an LCMSBase object
329
330        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
331
332        Parameters
333        ----------
334        ms_level : int, optional
335            The MS level to use for peak picking Default is 1.
336        grid : bool, optional
337            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded, used for persistent homology peak picking. Default is True.
338
339        Raises
340        ------
341        ValueError
342            If no MS level data is found on the object.
343            If persistent homology peak picking is attempted on non-profile mode data.
344            If data is not gridded and grid is False.
345            If peak picking method is not implemented.
346
347        Returns
348        -------
349        None, but assigns the mass_features and eics attributes to the object.
350
351        """
352        pp_method = self.parameters.lc_ms.peak_picking_method
353
354        if pp_method == "persistent homology":
355            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
356            if all(msx_scan_df["ms_format"] == "profile"):
357                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
358                self.cluster_mass_features(drop_children=True, sort_by="persistence")
359            else:
360                raise ValueError(
361                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
362                        ms_level
363                    )
364                )
365        else:
366            raise ValueError("Peak picking method not implemented")
367
368    def integrate_mass_features(
369        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
370    ):
371        """Integrate mass features and extract EICs.
372
373        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
374
375        Parameters
376        ----------
377        drop_if_fail : bool, optional
378            Whether to drop mass features if the EIC limit calculations fail.
379            Default is True.
380        drop_duplicates : bool, optional
381            Whether to mass features that appear to be duplicates
382            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
383            Default is True.
384        ms_level : int, optional
385            The MS level to use. Default is 1.
386
387        Raises
388        ------
389        ValueError
390            If no mass features are found.
391            If no MS level data is found for the given MS level (either in data or in the scan data)
392
393        Returns
394        -------
395        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
396
397        Notes
398        -----
399        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
400        """
401        # Check if there is data
402        if ms_level in self._ms_unprocessed.keys():
403            raw_data = self._ms_unprocessed[ms_level].copy()
404        else:
405            raise ValueError("No MS level " + str(ms_level) + " data found")
406        if self.mass_features is not None:
407            mf_df = self.mass_features_to_df().copy()
408        else:
409            raise ValueError(
410                "No mass features found, did you run find_mass_features() first?"
411            )
412        # Check if mass_spectrum exists on each mass feature
413        if not all(
414            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
415        ):
416            raise ValueError(
417                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
418            )
419
420        # Subset scan data to only include correct ms_level
421        scan_df_sub = self.scan_df[
422            self.scan_df["ms_level"] == int(ms_level)
423        ].reset_index(drop=True)
424        if scan_df_sub.empty:
425            raise ValueError("No MS level " + ms_level + " data found in scan data")
426        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
427
428        mzs_to_extract = np.unique(mf_df["mz"].values)
429        mzs_to_extract.sort()
430
431        # Get EICs for each unique mz in mass features list
432        for mz in mzs_to_extract:
433            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
434            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
435            raw_data_sub = raw_data[
436                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
437            ].reset_index(drop=True)
438            raw_data_sub = (
439                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
440            )
441            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
442            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
443            myEIC = EIC_Data(
444                scans=raw_data_sub["scan"].values,
445                time=raw_data_sub["scan_time"].values,
446                eic=raw_data_sub["intensity"].values,
447            )
448            # Smooth EIC
449            smoothed_eic = self.smooth_tic(myEIC.eic)
450            smoothed_eic[smoothed_eic < 0] = 0
451            myEIC.eic_smoothed = smoothed_eic
452            self.eics[mz] = myEIC
453
454        # Get limits of mass features using EIC centroid detector and integrate
455        mf_df["area"] = np.nan
456        for idx, mass_feature in mf_df.iterrows():
457            mz = mass_feature.mz
458            apex_scan = mass_feature.apex_scan
459
460            # Pull EIC data and find apex scan index
461            myEIC = self.eics[mz]
462            self.mass_features[idx]._eic_data = myEIC
463            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
464
465            # Find left and right limits of peak using EIC centroid detector, add to EICData
466            centroid_eics = self.eic_centroid_detector(
467                myEIC.time,
468                myEIC.eic_smoothed,
469                mass_feature.intensity * 1.1,
470                apex_indexes=[int(apex_index)],
471            )
472            l_a_r_scan_idx = [i for i in centroid_eics]
473            if len(l_a_r_scan_idx) > 0:
474                # Add start and final scan to mass_features and EICData
475                left_scan, right_scan = (
476                    myEIC.scans[l_a_r_scan_idx[0][0]],
477                    myEIC.scans[l_a_r_scan_idx[0][2]],
478                )
479                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
480                myEIC.apexes = myEIC.apexes + mf_scan_apex
481                self.mass_features[idx].start_scan = left_scan
482                self.mass_features[idx].final_scan = right_scan
483
484                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
485                area = np.trapz(
486                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
487                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
488                )
489                mf_df.at[idx, "area"] = area
490                myEIC.areas = myEIC.areas + [area]
491                self.eics[mz] = myEIC
492                self.mass_features[idx]._area = area
493            else:
494                if drop_if_fail is True:
495                    self.mass_features.pop(idx)
496
497        if drop_duplicates:
498            # Prepare mass feature dataframe
499            mf_df = self.mass_features_to_df().copy()
500
501            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
502            # Kepp the first mass fea
503            for idx, mass_feature in mf_df.iterrows():
504                mz = mass_feature.mz
505                apex_scan = mass_feature.apex_scan
506
507                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
508                mf_df_sub = mf_df[
509                    mf_df["mz_diff_ppm"]
510                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
511                    * 10**6
512                ].copy()
513
514                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
515                for idx2, mass_feature2 in mf_df_sub.iterrows():
516                    if idx2 != idx:
517                        if (
518                            mass_feature2.start_scan >= mass_feature.start_scan
519                            and mass_feature2.final_scan <= mass_feature.final_scan
520                        ):
521                            if idx2 in self.mass_features.keys():
522                                self.mass_features.pop(idx2)
523
524    def find_c13_mass_features(self):
525        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
526
527        Returns
528        -------
529        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
530
531        Raises
532        ------
533        ValueError
534            If no mass features are found.
535        """
536        verbose = self.parameters.lc_ms.verbose_processing
537        if verbose:
538            print("evaluating mass features for C13 isotopes")
539        if self.mass_features is None:
540            raise ValueError("No mass features found, run find_mass_features() first")
541
542        # Data prep fo sparse distance matrix
543        dims = ["mz", "scan_time"]
544        mf_df = self.mass_features_to_df().copy()
545        # Drop mass features that have no area (these are likely to be noise)
546        mf_df = mf_df[mf_df["area"].notnull()]
547        mf_df["mf_id"] = mf_df.index.values
548        dims = ["mz", "scan_time"]
549
550        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
551        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
552
553        mz_diff = 1.003355  # C13-C12 mass difference
554        tol = [
555            mf_df["mz"].median()
556            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
557            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
558        ]  # mz, in relative; scan_time in minutes
559
560        # Compute inter-feature distances
561        distances = None
562        for i in range(len(dims)):
563            # Construct k-d tree
564            values = mf_df[dims[i]].values
565            tree = KDTree(values.reshape(-1, 1))
566
567            max_tol = tol[i]
568            if dims[i] == "mz":
569                # Maximum absolute tolerance
570                max_tol = mz_diff + tol[i]
571
572            # Compute sparse distance matrix
573            # the larger the max_tol, the slower this operation is
574            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
575
576            # Only consider forward case, exclude diagonal
577            sdm = sparse.triu(sdm, k=1)
578
579            if dims[i] == "mz":
580                min_tol = mz_diff - tol[i]
581                # Get only the ones that are above the min tol
582                idx = sdm.data > min_tol
583
584                # Reconstruct sparse distance matrix
585                sdm = sparse.coo_matrix(
586                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
587                    shape=(len(values), len(values)),
588                )
589
590            # Cast as binary matrix
591            sdm.data = np.ones_like(sdm.data)
592
593            # Stack distances
594            if distances is None:
595                distances = sdm
596            else:
597                distances = distances.multiply(sdm)
598
599        # Extract indices of within-tolerance points
600        distances = distances.tocoo()
601        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
602
603        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
604        pairs_mf = pairs.copy()
605        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
606        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
607
608        # Connect monoisotopic masses with isotopologes within mass_features
609        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
610        for mono in monos:
611            self.mass_features[mono].monoisotopic_mf_id = mono
612        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
613        while not pairs_iso_df.empty:
614            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
615            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
616            for iso in m1_isos:
617                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
618                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
619                if len(parent) > 1:
620                    # Choose the parent that is closest in time to the isotopologue
621                    parent_time = [self.mass_features[p].retention_time for p in parent]
622                    time_diff = [
623                        np.abs(self.mass_features[iso].retention_time - x)
624                        for x in parent_time
625                    ]
626                    parent = parent[np.argmin(time_diff)]
627                else:
628                    parent = parent[0]
629                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
630                    parent
631                ].monoisotopic_mf_id
632                if self.mass_features[iso].monoisotopic_mf_id is not None:
633                    mass_diff = (
634                        self.mass_features[iso].mz
635                        - self.mass_features[
636                            self.mass_features[iso].monoisotopic_mf_id
637                        ].mz
638                    )
639                    self.mass_features[iso].isotopologue_type = "13C" + str(
640                        int(round(mass_diff, 0))
641                    )
642
643            # Drop the mono and iso from the pairs_iso_df
644            pairs_iso_df = pairs_iso_df.drop(
645                index=monos, errors="ignore"
646            )  # Drop pairs where the parent is a child that is a child of a root
647            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
648            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
649
650            if not pairs_iso_df.empty:
651                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
652                monos = np.setdiff1d(
653                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
654                )
655        if verbose:
656            # Report fraction of compounds annotated with isotopes
657            mf_df["c13_flag"] = np.where(
658                np.logical_or(
659                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
660                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
661                ),
662                1,
663                0,
664            )
665            print(
666                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
667                + " of mass features have or are C13 isotopes"
668            )
669
670    def deconvolute_ms1_mass_features(self):
671        """Deconvolute MS1 mass features
672
673        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
674
675        Parameters
676        ----------
677        None
678
679        Returns
680        -------
681        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
682        and associated_mass_features_deconvoluted attributes to the mass features in the
683        mass_features attribute of the LCMSBase object.
684
685        Raises
686        ------
687        ValueError
688            If no mass features are found, must run find_mass_features() first.
689            If no EICs are found, did you run integrate_mass_features() first?
690
691        """
692        # Checks for set mass_features and eics
693        if self.mass_features is None:
694            raise ValueError(
695                "No mass features found, did you run find_mass_features() first?"
696            )
697
698        if self.eics == {}:
699            raise ValueError(
700                "No EICs found, did you run integrate_mass_features() first?"
701            )
702
703        if 1 not in self._ms_unprocessed.keys():
704            raise ValueError("No unprocessed MS1 spectra found.")
705
706        # Prep ms1 data
707        ms1_data = self._ms_unprocessed[1].copy()
708        ms1_data = ms1_data.set_index("scan")
709
710        # Prep mass feature summary
711        mass_feature_df = self.mass_features_to_df()
712
713        # Loop through each mass feature
714        for mf_id, mass_feature in self.mass_features.items():
715            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
716            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
717                continue
718
719            # Get the left and right limits of the EIC of the mass feature
720            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
721
722            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
723            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
724            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
725
726            # Get the centroided masses of the mass feature
727            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
728
729            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
730            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
731                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
732            ]
733
734            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
735            ms1_data_sub["mz_diff_rel"] = (
736                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
737                / ms1_data_sub["mz"]
738            )
739            ms1_data_sub = ms1_data_sub[
740                ms1_data_sub["mz_diff_rel"]
741                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
742            ].reset_index(drop=True)
743
744            # Group by mass_feature_mz and scan and sum intensity
745            ms1_data_sub_group = (
746                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
747                .sum()
748                .reset_index()
749            )
750
751            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
752            corr = (
753                ms1_data_sub_group.pivot(
754                    index="scan", columns="mass_feature_mz", values="intensity"
755                )
756                .fillna(0)
757                .corr()
758            )
759
760            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
761            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
762            corr_subset = corr.loc[mass_feature.mz,]
763            corr_subset = corr_subset[corr_subset > decon_corr_min]
764
765            # Get the masses from the mass spectrum that are the result of the deconvolution
766            mzs_decon = corr_subset.index.values
767
768            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
769            mzs_decon_idx = [
770                id
771                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
772                if mz in mzs_decon
773            ]
774            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
775
776            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
777            if (
778                mass_feature.ms1_peak.abundance
779                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
780            ):
781                mass_feature.mass_spectrum_deconvoluted_parent = True
782            else:
783                mass_feature.mass_spectrum_deconvoluted_parent = False
784
785            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
786            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
787            mass_feature_df_sub = mass_feature_df[
788                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
789                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
790            ].copy()
791            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
792            mass_feature_df_sub["mz_diff_ppm"] = [
793                np.abs(mzs_decon - mz).min() / mz * 10**6
794                for mz in mass_feature_df_sub["mz"]
795            ]
796            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
797            mfs_associated_decon = mass_feature_df_sub[
798                mass_feature_df_sub["mz_diff_ppm"]
799                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
800            ].index.values
801
802            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Methods for performing LC calculations on mass spectra data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
  • smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
  • eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
  • find_nearest_scan(rt). Finds the nearest scan to the given retention time.
  • get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
  • find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
  • integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
  • find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
  • deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
@staticmethod
def get_max_eic(eic_data: dict):
66    @staticmethod
67    def get_max_eic(eic_data: dict):
68        """Returns the maximum EIC value from the given EIC data.
69
70        Notes
71        -----
72        This is a static method.
73
74        Parameters
75        ----------
76        eic_data : dict
77            A dictionary containing EIC data.
78
79        Returns
80        -------
81        float
82            The maximum EIC value.
83        """
84        max_eic = 0
85        for eic_data in eic_data.values():
86            ind_max_eic = max(eic_data.get("EIC"))
87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
88
89        return max_eic

Returns the maximum EIC value from the given EIC data.

Notes

This is a static method.

Parameters
  • eic_data (dict): A dictionary containing EIC data.
Returns
  • float: The maximum EIC value.
def smooth_tic(self, tic):
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )

Smooths the TIC or EIC data using the specified smoothing method and settings.

Parameters
  • tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
  • numpy.ndarray: The smoothed TIC data.
def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159
160        return include_indexes

Performs EIC centroid detection on the given EIC data.

Parameters
  • rt (numpy.ndarray): The retention time data.
  • eic (numpy.ndarray): The EIC data.
  • max_eic (float): The maximum EIC value.
  • apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
  • numpy.ndarray: The indexes of left, apex, and right limits as a generator.
def find_nearest_scan(self, rt):
162    def find_nearest_scan(self, rt):
163        """Finds the nearest scan to the given retention time.
164
165        Parameters
166        ----------
167        rt : float
168            The retention time (in minutes) to find the nearest scan for.
169
170        Returns
171        -------
172        int
173            The scan number of the nearest scan.
174        """
175        array_rt = np.array(self.retention_time)
176
177        scan_index = (np.abs(array_rt - rt)).argmin()
178
179        real_scan = self.scans_number[scan_index]
180
181        return real_scan

Finds the nearest scan to the given retention time.

Parameters
  • rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
  • int: The scan number of the nearest scan.
def add_peak_metrics(self):
183    def add_peak_metrics(self):
184        """Add peak metrics to the mass features.
185
186        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
187        """
188        # Check that at least some mass features have eic data
189        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
190            raise ValueError(
191                "No mass features have EIC data. Run integrate_mass_features first."
192            )
193
194        for mass_feature in self.mass_features.values():
195            # Check if the mass feature has been integrated
196            if mass_feature._eic_data is not None:
197                # Calculate peak metrics
198                mass_feature.calc_half_height_width()
199                mass_feature.calc_tailing_factor()
200                mass_feature.calc_dispersity_index()

Add peak metrics to the mass features.

This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.

def get_average_mass_spectrum( self, scan_list, apex_scan, spectrum_mode='profile', ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None, ms_params=None):
202    def get_average_mass_spectrum(
203        self,
204        scan_list,
205        apex_scan,
206        spectrum_mode="profile",
207        ms_level=1,
208        auto_process=True,
209        use_parser=False,
210        perform_checks=True,
211        polarity=None,
212        ms_params=None,
213    ):
214        """Returns an averaged mass spectrum object
215
216        Parameters
217        ----------
218        scan_list : list
219            List of scan numbers to average.
220        apex_scan : int
221            Number of the apex scan
222        spectrum_mode : str, optional
223            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
224        ms_level : int, optional
225            The MS level to use. Defaults to 1.
226        auto_process : bool, optional
227            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
228        use_parser : bool, optional
229            If True, the mass spectra will be obtained from the parser. Defaults to False.
230        perform_checks : bool, optional
231            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
232        polarity : int, optional
233            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
234        ms_params : MSParameters, optional
235            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
236
237        Returns
238        -------
239        MassSpectrumProfile
240            The averaged mass spectrum object.
241
242        Raises
243        ------
244        ValueError
245            If the spectrum mode is not "profile".
246            If the MS level is not found in the unprocessed mass spectra dictionary.
247            If not all scan numbers are found in the unprocessed mass spectra dictionary.
248        """
249        if perform_checks:
250            if spectrum_mode != "profile":
251                raise ValueError("Averaging only supported for profile mode")
252
253        if polarity is None:
254            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
255            if self.polarity == "negative":
256                polarity = -1
257            elif self.polarity == "positive":
258                polarity = 1
259            else:
260                raise ValueError(
261                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
262                )
263
264        # if not using_parser, check that scan numbers are in _ms_unprocessed
265        if not use_parser:
266            if perform_checks:
267                # Set index to scan for faster lookup
268                ms_df = (
269                    self._ms_unprocessed[ms_level]
270                    .copy()
271                    .set_index("scan", drop=False)
272                    .sort_index()
273                )
274                my_ms_df = ms_df.loc[scan_list]
275                # Check that all scan numbers are in the ms_df
276                if not all(np.isin(scan_list, ms_df.index)):
277                    raise ValueError(
278                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
279                    )
280            else:
281                my_ms_df = (
282                    pd.DataFrame({"scan": scan_list})
283                    .set_index("scan")
284                    .join(self._ms_unprocessed[ms_level], how="left")
285                )
286
287        if use_parser:
288            ms_list = [
289                self.spectra_parser.get_mass_spectrum_from_scan(
290                    x, spectrum_mode=spectrum_mode, auto_process=False
291                )
292                for x in scan_list
293            ]
294            ms_mz = [x._mz_exp for x in ms_list]
295            ms_int = [x._abundance for x in ms_list]
296            my_ms_df = []
297            for i in np.arange(len(ms_mz)):
298                my_ms_df.append(
299                    pd.DataFrame(
300                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
301                    )
302                )
303            my_ms_df = pd.concat(my_ms_df)
304
305        if not self.check_if_grid(my_ms_df):
306            my_ms_df = self.grid_data(my_ms_df)
307
308        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
309
310        ms = ms_from_array_profile(
311            my_ms_ave.mz,
312            my_ms_ave.intensity,
313            self.file_location,
314            polarity=polarity,
315            auto_process=False,
316        )
317
318        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
319        if ms is not None:
320            if ms_params is not None:
321                ms.parameters = ms_params
322            ms.scan_number = apex_scan
323            if auto_process:
324                ms.process_mass_spec()
325        return ms

Returns an averaged mass spectrum object

Parameters
  • scan_list (list): List of scan numbers to average.
  • apex_scan (int): Number of the apex scan
  • spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
  • ms_level (int, optional): The MS level to use. Defaults to 1.
  • auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
  • use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
  • perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
  • polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
  • ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
  • MassSpectrumProfile: The averaged mass spectrum object.
Raises
  • ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
def find_mass_features(self, ms_level=1, grid=True):
327    def find_mass_features(self, ms_level=1, grid=True):
328        """Find mass features within an LCMSBase object
329
330        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
331
332        Parameters
333        ----------
334        ms_level : int, optional
335            The MS level to use for peak picking Default is 1.
336        grid : bool, optional
337            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded, used for persistent homology peak picking. Default is True.
338
339        Raises
340        ------
341        ValueError
342            If no MS level data is found on the object.
343            If persistent homology peak picking is attempted on non-profile mode data.
344            If data is not gridded and grid is False.
345            If peak picking method is not implemented.
346
347        Returns
348        -------
349        None, but assigns the mass_features and eics attributes to the object.
350
351        """
352        pp_method = self.parameters.lc_ms.peak_picking_method
353
354        if pp_method == "persistent homology":
355            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
356            if all(msx_scan_df["ms_format"] == "profile"):
357                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
358                self.cluster_mass_features(drop_children=True, sort_by="persistence")
359            else:
360                raise ValueError(
361                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
362                        ms_level
363                    )
364                )
365        else:
366            raise ValueError("Peak picking method not implemented")

Find mass features within an LCMSBase object

Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.

Parameters
  • ms_level (int, optional): The MS level to use for peak picking Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded, used for persistent homology peak picking. Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
  • None, but assigns the mass_features and eics attributes to the object.
def integrate_mass_features(self, drop_if_fail=True, drop_duplicates=True, ms_level=1):
368    def integrate_mass_features(
369        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
370    ):
371        """Integrate mass features and extract EICs.
372
373        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
374
375        Parameters
376        ----------
377        drop_if_fail : bool, optional
378            Whether to drop mass features if the EIC limit calculations fail.
379            Default is True.
380        drop_duplicates : bool, optional
381            Whether to mass features that appear to be duplicates
382            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
383            Default is True.
384        ms_level : int, optional
385            The MS level to use. Default is 1.
386
387        Raises
388        ------
389        ValueError
390            If no mass features are found.
391            If no MS level data is found for the given MS level (either in data or in the scan data)
392
393        Returns
394        -------
395        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
396
397        Notes
398        -----
399        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
400        """
401        # Check if there is data
402        if ms_level in self._ms_unprocessed.keys():
403            raw_data = self._ms_unprocessed[ms_level].copy()
404        else:
405            raise ValueError("No MS level " + str(ms_level) + " data found")
406        if self.mass_features is not None:
407            mf_df = self.mass_features_to_df().copy()
408        else:
409            raise ValueError(
410                "No mass features found, did you run find_mass_features() first?"
411            )
412        # Check if mass_spectrum exists on each mass feature
413        if not all(
414            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
415        ):
416            raise ValueError(
417                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
418            )
419
420        # Subset scan data to only include correct ms_level
421        scan_df_sub = self.scan_df[
422            self.scan_df["ms_level"] == int(ms_level)
423        ].reset_index(drop=True)
424        if scan_df_sub.empty:
425            raise ValueError("No MS level " + ms_level + " data found in scan data")
426        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
427
428        mzs_to_extract = np.unique(mf_df["mz"].values)
429        mzs_to_extract.sort()
430
431        # Get EICs for each unique mz in mass features list
432        for mz in mzs_to_extract:
433            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
434            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
435            raw_data_sub = raw_data[
436                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
437            ].reset_index(drop=True)
438            raw_data_sub = (
439                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
440            )
441            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
442            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
443            myEIC = EIC_Data(
444                scans=raw_data_sub["scan"].values,
445                time=raw_data_sub["scan_time"].values,
446                eic=raw_data_sub["intensity"].values,
447            )
448            # Smooth EIC
449            smoothed_eic = self.smooth_tic(myEIC.eic)
450            smoothed_eic[smoothed_eic < 0] = 0
451            myEIC.eic_smoothed = smoothed_eic
452            self.eics[mz] = myEIC
453
454        # Get limits of mass features using EIC centroid detector and integrate
455        mf_df["area"] = np.nan
456        for idx, mass_feature in mf_df.iterrows():
457            mz = mass_feature.mz
458            apex_scan = mass_feature.apex_scan
459
460            # Pull EIC data and find apex scan index
461            myEIC = self.eics[mz]
462            self.mass_features[idx]._eic_data = myEIC
463            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
464
465            # Find left and right limits of peak using EIC centroid detector, add to EICData
466            centroid_eics = self.eic_centroid_detector(
467                myEIC.time,
468                myEIC.eic_smoothed,
469                mass_feature.intensity * 1.1,
470                apex_indexes=[int(apex_index)],
471            )
472            l_a_r_scan_idx = [i for i in centroid_eics]
473            if len(l_a_r_scan_idx) > 0:
474                # Add start and final scan to mass_features and EICData
475                left_scan, right_scan = (
476                    myEIC.scans[l_a_r_scan_idx[0][0]],
477                    myEIC.scans[l_a_r_scan_idx[0][2]],
478                )
479                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
480                myEIC.apexes = myEIC.apexes + mf_scan_apex
481                self.mass_features[idx].start_scan = left_scan
482                self.mass_features[idx].final_scan = right_scan
483
484                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
485                area = np.trapz(
486                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
487                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
488                )
489                mf_df.at[idx, "area"] = area
490                myEIC.areas = myEIC.areas + [area]
491                self.eics[mz] = myEIC
492                self.mass_features[idx]._area = area
493            else:
494                if drop_if_fail is True:
495                    self.mass_features.pop(idx)
496
497        if drop_duplicates:
498            # Prepare mass feature dataframe
499            mf_df = self.mass_features_to_df().copy()
500
501            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
502            # Kepp the first mass fea
503            for idx, mass_feature in mf_df.iterrows():
504                mz = mass_feature.mz
505                apex_scan = mass_feature.apex_scan
506
507                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
508                mf_df_sub = mf_df[
509                    mf_df["mz_diff_ppm"]
510                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
511                    * 10**6
512                ].copy()
513
514                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
515                for idx2, mass_feature2 in mf_df_sub.iterrows():
516                    if idx2 != idx:
517                        if (
518                            mass_feature2.start_scan >= mass_feature.start_scan
519                            and mass_feature2.final_scan <= mass_feature.final_scan
520                        ):
521                            if idx2 in self.mass_features.keys():
522                                self.mass_features.pop(idx2)

Integrate mass features and extract EICs.

Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.

Parameters
  • drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
  • drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
  • None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes

drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).

def find_c13_mass_features(self):
524    def find_c13_mass_features(self):
525        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
526
527        Returns
528        -------
529        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
530
531        Raises
532        ------
533        ValueError
534            If no mass features are found.
535        """
536        verbose = self.parameters.lc_ms.verbose_processing
537        if verbose:
538            print("evaluating mass features for C13 isotopes")
539        if self.mass_features is None:
540            raise ValueError("No mass features found, run find_mass_features() first")
541
542        # Data prep fo sparse distance matrix
543        dims = ["mz", "scan_time"]
544        mf_df = self.mass_features_to_df().copy()
545        # Drop mass features that have no area (these are likely to be noise)
546        mf_df = mf_df[mf_df["area"].notnull()]
547        mf_df["mf_id"] = mf_df.index.values
548        dims = ["mz", "scan_time"]
549
550        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
551        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
552
553        mz_diff = 1.003355  # C13-C12 mass difference
554        tol = [
555            mf_df["mz"].median()
556            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
557            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
558        ]  # mz, in relative; scan_time in minutes
559
560        # Compute inter-feature distances
561        distances = None
562        for i in range(len(dims)):
563            # Construct k-d tree
564            values = mf_df[dims[i]].values
565            tree = KDTree(values.reshape(-1, 1))
566
567            max_tol = tol[i]
568            if dims[i] == "mz":
569                # Maximum absolute tolerance
570                max_tol = mz_diff + tol[i]
571
572            # Compute sparse distance matrix
573            # the larger the max_tol, the slower this operation is
574            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
575
576            # Only consider forward case, exclude diagonal
577            sdm = sparse.triu(sdm, k=1)
578
579            if dims[i] == "mz":
580                min_tol = mz_diff - tol[i]
581                # Get only the ones that are above the min tol
582                idx = sdm.data > min_tol
583
584                # Reconstruct sparse distance matrix
585                sdm = sparse.coo_matrix(
586                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
587                    shape=(len(values), len(values)),
588                )
589
590            # Cast as binary matrix
591            sdm.data = np.ones_like(sdm.data)
592
593            # Stack distances
594            if distances is None:
595                distances = sdm
596            else:
597                distances = distances.multiply(sdm)
598
599        # Extract indices of within-tolerance points
600        distances = distances.tocoo()
601        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
602
603        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
604        pairs_mf = pairs.copy()
605        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
606        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
607
608        # Connect monoisotopic masses with isotopologes within mass_features
609        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
610        for mono in monos:
611            self.mass_features[mono].monoisotopic_mf_id = mono
612        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
613        while not pairs_iso_df.empty:
614            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
615            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
616            for iso in m1_isos:
617                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
618                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
619                if len(parent) > 1:
620                    # Choose the parent that is closest in time to the isotopologue
621                    parent_time = [self.mass_features[p].retention_time for p in parent]
622                    time_diff = [
623                        np.abs(self.mass_features[iso].retention_time - x)
624                        for x in parent_time
625                    ]
626                    parent = parent[np.argmin(time_diff)]
627                else:
628                    parent = parent[0]
629                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
630                    parent
631                ].monoisotopic_mf_id
632                if self.mass_features[iso].monoisotopic_mf_id is not None:
633                    mass_diff = (
634                        self.mass_features[iso].mz
635                        - self.mass_features[
636                            self.mass_features[iso].monoisotopic_mf_id
637                        ].mz
638                    )
639                    self.mass_features[iso].isotopologue_type = "13C" + str(
640                        int(round(mass_diff, 0))
641                    )
642
643            # Drop the mono and iso from the pairs_iso_df
644            pairs_iso_df = pairs_iso_df.drop(
645                index=monos, errors="ignore"
646            )  # Drop pairs where the parent is a child that is a child of a root
647            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
648            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
649
650            if not pairs_iso_df.empty:
651                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
652                monos = np.setdiff1d(
653                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
654                )
655        if verbose:
656            # Report fraction of compounds annotated with isotopes
657            mf_df["c13_flag"] = np.where(
658                np.logical_or(
659                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
660                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
661                ),
662                1,
663                0,
664            )
665            print(
666                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
667                + " of mass features have or are C13 isotopes"
668            )

Mark likely C13 isotopes and connect to monoisoitopic mass features.

Returns
  • None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found.
def deconvolute_ms1_mass_features(self):
670    def deconvolute_ms1_mass_features(self):
671        """Deconvolute MS1 mass features
672
673        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
674
675        Parameters
676        ----------
677        None
678
679        Returns
680        -------
681        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
682        and associated_mass_features_deconvoluted attributes to the mass features in the
683        mass_features attribute of the LCMSBase object.
684
685        Raises
686        ------
687        ValueError
688            If no mass features are found, must run find_mass_features() first.
689            If no EICs are found, did you run integrate_mass_features() first?
690
691        """
692        # Checks for set mass_features and eics
693        if self.mass_features is None:
694            raise ValueError(
695                "No mass features found, did you run find_mass_features() first?"
696            )
697
698        if self.eics == {}:
699            raise ValueError(
700                "No EICs found, did you run integrate_mass_features() first?"
701            )
702
703        if 1 not in self._ms_unprocessed.keys():
704            raise ValueError("No unprocessed MS1 spectra found.")
705
706        # Prep ms1 data
707        ms1_data = self._ms_unprocessed[1].copy()
708        ms1_data = ms1_data.set_index("scan")
709
710        # Prep mass feature summary
711        mass_feature_df = self.mass_features_to_df()
712
713        # Loop through each mass feature
714        for mf_id, mass_feature in self.mass_features.items():
715            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
716            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
717                continue
718
719            # Get the left and right limits of the EIC of the mass feature
720            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
721
722            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
723            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
724            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
725
726            # Get the centroided masses of the mass feature
727            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
728
729            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
730            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
731                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
732            ]
733
734            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
735            ms1_data_sub["mz_diff_rel"] = (
736                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
737                / ms1_data_sub["mz"]
738            )
739            ms1_data_sub = ms1_data_sub[
740                ms1_data_sub["mz_diff_rel"]
741                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
742            ].reset_index(drop=True)
743
744            # Group by mass_feature_mz and scan and sum intensity
745            ms1_data_sub_group = (
746                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
747                .sum()
748                .reset_index()
749            )
750
751            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
752            corr = (
753                ms1_data_sub_group.pivot(
754                    index="scan", columns="mass_feature_mz", values="intensity"
755                )
756                .fillna(0)
757                .corr()
758            )
759
760            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
761            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
762            corr_subset = corr.loc[mass_feature.mz,]
763            corr_subset = corr_subset[corr_subset > decon_corr_min]
764
765            # Get the masses from the mass spectrum that are the result of the deconvolution
766            mzs_decon = corr_subset.index.values
767
768            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
769            mzs_decon_idx = [
770                id
771                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
772                if mz in mzs_decon
773            ]
774            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
775
776            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
777            if (
778                mass_feature.ms1_peak.abundance
779                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
780            ):
781                mass_feature.mass_spectrum_deconvoluted_parent = True
782            else:
783                mass_feature.mass_spectrum_deconvoluted_parent = False
784
785            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
786            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
787            mass_feature_df_sub = mass_feature_df[
788                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
789                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
790            ].copy()
791            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
792            mass_feature_df_sub["mz_diff_ppm"] = [
793                np.abs(mzs_decon - mz).min() / mz * 10**6
794                for mz in mass_feature_df_sub["mz"]
795            ]
796            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
797            mfs_associated_decon = mass_feature_df_sub[
798                mass_feature_df_sub["mz_diff_ppm"]
799                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
800            ].index.values
801
802            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Deconvolute MS1 mass features

Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features

Parameters
  • None
Returns
  • None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
  • and associated_mass_features_deconvoluted attributes to the mass features in the
  • mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
class PHCalculations:
 805class PHCalculations:
 806    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 807
 808    Notes
 809    -----
 810    This class is intended to be used as a mixin for the LCMSBase class.
 811
 812    Methods
 813    -------
 814    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 815        Sparse implementation of a mean filter.
 816    * embed_unique_indices(a).
 817        Creates an array of indices, sorted by unique element.
 818    * sparse_upper_star(idx, V).
 819        Sparse implementation of an upper star filtration.
 820    * check_if_grid(data).
 821        Check if the data is gridded in mz space.
 822    * grid_data(data).
 823        Grid the data in the mz dimension.
 824    * find_mass_features_ph(ms_level=1, grid=True).
 825        Find mass features within an LCMSBase object using persistent homology.
 826    * cluster_mass_features(drop_children=True).
 827        Cluster regions of interest.
 828    """
 829
 830    @staticmethod
 831    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 832        """Sparse implementation of a mean filter.
 833
 834        Parameters
 835        ----------
 836        idx : :obj:`~numpy.array`
 837            Edge indices for each dimension (MxN).
 838        V : :obj:`~numpy.array`
 839            Array of intensity data (Mx1).
 840        radius : float or list
 841            Radius of the sparse filter in each dimension. Values less than
 842            zero indicate no connectivity in that dimension.
 843
 844        Returns
 845        -------
 846        :obj:`~numpy.array`
 847            Filtered intensities (Mx1).
 848
 849        Notes
 850        -----
 851        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 852        This is a static method.
 853        """
 854
 855        # Copy indices
 856        idx = idx.copy().astype(V.dtype)
 857
 858        # Scale
 859        for i, r in enumerate(radius):
 860            # Increase inter-index distance
 861            if r < 1:
 862                idx[:, i] *= 2
 863
 864            # Do nothing
 865            elif r == 1:
 866                pass
 867
 868            # Decrease inter-index distance
 869            else:
 870                idx[:, i] /= r
 871
 872        # Connectivity matrix
 873        cmat = KDTree(idx)
 874        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 875        cmat.setdiag(1)
 876
 877        # Pair indices
 878        I, J = cmat.nonzero()
 879
 880        # Delete cmat
 881        cmat_shape = cmat.shape
 882        del cmat
 883
 884        # Sum over columns
 885        V_sum = sparse.bsr_matrix(
 886            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 887        ).diagonal(0)
 888
 889        # Count over columns
 890        V_count = sparse.bsr_matrix(
 891            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 892        ).diagonal(0)
 893
 894        return V_sum / V_count
 895
 896    @staticmethod
 897    def embed_unique_indices(a):
 898        """Creates an array of indices, sorted by unique element.
 899
 900        Parameters
 901        ----------
 902        a : :obj:`~numpy.array`
 903            Array of unique elements (Mx1).
 904
 905        Returns
 906        -------
 907        :obj:`~numpy.array`
 908            Array of indices (Mx1).
 909
 910        Notes
 911        -----
 912        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 913        This is a static method.
 914        """
 915
 916        def count_tens(n):
 917            # Count tens
 918            ntens = (n - 1) // 10
 919
 920            while True:
 921                ntens_test = (ntens + n - 1) // 10
 922
 923                if ntens_test == ntens:
 924                    return ntens
 925                else:
 926                    ntens = ntens_test
 927
 928        def arange_exclude_10s(n):
 929            # How many 10s will there be?
 930            ntens = count_tens(n)
 931
 932            # Base array
 933            arr = np.arange(0, n + ntens)
 934
 935            # Exclude 10s
 936            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 937
 938            return arr
 939
 940        # Creates an array of indices, sorted by unique element
 941        idx_sort = np.argsort(a)
 942        idx_unsort = np.argsort(idx_sort)
 943
 944        # Sorts records array so all unique elements are together
 945        sorted_a = a[idx_sort]
 946
 947        # Returns the unique values, the index of the first occurrence,
 948        # and the count for each element
 949        vals, idx_start, count = np.unique(
 950            sorted_a, return_index=True, return_counts=True
 951        )
 952
 953        # Splits the indices into separate arrays
 954        splits = np.split(idx_sort, idx_start[1:])
 955
 956        # Creates unique indices for each split
 957        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 958
 959        # Reorders according to input array
 960        idx_unq = idx_unq[idx_unsort]
 961
 962        # Magnitude of each index
 963        exp = np.log10(
 964            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
 965        )
 966        idx_unq_mag = np.power(10, np.floor(exp) + 1)
 967
 968        # Result
 969        return a + idx_unq / idx_unq_mag
 970
 971    def sparse_upper_star(self, idx, V):
 972        """Sparse implementation of an upper star filtration.
 973
 974        Parameters
 975        ----------
 976        idx : :obj:`~numpy.array`
 977            Edge indices for each dimension (MxN).
 978        V : :obj:`~numpy.array`
 979            Array of intensity data (Mx1).
 980        Returns
 981        -------
 982        idx : :obj:`~numpy.array`
 983            Index of filtered points (Mx1).
 984        persistence : :obj:`~numpy.array`
 985            Persistence of each filtered point (Mx1).
 986
 987        Notes
 988        -----
 989        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 990        """
 991
 992        # Invert
 993        V = -1 * V.copy().astype(int)
 994
 995        # Embed indices
 996        V = self.embed_unique_indices(V)
 997
 998        # Connectivity matrix
 999        cmat = KDTree(idx)
1000        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1001        cmat.setdiag(1)
1002        cmat = sparse.triu(cmat)
1003
1004        # Pairwise minimums
1005        I, J = cmat.nonzero()
1006        d = np.maximum(V[I], V[J])
1007
1008        # Delete connectiity matrix
1009        cmat_shape = cmat.shape
1010        del cmat
1011
1012        # Sparse distance matrix
1013        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1014
1015        # Delete pairwise mins
1016        del d, I, J
1017
1018        # Persistence homology
1019        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1020
1021        # Bound death values
1022        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1023
1024        # Construct tree to query against
1025        tree = KDTree(V.reshape((-1, 1)))
1026
1027        # Get the indexes of the first nearest neighbor by birth
1028        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1029
1030        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1031
1032    def check_if_grid(self, data):
1033        """Check if the data are gridded in mz space.
1034
1035        Parameters
1036        ----------
1037        data : DataFrame
1038            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1039
1040        Returns
1041        -------
1042        bool
1043            True if the data is gridded in the mz direction, False otherwise.
1044
1045        Notes
1046        -----
1047        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1048        """
1049        # Calculate the difference between consecutive mz values in a single scan
1050        dat_check = data.copy().reset_index(drop=True)
1051        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1052        mz_diff_min = (
1053            dat_check.groupby("scan")["mz_diff"].min().min()
1054        )  # within each scan, what is the smallest mz difference between consecutive mz values
1055
1056        # Find the mininum mz difference between mz values in the data; regardless of scan
1057        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1058        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1059        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1060
1061        # Get minimum mz_diff between mz values in the data
1062        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1063
1064        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1065        if mz_diff_min_raw < mz_diff_min:
1066            return False
1067        else:
1068            return True
1069
1070    def grid_data(self, data):
1071        """Grid the data in the mz dimension.
1072
1073        Data must be gridded prior to persistent homology calculations.
1074
1075        Parameters
1076        ----------
1077        data : DataFrame
1078            The input data containing mz, scan, scan_time, and intensity columns.
1079
1080        Returns
1081        -------
1082        DataFrame
1083            The gridded data with mz, scan, scan_time, and intensity columns.
1084
1085        Raises
1086        ------
1087        ValueError
1088            If gridding fails.
1089        """
1090
1091        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1092        data_w = data.copy().reset_index(drop=True)
1093        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1094        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1095
1096        # Need high intensity mz values first so they are parents in the output pairs stack
1097        dat_mz = data_w[["mz", "intensity"]].sort_values(
1098            by=["intensity"], ascending=False
1099        )
1100        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1101
1102        # Construct KD tree
1103        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1104        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1105        sdm = sparse.triu(sdm, k=1)
1106        sdm.data = np.ones_like(sdm.data)
1107        distances = sdm.tocoo()
1108        pairs = np.stack((distances.row, distances.col), axis=1)
1109
1110        # Cull pairs to just get root
1111        to_drop = []
1112        while len(pairs) > 0:
1113            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1114            id_root_parents = np.isin(pairs[:, 0], root_parents)
1115            children_of_roots = np.unique(pairs[id_root_parents, 1])
1116            to_drop = np.append(to_drop, children_of_roots)
1117
1118            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1119            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1120            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1121        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1122        mz_dat_np = (
1123            dat_mz[["mz"]]
1124            .sort_values(by=["mz"])
1125            .reset_index(drop=True)
1126            .values.flatten()
1127        )
1128
1129        # Sort data by mz and recast mz to nearest value in mz_dat_np
1130        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1131        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1132        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1133
1134        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1135        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1136        new_data_w = (
1137            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1138            .groupby(["scan", "mz"])["intensity"]
1139            .sum()
1140            .reset_index()
1141        )
1142        new_data_w = (
1143            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1144            .reset_index(drop=True)
1145            .copy()
1146        )
1147
1148        # Check if grid worked and return
1149        if self.check_if_grid(new_data_w):
1150            return new_data_w
1151        else:
1152            raise ValueError("Gridding failed")
1153
1154    def find_mass_features_ph(self, ms_level=1, grid=True):
1155        """Find mass features within an LCMSBase object using persistent homology.
1156
1157        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1158
1159        Parameters
1160        ----------
1161        ms_level : int, optional
1162            The MS level to use. Default is 1.
1163        grid : bool, optional
1164            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1165
1166        Raises
1167        ------
1168        ValueError
1169            If no MS level data is found on the object.
1170            If data is not gridded and grid is False.
1171
1172        Returns
1173        -------
1174        None, but assigns the mass_features attribute to the object.
1175
1176        Notes
1177        -----
1178        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1179        """
1180        # Check that ms_level is a key in self._ms_uprocessed
1181        if ms_level not in self._ms_unprocessed.keys():
1182            raise ValueError(
1183                "No MS level "
1184                + str(ms_level)
1185                + " data found, did you instantiate with parser specific to MS level?"
1186            )
1187
1188        # Get ms data
1189        data = self._ms_unprocessed[ms_level].copy()
1190
1191        # Drop rows with missing intensity values and reset index
1192        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1193
1194        # Threshold data
1195        dims = ["mz", "scan_time"]
1196        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1197        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1198
1199        # Check if gridded, if not, grid
1200        gridded_mz = self.check_if_grid(data_thres)
1201        if gridded_mz is False:
1202            if grid is False:
1203                raise ValueError(
1204                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1205                )
1206            else:
1207                data_thres = self.grid_data(data_thres)
1208
1209        # Add build factors and add scan_time
1210        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1211        factors = {
1212            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1213            for dim in dims
1214        }  # this is return a float64 index
1215
1216        # Build indexes
1217        index = {
1218            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1219            for dim in factors
1220        }
1221
1222        # Smooth data
1223        iterations = self.parameters.lc_ms.ph_smooth_it
1224        smooth_radius = [
1225            self.parameters.lc_ms.ph_smooth_radius_mz,
1226            self.parameters.lc_ms.ph_smooth_radius_scan,
1227        ]  # mz, scan_time smoothing radius (in steps)
1228
1229        index = np.vstack([index[dim] for dim in dims]).T
1230        V = data_thres["intensity"].values
1231        resid = np.inf
1232        for i in range(iterations):
1233            # Previous iteration
1234            V_prev = V.copy()
1235            resid_prev = resid
1236            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1237
1238            # Calculate residual with previous iteration
1239            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1240
1241            # Evaluate convergence
1242            if i > 0:
1243                # Percent change in residual
1244                test = np.abs(resid - resid_prev) / resid_prev
1245
1246                # Exit criteria
1247                if test <= 0:
1248                    break
1249
1250        # Overwrite values
1251        data_thres["intensity"] = V
1252
1253        # Use persistent homology to find regions of interest
1254        pidx, pers = self.sparse_upper_star(index, V)
1255        pidx = pidx[pers > 1]
1256        pers = pers[pers > 1]
1257
1258        # Get peaks
1259        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1260
1261        # Add persistence column
1262        peaks["persistence"] = pers
1263        mass_features = peaks.sort_values(
1264            by="persistence", ascending=False
1265        ).reset_index(drop=True)
1266
1267        # Filter by persistence threshold
1268        persistence_threshold = (
1269            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1270        )
1271        mass_features = mass_features.loc[
1272            mass_features["persistence"] > persistence_threshold, :
1273        ].reset_index(drop=True)
1274
1275        # Rename scan column to apex_scan
1276        mass_features = mass_features.rename(
1277            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1278        )
1279
1280        # Populate mass_features attribute
1281        self.mass_features = {}
1282        for row in mass_features.itertuples():
1283            row_dict = mass_features.iloc[row.Index].to_dict()
1284            lcms_feature = LCMSMassFeature(self, **row_dict)
1285            self.mass_features[lcms_feature.id] = lcms_feature
1286
1287        if self.parameters.lc_ms.verbose_processing:
1288            print("Found " + str(len(mass_features)) + " initial mass features")
1289
1290    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1291        """Cluster mass features
1292
1293        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1294
1295        Parameters
1296        ----------
1297        drop_children : bool, optional
1298            Whether to drop the mass features that are not cluster parents. Default is True.
1299        sort_by : str, optional
1300            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1301
1302        Raises
1303        ------
1304        ValueError
1305            If no mass features are found.
1306            If too many mass features are found.
1307
1308        Returns
1309        -------
1310        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1311        """
1312        verbose = self.parameters.lc_ms.verbose_processing
1313
1314        if self.mass_features is None:
1315            raise ValueError("No mass features found, run find_mass_features() first")
1316        if len(self.mass_features) > 400000:
1317            raise ValueError(
1318                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1319            )
1320        dims = ["mz", "scan_time"]
1321        mf_df_og = self.mass_features_to_df()
1322        mf_df = mf_df_og.copy()
1323
1324        # Sort mass features by sort_by column, make mf_id its own column for easier bookkeeping
1325        mf_df = mf_df.sort_values(by=sort_by, ascending=False).reset_index(drop=False)
1326
1327        tol = [
1328            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1329            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1330        ]  # mz, in relative; scan_time in minutes
1331        relative = [True, False]
1332
1333        # Compute inter-feature distances
1334        distances = None
1335        for i in range(len(dims)):
1336            # Construct k-d tree
1337            values = mf_df[dims[i]].values
1338            tree = KDTree(values.reshape(-1, 1))
1339
1340            max_tol = tol[i]
1341            if relative[i] is True:
1342                # Maximum absolute tolerance
1343                max_tol = tol[i] * values.max()
1344
1345            # Compute sparse distance matrix
1346            # the larger the max_tol, the slower this operation is
1347            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1348
1349            # Only consider forward case, exclude diagonal
1350            sdm = sparse.triu(sdm, k=1)
1351
1352            # Filter relative distances
1353            if relative[i] is True:
1354                # Compute relative distances
1355                rel_dists = sdm.data / values[sdm.row]  # or col?
1356
1357                # Indices of relative distances less than tolerance
1358                idx = rel_dists <= tol[i]
1359
1360                # Reconstruct sparse distance matrix
1361                sdm = sparse.coo_matrix(
1362                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1363                    shape=(len(values), len(values)),
1364                )
1365
1366            # Cast as binary matrix
1367            sdm.data = np.ones_like(sdm.data)
1368
1369            # Stack distances
1370            if distances is None:
1371                distances = sdm
1372            else:
1373                distances = distances.multiply(sdm)
1374
1375        # Extract indices of within-tolerance points
1376        distances = distances.tocoo()
1377        pairs = np.stack((distances.row, distances.col), axis=1)
1378        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1379        pairs_df = pairs_df.set_index("parent")
1380
1381        to_drop = []
1382        while not pairs_df.empty:
1383            # Find root_parents and their children
1384            root_parents = np.setdiff1d(
1385                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1386            )
1387            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1388            to_drop = np.append(to_drop, children_of_roots)
1389
1390            # Remove root_children as possible parents from pairs_df for next iteration
1391            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1392            pairs_df = pairs_df.reset_index().set_index("child")
1393            # Remove root_children as possible children from pairs_df for next iteration
1394            pairs_df = pairs_df.drop(index=children_of_roots)
1395
1396            # Prepare for next iteration
1397            pairs_df = pairs_df.reset_index().set_index("parent")
1398
1399        # Drop mass features that are not cluster parents
1400        mf_df = mf_df.drop(index=np.array(to_drop))
1401
1402        # Set index back to mf_id
1403        mf_df = mf_df.set_index("mf_id")
1404        if verbose:
1405            print(str(len(mf_df)) + " mass features remaining")
1406
1407        mf_df_new = mf_df_og.copy()
1408        mf_df_new["cluster_parent"] = np.where(
1409            np.isin(mf_df_new.index, mf_df.index), True, False
1410        )
1411
1412        # get mass feature ids of features that are not cluster parents
1413        cluster_daughters = mf_df_new[mf_df_new["cluster_parent"] == False].index.values
1414        if drop_children is True:
1415            # Drop mass features that are not cluster parents from self
1416            self.mass_features = {
1417                k: v
1418                for k, v in self.mass_features.items()
1419                if k not in cluster_daughters
1420            }
1421        else:
1422            return cluster_daughters

Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
  • embed_unique_indices(a). Creates an array of indices, sorted by unique element.
  • sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
  • check_if_grid(data). Check if the data is gridded in mz space.
  • grid_data(data). Grid the data in the mz dimension.
  • find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
  • cluster_mass_features(drop_children=True). Cluster regions of interest.
@staticmethod
def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
830    @staticmethod
831    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
832        """Sparse implementation of a mean filter.
833
834        Parameters
835        ----------
836        idx : :obj:`~numpy.array`
837            Edge indices for each dimension (MxN).
838        V : :obj:`~numpy.array`
839            Array of intensity data (Mx1).
840        radius : float or list
841            Radius of the sparse filter in each dimension. Values less than
842            zero indicate no connectivity in that dimension.
843
844        Returns
845        -------
846        :obj:`~numpy.array`
847            Filtered intensities (Mx1).
848
849        Notes
850        -----
851        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
852        This is a static method.
853        """
854
855        # Copy indices
856        idx = idx.copy().astype(V.dtype)
857
858        # Scale
859        for i, r in enumerate(radius):
860            # Increase inter-index distance
861            if r < 1:
862                idx[:, i] *= 2
863
864            # Do nothing
865            elif r == 1:
866                pass
867
868            # Decrease inter-index distance
869            else:
870                idx[:, i] /= r
871
872        # Connectivity matrix
873        cmat = KDTree(idx)
874        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
875        cmat.setdiag(1)
876
877        # Pair indices
878        I, J = cmat.nonzero()
879
880        # Delete cmat
881        cmat_shape = cmat.shape
882        del cmat
883
884        # Sum over columns
885        V_sum = sparse.bsr_matrix(
886            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
887        ).diagonal(0)
888
889        # Count over columns
890        V_count = sparse.bsr_matrix(
891            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
892        ).diagonal(0)
893
894        return V_sum / V_count

Sparse implementation of a mean filter.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
  • radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
  • ~numpy.array: Filtered intensities (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.

@staticmethod
def embed_unique_indices(a):
896    @staticmethod
897    def embed_unique_indices(a):
898        """Creates an array of indices, sorted by unique element.
899
900        Parameters
901        ----------
902        a : :obj:`~numpy.array`
903            Array of unique elements (Mx1).
904
905        Returns
906        -------
907        :obj:`~numpy.array`
908            Array of indices (Mx1).
909
910        Notes
911        -----
912        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
913        This is a static method.
914        """
915
916        def count_tens(n):
917            # Count tens
918            ntens = (n - 1) // 10
919
920            while True:
921                ntens_test = (ntens + n - 1) // 10
922
923                if ntens_test == ntens:
924                    return ntens
925                else:
926                    ntens = ntens_test
927
928        def arange_exclude_10s(n):
929            # How many 10s will there be?
930            ntens = count_tens(n)
931
932            # Base array
933            arr = np.arange(0, n + ntens)
934
935            # Exclude 10s
936            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
937
938            return arr
939
940        # Creates an array of indices, sorted by unique element
941        idx_sort = np.argsort(a)
942        idx_unsort = np.argsort(idx_sort)
943
944        # Sorts records array so all unique elements are together
945        sorted_a = a[idx_sort]
946
947        # Returns the unique values, the index of the first occurrence,
948        # and the count for each element
949        vals, idx_start, count = np.unique(
950            sorted_a, return_index=True, return_counts=True
951        )
952
953        # Splits the indices into separate arrays
954        splits = np.split(idx_sort, idx_start[1:])
955
956        # Creates unique indices for each split
957        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
958
959        # Reorders according to input array
960        idx_unq = idx_unq[idx_unsort]
961
962        # Magnitude of each index
963        exp = np.log10(
964            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
965        )
966        idx_unq_mag = np.power(10, np.floor(exp) + 1)
967
968        # Result
969        return a + idx_unq / idx_unq_mag

Creates an array of indices, sorted by unique element.

Parameters
  • a (~numpy.array): Array of unique elements (Mx1).
Returns
  • ~numpy.array: Array of indices (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.

def sparse_upper_star(self, idx, V):
 971    def sparse_upper_star(self, idx, V):
 972        """Sparse implementation of an upper star filtration.
 973
 974        Parameters
 975        ----------
 976        idx : :obj:`~numpy.array`
 977            Edge indices for each dimension (MxN).
 978        V : :obj:`~numpy.array`
 979            Array of intensity data (Mx1).
 980        Returns
 981        -------
 982        idx : :obj:`~numpy.array`
 983            Index of filtered points (Mx1).
 984        persistence : :obj:`~numpy.array`
 985            Persistence of each filtered point (Mx1).
 986
 987        Notes
 988        -----
 989        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 990        """
 991
 992        # Invert
 993        V = -1 * V.copy().astype(int)
 994
 995        # Embed indices
 996        V = self.embed_unique_indices(V)
 997
 998        # Connectivity matrix
 999        cmat = KDTree(idx)
1000        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1001        cmat.setdiag(1)
1002        cmat = sparse.triu(cmat)
1003
1004        # Pairwise minimums
1005        I, J = cmat.nonzero()
1006        d = np.maximum(V[I], V[J])
1007
1008        # Delete connectiity matrix
1009        cmat_shape = cmat.shape
1010        del cmat
1011
1012        # Sparse distance matrix
1013        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1014
1015        # Delete pairwise mins
1016        del d, I, J
1017
1018        # Persistence homology
1019        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1020
1021        # Bound death values
1022        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1023
1024        # Construct tree to query against
1025        tree = KDTree(V.reshape((-1, 1)))
1026
1027        # Get the indexes of the first nearest neighbor by birth
1028        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1029
1030        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)

Sparse implementation of an upper star filtration.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
Returns
  • idx (~numpy.array): Index of filtered points (Mx1).
  • persistence (~numpy.array): Persistence of each filtered point (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def check_if_grid(self, data):
1032    def check_if_grid(self, data):
1033        """Check if the data are gridded in mz space.
1034
1035        Parameters
1036        ----------
1037        data : DataFrame
1038            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1039
1040        Returns
1041        -------
1042        bool
1043            True if the data is gridded in the mz direction, False otherwise.
1044
1045        Notes
1046        -----
1047        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1048        """
1049        # Calculate the difference between consecutive mz values in a single scan
1050        dat_check = data.copy().reset_index(drop=True)
1051        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1052        mz_diff_min = (
1053            dat_check.groupby("scan")["mz_diff"].min().min()
1054        )  # within each scan, what is the smallest mz difference between consecutive mz values
1055
1056        # Find the mininum mz difference between mz values in the data; regardless of scan
1057        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1058        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1059        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1060
1061        # Get minimum mz_diff between mz values in the data
1062        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1063
1064        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1065        if mz_diff_min_raw < mz_diff_min:
1066            return False
1067        else:
1068            return True

Check if the data are gridded in mz space.

Parameters
  • data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
  • bool: True if the data is gridded in the mz direction, False otherwise.
Notes

This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.

def grid_data(self, data):
1070    def grid_data(self, data):
1071        """Grid the data in the mz dimension.
1072
1073        Data must be gridded prior to persistent homology calculations.
1074
1075        Parameters
1076        ----------
1077        data : DataFrame
1078            The input data containing mz, scan, scan_time, and intensity columns.
1079
1080        Returns
1081        -------
1082        DataFrame
1083            The gridded data with mz, scan, scan_time, and intensity columns.
1084
1085        Raises
1086        ------
1087        ValueError
1088            If gridding fails.
1089        """
1090
1091        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1092        data_w = data.copy().reset_index(drop=True)
1093        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1094        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1095
1096        # Need high intensity mz values first so they are parents in the output pairs stack
1097        dat_mz = data_w[["mz", "intensity"]].sort_values(
1098            by=["intensity"], ascending=False
1099        )
1100        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1101
1102        # Construct KD tree
1103        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1104        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1105        sdm = sparse.triu(sdm, k=1)
1106        sdm.data = np.ones_like(sdm.data)
1107        distances = sdm.tocoo()
1108        pairs = np.stack((distances.row, distances.col), axis=1)
1109
1110        # Cull pairs to just get root
1111        to_drop = []
1112        while len(pairs) > 0:
1113            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1114            id_root_parents = np.isin(pairs[:, 0], root_parents)
1115            children_of_roots = np.unique(pairs[id_root_parents, 1])
1116            to_drop = np.append(to_drop, children_of_roots)
1117
1118            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1119            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1120            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1121        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1122        mz_dat_np = (
1123            dat_mz[["mz"]]
1124            .sort_values(by=["mz"])
1125            .reset_index(drop=True)
1126            .values.flatten()
1127        )
1128
1129        # Sort data by mz and recast mz to nearest value in mz_dat_np
1130        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1131        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1132        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1133
1134        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1135        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1136        new_data_w = (
1137            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1138            .groupby(["scan", "mz"])["intensity"]
1139            .sum()
1140            .reset_index()
1141        )
1142        new_data_w = (
1143            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1144            .reset_index(drop=True)
1145            .copy()
1146        )
1147
1148        # Check if grid worked and return
1149        if self.check_if_grid(new_data_w):
1150            return new_data_w
1151        else:
1152            raise ValueError("Gridding failed")

Grid the data in the mz dimension.

Data must be gridded prior to persistent homology calculations.

Parameters
  • data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
Returns
  • DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
  • ValueError: If gridding fails.
def find_mass_features_ph(self, ms_level=1, grid=True):
1154    def find_mass_features_ph(self, ms_level=1, grid=True):
1155        """Find mass features within an LCMSBase object using persistent homology.
1156
1157        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1158
1159        Parameters
1160        ----------
1161        ms_level : int, optional
1162            The MS level to use. Default is 1.
1163        grid : bool, optional
1164            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1165
1166        Raises
1167        ------
1168        ValueError
1169            If no MS level data is found on the object.
1170            If data is not gridded and grid is False.
1171
1172        Returns
1173        -------
1174        None, but assigns the mass_features attribute to the object.
1175
1176        Notes
1177        -----
1178        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1179        """
1180        # Check that ms_level is a key in self._ms_uprocessed
1181        if ms_level not in self._ms_unprocessed.keys():
1182            raise ValueError(
1183                "No MS level "
1184                + str(ms_level)
1185                + " data found, did you instantiate with parser specific to MS level?"
1186            )
1187
1188        # Get ms data
1189        data = self._ms_unprocessed[ms_level].copy()
1190
1191        # Drop rows with missing intensity values and reset index
1192        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1193
1194        # Threshold data
1195        dims = ["mz", "scan_time"]
1196        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1197        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1198
1199        # Check if gridded, if not, grid
1200        gridded_mz = self.check_if_grid(data_thres)
1201        if gridded_mz is False:
1202            if grid is False:
1203                raise ValueError(
1204                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1205                )
1206            else:
1207                data_thres = self.grid_data(data_thres)
1208
1209        # Add build factors and add scan_time
1210        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1211        factors = {
1212            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1213            for dim in dims
1214        }  # this is return a float64 index
1215
1216        # Build indexes
1217        index = {
1218            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1219            for dim in factors
1220        }
1221
1222        # Smooth data
1223        iterations = self.parameters.lc_ms.ph_smooth_it
1224        smooth_radius = [
1225            self.parameters.lc_ms.ph_smooth_radius_mz,
1226            self.parameters.lc_ms.ph_smooth_radius_scan,
1227        ]  # mz, scan_time smoothing radius (in steps)
1228
1229        index = np.vstack([index[dim] for dim in dims]).T
1230        V = data_thres["intensity"].values
1231        resid = np.inf
1232        for i in range(iterations):
1233            # Previous iteration
1234            V_prev = V.copy()
1235            resid_prev = resid
1236            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1237
1238            # Calculate residual with previous iteration
1239            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1240
1241            # Evaluate convergence
1242            if i > 0:
1243                # Percent change in residual
1244                test = np.abs(resid - resid_prev) / resid_prev
1245
1246                # Exit criteria
1247                if test <= 0:
1248                    break
1249
1250        # Overwrite values
1251        data_thres["intensity"] = V
1252
1253        # Use persistent homology to find regions of interest
1254        pidx, pers = self.sparse_upper_star(index, V)
1255        pidx = pidx[pers > 1]
1256        pers = pers[pers > 1]
1257
1258        # Get peaks
1259        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1260
1261        # Add persistence column
1262        peaks["persistence"] = pers
1263        mass_features = peaks.sort_values(
1264            by="persistence", ascending=False
1265        ).reset_index(drop=True)
1266
1267        # Filter by persistence threshold
1268        persistence_threshold = (
1269            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1270        )
1271        mass_features = mass_features.loc[
1272            mass_features["persistence"] > persistence_threshold, :
1273        ].reset_index(drop=True)
1274
1275        # Rename scan column to apex_scan
1276        mass_features = mass_features.rename(
1277            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1278        )
1279
1280        # Populate mass_features attribute
1281        self.mass_features = {}
1282        for row in mass_features.itertuples():
1283            row_dict = mass_features.iloc[row.Index].to_dict()
1284            lcms_feature = LCMSMassFeature(self, **row_dict)
1285            self.mass_features[lcms_feature.id] = lcms_feature
1286
1287        if self.parameters.lc_ms.verbose_processing:
1288            print("Found " + str(len(mass_features)) + " initial mass features")

Find mass features within an LCMSBase object using persistent homology.

Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
  • None, but assigns the mass_features attribute to the object.
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def cluster_mass_features(self, drop_children=True, sort_by='persistence'):
1290    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1291        """Cluster mass features
1292
1293        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1294
1295        Parameters
1296        ----------
1297        drop_children : bool, optional
1298            Whether to drop the mass features that are not cluster parents. Default is True.
1299        sort_by : str, optional
1300            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1301
1302        Raises
1303        ------
1304        ValueError
1305            If no mass features are found.
1306            If too many mass features are found.
1307
1308        Returns
1309        -------
1310        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1311        """
1312        verbose = self.parameters.lc_ms.verbose_processing
1313
1314        if self.mass_features is None:
1315            raise ValueError("No mass features found, run find_mass_features() first")
1316        if len(self.mass_features) > 400000:
1317            raise ValueError(
1318                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1319            )
1320        dims = ["mz", "scan_time"]
1321        mf_df_og = self.mass_features_to_df()
1322        mf_df = mf_df_og.copy()
1323
1324        # Sort mass features by sort_by column, make mf_id its own column for easier bookkeeping
1325        mf_df = mf_df.sort_values(by=sort_by, ascending=False).reset_index(drop=False)
1326
1327        tol = [
1328            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1329            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1330        ]  # mz, in relative; scan_time in minutes
1331        relative = [True, False]
1332
1333        # Compute inter-feature distances
1334        distances = None
1335        for i in range(len(dims)):
1336            # Construct k-d tree
1337            values = mf_df[dims[i]].values
1338            tree = KDTree(values.reshape(-1, 1))
1339
1340            max_tol = tol[i]
1341            if relative[i] is True:
1342                # Maximum absolute tolerance
1343                max_tol = tol[i] * values.max()
1344
1345            # Compute sparse distance matrix
1346            # the larger the max_tol, the slower this operation is
1347            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1348
1349            # Only consider forward case, exclude diagonal
1350            sdm = sparse.triu(sdm, k=1)
1351
1352            # Filter relative distances
1353            if relative[i] is True:
1354                # Compute relative distances
1355                rel_dists = sdm.data / values[sdm.row]  # or col?
1356
1357                # Indices of relative distances less than tolerance
1358                idx = rel_dists <= tol[i]
1359
1360                # Reconstruct sparse distance matrix
1361                sdm = sparse.coo_matrix(
1362                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1363                    shape=(len(values), len(values)),
1364                )
1365
1366            # Cast as binary matrix
1367            sdm.data = np.ones_like(sdm.data)
1368
1369            # Stack distances
1370            if distances is None:
1371                distances = sdm
1372            else:
1373                distances = distances.multiply(sdm)
1374
1375        # Extract indices of within-tolerance points
1376        distances = distances.tocoo()
1377        pairs = np.stack((distances.row, distances.col), axis=1)
1378        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1379        pairs_df = pairs_df.set_index("parent")
1380
1381        to_drop = []
1382        while not pairs_df.empty:
1383            # Find root_parents and their children
1384            root_parents = np.setdiff1d(
1385                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1386            )
1387            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1388            to_drop = np.append(to_drop, children_of_roots)
1389
1390            # Remove root_children as possible parents from pairs_df for next iteration
1391            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1392            pairs_df = pairs_df.reset_index().set_index("child")
1393            # Remove root_children as possible children from pairs_df for next iteration
1394            pairs_df = pairs_df.drop(index=children_of_roots)
1395
1396            # Prepare for next iteration
1397            pairs_df = pairs_df.reset_index().set_index("parent")
1398
1399        # Drop mass features that are not cluster parents
1400        mf_df = mf_df.drop(index=np.array(to_drop))
1401
1402        # Set index back to mf_id
1403        mf_df = mf_df.set_index("mf_id")
1404        if verbose:
1405            print(str(len(mf_df)) + " mass features remaining")
1406
1407        mf_df_new = mf_df_og.copy()
1408        mf_df_new["cluster_parent"] = np.where(
1409            np.isin(mf_df_new.index, mf_df.index), True, False
1410        )
1411
1412        # get mass feature ids of features that are not cluster parents
1413        cluster_daughters = mf_df_new[mf_df_new["cluster_parent"] == False].index.values
1414        if drop_children is True:
1415            # Drop mass features that are not cluster parents from self
1416            self.mass_features = {
1417                k: v
1418                for k, v in self.mass_features.items()
1419                if k not in cluster_daughters
1420            }
1421        else:
1422            return cluster_daughters

Cluster mass features

Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.

Parameters
  • drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
  • sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
  • ValueError: If no mass features are found. If too many mass features are found.
Returns
  • None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.