corems.mass_spectra.calc.lc_calc

   1import numpy as np
   2import pandas as pd
   3from ripser import ripser
   4from scipy import sparse
   5from scipy.spatial import KDTree
   6
   7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
   8from corems.mass_spectra.calc import SignalProcessing as sp
   9from corems.mass_spectra.factory.chromat_data import EIC_Data
  10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  11
  12
  13def find_closest(A, target):
  14    """Find the index of closest value in A to each value in target.
  15
  16    Parameters
  17    ----------
  18    A : :obj:`~numpy.array`
  19        The array to search (blueprint). A must be sorted.
  20    target : :obj:`~numpy.array`
  21        The array of values to search for. target must be sorted.
  22
  23    Returns
  24    -------
  25    :obj:`~numpy.array`
  26        The indices of the closest values in A to each value in target.
  27    """
  28    idx = A.searchsorted(target)
  29    idx = np.clip(idx, 1, len(A) - 1)
  30    left = A[idx - 1]
  31    right = A[idx]
  32    idx -= target - left < right - target
  33    return idx
  34
  35
  36class LCCalculations:
  37    """Methods for performing LC calculations on mass spectra data.
  38
  39    Notes
  40    -----
  41    This class is intended to be used as a mixin for the LCMSBase class.
  42
  43    Methods
  44    -------
  45    * get_max_eic(eic_data).
  46        Returns the maximum EIC value from the given EIC data. A static method.
  47    * smooth_tic(tic).
  48        Smooths the TIC data using the specified smoothing method and settings.
  49    * eic_centroid_detector(rt, eic, max_eic).
  50        Performs EIC centroid detection on the given EIC data.
  51    * find_nearest_scan(rt).
  52        Finds the nearest scan to the given retention time.
  53    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
  54        Returns an averaged mass spectrum object.
  55    * find_mass_features(ms_level=1).
  56        Find regions of interest for a given MS level (default is MS1).
  57    * integrate_mass_features(drop_if_fail=False, ms_level=1).
  58        Integrate mass features of interest and extracts EICs.
  59    * find_c13_mass_features().
  60        Evaluate mass features and mark likely C13 isotopes.
  61    * deconvolute_ms1_mass_features().
  62        Deconvolute mass features' ms1 mass spectra.
  63    """
  64
  65    @staticmethod
  66    def get_max_eic(eic_data: dict):
  67        """Returns the maximum EIC value from the given EIC data.
  68
  69        Notes
  70        -----
  71        This is a static method.
  72
  73        Parameters
  74        ----------
  75        eic_data : dict
  76            A dictionary containing EIC data.
  77
  78        Returns
  79        -------
  80        float
  81            The maximum EIC value.
  82        """
  83        max_eic = 0
  84        for eic_data in eic_data.values():
  85            ind_max_eic = max(eic_data.get("EIC"))
  86            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
  87
  88        return max_eic
  89
  90    def smooth_tic(self, tic):
  91        """Smooths the TIC or EIC data using the specified smoothing method and settings.
  92
  93        Parameters
  94        ----------
  95        tic : numpy.ndarray
  96            The TIC (or EIC) data to be smoothed.
  97
  98        Returns
  99        -------
 100        numpy.ndarray
 101            The smoothed TIC data.
 102        """
 103        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
 104
 105        pol_order = self.parameters.lc_ms.savgol_pol_order
 106
 107        window_len = self.parameters.lc_ms.smooth_window
 108
 109        window = self.parameters.lc_ms.smooth_method
 110
 111        return sp.smooth_signal(
 112            tic, window_len, window, pol_order, implemented_smooth_method
 113        )
 114
 115    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
 116        """Performs EIC centroid detection on the given EIC data.
 117
 118        Parameters
 119        ----------
 120        rt : numpy.ndarray
 121            The retention time data.
 122        eic : numpy.ndarray
 123            The EIC data.
 124        max_eic : float
 125            The maximum EIC value.
 126        apex_indexes : list, optional
 127            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
 128
 129        Returns
 130        -------
 131        numpy.ndarray
 132            The indexes of left, apex, and right limits as a generator.
 133        """
 134
 135        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
 136
 137        max_height = self.parameters.lc_ms.peak_height_max_percent
 138
 139        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
 140
 141        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
 142
 143        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
 144
 145        include_indexes = sp.peak_picking_first_derivative(
 146            domain=rt,
 147            signal=eic,
 148            max_height=max_height,
 149            max_prominence=max_prominence,
 150            max_signal=max_eic,
 151            min_peak_datapoints=min_peak_datapoints,
 152            peak_derivative_threshold=peak_derivative_threshold,
 153            signal_threshold=signal_threshold,
 154            correct_baseline=False,
 155            plot_res=False,
 156            apex_indexes=apex_indexes,
 157        )
 158
 159        return include_indexes
 160
 161    def find_nearest_scan(self, rt):
 162        """Finds the nearest scan to the given retention time.
 163
 164        Parameters
 165        ----------
 166        rt : float
 167            The retention time (in minutes) to find the nearest scan for.
 168
 169        Returns
 170        -------
 171        int
 172            The scan number of the nearest scan.
 173        """
 174        array_rt = np.array(self.retention_time)
 175
 176        scan_index = (np.abs(array_rt - rt)).argmin()
 177
 178        real_scan = self.scans_number[scan_index]
 179
 180        return real_scan
 181
 182    def add_peak_metrics(self):
 183        """Add peak metrics to the mass features.
 184
 185        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
 186        """
 187        # Check that at least some mass features have eic data
 188        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
 189            raise ValueError(
 190                "No mass features have EIC data. Run integrate_mass_features first."
 191            )
 192
 193        for mass_feature in self.mass_features.values():
 194            # Check if the mass feature has been integrated
 195            if mass_feature._eic_data is not None:
 196                # Calculate peak metrics
 197                mass_feature.calc_half_height_width()
 198                mass_feature.calc_tailing_factor()
 199                mass_feature.calc_dispersity_index()
 200
 201    def get_average_mass_spectrum(
 202        self,
 203        scan_list,
 204        apex_scan,
 205        spectrum_mode="profile",
 206        ms_level=1,
 207        auto_process=True,
 208        use_parser=False,
 209        perform_checks=True,
 210        polarity=None,
 211        ms_params=None,
 212    ):
 213        """Returns an averaged mass spectrum object
 214
 215        Parameters
 216        ----------
 217        scan_list : list
 218            List of scan numbers to average.
 219        apex_scan : int
 220            Number of the apex scan
 221        spectrum_mode : str, optional
 222            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
 223        ms_level : int, optional
 224            The MS level to use. Defaults to 1.
 225        auto_process : bool, optional
 226            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
 227        use_parser : bool, optional
 228            If True, the mass spectra will be obtained from the parser. Defaults to False.
 229        perform_checks : bool, optional
 230            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
 231        polarity : int, optional
 232            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
 233        ms_params : MSParameters, optional
 234            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
 235
 236        Returns
 237        -------
 238        MassSpectrumProfile
 239            The averaged mass spectrum object.
 240
 241        Raises
 242        ------
 243        ValueError
 244            If the spectrum mode is not "profile".
 245            If the MS level is not found in the unprocessed mass spectra dictionary.
 246            If not all scan numbers are found in the unprocessed mass spectra dictionary.
 247        """
 248        if perform_checks:
 249            if spectrum_mode != "profile":
 250                raise ValueError("Averaging only supported for profile mode")
 251
 252        if polarity is None:
 253            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 254            if self.polarity == "negative":
 255                polarity = -1
 256            elif self.polarity == "positive":
 257                polarity = 1
 258            else:
 259                raise ValueError(
 260                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
 261                )
 262
 263        # if not using_parser, check that scan numbers are in _ms_unprocessed
 264        if not use_parser:
 265            if perform_checks:
 266                # Set index to scan for faster lookup
 267                ms_df = (
 268                    self._ms_unprocessed[ms_level]
 269                    .copy()
 270                    .set_index("scan", drop=False)
 271                    .sort_index()
 272                )
 273                my_ms_df = ms_df.loc[scan_list]
 274                # Check that all scan numbers are in the ms_df
 275                if not all(np.isin(scan_list, ms_df.index)):
 276                    raise ValueError(
 277                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
 278                    )
 279            else:
 280                my_ms_df = (
 281                    pd.DataFrame({"scan": scan_list})
 282                    .set_index("scan")
 283                    .join(self._ms_unprocessed[ms_level], how="left")
 284                )
 285
 286        if use_parser:
 287            ms_list = [
 288                self.spectra_parser.get_mass_spectrum_from_scan(
 289                    x, spectrum_mode=spectrum_mode, auto_process=False
 290                )
 291                for x in scan_list
 292            ]
 293            ms_mz = [x._mz_exp for x in ms_list]
 294            ms_int = [x._abundance for x in ms_list]
 295            my_ms_df = []
 296            for i in np.arange(len(ms_mz)):
 297                my_ms_df.append(
 298                    pd.DataFrame(
 299                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
 300                    )
 301                )
 302            my_ms_df = pd.concat(my_ms_df)
 303
 304        if not self.check_if_grid(my_ms_df):
 305            my_ms_df = self.grid_data(my_ms_df)
 306
 307        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
 308
 309        ms = ms_from_array_profile(
 310            my_ms_ave.mz,
 311            my_ms_ave.intensity,
 312            self.file_location,
 313            polarity=polarity,
 314            auto_process=False,
 315        )
 316
 317        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 318        if ms is not None:
 319            if ms_params is not None:
 320                ms.parameters = ms_params
 321            ms.scan_number = apex_scan
 322            if auto_process:
 323                ms.process_mass_spec()
 324        return ms
 325
 326    def find_mass_features(self, ms_level=1, grid=True):
 327        """Find mass features within an LCMSBase object
 328
 329        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
 330
 331        Parameters
 332        ----------
 333        ms_level : int, optional
 334            The MS level to use for peak picking Default is 1.
 335        grid : bool, optional
 336            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 
 337            used for persistent homology peak picking for profile data only. Default is True.
 338
 339        Raises
 340        ------
 341        ValueError
 342            If no MS level data is found on the object.
 343            If persistent homology peak picking is attempted on non-profile mode data.
 344            If data is not gridded and grid is False.
 345            If peak picking method is not implemented.
 346
 347        Returns
 348        -------
 349        None, but assigns the mass_features and eics attributes to the object.
 350
 351        """
 352        pp_method = self.parameters.lc_ms.peak_picking_method
 353
 354        if pp_method == "persistent homology":
 355            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 356            if all(msx_scan_df["ms_format"] == "profile"):
 357                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
 358                self.cluster_mass_features(drop_children=True, sort_by="persistence")
 359            else:
 360                raise ValueError(
 361                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
 362                        ms_level
 363                    )
 364                )
 365        elif pp_method == "centroided_persistent_homology":
 366            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 367            if all(msx_scan_df["ms_format"] == "centroid"):
 368                self.find_mass_features_ph_centroid(ms_level=ms_level)
 369            else:
 370                raise ValueError(
 371                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
 372                        ms_level
 373                    )
 374                )
 375        else:
 376            raise ValueError("Peak picking method not implemented")
 377
 378    def integrate_mass_features(
 379        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
 380    ):
 381        """Integrate mass features and extract EICs.
 382
 383        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
 384
 385        Parameters
 386        ----------
 387        drop_if_fail : bool, optional
 388            Whether to drop mass features if the EIC limit calculations fail.
 389            Default is True.
 390        drop_duplicates : bool, optional
 391            Whether to mass features that appear to be duplicates
 392            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
 393            Default is True.
 394        ms_level : int, optional
 395            The MS level to use. Default is 1.
 396
 397        Raises
 398        ------
 399        ValueError
 400            If no mass features are found.
 401            If no MS level data is found for the given MS level (either in data or in the scan data)
 402
 403        Returns
 404        -------
 405        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
 406
 407        Notes
 408        -----
 409        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
 410        """
 411        # Check if there is data
 412        if ms_level in self._ms_unprocessed.keys():
 413            raw_data = self._ms_unprocessed[ms_level].copy()
 414        else:
 415            raise ValueError("No MS level " + str(ms_level) + " data found")
 416        if self.mass_features is not None:
 417            mf_df = self.mass_features_to_df().copy()
 418        else:
 419            raise ValueError(
 420                "No mass features found, did you run find_mass_features() first?"
 421            )
 422        # Check if mass_spectrum exists on each mass feature
 423        if not all(
 424            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
 425        ):
 426            raise ValueError(
 427                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
 428            )
 429
 430        # Subset scan data to only include correct ms_level
 431        scan_df_sub = self.scan_df[
 432            self.scan_df["ms_level"] == int(ms_level)
 433        ].reset_index(drop=True)
 434        if scan_df_sub.empty:
 435            raise ValueError("No MS level " + ms_level + " data found in scan data")
 436        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
 437
 438        mzs_to_extract = np.unique(mf_df["mz"].values)
 439        mzs_to_extract.sort()
 440
 441        # Get EICs for each unique mz in mass features list
 442        for mz in mzs_to_extract:
 443            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 444            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 445            raw_data_sub = raw_data[
 446                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
 447            ].reset_index(drop=True)
 448            raw_data_sub = (
 449                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
 450            )
 451            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
 452            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
 453            myEIC = EIC_Data(
 454                scans=raw_data_sub["scan"].values,
 455                time=raw_data_sub["scan_time"].values,
 456                eic=raw_data_sub["intensity"].values,
 457            )
 458            # Smooth EIC
 459            smoothed_eic = self.smooth_tic(myEIC.eic)
 460            smoothed_eic[smoothed_eic < 0] = 0
 461            myEIC.eic_smoothed = smoothed_eic
 462            self.eics[mz] = myEIC
 463
 464        # Get limits of mass features using EIC centroid detector and integrate
 465        mf_df["area"] = np.nan
 466        for idx, mass_feature in mf_df.iterrows():
 467            mz = mass_feature.mz
 468            apex_scan = mass_feature.apex_scan
 469
 470            # Pull EIC data and find apex scan index
 471            myEIC = self.eics[mz]
 472            self.mass_features[idx]._eic_data = myEIC
 473            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
 474
 475            # Find left and right limits of peak using EIC centroid detector, add to EICData
 476            centroid_eics = self.eic_centroid_detector(
 477                myEIC.time,
 478                myEIC.eic_smoothed,
 479                mass_feature.intensity * 1.1,
 480                apex_indexes=[int(apex_index)],
 481            )
 482            l_a_r_scan_idx = [i for i in centroid_eics]
 483            if len(l_a_r_scan_idx) > 0:
 484                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
 485                # Find the number of consecutive non-zero values in the EIC segment
 486                mask = myEIC.eic[l_a_r_scan_idx[0][0]:l_a_r_scan_idx[0][2] + 1] > 0
 487                # Find the longest run of consecutive True values
 488                if np.any(mask):
 489                    # Find indices where mask changes value
 490                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
 491                    starts = np.where(diff == 1)[0]
 492                    ends = np.where(diff == -1)[0]
 493                    consecutive_scans = (ends - starts).max()
 494                else:
 495                    consecutive_scans = 0
 496                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
 497                    self.mass_features.pop(idx)
 498                    continue
 499                # Add start and final scan to mass_features and EICData
 500                left_scan, right_scan = (
 501                    myEIC.scans[l_a_r_scan_idx[0][0]],
 502                    myEIC.scans[l_a_r_scan_idx[0][2]],
 503                )
 504                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
 505                myEIC.apexes = myEIC.apexes + mf_scan_apex
 506                self.mass_features[idx].start_scan = left_scan
 507                self.mass_features[idx].final_scan = right_scan
 508
 509                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
 510                area = np.trapz(
 511                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 512                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 513                )
 514                mf_df.at[idx, "area"] = area
 515                myEIC.areas = myEIC.areas + [area]
 516                self.eics[mz] = myEIC
 517                self.mass_features[idx]._area = area
 518            else:
 519                if drop_if_fail is True:
 520                    self.mass_features.pop(idx)
 521
 522        if drop_duplicates:
 523            # Prepare mass feature dataframe
 524            mf_df = self.mass_features_to_df().copy()
 525
 526            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
 527            # Kepp the first mass fea
 528            for idx, mass_feature in mf_df.iterrows():
 529                mz = mass_feature.mz
 530                apex_scan = mass_feature.apex_scan
 531
 532                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
 533                mf_df_sub = mf_df[
 534                    mf_df["mz_diff_ppm"]
 535                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 536                    * 10**6
 537                ].copy()
 538
 539                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
 540                for idx2, mass_feature2 in mf_df_sub.iterrows():
 541                    if idx2 != idx:
 542                        if (
 543                            mass_feature2.start_scan >= mass_feature.start_scan
 544                            and mass_feature2.final_scan <= mass_feature.final_scan
 545                        ):
 546                            if idx2 in self.mass_features.keys():
 547                                self.mass_features.pop(idx2)
 548
 549    def find_c13_mass_features(self):
 550        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
 551
 552        Returns
 553        -------
 554        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
 555
 556        Raises
 557        ------
 558        ValueError
 559            If no mass features are found.
 560        """
 561        verbose = self.parameters.lc_ms.verbose_processing
 562        if verbose:
 563            print("evaluating mass features for C13 isotopes")
 564        if self.mass_features is None:
 565            raise ValueError("No mass features found, run find_mass_features() first")
 566
 567        # Data prep fo sparse distance matrix
 568        dims = ["mz", "scan_time"]
 569        mf_df = self.mass_features_to_df().copy()
 570        # Drop mass features that have no area (these are likely to be noise)
 571        mf_df = mf_df[mf_df["area"].notnull()]
 572        mf_df["mf_id"] = mf_df.index.values
 573        dims = ["mz", "scan_time"]
 574
 575        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
 576        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
 577
 578        mz_diff = 1.003355  # C13-C12 mass difference
 579        tol = [
 580            mf_df["mz"].median()
 581            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
 582            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
 583        ]  # mz, in relative; scan_time in minutes
 584
 585        # Compute inter-feature distances
 586        distances = None
 587        for i in range(len(dims)):
 588            # Construct k-d tree
 589            values = mf_df[dims[i]].values
 590            tree = KDTree(values.reshape(-1, 1))
 591
 592            max_tol = tol[i]
 593            if dims[i] == "mz":
 594                # Maximum absolute tolerance
 595                max_tol = mz_diff + tol[i]
 596
 597            # Compute sparse distance matrix
 598            # the larger the max_tol, the slower this operation is
 599            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
 600
 601            # Only consider forward case, exclude diagonal
 602            sdm = sparse.triu(sdm, k=1)
 603
 604            if dims[i] == "mz":
 605                min_tol = mz_diff - tol[i]
 606                # Get only the ones that are above the min tol
 607                idx = sdm.data > min_tol
 608
 609                # Reconstruct sparse distance matrix
 610                sdm = sparse.coo_matrix(
 611                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
 612                    shape=(len(values), len(values)),
 613                )
 614
 615            # Cast as binary matrix
 616            sdm.data = np.ones_like(sdm.data)
 617
 618            # Stack distances
 619            if distances is None:
 620                distances = sdm
 621            else:
 622                distances = distances.multiply(sdm)
 623
 624        # Extract indices of within-tolerance points
 625        distances = distances.tocoo()
 626        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
 627
 628        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
 629        pairs_mf = pairs.copy()
 630        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
 631        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
 632
 633        # Connect monoisotopic masses with isotopologes within mass_features
 634        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
 635        for mono in monos:
 636            self.mass_features[mono].monoisotopic_mf_id = mono
 637        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
 638        while not pairs_iso_df.empty:
 639            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
 640            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
 641            for iso in m1_isos:
 642                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
 643                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
 644                if len(parent) > 1:
 645                    # Choose the parent that is closest in time to the isotopologue
 646                    parent_time = [self.mass_features[p].retention_time for p in parent]
 647                    time_diff = [
 648                        np.abs(self.mass_features[iso].retention_time - x)
 649                        for x in parent_time
 650                    ]
 651                    parent = parent[np.argmin(time_diff)]
 652                else:
 653                    parent = parent[0]
 654                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
 655                    parent
 656                ].monoisotopic_mf_id
 657                if self.mass_features[iso].monoisotopic_mf_id is not None:
 658                    mass_diff = (
 659                        self.mass_features[iso].mz
 660                        - self.mass_features[
 661                            self.mass_features[iso].monoisotopic_mf_id
 662                        ].mz
 663                    )
 664                    self.mass_features[iso].isotopologue_type = "13C" + str(
 665                        int(round(mass_diff, 0))
 666                    )
 667
 668            # Drop the mono and iso from the pairs_iso_df
 669            pairs_iso_df = pairs_iso_df.drop(
 670                index=monos, errors="ignore"
 671            )  # Drop pairs where the parent is a child that is a child of a root
 672            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
 673            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
 674
 675            if not pairs_iso_df.empty:
 676                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
 677                monos = np.setdiff1d(
 678                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
 679                )
 680        if verbose:
 681            # Report fraction of compounds annotated with isotopes
 682            mf_df["c13_flag"] = np.where(
 683                np.logical_or(
 684                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
 685                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
 686                ),
 687                1,
 688                0,
 689            )
 690            print(
 691                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
 692                + " of mass features have or are C13 isotopes"
 693            )
 694
 695    def deconvolute_ms1_mass_features(self):
 696        """Deconvolute MS1 mass features
 697
 698        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
 699
 700        Parameters
 701        ----------
 702        None
 703
 704        Returns
 705        -------
 706        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
 707        and associated_mass_features_deconvoluted attributes to the mass features in the
 708        mass_features attribute of the LCMSBase object.
 709
 710        Raises
 711        ------
 712        ValueError
 713            If no mass features are found, must run find_mass_features() first.
 714            If no EICs are found, did you run integrate_mass_features() first?
 715
 716        """
 717        # Checks for set mass_features and eics
 718        if self.mass_features is None:
 719            raise ValueError(
 720                "No mass features found, did you run find_mass_features() first?"
 721            )
 722
 723        if self.eics == {}:
 724            raise ValueError(
 725                "No EICs found, did you run integrate_mass_features() first?"
 726            )
 727
 728        if 1 not in self._ms_unprocessed.keys():
 729            raise ValueError("No unprocessed MS1 spectra found.")
 730
 731        # Prep ms1 data
 732        ms1_data = self._ms_unprocessed[1].copy()
 733        ms1_data = ms1_data.set_index("scan")
 734
 735        # Prep mass feature summary
 736        mass_feature_df = self.mass_features_to_df()
 737
 738        # Loop through each mass feature
 739        for mf_id, mass_feature in self.mass_features.items():
 740            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
 741            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
 742                continue
 743
 744            # Get the left and right limits of the EIC of the mass feature
 745            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
 746
 747            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
 748            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
 749            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
 750
 751            # Get the centroided masses of the mass feature
 752            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
 753
 754            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
 755            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
 756                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
 757            ]
 758
 759            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
 760            ms1_data_sub["mz_diff_rel"] = (
 761                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
 762                / ms1_data_sub["mz"]
 763            )
 764            ms1_data_sub = ms1_data_sub[
 765                ms1_data_sub["mz_diff_rel"]
 766                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 767            ].reset_index(drop=True)
 768
 769            # Group by mass_feature_mz and scan and sum intensity
 770            ms1_data_sub_group = (
 771                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
 772                .sum()
 773                .reset_index()
 774            )
 775
 776            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
 777            corr = (
 778                ms1_data_sub_group.pivot(
 779                    index="scan", columns="mass_feature_mz", values="intensity"
 780                )
 781                .fillna(0)
 782                .corr()
 783            )
 784
 785            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
 786            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
 787
 788            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
 789            try:
 790                corr_subset = corr.loc[mass_feature.mz,]
 791            except KeyError:
 792            # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
 793                continue
 794
 795            corr_subset = corr_subset[corr_subset > decon_corr_min]
 796
 797            # Get the masses from the mass spectrum that are the result of the deconvolution
 798            mzs_decon = corr_subset.index.values
 799
 800            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
 801            mzs_decon_idx = [
 802                id
 803                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
 804                if mz in mzs_decon
 805            ]
 806            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
 807
 808            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
 809            if (
 810                mass_feature.ms1_peak.abundance
 811                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
 812            ):
 813                mass_feature.mass_spectrum_deconvoluted_parent = True
 814            else:
 815                mass_feature.mass_spectrum_deconvoluted_parent = False
 816
 817            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
 818            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
 819            mass_feature_df_sub = mass_feature_df[
 820                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
 821                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 822            ].copy()
 823            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
 824            mass_feature_df_sub["mz_diff_ppm"] = [
 825                np.abs(mzs_decon - mz).min() / mz * 10**6
 826                for mz in mass_feature_df_sub["mz"]
 827            ]
 828            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
 829            mfs_associated_decon = mass_feature_df_sub[
 830                mass_feature_df_sub["mz_diff_ppm"]
 831                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
 832            ].index.values
 833
 834            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
 835
 836
 837class PHCalculations:
 838    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 839
 840    Notes
 841    -----
 842    This class is intended to be used as a mixin for the LCMSBase class.
 843
 844    Methods
 845    -------
 846    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 847        Sparse implementation of a mean filter.
 848    * embed_unique_indices(a).
 849        Creates an array of indices, sorted by unique element.
 850    * sparse_upper_star(idx, V).
 851        Sparse implementation of an upper star filtration.
 852    * check_if_grid(data).
 853        Check if the data is gridded in mz space.
 854    * grid_data(data).
 855        Grid the data in the mz dimension.
 856    * find_mass_features_ph(ms_level=1, grid=True).
 857        Find mass features within an LCMSBase object using persistent homology.
 858    * cluster_mass_features(drop_children=True).
 859        Cluster regions of interest.
 860    """
 861
 862    @staticmethod
 863    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 864        """Sparse implementation of a mean filter.
 865
 866        Parameters
 867        ----------
 868        idx : :obj:`~numpy.array`
 869            Edge indices for each dimension (MxN).
 870        V : :obj:`~numpy.array`
 871            Array of intensity data (Mx1).
 872        radius : float or list
 873            Radius of the sparse filter in each dimension. Values less than
 874            zero indicate no connectivity in that dimension.
 875
 876        Returns
 877        -------
 878        :obj:`~numpy.array`
 879            Filtered intensities (Mx1).
 880
 881        Notes
 882        -----
 883        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 884        This is a static method.
 885        """
 886
 887        # Copy indices
 888        idx = idx.copy().astype(V.dtype)
 889
 890        # Scale
 891        for i, r in enumerate(radius):
 892            # Increase inter-index distance
 893            if r < 1:
 894                idx[:, i] *= 2
 895
 896            # Do nothing
 897            elif r == 1:
 898                pass
 899
 900            # Decrease inter-index distance
 901            else:
 902                idx[:, i] /= r
 903
 904        # Connectivity matrix
 905        cmat = KDTree(idx)
 906        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 907        cmat.setdiag(1)
 908
 909        # Pair indices
 910        I, J = cmat.nonzero()
 911
 912        # Delete cmat
 913        cmat_shape = cmat.shape
 914        del cmat
 915
 916        # Sum over columns
 917        V_sum = sparse.bsr_matrix(
 918            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 919        ).diagonal(0)
 920
 921        # Count over columns
 922        V_count = sparse.bsr_matrix(
 923            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 924        ).diagonal(0)
 925
 926        return V_sum / V_count
 927
 928    @staticmethod
 929    def embed_unique_indices(a):
 930        """Creates an array of indices, sorted by unique element.
 931
 932        Parameters
 933        ----------
 934        a : :obj:`~numpy.array`
 935            Array of unique elements (Mx1).
 936
 937        Returns
 938        -------
 939        :obj:`~numpy.array`
 940            Array of indices (Mx1).
 941
 942        Notes
 943        -----
 944        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 945        This is a static method.
 946        """
 947
 948        def count_tens(n):
 949            # Count tens
 950            ntens = (n - 1) // 10
 951
 952            while True:
 953                ntens_test = (ntens + n - 1) // 10
 954
 955                if ntens_test == ntens:
 956                    return ntens
 957                else:
 958                    ntens = ntens_test
 959
 960        def arange_exclude_10s(n):
 961            # How many 10s will there be?
 962            ntens = count_tens(n)
 963
 964            # Base array
 965            arr = np.arange(0, n + ntens)
 966
 967            # Exclude 10s
 968            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 969
 970            return arr
 971
 972        # Creates an array of indices, sorted by unique element
 973        idx_sort = np.argsort(a)
 974        idx_unsort = np.argsort(idx_sort)
 975
 976        # Sorts records array so all unique elements are together
 977        sorted_a = a[idx_sort]
 978
 979        # Returns the unique values, the index of the first occurrence,
 980        # and the count for each element
 981        vals, idx_start, count = np.unique(
 982            sorted_a, return_index=True, return_counts=True
 983        )
 984
 985        # Splits the indices into separate arrays
 986        splits = np.split(idx_sort, idx_start[1:])
 987
 988        # Creates unique indices for each split
 989        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 990
 991        # Reorders according to input array
 992        idx_unq = idx_unq[idx_unsort]
 993
 994        # Magnitude of each index
 995        exp = np.log10(
 996            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
 997        )
 998        idx_unq_mag = np.power(10, np.floor(exp) + 1)
 999
1000        # Result
1001        return a + idx_unq / idx_unq_mag
1002    
1003    @staticmethod
1004    def roll_up_dataframe(
1005        df : pd.DataFrame,
1006        sort_by : str,
1007        tol : list,
1008        relative : list,
1009        dims : list
1010    ):
1011        """Subset data by rolling up into apex in appropriate dimensions.
1012        
1013        Parameters
1014        ----------
1015        data : pd.DataFrame
1016            The input data containing "dims" columns and the "sort_by" column.
1017        sort_by : str
1018            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 
1019            (i.e., the mass feature with the highest value in the sort_by column).
1020        dims : list
1021            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1022        tol : list
1023            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1024            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1025            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1026            If relative is False, the tolerance will be used as is.
1027        relative : list
1028            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1029
1030        Returns
1031        -------
1032        pd.DataFrame
1033            A DataFrame with only the rolled up mass features, with the original index and columns.
1034
1035            
1036        Raises
1037        ------
1038        ValueError
1039            If the input data is not a pandas DataFrame.
1040            If the input data does not have columns for each of the dimensions in "dims".
1041            If the length of "dims", "tol", and "relative" do not match.
1042        """
1043        og_columns = df.columns.copy()
1044
1045        # Unindex the data, but keep the original index
1046        if df.index.name is not None:
1047            og_index = df.index.name
1048        else:
1049            og_index = "index"
1050        df = df.reset_index(drop=False)
1051
1052        # Sort data by sort_by column, and reindex
1053        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1054
1055        # Check that data is a DataFrame and has columns for each of the dims
1056        if not isinstance(df, pd.DataFrame):
1057            raise ValueError("Data must be a pandas DataFrame")
1058        for dim in dims:
1059            if dim not in df.columns:
1060                raise ValueError(f"Data must have a column for {dim}")
1061        if len(dims) != len(tol) or len(dims) != len(relative):
1062            raise ValueError(
1063                "Dimensions, tolerances, and relative flags must be the same length"
1064            )
1065        
1066        # Compute inter-feature distances
1067        distances = None
1068        for i in range(len(dims)):
1069            # Construct k-d tree
1070            values = df[dims[i]].values
1071            tree = KDTree(values.reshape(-1, 1))
1072
1073            max_tol = tol[i]
1074            if relative[i] is True:
1075                # Maximum absolute tolerance
1076                max_tol = tol[i] * values.max()
1077
1078            # Compute sparse distance matrix
1079            # the larger the max_tol, the slower this operation is
1080            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1081
1082            # Only consider forward case, exclude diagonal
1083            sdm = sparse.triu(sdm, k=1)
1084
1085            # Filter relative distances
1086            if relative[i] is True:
1087                # Compute relative distances
1088                rel_dists = sdm.data / values[sdm.row]  # or col?
1089
1090                # Indices of relative distances less than tolerance
1091                idx = rel_dists <= tol[i]
1092
1093                # Reconstruct sparse distance matrix
1094                sdm = sparse.coo_matrix(
1095                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1096                    shape=(len(values), len(values)),
1097                )
1098
1099            # Cast as binary matrix
1100            sdm.data = np.ones_like(sdm.data)
1101
1102            # Stack distances
1103            if distances is None:
1104                distances = sdm
1105            else:
1106                distances = distances.multiply(sdm)
1107        
1108        # Extract indices of within-tolerance points
1109        distances = distances.tocoo()
1110        pairs = np.stack((distances.row, distances.col), axis=1)
1111        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1112        pairs_df = pairs_df.set_index("parent")
1113
1114        to_drop = []
1115        while not pairs_df.empty:
1116            # Find root_parents and their children
1117            root_parents = np.setdiff1d(
1118                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1119            )
1120            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1121            to_drop = np.append(to_drop, children_of_roots)
1122
1123            # Remove root_children as possible parents from pairs_df for next iteration
1124            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1125            pairs_df = pairs_df.reset_index().set_index("child")
1126            # Remove root_children as possible children from pairs_df for next iteration
1127            pairs_df = pairs_df.drop(index=children_of_roots)
1128
1129            # Prepare for next iteration
1130            pairs_df = pairs_df.reset_index().set_index("parent")
1131
1132        # Drop mass features that are not cluster parents
1133        df_sub = df.drop(index=np.array(to_drop))
1134
1135        # Set index back to og_index and only keep the columns that are in the original dataframe
1136        df_sub = df_sub.set_index(og_index)
1137
1138        # sort the dataframe by the original index
1139        df_sub = df_sub.sort_index()
1140        df_sub = df_sub[og_columns]
1141
1142        return df_sub
1143    
1144    def sparse_upper_star(self, idx, V):
1145        """Sparse implementation of an upper star filtration.
1146
1147        Parameters
1148        ----------
1149        idx : :obj:`~numpy.array`
1150            Edge indices for each dimension (MxN).
1151        V : :obj:`~numpy.array`
1152            Array of intensity data (Mx1).
1153        Returns
1154        -------
1155        idx : :obj:`~numpy.array`
1156            Index of filtered points (Mx1).
1157        persistence : :obj:`~numpy.array`
1158            Persistence of each filtered point (Mx1).
1159
1160        Notes
1161        -----
1162        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1163        """
1164
1165        # Invert
1166        V = -1 * V.copy().astype(int)
1167
1168        # Embed indices
1169        V = self.embed_unique_indices(V)
1170
1171        # Connectivity matrix
1172        cmat = KDTree(idx)
1173        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1174        cmat.setdiag(1)
1175        cmat = sparse.triu(cmat)
1176
1177        # Pairwise minimums
1178        I, J = cmat.nonzero()
1179        d = np.maximum(V[I], V[J])
1180
1181        # Delete connectiity matrix
1182        cmat_shape = cmat.shape
1183        del cmat
1184
1185        # Sparse distance matrix
1186        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1187
1188        # Delete pairwise mins
1189        del d, I, J
1190
1191        # Persistence homology
1192        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1193
1194        # Bound death values
1195        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1196
1197        # Construct tree to query against
1198        tree = KDTree(V.reshape((-1, 1)))
1199
1200        # Get the indexes of the first nearest neighbor by birth
1201        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1202
1203        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1204
1205    def check_if_grid(self, data):
1206        """Check if the data are gridded in mz space.
1207
1208        Parameters
1209        ----------
1210        data : DataFrame
1211            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1212
1213        Returns
1214        -------
1215        bool
1216            True if the data is gridded in the mz direction, False otherwise.
1217
1218        Notes
1219        -----
1220        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1221        """
1222        # Calculate the difference between consecutive mz values in a single scan
1223        dat_check = data.copy().reset_index(drop=True)
1224        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1225        mz_diff_min = (
1226            dat_check.groupby("scan")["mz_diff"].min().min()
1227        )  # within each scan, what is the smallest mz difference between consecutive mz values
1228
1229        # Find the mininum mz difference between mz values in the data; regardless of scan
1230        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1231        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1232        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1233
1234        # Get minimum mz_diff between mz values in the data
1235        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1236
1237        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1238        if mz_diff_min_raw < mz_diff_min:
1239            return False
1240        else:
1241            return True
1242
1243    def grid_data(self, data, attempts=5):
1244        """Grid the data in the mz dimension.
1245
1246        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1247
1248        Parameters
1249        ----------
1250        data : DataFrame
1251            The input data containing mz, scan, scan_time, and intensity columns.
1252        attempts : int, optional
1253            The number of attempts to grid the data. Default is 5.
1254
1255        Returns
1256        -------
1257        DataFrame
1258            The gridded data with mz, scan, scan_time, and intensity columns.
1259
1260        Raises
1261        ------
1262        ValueError
1263            If gridding fails after the specified number of attempts.
1264        """
1265        attempt_i = 0
1266        while attempt_i < attempts:
1267            attempt_i += 1
1268            data = self._grid_data(data)
1269
1270            if self.check_if_grid(data):
1271                return data
1272        
1273        if not self.check_if_grid(data):
1274            raise ValueError(
1275                "Gridding failed after "
1276                + str(attempt_i)
1277                + " attempts. Please check the data."
1278            )
1279        else:
1280            return data
1281    
1282    def _grid_data(self, data):
1283        """Internal method to grid the data in the mz dimension.
1284        
1285        Notes
1286        -----
1287        This method is called by the grid_data method and should not be called directly.
1288        It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan,
1289        but it does not check if the data is already gridded or if the gridding is successful.
1290
1291        Parameters
1292        ----------
1293        data : pd.DataFrame or pl.DataFrame
1294            The input data to grid.
1295
1296        Returns
1297        -------
1298        pd.DataFrame or pl.DataFrame
1299            The data after attempting to grid it in the mz dimension.
1300        """
1301        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1302        data_w = data.copy().reset_index(drop=True)
1303        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1304        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1305
1306        # Need high intensity mz values first so they are parents in the output pairs stack
1307        dat_mz = data_w[["mz", "intensity"]].sort_values(
1308            by=["intensity"], ascending=False
1309        )
1310        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1311
1312        # Construct KD tree
1313        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1314        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1315        sdm = sparse.triu(sdm, k=1)
1316        sdm.data = np.ones_like(sdm.data)
1317        distances = sdm.tocoo()
1318        pairs = np.stack((distances.row, distances.col), axis=1)
1319
1320        # Cull pairs to just get root
1321        to_drop = []
1322        while len(pairs) > 0:
1323            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1324            id_root_parents = np.isin(pairs[:, 0], root_parents)
1325            children_of_roots = np.unique(pairs[id_root_parents, 1])
1326            to_drop = np.append(to_drop, children_of_roots)
1327
1328            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1329            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1330            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1331        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1332        mz_dat_np = (
1333            dat_mz[["mz"]]
1334            .sort_values(by=["mz"])
1335            .reset_index(drop=True)
1336            .values.flatten()
1337        )
1338
1339        # Sort data by mz and recast mz to nearest value in mz_dat_np
1340        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1341        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1342        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1343
1344        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1345        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1346        new_data_w = (
1347            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1348            .groupby(["scan", "mz"])["intensity"]
1349            .sum()
1350            .reset_index()
1351        )
1352        new_data_w = (
1353            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1354            .reset_index(drop=True)
1355            .copy()
1356        )
1357
1358        return new_data_w
1359    
1360    def find_mass_features_ph(self, ms_level=1, grid=True):
1361        """Find mass features within an LCMSBase object using persistent homology.
1362
1363        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1364
1365        Parameters
1366        ----------
1367        ms_level : int, optional
1368            The MS level to use. Default is 1.
1369        grid : bool, optional
1370            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1371
1372        Raises
1373        ------
1374        ValueError
1375            If no MS level data is found on the object.
1376            If data is not gridded and grid is False.
1377
1378        Returns
1379        -------
1380        None, but assigns the mass_features attribute to the object.
1381
1382        Notes
1383        -----
1384        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1385        """
1386        # Check that ms_level is a key in self._ms_uprocessed
1387        if ms_level not in self._ms_unprocessed.keys():
1388            raise ValueError(
1389                "No MS level "
1390                + str(ms_level)
1391                + " data found, did you instantiate with parser specific to MS level?"
1392            )
1393
1394        # Get ms data
1395        data = self._ms_unprocessed[ms_level].copy()
1396
1397        # Drop rows with missing intensity values and reset index
1398        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1399
1400        # Threshold data
1401        dims = ["mz", "scan_time"]
1402        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1403        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1404
1405        # Check if gridded, if not, grid
1406        gridded_mz = self.check_if_grid(data_thres)
1407        if gridded_mz is False:
1408            if grid is False:
1409                raise ValueError(
1410                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1411                )
1412            else:
1413                data_thres = self.grid_data(data_thres)
1414
1415        # Add build factors and add scan_time
1416        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1417        factors = {
1418            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1419            for dim in dims
1420        }  # this is return a float64 index
1421
1422        # Build indexes
1423        index = {
1424            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1425            for dim in factors
1426        }
1427
1428        # Smooth data
1429        iterations = self.parameters.lc_ms.ph_smooth_it
1430        smooth_radius = [
1431            self.parameters.lc_ms.ph_smooth_radius_mz,
1432            self.parameters.lc_ms.ph_smooth_radius_scan,
1433        ]  # mz, scan_time smoothing radius (in steps)
1434
1435        index = np.vstack([index[dim] for dim in dims]).T
1436        V = data_thres["intensity"].values
1437        resid = np.inf
1438        for i in range(iterations):
1439            # Previous iteration
1440            V_prev = V.copy()
1441            resid_prev = resid
1442            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1443
1444            # Calculate residual with previous iteration
1445            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1446
1447            # Evaluate convergence
1448            if i > 0:
1449                # Percent change in residual
1450                test = np.abs(resid - resid_prev) / resid_prev
1451
1452                # Exit criteria
1453                if test <= 0:
1454                    break
1455
1456        # Overwrite values
1457        data_thres["intensity"] = V
1458
1459        # Use persistent homology to find regions of interest
1460        pidx, pers = self.sparse_upper_star(index, V)
1461        pidx = pidx[pers > 1]
1462        pers = pers[pers > 1]
1463
1464        # Get peaks
1465        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1466
1467        # Add persistence column
1468        peaks["persistence"] = pers
1469        mass_features = peaks.sort_values(
1470            by="persistence", ascending=False
1471        ).reset_index(drop=True)
1472
1473        # Filter by persistence threshold
1474        persistence_threshold = (
1475            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1476        )
1477        mass_features = mass_features.loc[
1478            mass_features["persistence"] > persistence_threshold, :
1479        ].reset_index(drop=True)
1480
1481        # Rename scan column to apex_scan
1482        mass_features = mass_features.rename(
1483            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1484        )
1485
1486        # Populate mass_features attribute
1487        self.mass_features = {}
1488        for row in mass_features.itertuples():
1489            row_dict = mass_features.iloc[row.Index].to_dict()
1490            lcms_feature = LCMSMassFeature(self, **row_dict)
1491            self.mass_features[lcms_feature.id] = lcms_feature
1492
1493        if self.parameters.lc_ms.verbose_processing:
1494            print("Found " + str(len(mass_features)) + " initial mass features")
1495
1496    def find_mass_features_ph_centroid(self, ms_level=1):
1497        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1498        
1499        Parameters
1500        ----------
1501        ms_level : int, optional
1502            The MS level to use. Default is 1.
1503        
1504        Raises
1505        ------
1506        ValueError
1507            If no MS level data is found on the object.
1508
1509        Returns
1510        -------
1511        None, but assigns the mass_features attribute to the object.        
1512        """
1513        # Check that ms_level is a key in self._ms_uprocessed
1514        if ms_level not in self._ms_unprocessed.keys():
1515            raise ValueError(
1516                "No MS level "
1517                + str(ms_level)
1518                + " data found, did you instantiate with parser specific to MS level?"
1519            )
1520        
1521        # Get ms data
1522        data = self._ms_unprocessed[ms_level].copy()
1523
1524        # Drop rows with missing intensity values and reset index
1525        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1526        
1527        # Threshold data
1528        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1529        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1530        data_thres['persistence'] = data_thres['intensity']
1531
1532        # Use this as the starting point for the mass features, adding scan_time
1533        mf_df = data_thres
1534        mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1535
1536        # Define tolerances and dimensions for rolling up
1537        tol = [
1538            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1539            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1540        ]
1541        relative = [True, False]    
1542        dims = ["mz", "scan_time"]
1543        print("here")
1544        mf_df = self.roll_up_dataframe(
1545            df=mf_df,
1546            sort_by="persistence",
1547            dims=dims,
1548            tol=tol,
1549            relative=relative
1550        )
1551
1552        # Rename scan column to apex_scan
1553        mass_features = mf_df.rename(
1554            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1555        )
1556        # Sort my persistence and reset index
1557        mass_features = mass_features.sort_values(
1558            by="persistence", ascending=False
1559        ).reset_index(drop=True)
1560
1561        # Populate mass_features attribute
1562        self.mass_features = {}
1563        for row in mass_features.itertuples():
1564            row_dict = mass_features.iloc[row.Index].to_dict()
1565            lcms_feature = LCMSMassFeature(self, **row_dict)
1566            self.mass_features[lcms_feature.id] = lcms_feature
1567
1568        if self.parameters.lc_ms.verbose_processing:
1569            print("Found " + str(len(mass_features)) + " initial mass features")
1570    
1571    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1572        """Cluster mass features
1573
1574        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1575
1576        Parameters
1577        ----------
1578        drop_children : bool, optional
1579            Whether to drop the mass features that are not cluster parents. Default is True.
1580        sort_by : str, optional
1581            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1582
1583        Raises
1584        ------
1585        ValueError
1586            If no mass features are found.
1587            If too many mass features are found.
1588
1589        Returns
1590        -------
1591        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1592        """
1593        verbose = self.parameters.lc_ms.verbose_processing
1594
1595        if self.mass_features is None:
1596            raise ValueError("No mass features found, run find_mass_features() first")
1597        if len(self.mass_features) > 400000:
1598            raise ValueError(
1599                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1600            )
1601        dims = ["mz", "scan_time"]
1602        mf_df_og = self.mass_features_to_df()
1603        mf_df = mf_df_og.copy()
1604
1605        tol = [
1606            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1607            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1608        ]  # mz, in relative; scan_time in minutes
1609        relative = [True, False]
1610
1611        # Roll up mass features based on their proximity in the declared dimensions
1612        mf_df_new = self.roll_up_dataframe(
1613            df=mf_df,
1614            sort_by=sort_by,
1615            dims=dims,
1616            tol=tol,
1617            relative=relative
1618        )
1619
1620        mf_df["cluster_parent"] = np.where(
1621            np.isin(mf_df.index, mf_df_new.index), True, False
1622        )
1623
1624        # get mass feature ids of features that are not cluster parents
1625        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1626        if drop_children is True:
1627            # Drop mass features that are not cluster parents from self
1628            self.mass_features = {
1629                k: v
1630                for k, v in self.mass_features.items()
1631                if k not in cluster_daughters
1632            }
1633        else:
1634            return cluster_daughters
def find_closest(A, target):
14def find_closest(A, target):
15    """Find the index of closest value in A to each value in target.
16
17    Parameters
18    ----------
19    A : :obj:`~numpy.array`
20        The array to search (blueprint). A must be sorted.
21    target : :obj:`~numpy.array`
22        The array of values to search for. target must be sorted.
23
24    Returns
25    -------
26    :obj:`~numpy.array`
27        The indices of the closest values in A to each value in target.
28    """
29    idx = A.searchsorted(target)
30    idx = np.clip(idx, 1, len(A) - 1)
31    left = A[idx - 1]
32    right = A[idx]
33    idx -= target - left < right - target
34    return idx

Find the index of closest value in A to each value in target.

Parameters
  • A (~numpy.array): The array to search (blueprint). A must be sorted.
  • target (~numpy.array): The array of values to search for. target must be sorted.
Returns
  • ~numpy.array: The indices of the closest values in A to each value in target.
class LCCalculations:
 37class LCCalculations:
 38    """Methods for performing LC calculations on mass spectra data.
 39
 40    Notes
 41    -----
 42    This class is intended to be used as a mixin for the LCMSBase class.
 43
 44    Methods
 45    -------
 46    * get_max_eic(eic_data).
 47        Returns the maximum EIC value from the given EIC data. A static method.
 48    * smooth_tic(tic).
 49        Smooths the TIC data using the specified smoothing method and settings.
 50    * eic_centroid_detector(rt, eic, max_eic).
 51        Performs EIC centroid detection on the given EIC data.
 52    * find_nearest_scan(rt).
 53        Finds the nearest scan to the given retention time.
 54    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
 55        Returns an averaged mass spectrum object.
 56    * find_mass_features(ms_level=1).
 57        Find regions of interest for a given MS level (default is MS1).
 58    * integrate_mass_features(drop_if_fail=False, ms_level=1).
 59        Integrate mass features of interest and extracts EICs.
 60    * find_c13_mass_features().
 61        Evaluate mass features and mark likely C13 isotopes.
 62    * deconvolute_ms1_mass_features().
 63        Deconvolute mass features' ms1 mass spectra.
 64    """
 65
 66    @staticmethod
 67    def get_max_eic(eic_data: dict):
 68        """Returns the maximum EIC value from the given EIC data.
 69
 70        Notes
 71        -----
 72        This is a static method.
 73
 74        Parameters
 75        ----------
 76        eic_data : dict
 77            A dictionary containing EIC data.
 78
 79        Returns
 80        -------
 81        float
 82            The maximum EIC value.
 83        """
 84        max_eic = 0
 85        for eic_data in eic_data.values():
 86            ind_max_eic = max(eic_data.get("EIC"))
 87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
 88
 89        return max_eic
 90
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )
115
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159
160        return include_indexes
161
162    def find_nearest_scan(self, rt):
163        """Finds the nearest scan to the given retention time.
164
165        Parameters
166        ----------
167        rt : float
168            The retention time (in minutes) to find the nearest scan for.
169
170        Returns
171        -------
172        int
173            The scan number of the nearest scan.
174        """
175        array_rt = np.array(self.retention_time)
176
177        scan_index = (np.abs(array_rt - rt)).argmin()
178
179        real_scan = self.scans_number[scan_index]
180
181        return real_scan
182
183    def add_peak_metrics(self):
184        """Add peak metrics to the mass features.
185
186        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
187        """
188        # Check that at least some mass features have eic data
189        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
190            raise ValueError(
191                "No mass features have EIC data. Run integrate_mass_features first."
192            )
193
194        for mass_feature in self.mass_features.values():
195            # Check if the mass feature has been integrated
196            if mass_feature._eic_data is not None:
197                # Calculate peak metrics
198                mass_feature.calc_half_height_width()
199                mass_feature.calc_tailing_factor()
200                mass_feature.calc_dispersity_index()
201
202    def get_average_mass_spectrum(
203        self,
204        scan_list,
205        apex_scan,
206        spectrum_mode="profile",
207        ms_level=1,
208        auto_process=True,
209        use_parser=False,
210        perform_checks=True,
211        polarity=None,
212        ms_params=None,
213    ):
214        """Returns an averaged mass spectrum object
215
216        Parameters
217        ----------
218        scan_list : list
219            List of scan numbers to average.
220        apex_scan : int
221            Number of the apex scan
222        spectrum_mode : str, optional
223            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
224        ms_level : int, optional
225            The MS level to use. Defaults to 1.
226        auto_process : bool, optional
227            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
228        use_parser : bool, optional
229            If True, the mass spectra will be obtained from the parser. Defaults to False.
230        perform_checks : bool, optional
231            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
232        polarity : int, optional
233            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
234        ms_params : MSParameters, optional
235            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
236
237        Returns
238        -------
239        MassSpectrumProfile
240            The averaged mass spectrum object.
241
242        Raises
243        ------
244        ValueError
245            If the spectrum mode is not "profile".
246            If the MS level is not found in the unprocessed mass spectra dictionary.
247            If not all scan numbers are found in the unprocessed mass spectra dictionary.
248        """
249        if perform_checks:
250            if spectrum_mode != "profile":
251                raise ValueError("Averaging only supported for profile mode")
252
253        if polarity is None:
254            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
255            if self.polarity == "negative":
256                polarity = -1
257            elif self.polarity == "positive":
258                polarity = 1
259            else:
260                raise ValueError(
261                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
262                )
263
264        # if not using_parser, check that scan numbers are in _ms_unprocessed
265        if not use_parser:
266            if perform_checks:
267                # Set index to scan for faster lookup
268                ms_df = (
269                    self._ms_unprocessed[ms_level]
270                    .copy()
271                    .set_index("scan", drop=False)
272                    .sort_index()
273                )
274                my_ms_df = ms_df.loc[scan_list]
275                # Check that all scan numbers are in the ms_df
276                if not all(np.isin(scan_list, ms_df.index)):
277                    raise ValueError(
278                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
279                    )
280            else:
281                my_ms_df = (
282                    pd.DataFrame({"scan": scan_list})
283                    .set_index("scan")
284                    .join(self._ms_unprocessed[ms_level], how="left")
285                )
286
287        if use_parser:
288            ms_list = [
289                self.spectra_parser.get_mass_spectrum_from_scan(
290                    x, spectrum_mode=spectrum_mode, auto_process=False
291                )
292                for x in scan_list
293            ]
294            ms_mz = [x._mz_exp for x in ms_list]
295            ms_int = [x._abundance for x in ms_list]
296            my_ms_df = []
297            for i in np.arange(len(ms_mz)):
298                my_ms_df.append(
299                    pd.DataFrame(
300                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
301                    )
302                )
303            my_ms_df = pd.concat(my_ms_df)
304
305        if not self.check_if_grid(my_ms_df):
306            my_ms_df = self.grid_data(my_ms_df)
307
308        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
309
310        ms = ms_from_array_profile(
311            my_ms_ave.mz,
312            my_ms_ave.intensity,
313            self.file_location,
314            polarity=polarity,
315            auto_process=False,
316        )
317
318        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
319        if ms is not None:
320            if ms_params is not None:
321                ms.parameters = ms_params
322            ms.scan_number = apex_scan
323            if auto_process:
324                ms.process_mass_spec()
325        return ms
326
327    def find_mass_features(self, ms_level=1, grid=True):
328        """Find mass features within an LCMSBase object
329
330        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
331
332        Parameters
333        ----------
334        ms_level : int, optional
335            The MS level to use for peak picking Default is 1.
336        grid : bool, optional
337            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 
338            used for persistent homology peak picking for profile data only. Default is True.
339
340        Raises
341        ------
342        ValueError
343            If no MS level data is found on the object.
344            If persistent homology peak picking is attempted on non-profile mode data.
345            If data is not gridded and grid is False.
346            If peak picking method is not implemented.
347
348        Returns
349        -------
350        None, but assigns the mass_features and eics attributes to the object.
351
352        """
353        pp_method = self.parameters.lc_ms.peak_picking_method
354
355        if pp_method == "persistent homology":
356            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
357            if all(msx_scan_df["ms_format"] == "profile"):
358                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
359                self.cluster_mass_features(drop_children=True, sort_by="persistence")
360            else:
361                raise ValueError(
362                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
363                        ms_level
364                    )
365                )
366        elif pp_method == "centroided_persistent_homology":
367            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
368            if all(msx_scan_df["ms_format"] == "centroid"):
369                self.find_mass_features_ph_centroid(ms_level=ms_level)
370            else:
371                raise ValueError(
372                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
373                        ms_level
374                    )
375                )
376        else:
377            raise ValueError("Peak picking method not implemented")
378
379    def integrate_mass_features(
380        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
381    ):
382        """Integrate mass features and extract EICs.
383
384        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
385
386        Parameters
387        ----------
388        drop_if_fail : bool, optional
389            Whether to drop mass features if the EIC limit calculations fail.
390            Default is True.
391        drop_duplicates : bool, optional
392            Whether to mass features that appear to be duplicates
393            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
394            Default is True.
395        ms_level : int, optional
396            The MS level to use. Default is 1.
397
398        Raises
399        ------
400        ValueError
401            If no mass features are found.
402            If no MS level data is found for the given MS level (either in data or in the scan data)
403
404        Returns
405        -------
406        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
407
408        Notes
409        -----
410        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
411        """
412        # Check if there is data
413        if ms_level in self._ms_unprocessed.keys():
414            raw_data = self._ms_unprocessed[ms_level].copy()
415        else:
416            raise ValueError("No MS level " + str(ms_level) + " data found")
417        if self.mass_features is not None:
418            mf_df = self.mass_features_to_df().copy()
419        else:
420            raise ValueError(
421                "No mass features found, did you run find_mass_features() first?"
422            )
423        # Check if mass_spectrum exists on each mass feature
424        if not all(
425            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
426        ):
427            raise ValueError(
428                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
429            )
430
431        # Subset scan data to only include correct ms_level
432        scan_df_sub = self.scan_df[
433            self.scan_df["ms_level"] == int(ms_level)
434        ].reset_index(drop=True)
435        if scan_df_sub.empty:
436            raise ValueError("No MS level " + ms_level + " data found in scan data")
437        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
438
439        mzs_to_extract = np.unique(mf_df["mz"].values)
440        mzs_to_extract.sort()
441
442        # Get EICs for each unique mz in mass features list
443        for mz in mzs_to_extract:
444            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
445            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
446            raw_data_sub = raw_data[
447                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
448            ].reset_index(drop=True)
449            raw_data_sub = (
450                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
451            )
452            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
453            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
454            myEIC = EIC_Data(
455                scans=raw_data_sub["scan"].values,
456                time=raw_data_sub["scan_time"].values,
457                eic=raw_data_sub["intensity"].values,
458            )
459            # Smooth EIC
460            smoothed_eic = self.smooth_tic(myEIC.eic)
461            smoothed_eic[smoothed_eic < 0] = 0
462            myEIC.eic_smoothed = smoothed_eic
463            self.eics[mz] = myEIC
464
465        # Get limits of mass features using EIC centroid detector and integrate
466        mf_df["area"] = np.nan
467        for idx, mass_feature in mf_df.iterrows():
468            mz = mass_feature.mz
469            apex_scan = mass_feature.apex_scan
470
471            # Pull EIC data and find apex scan index
472            myEIC = self.eics[mz]
473            self.mass_features[idx]._eic_data = myEIC
474            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
475
476            # Find left and right limits of peak using EIC centroid detector, add to EICData
477            centroid_eics = self.eic_centroid_detector(
478                myEIC.time,
479                myEIC.eic_smoothed,
480                mass_feature.intensity * 1.1,
481                apex_indexes=[int(apex_index)],
482            )
483            l_a_r_scan_idx = [i for i in centroid_eics]
484            if len(l_a_r_scan_idx) > 0:
485                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
486                # Find the number of consecutive non-zero values in the EIC segment
487                mask = myEIC.eic[l_a_r_scan_idx[0][0]:l_a_r_scan_idx[0][2] + 1] > 0
488                # Find the longest run of consecutive True values
489                if np.any(mask):
490                    # Find indices where mask changes value
491                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
492                    starts = np.where(diff == 1)[0]
493                    ends = np.where(diff == -1)[0]
494                    consecutive_scans = (ends - starts).max()
495                else:
496                    consecutive_scans = 0
497                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
498                    self.mass_features.pop(idx)
499                    continue
500                # Add start and final scan to mass_features and EICData
501                left_scan, right_scan = (
502                    myEIC.scans[l_a_r_scan_idx[0][0]],
503                    myEIC.scans[l_a_r_scan_idx[0][2]],
504                )
505                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
506                myEIC.apexes = myEIC.apexes + mf_scan_apex
507                self.mass_features[idx].start_scan = left_scan
508                self.mass_features[idx].final_scan = right_scan
509
510                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
511                area = np.trapz(
512                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
513                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
514                )
515                mf_df.at[idx, "area"] = area
516                myEIC.areas = myEIC.areas + [area]
517                self.eics[mz] = myEIC
518                self.mass_features[idx]._area = area
519            else:
520                if drop_if_fail is True:
521                    self.mass_features.pop(idx)
522
523        if drop_duplicates:
524            # Prepare mass feature dataframe
525            mf_df = self.mass_features_to_df().copy()
526
527            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
528            # Kepp the first mass fea
529            for idx, mass_feature in mf_df.iterrows():
530                mz = mass_feature.mz
531                apex_scan = mass_feature.apex_scan
532
533                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
534                mf_df_sub = mf_df[
535                    mf_df["mz_diff_ppm"]
536                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
537                    * 10**6
538                ].copy()
539
540                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
541                for idx2, mass_feature2 in mf_df_sub.iterrows():
542                    if idx2 != idx:
543                        if (
544                            mass_feature2.start_scan >= mass_feature.start_scan
545                            and mass_feature2.final_scan <= mass_feature.final_scan
546                        ):
547                            if idx2 in self.mass_features.keys():
548                                self.mass_features.pop(idx2)
549
550    def find_c13_mass_features(self):
551        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
552
553        Returns
554        -------
555        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
556
557        Raises
558        ------
559        ValueError
560            If no mass features are found.
561        """
562        verbose = self.parameters.lc_ms.verbose_processing
563        if verbose:
564            print("evaluating mass features for C13 isotopes")
565        if self.mass_features is None:
566            raise ValueError("No mass features found, run find_mass_features() first")
567
568        # Data prep fo sparse distance matrix
569        dims = ["mz", "scan_time"]
570        mf_df = self.mass_features_to_df().copy()
571        # Drop mass features that have no area (these are likely to be noise)
572        mf_df = mf_df[mf_df["area"].notnull()]
573        mf_df["mf_id"] = mf_df.index.values
574        dims = ["mz", "scan_time"]
575
576        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
577        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
578
579        mz_diff = 1.003355  # C13-C12 mass difference
580        tol = [
581            mf_df["mz"].median()
582            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
583            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
584        ]  # mz, in relative; scan_time in minutes
585
586        # Compute inter-feature distances
587        distances = None
588        for i in range(len(dims)):
589            # Construct k-d tree
590            values = mf_df[dims[i]].values
591            tree = KDTree(values.reshape(-1, 1))
592
593            max_tol = tol[i]
594            if dims[i] == "mz":
595                # Maximum absolute tolerance
596                max_tol = mz_diff + tol[i]
597
598            # Compute sparse distance matrix
599            # the larger the max_tol, the slower this operation is
600            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
601
602            # Only consider forward case, exclude diagonal
603            sdm = sparse.triu(sdm, k=1)
604
605            if dims[i] == "mz":
606                min_tol = mz_diff - tol[i]
607                # Get only the ones that are above the min tol
608                idx = sdm.data > min_tol
609
610                # Reconstruct sparse distance matrix
611                sdm = sparse.coo_matrix(
612                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
613                    shape=(len(values), len(values)),
614                )
615
616            # Cast as binary matrix
617            sdm.data = np.ones_like(sdm.data)
618
619            # Stack distances
620            if distances is None:
621                distances = sdm
622            else:
623                distances = distances.multiply(sdm)
624
625        # Extract indices of within-tolerance points
626        distances = distances.tocoo()
627        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
628
629        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
630        pairs_mf = pairs.copy()
631        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
632        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
633
634        # Connect monoisotopic masses with isotopologes within mass_features
635        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
636        for mono in monos:
637            self.mass_features[mono].monoisotopic_mf_id = mono
638        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
639        while not pairs_iso_df.empty:
640            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
641            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
642            for iso in m1_isos:
643                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
644                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
645                if len(parent) > 1:
646                    # Choose the parent that is closest in time to the isotopologue
647                    parent_time = [self.mass_features[p].retention_time for p in parent]
648                    time_diff = [
649                        np.abs(self.mass_features[iso].retention_time - x)
650                        for x in parent_time
651                    ]
652                    parent = parent[np.argmin(time_diff)]
653                else:
654                    parent = parent[0]
655                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
656                    parent
657                ].monoisotopic_mf_id
658                if self.mass_features[iso].monoisotopic_mf_id is not None:
659                    mass_diff = (
660                        self.mass_features[iso].mz
661                        - self.mass_features[
662                            self.mass_features[iso].monoisotopic_mf_id
663                        ].mz
664                    )
665                    self.mass_features[iso].isotopologue_type = "13C" + str(
666                        int(round(mass_diff, 0))
667                    )
668
669            # Drop the mono and iso from the pairs_iso_df
670            pairs_iso_df = pairs_iso_df.drop(
671                index=monos, errors="ignore"
672            )  # Drop pairs where the parent is a child that is a child of a root
673            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
674            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
675
676            if not pairs_iso_df.empty:
677                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
678                monos = np.setdiff1d(
679                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
680                )
681        if verbose:
682            # Report fraction of compounds annotated with isotopes
683            mf_df["c13_flag"] = np.where(
684                np.logical_or(
685                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
686                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
687                ),
688                1,
689                0,
690            )
691            print(
692                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
693                + " of mass features have or are C13 isotopes"
694            )
695
696    def deconvolute_ms1_mass_features(self):
697        """Deconvolute MS1 mass features
698
699        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
700
701        Parameters
702        ----------
703        None
704
705        Returns
706        -------
707        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
708        and associated_mass_features_deconvoluted attributes to the mass features in the
709        mass_features attribute of the LCMSBase object.
710
711        Raises
712        ------
713        ValueError
714            If no mass features are found, must run find_mass_features() first.
715            If no EICs are found, did you run integrate_mass_features() first?
716
717        """
718        # Checks for set mass_features and eics
719        if self.mass_features is None:
720            raise ValueError(
721                "No mass features found, did you run find_mass_features() first?"
722            )
723
724        if self.eics == {}:
725            raise ValueError(
726                "No EICs found, did you run integrate_mass_features() first?"
727            )
728
729        if 1 not in self._ms_unprocessed.keys():
730            raise ValueError("No unprocessed MS1 spectra found.")
731
732        # Prep ms1 data
733        ms1_data = self._ms_unprocessed[1].copy()
734        ms1_data = ms1_data.set_index("scan")
735
736        # Prep mass feature summary
737        mass_feature_df = self.mass_features_to_df()
738
739        # Loop through each mass feature
740        for mf_id, mass_feature in self.mass_features.items():
741            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
742            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
743                continue
744
745            # Get the left and right limits of the EIC of the mass feature
746            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
747
748            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
749            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
750            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
751
752            # Get the centroided masses of the mass feature
753            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
754
755            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
756            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
757                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
758            ]
759
760            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
761            ms1_data_sub["mz_diff_rel"] = (
762                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
763                / ms1_data_sub["mz"]
764            )
765            ms1_data_sub = ms1_data_sub[
766                ms1_data_sub["mz_diff_rel"]
767                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
768            ].reset_index(drop=True)
769
770            # Group by mass_feature_mz and scan and sum intensity
771            ms1_data_sub_group = (
772                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
773                .sum()
774                .reset_index()
775            )
776
777            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
778            corr = (
779                ms1_data_sub_group.pivot(
780                    index="scan", columns="mass_feature_mz", values="intensity"
781                )
782                .fillna(0)
783                .corr()
784            )
785
786            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
787            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
788
789            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
790            try:
791                corr_subset = corr.loc[mass_feature.mz,]
792            except KeyError:
793            # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
794                continue
795
796            corr_subset = corr_subset[corr_subset > decon_corr_min]
797
798            # Get the masses from the mass spectrum that are the result of the deconvolution
799            mzs_decon = corr_subset.index.values
800
801            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
802            mzs_decon_idx = [
803                id
804                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
805                if mz in mzs_decon
806            ]
807            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
808
809            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
810            if (
811                mass_feature.ms1_peak.abundance
812                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
813            ):
814                mass_feature.mass_spectrum_deconvoluted_parent = True
815            else:
816                mass_feature.mass_spectrum_deconvoluted_parent = False
817
818            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
819            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
820            mass_feature_df_sub = mass_feature_df[
821                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
822                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
823            ].copy()
824            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
825            mass_feature_df_sub["mz_diff_ppm"] = [
826                np.abs(mzs_decon - mz).min() / mz * 10**6
827                for mz in mass_feature_df_sub["mz"]
828            ]
829            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
830            mfs_associated_decon = mass_feature_df_sub[
831                mass_feature_df_sub["mz_diff_ppm"]
832                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
833            ].index.values
834
835            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Methods for performing LC calculations on mass spectra data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
  • smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
  • eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
  • find_nearest_scan(rt). Finds the nearest scan to the given retention time.
  • get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
  • find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
  • integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
  • find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
  • deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
@staticmethod
def get_max_eic(eic_data: dict):
66    @staticmethod
67    def get_max_eic(eic_data: dict):
68        """Returns the maximum EIC value from the given EIC data.
69
70        Notes
71        -----
72        This is a static method.
73
74        Parameters
75        ----------
76        eic_data : dict
77            A dictionary containing EIC data.
78
79        Returns
80        -------
81        float
82            The maximum EIC value.
83        """
84        max_eic = 0
85        for eic_data in eic_data.values():
86            ind_max_eic = max(eic_data.get("EIC"))
87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
88
89        return max_eic

Returns the maximum EIC value from the given EIC data.

Notes

This is a static method.

Parameters
  • eic_data (dict): A dictionary containing EIC data.
Returns
  • float: The maximum EIC value.
def smooth_tic(self, tic):
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )

Smooths the TIC or EIC data using the specified smoothing method and settings.

Parameters
  • tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
  • numpy.ndarray: The smoothed TIC data.
def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159
160        return include_indexes

Performs EIC centroid detection on the given EIC data.

Parameters
  • rt (numpy.ndarray): The retention time data.
  • eic (numpy.ndarray): The EIC data.
  • max_eic (float): The maximum EIC value.
  • apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
  • numpy.ndarray: The indexes of left, apex, and right limits as a generator.
def find_nearest_scan(self, rt):
162    def find_nearest_scan(self, rt):
163        """Finds the nearest scan to the given retention time.
164
165        Parameters
166        ----------
167        rt : float
168            The retention time (in minutes) to find the nearest scan for.
169
170        Returns
171        -------
172        int
173            The scan number of the nearest scan.
174        """
175        array_rt = np.array(self.retention_time)
176
177        scan_index = (np.abs(array_rt - rt)).argmin()
178
179        real_scan = self.scans_number[scan_index]
180
181        return real_scan

Finds the nearest scan to the given retention time.

Parameters
  • rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
  • int: The scan number of the nearest scan.
def add_peak_metrics(self):
183    def add_peak_metrics(self):
184        """Add peak metrics to the mass features.
185
186        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
187        """
188        # Check that at least some mass features have eic data
189        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
190            raise ValueError(
191                "No mass features have EIC data. Run integrate_mass_features first."
192            )
193
194        for mass_feature in self.mass_features.values():
195            # Check if the mass feature has been integrated
196            if mass_feature._eic_data is not None:
197                # Calculate peak metrics
198                mass_feature.calc_half_height_width()
199                mass_feature.calc_tailing_factor()
200                mass_feature.calc_dispersity_index()

Add peak metrics to the mass features.

This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.

def get_average_mass_spectrum( self, scan_list, apex_scan, spectrum_mode='profile', ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None, ms_params=None):
202    def get_average_mass_spectrum(
203        self,
204        scan_list,
205        apex_scan,
206        spectrum_mode="profile",
207        ms_level=1,
208        auto_process=True,
209        use_parser=False,
210        perform_checks=True,
211        polarity=None,
212        ms_params=None,
213    ):
214        """Returns an averaged mass spectrum object
215
216        Parameters
217        ----------
218        scan_list : list
219            List of scan numbers to average.
220        apex_scan : int
221            Number of the apex scan
222        spectrum_mode : str, optional
223            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
224        ms_level : int, optional
225            The MS level to use. Defaults to 1.
226        auto_process : bool, optional
227            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
228        use_parser : bool, optional
229            If True, the mass spectra will be obtained from the parser. Defaults to False.
230        perform_checks : bool, optional
231            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
232        polarity : int, optional
233            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
234        ms_params : MSParameters, optional
235            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
236
237        Returns
238        -------
239        MassSpectrumProfile
240            The averaged mass spectrum object.
241
242        Raises
243        ------
244        ValueError
245            If the spectrum mode is not "profile".
246            If the MS level is not found in the unprocessed mass spectra dictionary.
247            If not all scan numbers are found in the unprocessed mass spectra dictionary.
248        """
249        if perform_checks:
250            if spectrum_mode != "profile":
251                raise ValueError("Averaging only supported for profile mode")
252
253        if polarity is None:
254            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
255            if self.polarity == "negative":
256                polarity = -1
257            elif self.polarity == "positive":
258                polarity = 1
259            else:
260                raise ValueError(
261                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
262                )
263
264        # if not using_parser, check that scan numbers are in _ms_unprocessed
265        if not use_parser:
266            if perform_checks:
267                # Set index to scan for faster lookup
268                ms_df = (
269                    self._ms_unprocessed[ms_level]
270                    .copy()
271                    .set_index("scan", drop=False)
272                    .sort_index()
273                )
274                my_ms_df = ms_df.loc[scan_list]
275                # Check that all scan numbers are in the ms_df
276                if not all(np.isin(scan_list, ms_df.index)):
277                    raise ValueError(
278                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
279                    )
280            else:
281                my_ms_df = (
282                    pd.DataFrame({"scan": scan_list})
283                    .set_index("scan")
284                    .join(self._ms_unprocessed[ms_level], how="left")
285                )
286
287        if use_parser:
288            ms_list = [
289                self.spectra_parser.get_mass_spectrum_from_scan(
290                    x, spectrum_mode=spectrum_mode, auto_process=False
291                )
292                for x in scan_list
293            ]
294            ms_mz = [x._mz_exp for x in ms_list]
295            ms_int = [x._abundance for x in ms_list]
296            my_ms_df = []
297            for i in np.arange(len(ms_mz)):
298                my_ms_df.append(
299                    pd.DataFrame(
300                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
301                    )
302                )
303            my_ms_df = pd.concat(my_ms_df)
304
305        if not self.check_if_grid(my_ms_df):
306            my_ms_df = self.grid_data(my_ms_df)
307
308        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
309
310        ms = ms_from_array_profile(
311            my_ms_ave.mz,
312            my_ms_ave.intensity,
313            self.file_location,
314            polarity=polarity,
315            auto_process=False,
316        )
317
318        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
319        if ms is not None:
320            if ms_params is not None:
321                ms.parameters = ms_params
322            ms.scan_number = apex_scan
323            if auto_process:
324                ms.process_mass_spec()
325        return ms

Returns an averaged mass spectrum object

Parameters
  • scan_list (list): List of scan numbers to average.
  • apex_scan (int): Number of the apex scan
  • spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
  • ms_level (int, optional): The MS level to use. Defaults to 1.
  • auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
  • use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
  • perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
  • polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
  • ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
  • MassSpectrumProfile: The averaged mass spectrum object.
Raises
  • ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
def find_mass_features(self, ms_level=1, grid=True):
327    def find_mass_features(self, ms_level=1, grid=True):
328        """Find mass features within an LCMSBase object
329
330        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
331
332        Parameters
333        ----------
334        ms_level : int, optional
335            The MS level to use for peak picking Default is 1.
336        grid : bool, optional
337            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), 
338            used for persistent homology peak picking for profile data only. Default is True.
339
340        Raises
341        ------
342        ValueError
343            If no MS level data is found on the object.
344            If persistent homology peak picking is attempted on non-profile mode data.
345            If data is not gridded and grid is False.
346            If peak picking method is not implemented.
347
348        Returns
349        -------
350        None, but assigns the mass_features and eics attributes to the object.
351
352        """
353        pp_method = self.parameters.lc_ms.peak_picking_method
354
355        if pp_method == "persistent homology":
356            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
357            if all(msx_scan_df["ms_format"] == "profile"):
358                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
359                self.cluster_mass_features(drop_children=True, sort_by="persistence")
360            else:
361                raise ValueError(
362                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
363                        ms_level
364                    )
365                )
366        elif pp_method == "centroided_persistent_homology":
367            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
368            if all(msx_scan_df["ms_format"] == "centroid"):
369                self.find_mass_features_ph_centroid(ms_level=ms_level)
370            else:
371                raise ValueError(
372                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
373                        ms_level
374                    )
375                )
376        else:
377            raise ValueError("Peak picking method not implemented")

Find mass features within an LCMSBase object

Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.

Parameters
  • ms_level (int, optional): The MS level to use for peak picking Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), used for persistent homology peak picking for profile data only. Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
  • None, but assigns the mass_features and eics attributes to the object.
def integrate_mass_features(self, drop_if_fail=True, drop_duplicates=True, ms_level=1):
379    def integrate_mass_features(
380        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
381    ):
382        """Integrate mass features and extract EICs.
383
384        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
385
386        Parameters
387        ----------
388        drop_if_fail : bool, optional
389            Whether to drop mass features if the EIC limit calculations fail.
390            Default is True.
391        drop_duplicates : bool, optional
392            Whether to mass features that appear to be duplicates
393            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
394            Default is True.
395        ms_level : int, optional
396            The MS level to use. Default is 1.
397
398        Raises
399        ------
400        ValueError
401            If no mass features are found.
402            If no MS level data is found for the given MS level (either in data or in the scan data)
403
404        Returns
405        -------
406        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
407
408        Notes
409        -----
410        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
411        """
412        # Check if there is data
413        if ms_level in self._ms_unprocessed.keys():
414            raw_data = self._ms_unprocessed[ms_level].copy()
415        else:
416            raise ValueError("No MS level " + str(ms_level) + " data found")
417        if self.mass_features is not None:
418            mf_df = self.mass_features_to_df().copy()
419        else:
420            raise ValueError(
421                "No mass features found, did you run find_mass_features() first?"
422            )
423        # Check if mass_spectrum exists on each mass feature
424        if not all(
425            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
426        ):
427            raise ValueError(
428                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
429            )
430
431        # Subset scan data to only include correct ms_level
432        scan_df_sub = self.scan_df[
433            self.scan_df["ms_level"] == int(ms_level)
434        ].reset_index(drop=True)
435        if scan_df_sub.empty:
436            raise ValueError("No MS level " + ms_level + " data found in scan data")
437        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
438
439        mzs_to_extract = np.unique(mf_df["mz"].values)
440        mzs_to_extract.sort()
441
442        # Get EICs for each unique mz in mass features list
443        for mz in mzs_to_extract:
444            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
445            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
446            raw_data_sub = raw_data[
447                (raw_data["mz"] >= mz_min) & (raw_data["mz"] <= mz_max)
448            ].reset_index(drop=True)
449            raw_data_sub = (
450                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
451            )
452            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
453            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
454            myEIC = EIC_Data(
455                scans=raw_data_sub["scan"].values,
456                time=raw_data_sub["scan_time"].values,
457                eic=raw_data_sub["intensity"].values,
458            )
459            # Smooth EIC
460            smoothed_eic = self.smooth_tic(myEIC.eic)
461            smoothed_eic[smoothed_eic < 0] = 0
462            myEIC.eic_smoothed = smoothed_eic
463            self.eics[mz] = myEIC
464
465        # Get limits of mass features using EIC centroid detector and integrate
466        mf_df["area"] = np.nan
467        for idx, mass_feature in mf_df.iterrows():
468            mz = mass_feature.mz
469            apex_scan = mass_feature.apex_scan
470
471            # Pull EIC data and find apex scan index
472            myEIC = self.eics[mz]
473            self.mass_features[idx]._eic_data = myEIC
474            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
475
476            # Find left and right limits of peak using EIC centroid detector, add to EICData
477            centroid_eics = self.eic_centroid_detector(
478                myEIC.time,
479                myEIC.eic_smoothed,
480                mass_feature.intensity * 1.1,
481                apex_indexes=[int(apex_index)],
482            )
483            l_a_r_scan_idx = [i for i in centroid_eics]
484            if len(l_a_r_scan_idx) > 0:
485                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
486                # Find the number of consecutive non-zero values in the EIC segment
487                mask = myEIC.eic[l_a_r_scan_idx[0][0]:l_a_r_scan_idx[0][2] + 1] > 0
488                # Find the longest run of consecutive True values
489                if np.any(mask):
490                    # Find indices where mask changes value
491                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
492                    starts = np.where(diff == 1)[0]
493                    ends = np.where(diff == -1)[0]
494                    consecutive_scans = (ends - starts).max()
495                else:
496                    consecutive_scans = 0
497                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
498                    self.mass_features.pop(idx)
499                    continue
500                # Add start and final scan to mass_features and EICData
501                left_scan, right_scan = (
502                    myEIC.scans[l_a_r_scan_idx[0][0]],
503                    myEIC.scans[l_a_r_scan_idx[0][2]],
504                )
505                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
506                myEIC.apexes = myEIC.apexes + mf_scan_apex
507                self.mass_features[idx].start_scan = left_scan
508                self.mass_features[idx].final_scan = right_scan
509
510                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
511                area = np.trapz(
512                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
513                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
514                )
515                mf_df.at[idx, "area"] = area
516                myEIC.areas = myEIC.areas + [area]
517                self.eics[mz] = myEIC
518                self.mass_features[idx]._area = area
519            else:
520                if drop_if_fail is True:
521                    self.mass_features.pop(idx)
522
523        if drop_duplicates:
524            # Prepare mass feature dataframe
525            mf_df = self.mass_features_to_df().copy()
526
527            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
528            # Kepp the first mass fea
529            for idx, mass_feature in mf_df.iterrows():
530                mz = mass_feature.mz
531                apex_scan = mass_feature.apex_scan
532
533                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
534                mf_df_sub = mf_df[
535                    mf_df["mz_diff_ppm"]
536                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
537                    * 10**6
538                ].copy()
539
540                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
541                for idx2, mass_feature2 in mf_df_sub.iterrows():
542                    if idx2 != idx:
543                        if (
544                            mass_feature2.start_scan >= mass_feature.start_scan
545                            and mass_feature2.final_scan <= mass_feature.final_scan
546                        ):
547                            if idx2 in self.mass_features.keys():
548                                self.mass_features.pop(idx2)

Integrate mass features and extract EICs.

Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.

Parameters
  • drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
  • drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
  • None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes

drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).

def find_c13_mass_features(self):
550    def find_c13_mass_features(self):
551        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
552
553        Returns
554        -------
555        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
556
557        Raises
558        ------
559        ValueError
560            If no mass features are found.
561        """
562        verbose = self.parameters.lc_ms.verbose_processing
563        if verbose:
564            print("evaluating mass features for C13 isotopes")
565        if self.mass_features is None:
566            raise ValueError("No mass features found, run find_mass_features() first")
567
568        # Data prep fo sparse distance matrix
569        dims = ["mz", "scan_time"]
570        mf_df = self.mass_features_to_df().copy()
571        # Drop mass features that have no area (these are likely to be noise)
572        mf_df = mf_df[mf_df["area"].notnull()]
573        mf_df["mf_id"] = mf_df.index.values
574        dims = ["mz", "scan_time"]
575
576        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
577        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
578
579        mz_diff = 1.003355  # C13-C12 mass difference
580        tol = [
581            mf_df["mz"].median()
582            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
583            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
584        ]  # mz, in relative; scan_time in minutes
585
586        # Compute inter-feature distances
587        distances = None
588        for i in range(len(dims)):
589            # Construct k-d tree
590            values = mf_df[dims[i]].values
591            tree = KDTree(values.reshape(-1, 1))
592
593            max_tol = tol[i]
594            if dims[i] == "mz":
595                # Maximum absolute tolerance
596                max_tol = mz_diff + tol[i]
597
598            # Compute sparse distance matrix
599            # the larger the max_tol, the slower this operation is
600            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
601
602            # Only consider forward case, exclude diagonal
603            sdm = sparse.triu(sdm, k=1)
604
605            if dims[i] == "mz":
606                min_tol = mz_diff - tol[i]
607                # Get only the ones that are above the min tol
608                idx = sdm.data > min_tol
609
610                # Reconstruct sparse distance matrix
611                sdm = sparse.coo_matrix(
612                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
613                    shape=(len(values), len(values)),
614                )
615
616            # Cast as binary matrix
617            sdm.data = np.ones_like(sdm.data)
618
619            # Stack distances
620            if distances is None:
621                distances = sdm
622            else:
623                distances = distances.multiply(sdm)
624
625        # Extract indices of within-tolerance points
626        distances = distances.tocoo()
627        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
628
629        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
630        pairs_mf = pairs.copy()
631        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
632        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
633
634        # Connect monoisotopic masses with isotopologes within mass_features
635        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
636        for mono in monos:
637            self.mass_features[mono].monoisotopic_mf_id = mono
638        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
639        while not pairs_iso_df.empty:
640            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
641            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
642            for iso in m1_isos:
643                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
644                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
645                if len(parent) > 1:
646                    # Choose the parent that is closest in time to the isotopologue
647                    parent_time = [self.mass_features[p].retention_time for p in parent]
648                    time_diff = [
649                        np.abs(self.mass_features[iso].retention_time - x)
650                        for x in parent_time
651                    ]
652                    parent = parent[np.argmin(time_diff)]
653                else:
654                    parent = parent[0]
655                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
656                    parent
657                ].monoisotopic_mf_id
658                if self.mass_features[iso].monoisotopic_mf_id is not None:
659                    mass_diff = (
660                        self.mass_features[iso].mz
661                        - self.mass_features[
662                            self.mass_features[iso].monoisotopic_mf_id
663                        ].mz
664                    )
665                    self.mass_features[iso].isotopologue_type = "13C" + str(
666                        int(round(mass_diff, 0))
667                    )
668
669            # Drop the mono and iso from the pairs_iso_df
670            pairs_iso_df = pairs_iso_df.drop(
671                index=monos, errors="ignore"
672            )  # Drop pairs where the parent is a child that is a child of a root
673            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
674            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
675
676            if not pairs_iso_df.empty:
677                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
678                monos = np.setdiff1d(
679                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
680                )
681        if verbose:
682            # Report fraction of compounds annotated with isotopes
683            mf_df["c13_flag"] = np.where(
684                np.logical_or(
685                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
686                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
687                ),
688                1,
689                0,
690            )
691            print(
692                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
693                + " of mass features have or are C13 isotopes"
694            )

Mark likely C13 isotopes and connect to monoisoitopic mass features.

Returns
  • None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found.
def deconvolute_ms1_mass_features(self):
696    def deconvolute_ms1_mass_features(self):
697        """Deconvolute MS1 mass features
698
699        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
700
701        Parameters
702        ----------
703        None
704
705        Returns
706        -------
707        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
708        and associated_mass_features_deconvoluted attributes to the mass features in the
709        mass_features attribute of the LCMSBase object.
710
711        Raises
712        ------
713        ValueError
714            If no mass features are found, must run find_mass_features() first.
715            If no EICs are found, did you run integrate_mass_features() first?
716
717        """
718        # Checks for set mass_features and eics
719        if self.mass_features is None:
720            raise ValueError(
721                "No mass features found, did you run find_mass_features() first?"
722            )
723
724        if self.eics == {}:
725            raise ValueError(
726                "No EICs found, did you run integrate_mass_features() first?"
727            )
728
729        if 1 not in self._ms_unprocessed.keys():
730            raise ValueError("No unprocessed MS1 spectra found.")
731
732        # Prep ms1 data
733        ms1_data = self._ms_unprocessed[1].copy()
734        ms1_data = ms1_data.set_index("scan")
735
736        # Prep mass feature summary
737        mass_feature_df = self.mass_features_to_df()
738
739        # Loop through each mass feature
740        for mf_id, mass_feature in self.mass_features.items():
741            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
742            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
743                continue
744
745            # Get the left and right limits of the EIC of the mass feature
746            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
747
748            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
749            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
750            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
751
752            # Get the centroided masses of the mass feature
753            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
754
755            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
756            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
757                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
758            ]
759
760            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
761            ms1_data_sub["mz_diff_rel"] = (
762                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
763                / ms1_data_sub["mz"]
764            )
765            ms1_data_sub = ms1_data_sub[
766                ms1_data_sub["mz_diff_rel"]
767                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
768            ].reset_index(drop=True)
769
770            # Group by mass_feature_mz and scan and sum intensity
771            ms1_data_sub_group = (
772                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
773                .sum()
774                .reset_index()
775            )
776
777            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
778            corr = (
779                ms1_data_sub_group.pivot(
780                    index="scan", columns="mass_feature_mz", values="intensity"
781                )
782                .fillna(0)
783                .corr()
784            )
785
786            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
787            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
788
789            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
790            try:
791                corr_subset = corr.loc[mass_feature.mz,]
792            except KeyError:
793            # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
794                continue
795
796            corr_subset = corr_subset[corr_subset > decon_corr_min]
797
798            # Get the masses from the mass spectrum that are the result of the deconvolution
799            mzs_decon = corr_subset.index.values
800
801            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
802            mzs_decon_idx = [
803                id
804                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
805                if mz in mzs_decon
806            ]
807            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
808
809            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
810            if (
811                mass_feature.ms1_peak.abundance
812                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
813            ):
814                mass_feature.mass_spectrum_deconvoluted_parent = True
815            else:
816                mass_feature.mass_spectrum_deconvoluted_parent = False
817
818            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
819            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
820            mass_feature_df_sub = mass_feature_df[
821                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
822                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
823            ].copy()
824            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
825            mass_feature_df_sub["mz_diff_ppm"] = [
826                np.abs(mzs_decon - mz).min() / mz * 10**6
827                for mz in mass_feature_df_sub["mz"]
828            ]
829            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
830            mfs_associated_decon = mass_feature_df_sub[
831                mass_feature_df_sub["mz_diff_ppm"]
832                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
833            ].index.values
834
835            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Deconvolute MS1 mass features

Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features

Parameters
  • None
Returns
  • None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
  • and associated_mass_features_deconvoluted attributes to the mass features in the
  • mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
class PHCalculations:
 838class PHCalculations:
 839    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 840
 841    Notes
 842    -----
 843    This class is intended to be used as a mixin for the LCMSBase class.
 844
 845    Methods
 846    -------
 847    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 848        Sparse implementation of a mean filter.
 849    * embed_unique_indices(a).
 850        Creates an array of indices, sorted by unique element.
 851    * sparse_upper_star(idx, V).
 852        Sparse implementation of an upper star filtration.
 853    * check_if_grid(data).
 854        Check if the data is gridded in mz space.
 855    * grid_data(data).
 856        Grid the data in the mz dimension.
 857    * find_mass_features_ph(ms_level=1, grid=True).
 858        Find mass features within an LCMSBase object using persistent homology.
 859    * cluster_mass_features(drop_children=True).
 860        Cluster regions of interest.
 861    """
 862
 863    @staticmethod
 864    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 865        """Sparse implementation of a mean filter.
 866
 867        Parameters
 868        ----------
 869        idx : :obj:`~numpy.array`
 870            Edge indices for each dimension (MxN).
 871        V : :obj:`~numpy.array`
 872            Array of intensity data (Mx1).
 873        radius : float or list
 874            Radius of the sparse filter in each dimension. Values less than
 875            zero indicate no connectivity in that dimension.
 876
 877        Returns
 878        -------
 879        :obj:`~numpy.array`
 880            Filtered intensities (Mx1).
 881
 882        Notes
 883        -----
 884        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 885        This is a static method.
 886        """
 887
 888        # Copy indices
 889        idx = idx.copy().astype(V.dtype)
 890
 891        # Scale
 892        for i, r in enumerate(radius):
 893            # Increase inter-index distance
 894            if r < 1:
 895                idx[:, i] *= 2
 896
 897            # Do nothing
 898            elif r == 1:
 899                pass
 900
 901            # Decrease inter-index distance
 902            else:
 903                idx[:, i] /= r
 904
 905        # Connectivity matrix
 906        cmat = KDTree(idx)
 907        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 908        cmat.setdiag(1)
 909
 910        # Pair indices
 911        I, J = cmat.nonzero()
 912
 913        # Delete cmat
 914        cmat_shape = cmat.shape
 915        del cmat
 916
 917        # Sum over columns
 918        V_sum = sparse.bsr_matrix(
 919            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 920        ).diagonal(0)
 921
 922        # Count over columns
 923        V_count = sparse.bsr_matrix(
 924            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 925        ).diagonal(0)
 926
 927        return V_sum / V_count
 928
 929    @staticmethod
 930    def embed_unique_indices(a):
 931        """Creates an array of indices, sorted by unique element.
 932
 933        Parameters
 934        ----------
 935        a : :obj:`~numpy.array`
 936            Array of unique elements (Mx1).
 937
 938        Returns
 939        -------
 940        :obj:`~numpy.array`
 941            Array of indices (Mx1).
 942
 943        Notes
 944        -----
 945        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 946        This is a static method.
 947        """
 948
 949        def count_tens(n):
 950            # Count tens
 951            ntens = (n - 1) // 10
 952
 953            while True:
 954                ntens_test = (ntens + n - 1) // 10
 955
 956                if ntens_test == ntens:
 957                    return ntens
 958                else:
 959                    ntens = ntens_test
 960
 961        def arange_exclude_10s(n):
 962            # How many 10s will there be?
 963            ntens = count_tens(n)
 964
 965            # Base array
 966            arr = np.arange(0, n + ntens)
 967
 968            # Exclude 10s
 969            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 970
 971            return arr
 972
 973        # Creates an array of indices, sorted by unique element
 974        idx_sort = np.argsort(a)
 975        idx_unsort = np.argsort(idx_sort)
 976
 977        # Sorts records array so all unique elements are together
 978        sorted_a = a[idx_sort]
 979
 980        # Returns the unique values, the index of the first occurrence,
 981        # and the count for each element
 982        vals, idx_start, count = np.unique(
 983            sorted_a, return_index=True, return_counts=True
 984        )
 985
 986        # Splits the indices into separate arrays
 987        splits = np.split(idx_sort, idx_start[1:])
 988
 989        # Creates unique indices for each split
 990        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 991
 992        # Reorders according to input array
 993        idx_unq = idx_unq[idx_unsort]
 994
 995        # Magnitude of each index
 996        exp = np.log10(
 997            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
 998        )
 999        idx_unq_mag = np.power(10, np.floor(exp) + 1)
1000
1001        # Result
1002        return a + idx_unq / idx_unq_mag
1003    
1004    @staticmethod
1005    def roll_up_dataframe(
1006        df : pd.DataFrame,
1007        sort_by : str,
1008        tol : list,
1009        relative : list,
1010        dims : list
1011    ):
1012        """Subset data by rolling up into apex in appropriate dimensions.
1013        
1014        Parameters
1015        ----------
1016        data : pd.DataFrame
1017            The input data containing "dims" columns and the "sort_by" column.
1018        sort_by : str
1019            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 
1020            (i.e., the mass feature with the highest value in the sort_by column).
1021        dims : list
1022            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1023        tol : list
1024            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1025            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1026            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1027            If relative is False, the tolerance will be used as is.
1028        relative : list
1029            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1030
1031        Returns
1032        -------
1033        pd.DataFrame
1034            A DataFrame with only the rolled up mass features, with the original index and columns.
1035
1036            
1037        Raises
1038        ------
1039        ValueError
1040            If the input data is not a pandas DataFrame.
1041            If the input data does not have columns for each of the dimensions in "dims".
1042            If the length of "dims", "tol", and "relative" do not match.
1043        """
1044        og_columns = df.columns.copy()
1045
1046        # Unindex the data, but keep the original index
1047        if df.index.name is not None:
1048            og_index = df.index.name
1049        else:
1050            og_index = "index"
1051        df = df.reset_index(drop=False)
1052
1053        # Sort data by sort_by column, and reindex
1054        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1055
1056        # Check that data is a DataFrame and has columns for each of the dims
1057        if not isinstance(df, pd.DataFrame):
1058            raise ValueError("Data must be a pandas DataFrame")
1059        for dim in dims:
1060            if dim not in df.columns:
1061                raise ValueError(f"Data must have a column for {dim}")
1062        if len(dims) != len(tol) or len(dims) != len(relative):
1063            raise ValueError(
1064                "Dimensions, tolerances, and relative flags must be the same length"
1065            )
1066        
1067        # Compute inter-feature distances
1068        distances = None
1069        for i in range(len(dims)):
1070            # Construct k-d tree
1071            values = df[dims[i]].values
1072            tree = KDTree(values.reshape(-1, 1))
1073
1074            max_tol = tol[i]
1075            if relative[i] is True:
1076                # Maximum absolute tolerance
1077                max_tol = tol[i] * values.max()
1078
1079            # Compute sparse distance matrix
1080            # the larger the max_tol, the slower this operation is
1081            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1082
1083            # Only consider forward case, exclude diagonal
1084            sdm = sparse.triu(sdm, k=1)
1085
1086            # Filter relative distances
1087            if relative[i] is True:
1088                # Compute relative distances
1089                rel_dists = sdm.data / values[sdm.row]  # or col?
1090
1091                # Indices of relative distances less than tolerance
1092                idx = rel_dists <= tol[i]
1093
1094                # Reconstruct sparse distance matrix
1095                sdm = sparse.coo_matrix(
1096                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1097                    shape=(len(values), len(values)),
1098                )
1099
1100            # Cast as binary matrix
1101            sdm.data = np.ones_like(sdm.data)
1102
1103            # Stack distances
1104            if distances is None:
1105                distances = sdm
1106            else:
1107                distances = distances.multiply(sdm)
1108        
1109        # Extract indices of within-tolerance points
1110        distances = distances.tocoo()
1111        pairs = np.stack((distances.row, distances.col), axis=1)
1112        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1113        pairs_df = pairs_df.set_index("parent")
1114
1115        to_drop = []
1116        while not pairs_df.empty:
1117            # Find root_parents and their children
1118            root_parents = np.setdiff1d(
1119                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1120            )
1121            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1122            to_drop = np.append(to_drop, children_of_roots)
1123
1124            # Remove root_children as possible parents from pairs_df for next iteration
1125            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1126            pairs_df = pairs_df.reset_index().set_index("child")
1127            # Remove root_children as possible children from pairs_df for next iteration
1128            pairs_df = pairs_df.drop(index=children_of_roots)
1129
1130            # Prepare for next iteration
1131            pairs_df = pairs_df.reset_index().set_index("parent")
1132
1133        # Drop mass features that are not cluster parents
1134        df_sub = df.drop(index=np.array(to_drop))
1135
1136        # Set index back to og_index and only keep the columns that are in the original dataframe
1137        df_sub = df_sub.set_index(og_index)
1138
1139        # sort the dataframe by the original index
1140        df_sub = df_sub.sort_index()
1141        df_sub = df_sub[og_columns]
1142
1143        return df_sub
1144    
1145    def sparse_upper_star(self, idx, V):
1146        """Sparse implementation of an upper star filtration.
1147
1148        Parameters
1149        ----------
1150        idx : :obj:`~numpy.array`
1151            Edge indices for each dimension (MxN).
1152        V : :obj:`~numpy.array`
1153            Array of intensity data (Mx1).
1154        Returns
1155        -------
1156        idx : :obj:`~numpy.array`
1157            Index of filtered points (Mx1).
1158        persistence : :obj:`~numpy.array`
1159            Persistence of each filtered point (Mx1).
1160
1161        Notes
1162        -----
1163        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1164        """
1165
1166        # Invert
1167        V = -1 * V.copy().astype(int)
1168
1169        # Embed indices
1170        V = self.embed_unique_indices(V)
1171
1172        # Connectivity matrix
1173        cmat = KDTree(idx)
1174        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1175        cmat.setdiag(1)
1176        cmat = sparse.triu(cmat)
1177
1178        # Pairwise minimums
1179        I, J = cmat.nonzero()
1180        d = np.maximum(V[I], V[J])
1181
1182        # Delete connectiity matrix
1183        cmat_shape = cmat.shape
1184        del cmat
1185
1186        # Sparse distance matrix
1187        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1188
1189        # Delete pairwise mins
1190        del d, I, J
1191
1192        # Persistence homology
1193        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1194
1195        # Bound death values
1196        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1197
1198        # Construct tree to query against
1199        tree = KDTree(V.reshape((-1, 1)))
1200
1201        # Get the indexes of the first nearest neighbor by birth
1202        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1203
1204        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1205
1206    def check_if_grid(self, data):
1207        """Check if the data are gridded in mz space.
1208
1209        Parameters
1210        ----------
1211        data : DataFrame
1212            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1213
1214        Returns
1215        -------
1216        bool
1217            True if the data is gridded in the mz direction, False otherwise.
1218
1219        Notes
1220        -----
1221        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1222        """
1223        # Calculate the difference between consecutive mz values in a single scan
1224        dat_check = data.copy().reset_index(drop=True)
1225        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1226        mz_diff_min = (
1227            dat_check.groupby("scan")["mz_diff"].min().min()
1228        )  # within each scan, what is the smallest mz difference between consecutive mz values
1229
1230        # Find the mininum mz difference between mz values in the data; regardless of scan
1231        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1232        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1233        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1234
1235        # Get minimum mz_diff between mz values in the data
1236        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1237
1238        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1239        if mz_diff_min_raw < mz_diff_min:
1240            return False
1241        else:
1242            return True
1243
1244    def grid_data(self, data, attempts=5):
1245        """Grid the data in the mz dimension.
1246
1247        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1248
1249        Parameters
1250        ----------
1251        data : DataFrame
1252            The input data containing mz, scan, scan_time, and intensity columns.
1253        attempts : int, optional
1254            The number of attempts to grid the data. Default is 5.
1255
1256        Returns
1257        -------
1258        DataFrame
1259            The gridded data with mz, scan, scan_time, and intensity columns.
1260
1261        Raises
1262        ------
1263        ValueError
1264            If gridding fails after the specified number of attempts.
1265        """
1266        attempt_i = 0
1267        while attempt_i < attempts:
1268            attempt_i += 1
1269            data = self._grid_data(data)
1270
1271            if self.check_if_grid(data):
1272                return data
1273        
1274        if not self.check_if_grid(data):
1275            raise ValueError(
1276                "Gridding failed after "
1277                + str(attempt_i)
1278                + " attempts. Please check the data."
1279            )
1280        else:
1281            return data
1282    
1283    def _grid_data(self, data):
1284        """Internal method to grid the data in the mz dimension.
1285        
1286        Notes
1287        -----
1288        This method is called by the grid_data method and should not be called directly.
1289        It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan,
1290        but it does not check if the data is already gridded or if the gridding is successful.
1291
1292        Parameters
1293        ----------
1294        data : pd.DataFrame or pl.DataFrame
1295            The input data to grid.
1296
1297        Returns
1298        -------
1299        pd.DataFrame or pl.DataFrame
1300            The data after attempting to grid it in the mz dimension.
1301        """
1302        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1303        data_w = data.copy().reset_index(drop=True)
1304        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1305        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1306
1307        # Need high intensity mz values first so they are parents in the output pairs stack
1308        dat_mz = data_w[["mz", "intensity"]].sort_values(
1309            by=["intensity"], ascending=False
1310        )
1311        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1312
1313        # Construct KD tree
1314        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1315        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1316        sdm = sparse.triu(sdm, k=1)
1317        sdm.data = np.ones_like(sdm.data)
1318        distances = sdm.tocoo()
1319        pairs = np.stack((distances.row, distances.col), axis=1)
1320
1321        # Cull pairs to just get root
1322        to_drop = []
1323        while len(pairs) > 0:
1324            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1325            id_root_parents = np.isin(pairs[:, 0], root_parents)
1326            children_of_roots = np.unique(pairs[id_root_parents, 1])
1327            to_drop = np.append(to_drop, children_of_roots)
1328
1329            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1330            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1331            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1332        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1333        mz_dat_np = (
1334            dat_mz[["mz"]]
1335            .sort_values(by=["mz"])
1336            .reset_index(drop=True)
1337            .values.flatten()
1338        )
1339
1340        # Sort data by mz and recast mz to nearest value in mz_dat_np
1341        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1342        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1343        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1344
1345        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1346        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1347        new_data_w = (
1348            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1349            .groupby(["scan", "mz"])["intensity"]
1350            .sum()
1351            .reset_index()
1352        )
1353        new_data_w = (
1354            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1355            .reset_index(drop=True)
1356            .copy()
1357        )
1358
1359        return new_data_w
1360    
1361    def find_mass_features_ph(self, ms_level=1, grid=True):
1362        """Find mass features within an LCMSBase object using persistent homology.
1363
1364        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1365
1366        Parameters
1367        ----------
1368        ms_level : int, optional
1369            The MS level to use. Default is 1.
1370        grid : bool, optional
1371            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1372
1373        Raises
1374        ------
1375        ValueError
1376            If no MS level data is found on the object.
1377            If data is not gridded and grid is False.
1378
1379        Returns
1380        -------
1381        None, but assigns the mass_features attribute to the object.
1382
1383        Notes
1384        -----
1385        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1386        """
1387        # Check that ms_level is a key in self._ms_uprocessed
1388        if ms_level not in self._ms_unprocessed.keys():
1389            raise ValueError(
1390                "No MS level "
1391                + str(ms_level)
1392                + " data found, did you instantiate with parser specific to MS level?"
1393            )
1394
1395        # Get ms data
1396        data = self._ms_unprocessed[ms_level].copy()
1397
1398        # Drop rows with missing intensity values and reset index
1399        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1400
1401        # Threshold data
1402        dims = ["mz", "scan_time"]
1403        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1404        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1405
1406        # Check if gridded, if not, grid
1407        gridded_mz = self.check_if_grid(data_thres)
1408        if gridded_mz is False:
1409            if grid is False:
1410                raise ValueError(
1411                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1412                )
1413            else:
1414                data_thres = self.grid_data(data_thres)
1415
1416        # Add build factors and add scan_time
1417        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1418        factors = {
1419            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1420            for dim in dims
1421        }  # this is return a float64 index
1422
1423        # Build indexes
1424        index = {
1425            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1426            for dim in factors
1427        }
1428
1429        # Smooth data
1430        iterations = self.parameters.lc_ms.ph_smooth_it
1431        smooth_radius = [
1432            self.parameters.lc_ms.ph_smooth_radius_mz,
1433            self.parameters.lc_ms.ph_smooth_radius_scan,
1434        ]  # mz, scan_time smoothing radius (in steps)
1435
1436        index = np.vstack([index[dim] for dim in dims]).T
1437        V = data_thres["intensity"].values
1438        resid = np.inf
1439        for i in range(iterations):
1440            # Previous iteration
1441            V_prev = V.copy()
1442            resid_prev = resid
1443            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1444
1445            # Calculate residual with previous iteration
1446            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1447
1448            # Evaluate convergence
1449            if i > 0:
1450                # Percent change in residual
1451                test = np.abs(resid - resid_prev) / resid_prev
1452
1453                # Exit criteria
1454                if test <= 0:
1455                    break
1456
1457        # Overwrite values
1458        data_thres["intensity"] = V
1459
1460        # Use persistent homology to find regions of interest
1461        pidx, pers = self.sparse_upper_star(index, V)
1462        pidx = pidx[pers > 1]
1463        pers = pers[pers > 1]
1464
1465        # Get peaks
1466        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1467
1468        # Add persistence column
1469        peaks["persistence"] = pers
1470        mass_features = peaks.sort_values(
1471            by="persistence", ascending=False
1472        ).reset_index(drop=True)
1473
1474        # Filter by persistence threshold
1475        persistence_threshold = (
1476            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1477        )
1478        mass_features = mass_features.loc[
1479            mass_features["persistence"] > persistence_threshold, :
1480        ].reset_index(drop=True)
1481
1482        # Rename scan column to apex_scan
1483        mass_features = mass_features.rename(
1484            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1485        )
1486
1487        # Populate mass_features attribute
1488        self.mass_features = {}
1489        for row in mass_features.itertuples():
1490            row_dict = mass_features.iloc[row.Index].to_dict()
1491            lcms_feature = LCMSMassFeature(self, **row_dict)
1492            self.mass_features[lcms_feature.id] = lcms_feature
1493
1494        if self.parameters.lc_ms.verbose_processing:
1495            print("Found " + str(len(mass_features)) + " initial mass features")
1496
1497    def find_mass_features_ph_centroid(self, ms_level=1):
1498        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1499        
1500        Parameters
1501        ----------
1502        ms_level : int, optional
1503            The MS level to use. Default is 1.
1504        
1505        Raises
1506        ------
1507        ValueError
1508            If no MS level data is found on the object.
1509
1510        Returns
1511        -------
1512        None, but assigns the mass_features attribute to the object.        
1513        """
1514        # Check that ms_level is a key in self._ms_uprocessed
1515        if ms_level not in self._ms_unprocessed.keys():
1516            raise ValueError(
1517                "No MS level "
1518                + str(ms_level)
1519                + " data found, did you instantiate with parser specific to MS level?"
1520            )
1521        
1522        # Get ms data
1523        data = self._ms_unprocessed[ms_level].copy()
1524
1525        # Drop rows with missing intensity values and reset index
1526        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1527        
1528        # Threshold data
1529        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1530        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1531        data_thres['persistence'] = data_thres['intensity']
1532
1533        # Use this as the starting point for the mass features, adding scan_time
1534        mf_df = data_thres
1535        mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1536
1537        # Define tolerances and dimensions for rolling up
1538        tol = [
1539            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1540            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1541        ]
1542        relative = [True, False]    
1543        dims = ["mz", "scan_time"]
1544        print("here")
1545        mf_df = self.roll_up_dataframe(
1546            df=mf_df,
1547            sort_by="persistence",
1548            dims=dims,
1549            tol=tol,
1550            relative=relative
1551        )
1552
1553        # Rename scan column to apex_scan
1554        mass_features = mf_df.rename(
1555            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1556        )
1557        # Sort my persistence and reset index
1558        mass_features = mass_features.sort_values(
1559            by="persistence", ascending=False
1560        ).reset_index(drop=True)
1561
1562        # Populate mass_features attribute
1563        self.mass_features = {}
1564        for row in mass_features.itertuples():
1565            row_dict = mass_features.iloc[row.Index].to_dict()
1566            lcms_feature = LCMSMassFeature(self, **row_dict)
1567            self.mass_features[lcms_feature.id] = lcms_feature
1568
1569        if self.parameters.lc_ms.verbose_processing:
1570            print("Found " + str(len(mass_features)) + " initial mass features")
1571    
1572    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1573        """Cluster mass features
1574
1575        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1576
1577        Parameters
1578        ----------
1579        drop_children : bool, optional
1580            Whether to drop the mass features that are not cluster parents. Default is True.
1581        sort_by : str, optional
1582            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1583
1584        Raises
1585        ------
1586        ValueError
1587            If no mass features are found.
1588            If too many mass features are found.
1589
1590        Returns
1591        -------
1592        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1593        """
1594        verbose = self.parameters.lc_ms.verbose_processing
1595
1596        if self.mass_features is None:
1597            raise ValueError("No mass features found, run find_mass_features() first")
1598        if len(self.mass_features) > 400000:
1599            raise ValueError(
1600                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1601            )
1602        dims = ["mz", "scan_time"]
1603        mf_df_og = self.mass_features_to_df()
1604        mf_df = mf_df_og.copy()
1605
1606        tol = [
1607            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1608            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1609        ]  # mz, in relative; scan_time in minutes
1610        relative = [True, False]
1611
1612        # Roll up mass features based on their proximity in the declared dimensions
1613        mf_df_new = self.roll_up_dataframe(
1614            df=mf_df,
1615            sort_by=sort_by,
1616            dims=dims,
1617            tol=tol,
1618            relative=relative
1619        )
1620
1621        mf_df["cluster_parent"] = np.where(
1622            np.isin(mf_df.index, mf_df_new.index), True, False
1623        )
1624
1625        # get mass feature ids of features that are not cluster parents
1626        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1627        if drop_children is True:
1628            # Drop mass features that are not cluster parents from self
1629            self.mass_features = {
1630                k: v
1631                for k, v in self.mass_features.items()
1632                if k not in cluster_daughters
1633            }
1634        else:
1635            return cluster_daughters

Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
  • embed_unique_indices(a). Creates an array of indices, sorted by unique element.
  • sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
  • check_if_grid(data). Check if the data is gridded in mz space.
  • grid_data(data). Grid the data in the mz dimension.
  • find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
  • cluster_mass_features(drop_children=True). Cluster regions of interest.
@staticmethod
def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
863    @staticmethod
864    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
865        """Sparse implementation of a mean filter.
866
867        Parameters
868        ----------
869        idx : :obj:`~numpy.array`
870            Edge indices for each dimension (MxN).
871        V : :obj:`~numpy.array`
872            Array of intensity data (Mx1).
873        radius : float or list
874            Radius of the sparse filter in each dimension. Values less than
875            zero indicate no connectivity in that dimension.
876
877        Returns
878        -------
879        :obj:`~numpy.array`
880            Filtered intensities (Mx1).
881
882        Notes
883        -----
884        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
885        This is a static method.
886        """
887
888        # Copy indices
889        idx = idx.copy().astype(V.dtype)
890
891        # Scale
892        for i, r in enumerate(radius):
893            # Increase inter-index distance
894            if r < 1:
895                idx[:, i] *= 2
896
897            # Do nothing
898            elif r == 1:
899                pass
900
901            # Decrease inter-index distance
902            else:
903                idx[:, i] /= r
904
905        # Connectivity matrix
906        cmat = KDTree(idx)
907        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
908        cmat.setdiag(1)
909
910        # Pair indices
911        I, J = cmat.nonzero()
912
913        # Delete cmat
914        cmat_shape = cmat.shape
915        del cmat
916
917        # Sum over columns
918        V_sum = sparse.bsr_matrix(
919            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
920        ).diagonal(0)
921
922        # Count over columns
923        V_count = sparse.bsr_matrix(
924            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
925        ).diagonal(0)
926
927        return V_sum / V_count

Sparse implementation of a mean filter.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
  • radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
  • ~numpy.array: Filtered intensities (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.

@staticmethod
def embed_unique_indices(a):
 929    @staticmethod
 930    def embed_unique_indices(a):
 931        """Creates an array of indices, sorted by unique element.
 932
 933        Parameters
 934        ----------
 935        a : :obj:`~numpy.array`
 936            Array of unique elements (Mx1).
 937
 938        Returns
 939        -------
 940        :obj:`~numpy.array`
 941            Array of indices (Mx1).
 942
 943        Notes
 944        -----
 945        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 946        This is a static method.
 947        """
 948
 949        def count_tens(n):
 950            # Count tens
 951            ntens = (n - 1) // 10
 952
 953            while True:
 954                ntens_test = (ntens + n - 1) // 10
 955
 956                if ntens_test == ntens:
 957                    return ntens
 958                else:
 959                    ntens = ntens_test
 960
 961        def arange_exclude_10s(n):
 962            # How many 10s will there be?
 963            ntens = count_tens(n)
 964
 965            # Base array
 966            arr = np.arange(0, n + ntens)
 967
 968            # Exclude 10s
 969            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 970
 971            return arr
 972
 973        # Creates an array of indices, sorted by unique element
 974        idx_sort = np.argsort(a)
 975        idx_unsort = np.argsort(idx_sort)
 976
 977        # Sorts records array so all unique elements are together
 978        sorted_a = a[idx_sort]
 979
 980        # Returns the unique values, the index of the first occurrence,
 981        # and the count for each element
 982        vals, idx_start, count = np.unique(
 983            sorted_a, return_index=True, return_counts=True
 984        )
 985
 986        # Splits the indices into separate arrays
 987        splits = np.split(idx_sort, idx_start[1:])
 988
 989        # Creates unique indices for each split
 990        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 991
 992        # Reorders according to input array
 993        idx_unq = idx_unq[idx_unsort]
 994
 995        # Magnitude of each index
 996        exp = np.log10(
 997            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
 998        )
 999        idx_unq_mag = np.power(10, np.floor(exp) + 1)
1000
1001        # Result
1002        return a + idx_unq / idx_unq_mag

Creates an array of indices, sorted by unique element.

Parameters
  • a (~numpy.array): Array of unique elements (Mx1).
Returns
  • ~numpy.array: Array of indices (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.

@staticmethod
def roll_up_dataframe( df: pandas.core.frame.DataFrame, sort_by: str, tol: list, relative: list, dims: list):
1004    @staticmethod
1005    def roll_up_dataframe(
1006        df : pd.DataFrame,
1007        sort_by : str,
1008        tol : list,
1009        relative : list,
1010        dims : list
1011    ):
1012        """Subset data by rolling up into apex in appropriate dimensions.
1013        
1014        Parameters
1015        ----------
1016        data : pd.DataFrame
1017            The input data containing "dims" columns and the "sort_by" column.
1018        sort_by : str
1019            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature 
1020            (i.e., the mass feature with the highest value in the sort_by column).
1021        dims : list
1022            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1023        tol : list
1024            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1025            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1026            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1027            If relative is False, the tolerance will be used as is.
1028        relative : list
1029            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1030
1031        Returns
1032        -------
1033        pd.DataFrame
1034            A DataFrame with only the rolled up mass features, with the original index and columns.
1035
1036            
1037        Raises
1038        ------
1039        ValueError
1040            If the input data is not a pandas DataFrame.
1041            If the input data does not have columns for each of the dimensions in "dims".
1042            If the length of "dims", "tol", and "relative" do not match.
1043        """
1044        og_columns = df.columns.copy()
1045
1046        # Unindex the data, but keep the original index
1047        if df.index.name is not None:
1048            og_index = df.index.name
1049        else:
1050            og_index = "index"
1051        df = df.reset_index(drop=False)
1052
1053        # Sort data by sort_by column, and reindex
1054        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1055
1056        # Check that data is a DataFrame and has columns for each of the dims
1057        if not isinstance(df, pd.DataFrame):
1058            raise ValueError("Data must be a pandas DataFrame")
1059        for dim in dims:
1060            if dim not in df.columns:
1061                raise ValueError(f"Data must have a column for {dim}")
1062        if len(dims) != len(tol) or len(dims) != len(relative):
1063            raise ValueError(
1064                "Dimensions, tolerances, and relative flags must be the same length"
1065            )
1066        
1067        # Compute inter-feature distances
1068        distances = None
1069        for i in range(len(dims)):
1070            # Construct k-d tree
1071            values = df[dims[i]].values
1072            tree = KDTree(values.reshape(-1, 1))
1073
1074            max_tol = tol[i]
1075            if relative[i] is True:
1076                # Maximum absolute tolerance
1077                max_tol = tol[i] * values.max()
1078
1079            # Compute sparse distance matrix
1080            # the larger the max_tol, the slower this operation is
1081            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1082
1083            # Only consider forward case, exclude diagonal
1084            sdm = sparse.triu(sdm, k=1)
1085
1086            # Filter relative distances
1087            if relative[i] is True:
1088                # Compute relative distances
1089                rel_dists = sdm.data / values[sdm.row]  # or col?
1090
1091                # Indices of relative distances less than tolerance
1092                idx = rel_dists <= tol[i]
1093
1094                # Reconstruct sparse distance matrix
1095                sdm = sparse.coo_matrix(
1096                    (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
1097                    shape=(len(values), len(values)),
1098                )
1099
1100            # Cast as binary matrix
1101            sdm.data = np.ones_like(sdm.data)
1102
1103            # Stack distances
1104            if distances is None:
1105                distances = sdm
1106            else:
1107                distances = distances.multiply(sdm)
1108        
1109        # Extract indices of within-tolerance points
1110        distances = distances.tocoo()
1111        pairs = np.stack((distances.row, distances.col), axis=1)
1112        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"])
1113        pairs_df = pairs_df.set_index("parent")
1114
1115        to_drop = []
1116        while not pairs_df.empty:
1117            # Find root_parents and their children
1118            root_parents = np.setdiff1d(
1119                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1120            )
1121            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1122            to_drop = np.append(to_drop, children_of_roots)
1123
1124            # Remove root_children as possible parents from pairs_df for next iteration
1125            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1126            pairs_df = pairs_df.reset_index().set_index("child")
1127            # Remove root_children as possible children from pairs_df for next iteration
1128            pairs_df = pairs_df.drop(index=children_of_roots)
1129
1130            # Prepare for next iteration
1131            pairs_df = pairs_df.reset_index().set_index("parent")
1132
1133        # Drop mass features that are not cluster parents
1134        df_sub = df.drop(index=np.array(to_drop))
1135
1136        # Set index back to og_index and only keep the columns that are in the original dataframe
1137        df_sub = df_sub.set_index(og_index)
1138
1139        # sort the dataframe by the original index
1140        df_sub = df_sub.sort_index()
1141        df_sub = df_sub[og_columns]
1142
1143        return df_sub

Subset data by rolling up into apex in appropriate dimensions.

Parameters
  • data (pd.DataFrame): The input data containing "dims" columns and the "sort_by" column.
  • sort_by (str): The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature (i.e., the mass feature with the highest value in the sort_by column).
  • dims (list): A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
  • tol (list): A list of tolerances for each dimension. The length of the list must match the number of dimensions. The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). If relative is True, the tolerance will be multiplied by the maximum value in that dimension. If relative is False, the tolerance will be used as is.
  • relative (list): A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
Returns
  • pd.DataFrame: A DataFrame with only the rolled up mass features, with the original index and columns.
Raises
  • ValueError: If the input data is not a pandas DataFrame. If the input data does not have columns for each of the dimensions in "dims". If the length of "dims", "tol", and "relative" do not match.
def sparse_upper_star(self, idx, V):
1145    def sparse_upper_star(self, idx, V):
1146        """Sparse implementation of an upper star filtration.
1147
1148        Parameters
1149        ----------
1150        idx : :obj:`~numpy.array`
1151            Edge indices for each dimension (MxN).
1152        V : :obj:`~numpy.array`
1153            Array of intensity data (Mx1).
1154        Returns
1155        -------
1156        idx : :obj:`~numpy.array`
1157            Index of filtered points (Mx1).
1158        persistence : :obj:`~numpy.array`
1159            Persistence of each filtered point (Mx1).
1160
1161        Notes
1162        -----
1163        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1164        """
1165
1166        # Invert
1167        V = -1 * V.copy().astype(int)
1168
1169        # Embed indices
1170        V = self.embed_unique_indices(V)
1171
1172        # Connectivity matrix
1173        cmat = KDTree(idx)
1174        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1175        cmat.setdiag(1)
1176        cmat = sparse.triu(cmat)
1177
1178        # Pairwise minimums
1179        I, J = cmat.nonzero()
1180        d = np.maximum(V[I], V[J])
1181
1182        # Delete connectiity matrix
1183        cmat_shape = cmat.shape
1184        del cmat
1185
1186        # Sparse distance matrix
1187        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1188
1189        # Delete pairwise mins
1190        del d, I, J
1191
1192        # Persistence homology
1193        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1194
1195        # Bound death values
1196        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1197
1198        # Construct tree to query against
1199        tree = KDTree(V.reshape((-1, 1)))
1200
1201        # Get the indexes of the first nearest neighbor by birth
1202        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1203
1204        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)

Sparse implementation of an upper star filtration.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
Returns
  • idx (~numpy.array): Index of filtered points (Mx1).
  • persistence (~numpy.array): Persistence of each filtered point (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def check_if_grid(self, data):
1206    def check_if_grid(self, data):
1207        """Check if the data are gridded in mz space.
1208
1209        Parameters
1210        ----------
1211        data : DataFrame
1212            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1213
1214        Returns
1215        -------
1216        bool
1217            True if the data is gridded in the mz direction, False otherwise.
1218
1219        Notes
1220        -----
1221        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1222        """
1223        # Calculate the difference between consecutive mz values in a single scan
1224        dat_check = data.copy().reset_index(drop=True)
1225        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1226        mz_diff_min = (
1227            dat_check.groupby("scan")["mz_diff"].min().min()
1228        )  # within each scan, what is the smallest mz difference between consecutive mz values
1229
1230        # Find the mininum mz difference between mz values in the data; regardless of scan
1231        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1232        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1233        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1234
1235        # Get minimum mz_diff between mz values in the data
1236        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1237
1238        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1239        if mz_diff_min_raw < mz_diff_min:
1240            return False
1241        else:
1242            return True

Check if the data are gridded in mz space.

Parameters
  • data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
  • bool: True if the data is gridded in the mz direction, False otherwise.
Notes

This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.

def grid_data(self, data, attempts=5):
1244    def grid_data(self, data, attempts=5):
1245        """Grid the data in the mz dimension.
1246
1247        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1248
1249        Parameters
1250        ----------
1251        data : DataFrame
1252            The input data containing mz, scan, scan_time, and intensity columns.
1253        attempts : int, optional
1254            The number of attempts to grid the data. Default is 5.
1255
1256        Returns
1257        -------
1258        DataFrame
1259            The gridded data with mz, scan, scan_time, and intensity columns.
1260
1261        Raises
1262        ------
1263        ValueError
1264            If gridding fails after the specified number of attempts.
1265        """
1266        attempt_i = 0
1267        while attempt_i < attempts:
1268            attempt_i += 1
1269            data = self._grid_data(data)
1270
1271            if self.check_if_grid(data):
1272                return data
1273        
1274        if not self.check_if_grid(data):
1275            raise ValueError(
1276                "Gridding failed after "
1277                + str(attempt_i)
1278                + " attempts. Please check the data."
1279            )
1280        else:
1281            return data

Grid the data in the mz dimension.

Data must be gridded prior to persistent homology calculations and computing average mass spectrum

Parameters
  • data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
  • attempts (int, optional): The number of attempts to grid the data. Default is 5.
Returns
  • DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
  • ValueError: If gridding fails after the specified number of attempts.
def find_mass_features_ph(self, ms_level=1, grid=True):
1361    def find_mass_features_ph(self, ms_level=1, grid=True):
1362        """Find mass features within an LCMSBase object using persistent homology.
1363
1364        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1365
1366        Parameters
1367        ----------
1368        ms_level : int, optional
1369            The MS level to use. Default is 1.
1370        grid : bool, optional
1371            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1372
1373        Raises
1374        ------
1375        ValueError
1376            If no MS level data is found on the object.
1377            If data is not gridded and grid is False.
1378
1379        Returns
1380        -------
1381        None, but assigns the mass_features attribute to the object.
1382
1383        Notes
1384        -----
1385        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1386        """
1387        # Check that ms_level is a key in self._ms_uprocessed
1388        if ms_level not in self._ms_unprocessed.keys():
1389            raise ValueError(
1390                "No MS level "
1391                + str(ms_level)
1392                + " data found, did you instantiate with parser specific to MS level?"
1393            )
1394
1395        # Get ms data
1396        data = self._ms_unprocessed[ms_level].copy()
1397
1398        # Drop rows with missing intensity values and reset index
1399        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1400
1401        # Threshold data
1402        dims = ["mz", "scan_time"]
1403        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1404        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1405
1406        # Check if gridded, if not, grid
1407        gridded_mz = self.check_if_grid(data_thres)
1408        if gridded_mz is False:
1409            if grid is False:
1410                raise ValueError(
1411                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1412                )
1413            else:
1414                data_thres = self.grid_data(data_thres)
1415
1416        # Add build factors and add scan_time
1417        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1418        factors = {
1419            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1420            for dim in dims
1421        }  # this is return a float64 index
1422
1423        # Build indexes
1424        index = {
1425            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1426            for dim in factors
1427        }
1428
1429        # Smooth data
1430        iterations = self.parameters.lc_ms.ph_smooth_it
1431        smooth_radius = [
1432            self.parameters.lc_ms.ph_smooth_radius_mz,
1433            self.parameters.lc_ms.ph_smooth_radius_scan,
1434        ]  # mz, scan_time smoothing radius (in steps)
1435
1436        index = np.vstack([index[dim] for dim in dims]).T
1437        V = data_thres["intensity"].values
1438        resid = np.inf
1439        for i in range(iterations):
1440            # Previous iteration
1441            V_prev = V.copy()
1442            resid_prev = resid
1443            V = self.sparse_mean_filter(index, V, radius=smooth_radius)
1444
1445            # Calculate residual with previous iteration
1446            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1447
1448            # Evaluate convergence
1449            if i > 0:
1450                # Percent change in residual
1451                test = np.abs(resid - resid_prev) / resid_prev
1452
1453                # Exit criteria
1454                if test <= 0:
1455                    break
1456
1457        # Overwrite values
1458        data_thres["intensity"] = V
1459
1460        # Use persistent homology to find regions of interest
1461        pidx, pers = self.sparse_upper_star(index, V)
1462        pidx = pidx[pers > 1]
1463        pers = pers[pers > 1]
1464
1465        # Get peaks
1466        peaks = data_thres.iloc[pidx, :].reset_index(drop=True)
1467
1468        # Add persistence column
1469        peaks["persistence"] = pers
1470        mass_features = peaks.sort_values(
1471            by="persistence", ascending=False
1472        ).reset_index(drop=True)
1473
1474        # Filter by persistence threshold
1475        persistence_threshold = (
1476            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1477        )
1478        mass_features = mass_features.loc[
1479            mass_features["persistence"] > persistence_threshold, :
1480        ].reset_index(drop=True)
1481
1482        # Rename scan column to apex_scan
1483        mass_features = mass_features.rename(
1484            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1485        )
1486
1487        # Populate mass_features attribute
1488        self.mass_features = {}
1489        for row in mass_features.itertuples():
1490            row_dict = mass_features.iloc[row.Index].to_dict()
1491            lcms_feature = LCMSMassFeature(self, **row_dict)
1492            self.mass_features[lcms_feature.id] = lcms_feature
1493
1494        if self.parameters.lc_ms.verbose_processing:
1495            print("Found " + str(len(mass_features)) + " initial mass features")

Find mass features within an LCMSBase object using persistent homology.

Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
  • None, but assigns the mass_features attribute to the object.
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def find_mass_features_ph_centroid(self, ms_level=1):
1497    def find_mass_features_ph_centroid(self, ms_level=1):
1498        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1499        
1500        Parameters
1501        ----------
1502        ms_level : int, optional
1503            The MS level to use. Default is 1.
1504        
1505        Raises
1506        ------
1507        ValueError
1508            If no MS level data is found on the object.
1509
1510        Returns
1511        -------
1512        None, but assigns the mass_features attribute to the object.        
1513        """
1514        # Check that ms_level is a key in self._ms_uprocessed
1515        if ms_level not in self._ms_unprocessed.keys():
1516            raise ValueError(
1517                "No MS level "
1518                + str(ms_level)
1519                + " data found, did you instantiate with parser specific to MS level?"
1520            )
1521        
1522        # Get ms data
1523        data = self._ms_unprocessed[ms_level].copy()
1524
1525        # Drop rows with missing intensity values and reset index
1526        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1527        
1528        # Threshold data
1529        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1530        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1531        data_thres['persistence'] = data_thres['intensity']
1532
1533        # Use this as the starting point for the mass features, adding scan_time
1534        mf_df = data_thres
1535        mf_df = mf_df.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1536
1537        # Define tolerances and dimensions for rolling up
1538        tol = [
1539            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1540            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1541        ]
1542        relative = [True, False]    
1543        dims = ["mz", "scan_time"]
1544        print("here")
1545        mf_df = self.roll_up_dataframe(
1546            df=mf_df,
1547            sort_by="persistence",
1548            dims=dims,
1549            tol=tol,
1550            relative=relative
1551        )
1552
1553        # Rename scan column to apex_scan
1554        mass_features = mf_df.rename(
1555            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1556        )
1557        # Sort my persistence and reset index
1558        mass_features = mass_features.sort_values(
1559            by="persistence", ascending=False
1560        ).reset_index(drop=True)
1561
1562        # Populate mass_features attribute
1563        self.mass_features = {}
1564        for row in mass_features.itertuples():
1565            row_dict = mass_features.iloc[row.Index].to_dict()
1566            lcms_feature = LCMSMassFeature(self, **row_dict)
1567            self.mass_features[lcms_feature.id] = lcms_feature
1568
1569        if self.parameters.lc_ms.verbose_processing:
1570            print("Found " + str(len(mass_features)) + " initial mass features")

Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no MS level data is found on the object.
Returns
  • None, but assigns the mass_features attribute to the object.
def cluster_mass_features(self, drop_children=True, sort_by='persistence'):
1572    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1573        """Cluster mass features
1574
1575        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1576
1577        Parameters
1578        ----------
1579        drop_children : bool, optional
1580            Whether to drop the mass features that are not cluster parents. Default is True.
1581        sort_by : str, optional
1582            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1583
1584        Raises
1585        ------
1586        ValueError
1587            If no mass features are found.
1588            If too many mass features are found.
1589
1590        Returns
1591        -------
1592        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1593        """
1594        verbose = self.parameters.lc_ms.verbose_processing
1595
1596        if self.mass_features is None:
1597            raise ValueError("No mass features found, run find_mass_features() first")
1598        if len(self.mass_features) > 400000:
1599            raise ValueError(
1600                "Too many mass featuers of interest found, run find_mass_features() with a higher intensity threshold"
1601            )
1602        dims = ["mz", "scan_time"]
1603        mf_df_og = self.mass_features_to_df()
1604        mf_df = mf_df_og.copy()
1605
1606        tol = [
1607            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1608            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1609        ]  # mz, in relative; scan_time in minutes
1610        relative = [True, False]
1611
1612        # Roll up mass features based on their proximity in the declared dimensions
1613        mf_df_new = self.roll_up_dataframe(
1614            df=mf_df,
1615            sort_by=sort_by,
1616            dims=dims,
1617            tol=tol,
1618            relative=relative
1619        )
1620
1621        mf_df["cluster_parent"] = np.where(
1622            np.isin(mf_df.index, mf_df_new.index), True, False
1623        )
1624
1625        # get mass feature ids of features that are not cluster parents
1626        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1627        if drop_children is True:
1628            # Drop mass features that are not cluster parents from self
1629            self.mass_features = {
1630                k: v
1631                for k, v in self.mass_features.items()
1632                if k not in cluster_daughters
1633            }
1634        else:
1635            return cluster_daughters

Cluster mass features

Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.

Parameters
  • drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
  • sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
  • ValueError: If no mass features are found. If too many mass features are found.
Returns
  • None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.