corems.mass_spectra.calc.lc_calc

   1import numpy as np
   2import pandas as pd
   3from ripser import ripser
   4from scipy import sparse
   5from scipy.spatial import KDTree
   6
   7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
   8from corems.mass_spectra.calc import SignalProcessing as sp
   9from corems.mass_spectra.factory.chromat_data import EIC_Data
  10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  11
  12
  13def find_closest(A, target):
  14    """Find the index of closest value in A to each value in target.
  15
  16    Parameters
  17    ----------
  18    A : :obj:`~numpy.array`
  19        The array to search (blueprint). A must be sorted.
  20    target : :obj:`~numpy.array`
  21        The array of values to search for. target must be sorted.
  22
  23    Returns
  24    -------
  25    :obj:`~numpy.array`
  26        The indices of the closest values in A to each value in target.
  27    """
  28    idx = A.searchsorted(target)
  29    idx = np.clip(idx, 1, len(A) - 1)
  30    left = A[idx - 1]
  31    right = A[idx]
  32    idx -= target - left < right - target
  33    return idx
  34
  35
  36class LCCalculations:
  37    """Methods for performing LC calculations on mass spectra data.
  38
  39    Notes
  40    -----
  41    This class is intended to be used as a mixin for the LCMSBase class.
  42
  43    Methods
  44    -------
  45    * get_max_eic(eic_data).
  46        Returns the maximum EIC value from the given EIC data. A static method.
  47    * smooth_tic(tic).
  48        Smooths the TIC data using the specified smoothing method and settings.
  49    * eic_centroid_detector(rt, eic, max_eic).
  50        Performs EIC centroid detection on the given EIC data.
  51    * find_nearest_scan(rt).
  52        Finds the nearest scan to the given retention time.
  53    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
  54        Returns an averaged mass spectrum object.
  55    * find_mass_features(ms_level=1).
  56        Find regions of interest for a given MS level (default is MS1).
  57    * integrate_mass_features(drop_if_fail=False, ms_level=1).
  58        Integrate mass features of interest and extracts EICs.
  59    * find_c13_mass_features().
  60        Evaluate mass features and mark likely C13 isotopes.
  61    * deconvolute_ms1_mass_features().
  62        Deconvolute mass features' ms1 mass spectra.
  63    """
  64
  65    @staticmethod
  66    def get_max_eic(eic_data: dict):
  67        """Returns the maximum EIC value from the given EIC data.
  68
  69        Notes
  70        -----
  71        This is a static method.
  72
  73        Parameters
  74        ----------
  75        eic_data : dict
  76            A dictionary containing EIC data.
  77
  78        Returns
  79        -------
  80        float
  81            The maximum EIC value.
  82        """
  83        max_eic = 0
  84        for eic_data in eic_data.values():
  85            ind_max_eic = max(eic_data.get("EIC"))
  86            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
  87
  88        return max_eic
  89
  90    def smooth_tic(self, tic):
  91        """Smooths the TIC or EIC data using the specified smoothing method and settings.
  92
  93        Parameters
  94        ----------
  95        tic : numpy.ndarray
  96            The TIC (or EIC) data to be smoothed.
  97
  98        Returns
  99        -------
 100        numpy.ndarray
 101            The smoothed TIC data.
 102        """
 103        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
 104
 105        pol_order = self.parameters.lc_ms.savgol_pol_order
 106
 107        window_len = self.parameters.lc_ms.smooth_window
 108
 109        window = self.parameters.lc_ms.smooth_method
 110
 111        return sp.smooth_signal(
 112            tic, window_len, window, pol_order, implemented_smooth_method
 113        )
 114
 115    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
 116        """Performs EIC centroid detection on the given EIC data.
 117
 118        Parameters
 119        ----------
 120        rt : numpy.ndarray
 121            The retention time data.
 122        eic : numpy.ndarray
 123            The EIC data.
 124        max_eic : float
 125            The maximum EIC value.
 126        apex_indexes : list, optional
 127            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
 128
 129        Returns
 130        -------
 131        numpy.ndarray
 132            The indexes of left, apex, and right limits as a generator.
 133        """
 134
 135        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
 136
 137        max_height = self.parameters.lc_ms.peak_height_max_percent
 138
 139        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
 140
 141        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
 142
 143        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
 144
 145        include_indexes = sp.peak_picking_first_derivative(
 146            domain=rt,
 147            signal=eic,
 148            max_height=max_height,
 149            max_prominence=max_prominence,
 150            max_signal=max_eic,
 151            min_peak_datapoints=min_peak_datapoints,
 152            peak_derivative_threshold=peak_derivative_threshold,
 153            signal_threshold=signal_threshold,
 154            correct_baseline=False,
 155            plot_res=False,
 156            apex_indexes=apex_indexes,
 157        )
 158
 159        return include_indexes
 160
 161    def find_nearest_scan(self, rt):
 162        """Finds the nearest scan to the given retention time.
 163
 164        Parameters
 165        ----------
 166        rt : float
 167            The retention time (in minutes) to find the nearest scan for.
 168
 169        Returns
 170        -------
 171        int
 172            The scan number of the nearest scan.
 173        """
 174        array_rt = np.array(self.retention_time)
 175
 176        scan_index = (np.abs(array_rt - rt)).argmin()
 177
 178        real_scan = self.scans_number[scan_index]
 179
 180        return real_scan
 181
 182    def add_peak_metrics(self):
 183        """Add peak metrics to the mass features.
 184
 185        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
 186        """
 187        # Check that at least some mass features have eic data
 188        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
 189            raise ValueError(
 190                "No mass features have EIC data. Run integrate_mass_features first."
 191            )
 192
 193        for mass_feature in self.mass_features.values():
 194            # Check if the mass feature has been integrated
 195            if mass_feature._eic_data is not None:
 196                # Calculate peak metrics
 197                mass_feature.calc_half_height_width()
 198                mass_feature.calc_tailing_factor()
 199                mass_feature.calc_dispersity_index()
 200
 201    def get_average_mass_spectrum(
 202        self,
 203        scan_list,
 204        apex_scan,
 205        spectrum_mode="profile",
 206        ms_level=1,
 207        auto_process=True,
 208        use_parser=False,
 209        perform_checks=True,
 210        polarity=None,
 211        ms_params=None,
 212    ):
 213        """Returns an averaged mass spectrum object
 214
 215        Parameters
 216        ----------
 217        scan_list : list
 218            List of scan numbers to average.
 219        apex_scan : int
 220            Number of the apex scan
 221        spectrum_mode : str, optional
 222            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
 223        ms_level : int, optional
 224            The MS level to use. Defaults to 1.
 225        auto_process : bool, optional
 226            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
 227        use_parser : bool, optional
 228            If True, the mass spectra will be obtained from the parser. Defaults to False.
 229        perform_checks : bool, optional
 230            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
 231        polarity : int, optional
 232            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
 233        ms_params : MSParameters, optional
 234            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
 235
 236        Returns
 237        -------
 238        MassSpectrumProfile
 239            The averaged mass spectrum object.
 240
 241        Raises
 242        ------
 243        ValueError
 244            If the spectrum mode is not "profile".
 245            If the MS level is not found in the unprocessed mass spectra dictionary.
 246            If not all scan numbers are found in the unprocessed mass spectra dictionary.
 247        """
 248        if perform_checks:
 249            if spectrum_mode != "profile":
 250                raise ValueError("Averaging only supported for profile mode")
 251
 252        if polarity is None:
 253            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 254            if self.polarity == "negative":
 255                polarity = -1
 256            elif self.polarity == "positive":
 257                polarity = 1
 258            else:
 259                raise ValueError(
 260                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
 261                )
 262
 263        # if not using_parser, check that scan numbers are in _ms_unprocessed
 264        if not use_parser:
 265            if perform_checks:
 266                # Set index to scan for faster lookup
 267                ms_df = (
 268                    self._ms_unprocessed[ms_level]
 269                    .copy()
 270                    .set_index("scan", drop=False)
 271                    .sort_index()
 272                )
 273                my_ms_df = ms_df.loc[scan_list]
 274                # Check that all scan numbers are in the ms_df
 275                if not all(np.isin(scan_list, ms_df.index)):
 276                    raise ValueError(
 277                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
 278                    )
 279            else:
 280                my_ms_df = (
 281                    pd.DataFrame({"scan": scan_list})
 282                    .set_index("scan")
 283                    .join(self._ms_unprocessed[ms_level], how="left")
 284                )
 285
 286        if use_parser:
 287            ms_list = [
 288                self.spectra_parser.get_mass_spectrum_from_scan(
 289                    x, spectrum_mode=spectrum_mode, auto_process=False
 290                )
 291                for x in scan_list
 292            ]
 293            ms_mz = [x._mz_exp for x in ms_list]
 294            ms_int = [x._abundance for x in ms_list]
 295            my_ms_df = []
 296            for i in np.arange(len(ms_mz)):
 297                my_ms_df.append(
 298                    pd.DataFrame(
 299                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
 300                    )
 301                )
 302            my_ms_df = pd.concat(my_ms_df)
 303
 304        if not self.check_if_grid(my_ms_df):
 305            my_ms_df = self.grid_data(my_ms_df)
 306
 307        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
 308
 309        ms = ms_from_array_profile(
 310            my_ms_ave.mz,
 311            my_ms_ave.intensity,
 312            self.file_location,
 313            polarity=polarity,
 314            auto_process=False,
 315        )
 316
 317        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 318        if ms is not None:
 319            if ms_params is not None:
 320                ms.parameters = ms_params
 321            ms.scan_number = apex_scan
 322            if auto_process:
 323                ms.process_mass_spec()
 324        return ms
 325
 326    def find_mass_features(self, ms_level=1, grid=True):
 327        """Find mass features within an LCMSBase object
 328
 329        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
 330
 331        Parameters
 332        ----------
 333        ms_level : int, optional
 334            The MS level to use for peak picking Default is 1.
 335        grid : bool, optional
 336            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded),
 337            used for persistent homology peak picking for profile data only. Default is True.
 338
 339        Raises
 340        ------
 341        ValueError
 342            If no MS level data is found on the object.
 343            If persistent homology peak picking is attempted on non-profile mode data.
 344            If data is not gridded and grid is False.
 345            If peak picking method is not implemented.
 346
 347        Returns
 348        -------
 349        None, but assigns the mass_features and eics attributes to the object.
 350
 351        """
 352        pp_method = self.parameters.lc_ms.peak_picking_method
 353
 354        if pp_method == "persistent homology":
 355            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 356            if all(msx_scan_df["ms_format"] == "profile"):
 357                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
 358            else:
 359                raise ValueError(
 360                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
 361                        ms_level
 362                    )
 363                )
 364        elif pp_method == "centroided_persistent_homology":
 365            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 366            if all(msx_scan_df["ms_format"] == "centroid"):
 367                self.find_mass_features_ph_centroid(ms_level=ms_level)
 368            else:
 369                raise ValueError(
 370                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
 371                        ms_level
 372                    )
 373                )
 374        else:
 375            raise ValueError("Peak picking method not implemented")
 376
 377    def integrate_mass_features(
 378        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
 379    ):
 380        """Integrate mass features and extract EICs.
 381
 382        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
 383
 384        Parameters
 385        ----------
 386        drop_if_fail : bool, optional
 387            Whether to drop mass features if the EIC limit calculations fail.
 388            Default is True.
 389        drop_duplicates : bool, optional
 390            Whether to mass features that appear to be duplicates
 391            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
 392            Default is True.
 393        ms_level : int, optional
 394            The MS level to use. Default is 1.
 395
 396        Raises
 397        ------
 398        ValueError
 399            If no mass features are found.
 400            If no MS level data is found for the given MS level (either in data or in the scan data)
 401
 402        Returns
 403        -------
 404        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
 405
 406        Notes
 407        -----
 408        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
 409        """
 410        # Check if there is data
 411        if ms_level in self._ms_unprocessed.keys():
 412            raw_data = self._ms_unprocessed[ms_level].copy()
 413        else:
 414            raise ValueError("No MS level " + str(ms_level) + " data found")
 415        if self.mass_features is not None:
 416            mf_df = self.mass_features_to_df().copy()
 417        else:
 418            raise ValueError(
 419                "No mass features found, did you run find_mass_features() first?"
 420            )
 421        # Check if mass_spectrum exists on each mass feature
 422        if not all(
 423            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
 424        ):
 425            raise ValueError(
 426                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
 427            )
 428
 429        # Subset scan data to only include correct ms_level
 430        scan_df_sub = self.scan_df[
 431            self.scan_df["ms_level"] == int(ms_level)
 432        ].reset_index(drop=True)
 433        if scan_df_sub.empty:
 434            raise ValueError("No MS level " + ms_level + " data found in scan data")
 435        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
 436
 437        mzs_to_extract = np.unique(mf_df["mz"].values)
 438        mzs_to_extract.sort()
 439
 440        # Pre-sort raw_data by mz for faster filtering
 441        raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True)
 442        raw_data_mz = raw_data_sorted["mz"].values
 443
 444        # Get EICs for each unique mz in mass features list
 445        for mz in mzs_to_extract:
 446            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 447            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 448
 449            # Use binary search for faster mz range filtering
 450            left_idx = np.searchsorted(raw_data_mz, mz_min, side="left")
 451            right_idx = np.searchsorted(raw_data_mz, mz_max, side="right")
 452            raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy()
 453
 454            raw_data_sub = (
 455                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
 456            )
 457            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
 458            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
 459            myEIC = EIC_Data(
 460                scans=raw_data_sub["scan"].values,
 461                time=raw_data_sub["scan_time"].values,
 462                eic=raw_data_sub["intensity"].values,
 463            )
 464            # Smooth EIC
 465            smoothed_eic = self.smooth_tic(myEIC.eic)
 466            smoothed_eic[smoothed_eic < 0] = 0
 467            myEIC.eic_smoothed = smoothed_eic
 468            self.eics[mz] = myEIC
 469
 470        # Get limits of mass features using EIC centroid detector and integrate
 471        mf_df["area"] = np.nan
 472        for idx, mass_feature in mf_df.iterrows():
 473            mz = mass_feature.mz
 474            apex_scan = mass_feature.apex_scan
 475
 476            # Pull EIC data and find apex scan index
 477            myEIC = self.eics[mz]
 478            self.mass_features[idx]._eic_data = myEIC
 479            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
 480
 481            # Find left and right limits of peak using EIC centroid detector, add to EICData
 482            centroid_eics = self.eic_centroid_detector(
 483                myEIC.time,
 484                myEIC.eic_smoothed,
 485                mass_feature.intensity * 1.1,
 486                apex_indexes=[int(apex_index)],
 487            )
 488            l_a_r_scan_idx = [i for i in centroid_eics]
 489            if len(l_a_r_scan_idx) > 0:
 490                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
 491                # Find the number of consecutive non-zero values in the EIC segment
 492                mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0
 493                # Find the longest run of consecutive True values
 494                if np.any(mask):
 495                    # Find indices where mask changes value
 496                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
 497                    starts = np.where(diff == 1)[0]
 498                    ends = np.where(diff == -1)[0]
 499                    consecutive_scans = (ends - starts).max()
 500                else:
 501                    consecutive_scans = 0
 502                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
 503                    self.mass_features.pop(idx)
 504                    continue
 505                # Add start and final scan to mass_features and EICData
 506                left_scan, right_scan = (
 507                    myEIC.scans[l_a_r_scan_idx[0][0]],
 508                    myEIC.scans[l_a_r_scan_idx[0][2]],
 509                )
 510                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
 511                myEIC.apexes = myEIC.apexes + mf_scan_apex
 512                self.mass_features[idx].start_scan = left_scan
 513                self.mass_features[idx].final_scan = right_scan
 514
 515                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
 516                area = np.trapz(
 517                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 518                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 519                )
 520                mf_df.at[idx, "area"] = area
 521                myEIC.areas = myEIC.areas + [area]
 522                self.eics[mz] = myEIC
 523                self.mass_features[idx]._area = area
 524            else:
 525                if drop_if_fail is True:
 526                    self.mass_features.pop(idx)
 527
 528        if drop_duplicates:
 529            # Prepare mass feature dataframe
 530            mf_df = self.mass_features_to_df()
 531
 532            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
 533            # Keep the first mass feature (highest persistence)
 534            for idx, mass_feature in mf_df.iterrows():
 535                mz = mass_feature.mz
 536                apex_scan = mass_feature.apex_scan
 537
 538                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
 539                mf_df_sub = mf_df[
 540                    mf_df["mz_diff_ppm"]
 541                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 542                    * 10**6
 543                ].copy()
 544
 545                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
 546                for idx2, mass_feature2 in mf_df_sub.iterrows():
 547                    if idx2 != idx:
 548                        if (
 549                            mass_feature2.start_scan >= mass_feature.start_scan
 550                            and mass_feature2.final_scan <= mass_feature.final_scan
 551                        ):
 552                            if idx2 in self.mass_features.keys():
 553                                self.mass_features.pop(idx2)
 554
 555    def find_c13_mass_features(self):
 556        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
 557
 558        Returns
 559        -------
 560        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
 561
 562        Raises
 563        ------
 564        ValueError
 565            If no mass features are found.
 566        """
 567        verbose = self.parameters.lc_ms.verbose_processing
 568        if verbose:
 569            print("evaluating mass features for C13 isotopes")
 570        if self.mass_features is None:
 571            raise ValueError("No mass features found, run find_mass_features() first")
 572
 573        # Data prep fo sparse distance matrix
 574        dims = ["mz", "scan_time"]
 575        mf_df = self.mass_features_to_df().copy()
 576        # Drop mass features that have no area (these are likely to be noise)
 577        mf_df = mf_df[mf_df["area"].notnull()]
 578        mf_df["mf_id"] = mf_df.index.values
 579        dims = ["mz", "scan_time"]
 580
 581        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
 582        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
 583
 584        mz_diff = 1.003355  # C13-C12 mass difference
 585        tol = [
 586            mf_df["mz"].median()
 587            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
 588            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
 589        ]  # mz, in relative; scan_time in minutes
 590
 591        # Compute inter-feature distances
 592        distances = None
 593        for i in range(len(dims)):
 594            # Construct k-d tree
 595            values = mf_df[dims[i]].values
 596            tree = KDTree(values.reshape(-1, 1))
 597
 598            max_tol = tol[i]
 599            if dims[i] == "mz":
 600                # Maximum absolute tolerance
 601                max_tol = mz_diff + tol[i]
 602
 603            # Compute sparse distance matrix
 604            # the larger the max_tol, the slower this operation is
 605            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
 606
 607            # Only consider forward case, exclude diagonal
 608            sdm = sparse.triu(sdm, k=1)
 609
 610            if dims[i] == "mz":
 611                min_tol = mz_diff - tol[i]
 612                # Get only the ones that are above the min tol
 613                idx = sdm.data > min_tol
 614
 615                # Reconstruct sparse distance matrix
 616                sdm = sparse.coo_matrix(
 617                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
 618                    shape=(len(values), len(values)),
 619                )
 620
 621            # Cast as binary matrix
 622            sdm.data = np.ones_like(sdm.data)
 623
 624            # Stack distances
 625            if distances is None:
 626                distances = sdm
 627            else:
 628                distances = distances.multiply(sdm)
 629
 630        # Extract indices of within-tolerance points
 631        distances = distances.tocoo()
 632        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
 633
 634        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
 635        pairs_mf = pairs.copy()
 636        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
 637        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
 638
 639        # Connect monoisotopic masses with isotopologes within mass_features
 640        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
 641        for mono in monos:
 642            self.mass_features[mono].monoisotopic_mf_id = mono
 643        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
 644        while not pairs_iso_df.empty:
 645            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
 646            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
 647            for iso in m1_isos:
 648                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
 649                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
 650                if len(parent) > 1:
 651                    # Choose the parent that is closest in time to the isotopologue
 652                    parent_time = [self.mass_features[p].retention_time for p in parent]
 653                    time_diff = [
 654                        np.abs(self.mass_features[iso].retention_time - x)
 655                        for x in parent_time
 656                    ]
 657                    parent = parent[np.argmin(time_diff)]
 658                else:
 659                    parent = parent[0]
 660                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
 661                    parent
 662                ].monoisotopic_mf_id
 663                if self.mass_features[iso].monoisotopic_mf_id is not None:
 664                    mass_diff = (
 665                        self.mass_features[iso].mz
 666                        - self.mass_features[
 667                            self.mass_features[iso].monoisotopic_mf_id
 668                        ].mz
 669                    )
 670                    self.mass_features[iso].isotopologue_type = "13C" + str(
 671                        int(round(mass_diff, 0))
 672                    )
 673
 674            # Drop the mono and iso from the pairs_iso_df
 675            pairs_iso_df = pairs_iso_df.drop(
 676                index=monos, errors="ignore"
 677            )  # Drop pairs where the parent is a child that is a child of a root
 678            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
 679            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
 680
 681            if not pairs_iso_df.empty:
 682                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
 683                monos = np.setdiff1d(
 684                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
 685                )
 686        if verbose:
 687            # Report fraction of compounds annotated with isotopes
 688            mf_df["c13_flag"] = np.where(
 689                np.logical_or(
 690                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
 691                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
 692                ),
 693                1,
 694                0,
 695            )
 696            print(
 697                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
 698                + " of mass features have or are C13 isotopes"
 699            )
 700
 701    def deconvolute_ms1_mass_features(self):
 702        """Deconvolute MS1 mass features
 703
 704        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
 705
 706        Parameters
 707        ----------
 708        None
 709
 710        Returns
 711        -------
 712        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
 713        and associated_mass_features_deconvoluted attributes to the mass features in the
 714        mass_features attribute of the LCMSBase object.
 715
 716        Raises
 717        ------
 718        ValueError
 719            If no mass features are found, must run find_mass_features() first.
 720            If no EICs are found, did you run integrate_mass_features() first?
 721
 722        """
 723        # Checks for set mass_features and eics
 724        if self.mass_features is None:
 725            raise ValueError(
 726                "No mass features found, did you run find_mass_features() first?"
 727            )
 728
 729        if self.eics == {}:
 730            raise ValueError(
 731                "No EICs found, did you run integrate_mass_features() first?"
 732            )
 733
 734        if 1 not in self._ms_unprocessed.keys():
 735            raise ValueError("No unprocessed MS1 spectra found.")
 736
 737        # Prep ms1 data
 738        ms1_data = self._ms_unprocessed[1].copy()
 739        ms1_data = ms1_data.set_index("scan")
 740
 741        # Prep mass feature summary
 742        mass_feature_df = self.mass_features_to_df()
 743
 744        # Loop through each mass feature
 745        for mf_id, mass_feature in self.mass_features.items():
 746            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
 747            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
 748                continue
 749
 750            # Get the left and right limits of the EIC of the mass feature
 751            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
 752
 753            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
 754            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
 755            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
 756
 757            # Get the centroided masses of the mass feature
 758            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
 759
 760            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
 761            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
 762                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
 763            ]
 764
 765            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
 766            ms1_data_sub["mz_diff_rel"] = (
 767                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
 768                / ms1_data_sub["mz"]
 769            )
 770            ms1_data_sub = ms1_data_sub[
 771                ms1_data_sub["mz_diff_rel"]
 772                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 773            ].reset_index(drop=True)
 774
 775            # Group by mass_feature_mz and scan and sum intensity
 776            ms1_data_sub_group = (
 777                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
 778                .sum()
 779                .reset_index()
 780            )
 781
 782            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
 783            corr = (
 784                ms1_data_sub_group.pivot(
 785                    index="scan", columns="mass_feature_mz", values="intensity"
 786                )
 787                .fillna(0)
 788                .corr()
 789            )
 790
 791            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
 792            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
 793
 794            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
 795            try:
 796                corr_subset = corr.loc[mass_feature.mz,]
 797            except KeyError:
 798                # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
 799                continue
 800
 801            corr_subset = corr_subset[corr_subset > decon_corr_min]
 802
 803            # Get the masses from the mass spectrum that are the result of the deconvolution
 804            mzs_decon = corr_subset.index.values
 805
 806            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
 807            mzs_decon_idx = [
 808                id
 809                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
 810                if mz in mzs_decon
 811            ]
 812            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
 813
 814            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
 815            if (
 816                mass_feature.ms1_peak.abundance
 817                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
 818            ):
 819                mass_feature.mass_spectrum_deconvoluted_parent = True
 820            else:
 821                mass_feature.mass_spectrum_deconvoluted_parent = False
 822
 823            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
 824            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
 825            mass_feature_df_sub = mass_feature_df[
 826                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
 827                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 828            ].copy()
 829            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
 830            mass_feature_df_sub["mz_diff_ppm"] = [
 831                np.abs(mzs_decon - mz).min() / mz * 10**6
 832                for mz in mass_feature_df_sub["mz"]
 833            ]
 834            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
 835            mfs_associated_decon = mass_feature_df_sub[
 836                mass_feature_df_sub["mz_diff_ppm"]
 837                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
 838            ].index.values
 839
 840            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
 841
 842
 843class PHCalculations:
 844    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 845
 846    Notes
 847    -----
 848    This class is intended to be used as a mixin for the LCMSBase class.
 849
 850    Methods
 851    -------
 852    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 853        Sparse implementation of a mean filter.
 854    * embed_unique_indices(a).
 855        Creates an array of indices, sorted by unique element.
 856    * sparse_upper_star(idx, V).
 857        Sparse implementation of an upper star filtration.
 858    * check_if_grid(data).
 859        Check if the data is gridded in mz space.
 860    * grid_data(data).
 861        Grid the data in the mz dimension.
 862    * find_mass_features_ph(ms_level=1, grid=True).
 863        Find mass features within an LCMSBase object using persistent homology.
 864    * cluster_mass_features(drop_children=True).
 865        Cluster regions of interest.
 866    """
 867
 868    @staticmethod
 869    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 870        """Sparse implementation of a mean filter.
 871
 872        Parameters
 873        ----------
 874        idx : :obj:`~numpy.array`
 875            Edge indices for each dimension (MxN).
 876        V : :obj:`~numpy.array`
 877            Array of intensity data (Mx1).
 878        radius : float or list
 879            Radius of the sparse filter in each dimension. Values less than
 880            zero indicate no connectivity in that dimension.
 881
 882        Returns
 883        -------
 884        :obj:`~numpy.array`
 885            Filtered intensities (Mx1).
 886
 887        Notes
 888        -----
 889        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 890        This is a static method.
 891        """
 892
 893        # Copy indices
 894        idx = idx.copy().astype(V.dtype)
 895
 896        # Scale
 897        for i, r in enumerate(radius):
 898            # Increase inter-index distance
 899            if r < 1:
 900                idx[:, i] *= 2
 901
 902            # Do nothing
 903            elif r == 1:
 904                pass
 905
 906            # Decrease inter-index distance
 907            else:
 908                idx[:, i] /= r
 909
 910        # Connectivity matrix
 911        cmat = KDTree(idx)
 912        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 913        cmat.setdiag(1)
 914
 915        # Pair indices
 916        I, J = cmat.nonzero()
 917
 918        # Delete cmat
 919        cmat_shape = cmat.shape
 920        del cmat
 921
 922        # Sum over columns
 923        V_sum = sparse.bsr_matrix(
 924            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 925        ).diagonal(0)
 926
 927        # Count over columns
 928        V_count = sparse.bsr_matrix(
 929            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 930        ).diagonal(0)
 931
 932        return V_sum / V_count
 933
 934    @staticmethod
 935    def embed_unique_indices(a):
 936        """Creates an array of indices, sorted by unique element.
 937
 938        Parameters
 939        ----------
 940        a : :obj:`~numpy.array`
 941            Array of unique elements (Mx1).
 942
 943        Returns
 944        -------
 945        :obj:`~numpy.array`
 946            Array of indices (Mx1).
 947
 948        Notes
 949        -----
 950        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 951        This is a static method.
 952        """
 953
 954        def count_tens(n):
 955            # Count tens
 956            ntens = (n - 1) // 10
 957
 958            while True:
 959                ntens_test = (ntens + n - 1) // 10
 960
 961                if ntens_test == ntens:
 962                    return ntens
 963                else:
 964                    ntens = ntens_test
 965
 966        def arange_exclude_10s(n):
 967            # How many 10s will there be?
 968            ntens = count_tens(n)
 969
 970            # Base array
 971            arr = np.arange(0, n + ntens)
 972
 973            # Exclude 10s
 974            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 975
 976            return arr
 977
 978        # Creates an array of indices, sorted by unique element
 979        idx_sort = np.argsort(a)
 980        idx_unsort = np.argsort(idx_sort)
 981
 982        # Sorts records array so all unique elements are together
 983        sorted_a = a[idx_sort]
 984
 985        # Returns the unique values, the index of the first occurrence,
 986        # and the count for each element
 987        vals, idx_start, count = np.unique(
 988            sorted_a, return_index=True, return_counts=True
 989        )
 990
 991        # Splits the indices into separate arrays
 992        splits = np.split(idx_sort, idx_start[1:])
 993
 994        # Creates unique indices for each split
 995        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 996
 997        # Reorders according to input array
 998        idx_unq = idx_unq[idx_unsort]
 999
1000        # Magnitude of each index
1001        exp = np.log10(
1002            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
1003        )
1004        idx_unq_mag = np.power(10, np.floor(exp) + 1)
1005
1006        # Result
1007        return a + idx_unq / idx_unq_mag
1008
1009    @staticmethod
1010    def roll_up_dataframe(
1011        df: pd.DataFrame,
1012        sort_by: str,
1013        tol: list,
1014        relative: list,
1015        dims: list,
1016        memory_opt_threshold: int = 10000,
1017    ):
1018        """Subset data by rolling up into apex in appropriate dimensions.
1019
1020        Parameters
1021        ----------
1022        data : pd.DataFrame
1023            The input data containing "dims" columns and the "sort_by" column.
1024        sort_by : str
1025            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature
1026            (i.e., the mass feature with the highest value in the sort_by column).
1027        dims : list
1028            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1029        tol : list
1030            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1031            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1032            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1033            If relative is False, the tolerance will be used as is.
1034        relative : list
1035            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1036        memory_opt_threshold : int, optional
1037            Minimum number of rows to trigger memory-optimized processing. Default is 10000.
1038
1039        Returns
1040        -------
1041        pd.DataFrame
1042            A DataFrame with only the rolled up mass features, with the original index and columns.
1043
1044
1045        Raises
1046        ------
1047        ValueError
1048            If the input data is not a pandas DataFrame.
1049            If the input data does not have columns for each of the dimensions in "dims".
1050            If the length of "dims", "tol", and "relative" do not match.
1051        """
1052        og_columns = df.columns.copy()
1053
1054        # Unindex the data, but keep the original index
1055        if df.index.name is not None:
1056            og_index = df.index.name
1057        else:
1058            og_index = "index"
1059        df = df.reset_index(drop=False)
1060
1061        # Sort data by sort_by column, and reindex
1062        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1063
1064        # Check that data is a DataFrame and has columns for each of the dims
1065        if not isinstance(df, pd.DataFrame):
1066            raise ValueError("Data must be a pandas DataFrame")
1067        for dim in dims:
1068            if dim not in df.columns:
1069                raise ValueError(f"Data must have a column for {dim}")
1070        if len(dims) != len(tol) or len(dims) != len(relative):
1071            raise ValueError(
1072                "Dimensions, tolerances, and relative flags must be the same length"
1073            )
1074
1075        # Pre-compute all values arrays
1076        all_values = [df[dim].values for dim in dims]
1077
1078        # Choose processing method based on dataframe size
1079        if len(df) >= memory_opt_threshold:
1080            # Memory-optimized approach for large dataframes
1081            distances = PHCalculations._compute_distances_memory_optimized(
1082                all_values, tol, relative
1083            )
1084        else:
1085            # Faster approach for smaller dataframes
1086            distances = PHCalculations._compute_distances_original(
1087                all_values, tol, relative
1088            )
1089
1090        # Process pairs with original logic but memory optimizations
1091        distances = distances.tocoo()
1092        pairs = np.stack((distances.row, distances.col), axis=1)
1093        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent")
1094        del distances, pairs  # Free memory immediately
1095
1096        to_drop = []
1097        while not pairs_df.empty:
1098            # Find root_parents and their children (original logic preserved)
1099            root_parents = np.setdiff1d(
1100                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1101            )
1102            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1103            to_drop.extend(children_of_roots)  # Use extend instead of append
1104
1105            # Remove root_children as possible parents from pairs_df for next iteration
1106            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1107            pairs_df = pairs_df.reset_index().set_index("child")
1108            # Remove root_children as possible children from pairs_df for next iteration
1109            pairs_df = pairs_df.drop(index=children_of_roots)
1110
1111            # Prepare for next iteration
1112            pairs_df = pairs_df.reset_index().set_index("parent")
1113
1114        # Convert to numpy array for efficient dropping
1115        to_drop = np.array(to_drop)
1116
1117        # Drop mass features that are not cluster parents
1118        df_sub = df.drop(index=to_drop)
1119
1120        # Set index back to og_index and only keep original columns
1121        df_sub = df_sub.set_index(og_index).sort_index()[og_columns]
1122
1123        return df_sub
1124
1125    @staticmethod
1126    def _compute_distances_original(all_values, tol, relative):
1127        """Original distance computation method for smaller datasets.
1128
1129        This method computes the pairwise distances between features in the dataset
1130        using a straightforward approach. It is suitable for smaller datasets where
1131        memory usage is not a primary concern.
1132
1133        Parameters
1134        ----------
1135        all_values : list of :obj:`~numpy.array`
1136            List of arrays containing the values for each dimension.
1137        tol : list of float
1138            List of tolerances for each dimension.
1139        relative : list of bool
1140            List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1141
1142        Returns
1143        -------
1144        :obj:`~scipy.sparse.coo_matrix`
1145            Sparse matrix indicating pairwise distances within tolerances.
1146        """
1147        # Compute inter-feature distances with memory optimization
1148        distances = None
1149        for i in range(len(all_values)):
1150            values = all_values[i]
1151            # Use single precision if possible to reduce memory
1152            tree = KDTree(values.reshape(-1, 1).astype(np.float32))
1153
1154            max_tol = tol[i]
1155            if relative[i] is True:
1156                max_tol = tol[i] * values.max()
1157
1158            # Compute sparse distance matrix with smaller chunks if memory is an issue
1159            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1160
1161            # Only consider forward case, exclude diagonal
1162            sdm = sparse.triu(sdm, k=1)
1163
1164            # Process relative distances more efficiently
1165            if relative[i] is True:
1166                # Vectorized computation without creating intermediate arrays
1167                row_values = values[sdm.row]
1168                valid_idx = sdm.data <= tol[i] * row_values
1169
1170                # Reconstruct sparse matrix more efficiently
1171                sdm = sparse.coo_matrix(
1172                    (
1173                        np.ones(valid_idx.sum(), dtype=np.uint8),
1174                        (sdm.row[valid_idx], sdm.col[valid_idx]),
1175                    ),
1176                    shape=(len(values), len(values)),
1177                )
1178            else:
1179                # Cast as binary matrix with smaller data type
1180                sdm.data = np.ones(len(sdm.data), dtype=np.uint8)
1181
1182            # Stack distances with memory-efficient multiplication
1183            if distances is None:
1184                distances = sdm
1185            else:
1186                # Use in-place operations where possible
1187                distances = distances.multiply(sdm)
1188                del sdm  # Free memory immediately
1189
1190        return distances
1191
1192    @staticmethod
1193    def _compute_distances_memory_optimized(all_values, tol, relative):
1194        """Memory-optimized distance computation for large datasets.
1195
1196        This method computes the pairwise distances between features in the dataset
1197        using a more memory-efficient approach. It is suitable for larger datasets
1198        where memory usage is a primary concern.
1199
1200        Parameters
1201        ----------
1202        all_values : list of :obj:`~numpy.array`
1203            List of arrays containing the values for each dimension.
1204        tol : list of float
1205            List of tolerances for each dimension.
1206        relative : list of bool
1207            List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1208
1209        Returns
1210        -------
1211        :obj:`~scipy.sparse.coo_matrix`
1212            Sparse matrix indicating pairwise distances within tolerances.
1213        """
1214        # Compute distance matrix for first dimension (full matrix as before)
1215        values_0 = all_values[0].astype(np.float32)
1216        tree_0 = KDTree(values_0.reshape(-1, 1))
1217
1218        max_tol_0 = tol[0]
1219        if relative[0] is True:
1220            max_tol_0 = tol[0] * values_0.max()
1221
1222        # Compute sparse distance matrix for first dimension
1223        distances = tree_0.sparse_distance_matrix(
1224            tree_0, max_tol_0, output_type="coo_matrix"
1225        )
1226        distances = sparse.triu(distances, k=1)
1227
1228        # Process relative distances for first dimension
1229        if relative[0] is True:
1230            row_values = values_0[distances.row]
1231            valid_idx = distances.data <= tol[0] * row_values
1232            distances = sparse.coo_matrix(
1233                (
1234                    np.ones(valid_idx.sum(), dtype=np.uint8),
1235                    (distances.row[valid_idx], distances.col[valid_idx]),
1236                ),
1237                shape=(len(values_0), len(values_0)),
1238            )
1239        else:
1240            distances.data = np.ones(len(distances.data), dtype=np.uint8)
1241
1242        # For remaining dimensions, work only on chunks defined by first dimension pairs
1243        if len(all_values) > 1:
1244            distances_coo = distances.tocoo()
1245            valid_pairs = []
1246
1247            # Process each pair from first dimension
1248            for idx in range(len(distances_coo.data)):
1249                i, j = distances_coo.row[idx], distances_coo.col[idx]
1250                is_valid_pair = True
1251
1252                # Check remaining dimensions for this specific pair
1253                for dim_idx in range(1, len(all_values)):
1254                    values = all_values[dim_idx]
1255                    val_i, val_j = values[i], values[j]
1256
1257                    max_tol = tol[dim_idx]
1258                    if relative[dim_idx] is True:
1259                        max_tol = tol[dim_idx] * values.max()
1260
1261                    distance_ij = abs(val_i - val_j)
1262
1263                    # Check if this pair satisfies the tolerance for this dimension
1264                    if relative[dim_idx] is True:
1265                        if distance_ij > tol[dim_idx] * val_i:
1266                            is_valid_pair = False
1267                            break
1268                    else:
1269                        if distance_ij > max_tol:
1270                            is_valid_pair = False
1271                            break
1272
1273                if is_valid_pair:
1274                    valid_pairs.append((i, j))
1275
1276            # Rebuild distances matrix with only valid pairs
1277            if valid_pairs:
1278                valid_pairs = np.array(valid_pairs)
1279                distances = sparse.coo_matrix(
1280                    (
1281                        np.ones(len(valid_pairs), dtype=np.uint8),
1282                        (valid_pairs[:, 0], valid_pairs[:, 1]),
1283                    ),
1284                    shape=(len(values_0), len(values_0)),
1285                )
1286            else:
1287                # No valid pairs found
1288                distances = sparse.coo_matrix(
1289                    (len(values_0), len(values_0)), dtype=np.uint8
1290                )
1291
1292        return distances
1293
1294    def sparse_upper_star(self, idx, V):
1295        """Sparse implementation of an upper star filtration.
1296
1297        Parameters
1298        ----------
1299        idx : :obj:`~numpy.array`
1300            Edge indices for each dimension (MxN).
1301        V : :obj:`~numpy.array`
1302            Array of intensity data (Mx1).
1303        Returns
1304        -------
1305        idx : :obj:`~numpy.array`
1306            Index of filtered points (Mx1).
1307        persistence : :obj:`~numpy.array`
1308            Persistence of each filtered point (Mx1).
1309
1310        Notes
1311        -----
1312        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1313        """
1314
1315        # Invert
1316        V = -1 * V.copy().astype(int)
1317
1318        # Embed indices
1319        V = self.embed_unique_indices(V)
1320
1321        # Connectivity matrix
1322        cmat = KDTree(idx)
1323        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1324        cmat.setdiag(1)
1325        cmat = sparse.triu(cmat)
1326
1327        # Pairwise minimums
1328        I, J = cmat.nonzero()
1329        d = np.maximum(V[I], V[J])
1330
1331        # Delete connectiity matrix
1332        cmat_shape = cmat.shape
1333        del cmat
1334
1335        # Sparse distance matrix
1336        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1337
1338        # Delete pairwise mins
1339        del d, I, J
1340
1341        # Persistence homology
1342        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1343
1344        # Bound death values
1345        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1346
1347        # Construct tree to query against
1348        tree = KDTree(V.reshape((-1, 1)))
1349
1350        # Get the indexes of the first nearest neighbor by birth
1351        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1352
1353        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1354
1355    def check_if_grid(self, data):
1356        """Check if the data are gridded in mz space.
1357
1358        Parameters
1359        ----------
1360        data : DataFrame
1361            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1362
1363        Returns
1364        -------
1365        bool
1366            True if the data is gridded in the mz direction, False otherwise.
1367
1368        Notes
1369        -----
1370        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1371        """
1372        # Calculate the difference between consecutive mz values in a single scan
1373        dat_check = data.copy().reset_index(drop=True)
1374        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1375        mz_diff_min = (
1376            dat_check.groupby("scan")["mz_diff"].min().min()
1377        )  # within each scan, what is the smallest mz difference between consecutive mz values
1378
1379        # Find the mininum mz difference between mz values in the data; regardless of scan
1380        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1381        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1382        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1383
1384        # Get minimum mz_diff between mz values in the data
1385        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1386
1387        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1388        if mz_diff_min_raw < mz_diff_min:
1389            return False
1390        else:
1391            return True
1392
1393    def grid_data(self, data, attempts=5):
1394        """Grid the data in the mz dimension.
1395
1396        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1397
1398        Parameters
1399        ----------
1400        data : DataFrame
1401            The input data containing mz, scan, scan_time, and intensity columns.
1402        attempts : int, optional
1403            The number of attempts to grid the data. Default is 5.
1404
1405        Returns
1406        -------
1407        DataFrame
1408            The gridded data with mz, scan, scan_time, and intensity columns.
1409
1410        Raises
1411        ------
1412        ValueError
1413            If gridding fails after the specified number of attempts.
1414        """
1415        attempt_i = 0
1416        while attempt_i < attempts:
1417            attempt_i += 1
1418            data = self._grid_data(data)
1419
1420            if self.check_if_grid(data):
1421                return data
1422
1423        if not self.check_if_grid(data):
1424            raise ValueError(
1425                "Gridding failed after "
1426                + str(attempt_i)
1427                + " attempts. Please check the data."
1428            )
1429        else:
1430            return data
1431
1432    def _grid_data(self, data):
1433        """Internal method to grid the data in the mz dimension.
1434
1435        Notes
1436        -----
1437        This method is called by the grid_data method and should not be called directly.
1438        It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan,
1439        but it does not check if the data is already gridded or if the gridding is successful.
1440
1441        Parameters
1442        ----------
1443        data : pd.DataFrame or pl.DataFrame
1444            The input data to grid.
1445
1446        Returns
1447        -------
1448        pd.DataFrame or pl.DataFrame
1449            The data after attempting to grid it in the mz dimension.
1450        """
1451        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1452        data_w = data.copy().reset_index(drop=True)
1453        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1454        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1455
1456        # Need high intensity mz values first so they are parents in the output pairs stack
1457        dat_mz = data_w[["mz", "intensity"]].sort_values(
1458            by=["intensity"], ascending=False
1459        )
1460        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1461
1462        # Construct KD tree
1463        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1464        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1465        sdm = sparse.triu(sdm, k=1)
1466        sdm.data = np.ones_like(sdm.data)
1467        distances = sdm.tocoo()
1468        pairs = np.stack((distances.row, distances.col), axis=1)
1469
1470        # Cull pairs to just get root
1471        to_drop = []
1472        while len(pairs) > 0:
1473            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1474            id_root_parents = np.isin(pairs[:, 0], root_parents)
1475            children_of_roots = np.unique(pairs[id_root_parents, 1])
1476            to_drop = np.append(to_drop, children_of_roots)
1477
1478            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1479            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1480            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1481        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1482        mz_dat_np = (
1483            dat_mz[["mz"]]
1484            .sort_values(by=["mz"])
1485            .reset_index(drop=True)
1486            .values.flatten()
1487        )
1488
1489        # Sort data by mz and recast mz to nearest value in mz_dat_np
1490        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1491        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1492        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1493
1494        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1495        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1496        new_data_w = (
1497            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1498            .groupby(["scan", "mz"])["intensity"]
1499            .sum()
1500            .reset_index()
1501        )
1502        new_data_w = (
1503            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1504            .reset_index(drop=True)
1505            .copy()
1506        )
1507
1508        return new_data_w
1509
1510    def find_mass_features_ph(self, ms_level=1, grid=True):
1511        """Find mass features within an LCMSBase object using persistent homology.
1512
1513        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1514
1515        Parameters
1516        ----------
1517        ms_level : int, optional
1518            The MS level to use. Default is 1.
1519        grid : bool, optional
1520            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1521
1522        Raises
1523        ------
1524        ValueError
1525            If no MS level data is found on the object.
1526            If data is not gridded and grid is False.
1527
1528        Returns
1529        -------
1530        None, but assigns the mass_features attribute to the object.
1531
1532        Notes
1533        -----
1534        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1535        """
1536        # Check that ms_level is a key in self._ms_uprocessed
1537        if ms_level not in self._ms_unprocessed.keys():
1538            raise ValueError(
1539                "No MS level "
1540                + str(ms_level)
1541                + " data found, did you instantiate with parser specific to MS level?"
1542            )
1543
1544        # Get ms data
1545        data = self._ms_unprocessed[ms_level].copy()
1546
1547        # Drop rows with missing intensity values and reset index
1548        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1549
1550        # Threshold data
1551        dims = ["mz", "scan_time"]
1552        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1553        persistence_threshold = (
1554            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1555        )
1556        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1557
1558        # Check if gridded, if not, grid
1559        gridded_mz = self.check_if_grid(data_thres)
1560        if gridded_mz is False:
1561            if grid is False:
1562                raise ValueError(
1563                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1564                )
1565            else:
1566                data_thres = self.grid_data(data_thres)
1567
1568        # Add scan_time
1569        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1570        # Process in chunks if required
1571        if len(data_thres) > 10000:
1572            return self._find_mass_features_ph_partition(
1573                data_thres, dims, persistence_threshold
1574            )
1575        else:
1576            # Process all at once
1577            return self._find_mass_features_ph_single(
1578                data_thres, dims, persistence_threshold
1579            )
1580
1581    def _find_mass_features_ph_single(self, data_thres, dims, persistence_threshold):
1582        """Process all data at once (original logic)."""
1583        # Build factors
1584        factors = {
1585            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1586            for dim in dims
1587        }
1588
1589        # Build indexes
1590        index = {
1591            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1592            for dim in factors
1593        }
1594
1595        # Smooth and process
1596        mass_features_df = self._process_partition_ph(
1597            data_thres, index, dims, persistence_threshold
1598        )
1599
1600        # Roll up within chunk to remove duplicates
1601        mass_features_df = self.roll_up_dataframe(
1602            df=mass_features_df,
1603            sort_by="persistence",
1604            dims=["mz", "scan_time"],
1605            tol=[
1606                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1607                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1608            ],
1609            relative=[True, False],
1610        )
1611        mass_features_df = mass_features_df.reset_index(drop=True)
1612
1613        # Populate mass_features attribute
1614        self._populate_mass_features(mass_features_df)
1615
1616    def _find_mass_features_ph_partition(self, data_thres, dims, persistence_threshold):
1617        """Partition the persistent homology mass feature detection for large datasets.
1618
1619        This method splits the input data into overlapping scan partitions, processes each partition to detect mass features
1620        using persistent homology, rolls up duplicates within and across partitions, and populates the mass_features attribute.
1621
1622        Parameters
1623        ----------
1624        data_thres : pd.DataFrame
1625            The thresholded input data containing mass spectrometry information.
1626        dims : list
1627            List of dimension names (e.g., ["mz", "scan_time"]) used for feature detection.
1628        persistence_threshold : float
1629            Minimum persistence value required for a detected mass feature to be retained.
1630
1631        Returns
1632        -------
1633        None
1634            Populates the mass_features attribute of the object with detected mass features.
1635        """
1636        all_mass_features = []
1637
1638        # Split scans into partitions
1639        unique_scans = sorted(data_thres["scan"].unique())
1640        unique_scans_n = len(unique_scans)
1641
1642        # Calculate partition size in scans based on goal
1643        partition_size_goal = 5000
1644        scans_per_partition = max(
1645            1, partition_size_goal // (len(data_thres) // unique_scans_n)
1646        )
1647        if scans_per_partition == 0:
1648            scans_per_partition = 1
1649
1650        # Make partitions based on scans, with overlapping in partitioned scans
1651        scan_overlap = 4
1652        partition_scans = []
1653        for i in range(0, unique_scans_n, scans_per_partition):
1654            start_idx = max(0, i - scan_overlap)
1655            end_idx = min(
1656                unique_scans_n - 1, i + scans_per_partition - 1 + scan_overlap
1657            )
1658            scans_group = [int(s) for s in unique_scans[start_idx : end_idx + 1]]
1659            partition_scans.append(scans_group)
1660
1661        # Set index to scan for faster filtering
1662        data_thres = data_thres.set_index("scan")
1663        for scans in partition_scans:
1664            # Determine start and end scan for partition, with 5 scans overlap
1665            partition_data = data_thres.loc[scans].reset_index(drop=False).copy()
1666
1667            if len(partition_data) == 0:
1668                continue
1669
1670            # Build factors for this partition
1671            factors = {
1672                dim: pd.factorize(partition_data[dim], sort=True)[1].astype(np.float32)
1673                for dim in dims
1674            }
1675
1676            # Build indexes
1677            index = {
1678                dim: np.searchsorted(factors[dim], partition_data[dim]).astype(
1679                    np.float32
1680                )
1681                for dim in factors
1682            }
1683
1684            # Process partition
1685            partition_features = self._process_partition_ph(
1686                partition_data, index, dims, persistence_threshold
1687            )
1688
1689            if len(partition_features) == 0:
1690                continue
1691
1692            # Roll up within partition
1693            partition_features = self.roll_up_dataframe(
1694                df=partition_features,
1695                sort_by="persistence",
1696                dims=["mz", "scan_time"],
1697                tol=[
1698                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1699                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1700                ],
1701                relative=[True, False],
1702            )
1703            partition_features = partition_features.reset_index(drop=True)
1704
1705            if len(partition_features) > 0:
1706                all_mass_features.append(partition_features)
1707
1708        # Combine results from all partitions
1709        if all_mass_features:
1710            combined_features = pd.concat(all_mass_features, ignore_index=True)
1711
1712            # Sort by persistence
1713            combined_features = combined_features.sort_values(
1714                by="persistence", ascending=False
1715            ).reset_index(drop=True)
1716
1717            # Remove duplicates from overlapping regions
1718            combined_features = self.roll_up_dataframe(
1719                df=combined_features,
1720                sort_by="persistence",
1721                dims=["mz", "scan_time"],
1722                tol=[
1723                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1724                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1725                ],
1726                relative=[True, False],
1727            )
1728
1729            # resort by persistence and reset index
1730            combined_features = combined_features.reset_index(drop=True)
1731
1732            # Populate mass_features attribute
1733            self._populate_mass_features(combined_features)
1734        else:
1735            self.mass_features = {}
1736
1737    def _process_partition_ph(self, partition_data, index, dims, persistence_threshold):
1738        """Process a single partition with persistent homology."""
1739        # Smooth data
1740        iterations = self.parameters.lc_ms.ph_smooth_it
1741        smooth_radius = [
1742            self.parameters.lc_ms.ph_smooth_radius_mz,
1743            self.parameters.lc_ms.ph_smooth_radius_scan,
1744        ]
1745
1746        index_array = np.vstack([index[dim] for dim in dims]).T
1747        V = partition_data["intensity"].values
1748        resid = np.inf
1749
1750        for i in range(iterations):
1751            # Previous iteration
1752            V_prev = V.copy()
1753            resid_prev = resid
1754            V = self.sparse_mean_filter(index_array, V, radius=smooth_radius)
1755
1756            # Calculate residual with previous iteration
1757            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1758
1759            # Evaluate convergence
1760            if i > 0:
1761                # Percent change in residual
1762                test = np.abs(resid - resid_prev) / resid_prev
1763
1764                # Exit criteria
1765                if test <= 0:
1766                    break
1767
1768        # Overwrite values
1769        partition_data = partition_data.copy()
1770        partition_data["intensity"] = V
1771
1772        # Use persistent homology to find regions of interest
1773        pidx, pers = self.sparse_upper_star(index_array, V)
1774        pidx = pidx[pers > 1]
1775        pers = pers[pers > 1]
1776
1777        if len(pidx) == 0:
1778            return pd.DataFrame()
1779
1780        # Get peaks
1781        peaks = partition_data.iloc[pidx, :].reset_index(drop=True)
1782
1783        # Add persistence column
1784        peaks["persistence"] = pers
1785        mass_features = peaks.sort_values(
1786            by="persistence", ascending=False
1787        ).reset_index(drop=True)
1788
1789        # Filter by persistence threshold
1790        mass_features = mass_features.loc[
1791            mass_features["persistence"] > persistence_threshold, :
1792        ].reset_index(drop=True)
1793
1794        return mass_features
1795
1796    def _populate_mass_features(self, mass_features_df):
1797        """Populate the mass_features attribute from a DataFrame.
1798
1799        Parameters
1800        ----------
1801        mass_features_df : pd.DataFrame
1802            DataFrame containing mass feature information.
1803            Note that the order of this DataFrame will determine the order of mass features in the mass_features attribute.
1804
1805        Returns
1806        -------
1807        None, but assigns the mass_features attribute to the object.
1808        """
1809        # Rename scan column to apex_scan
1810        mass_features_df = mass_features_df.rename(
1811            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1812        )
1813
1814        # Populate mass_features attribute
1815        self.mass_features = {}
1816        for row in mass_features_df.itertuples():
1817            row_dict = mass_features_df.iloc[row.Index].to_dict()
1818            lcms_feature = LCMSMassFeature(self, **row_dict)
1819            self.mass_features[lcms_feature.id] = lcms_feature
1820
1821        if self.parameters.lc_ms.verbose_processing:
1822            print("Found " + str(len(mass_features_df)) + " initial mass features")
1823
1824    def find_mass_features_ph_centroid(self, ms_level=1):
1825        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1826
1827        Parameters
1828        ----------
1829        ms_level : int, optional
1830            The MS level to use. Default is 1.
1831
1832        Raises
1833        ------
1834        ValueError
1835            If no MS level data is found on the object.
1836
1837        Returns
1838        -------
1839        None, but assigns the mass_features attribute to the object.
1840        """
1841        # Check that ms_level is a key in self._ms_uprocessed
1842        if ms_level not in self._ms_unprocessed.keys():
1843            raise ValueError(
1844                "No MS level "
1845                + str(ms_level)
1846                + " data found, did you instantiate with parser specific to MS level?"
1847            )
1848
1849        # Work with reference instead of copy
1850        data = self._ms_unprocessed[ms_level]
1851
1852        # Calculate threshold first to avoid multiple operations
1853        max_intensity = data["intensity"].max()
1854        threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity
1855
1856        # Create single filter condition and apply to required columns only
1857        valid_mask = data["intensity"].notna() & (data["intensity"] > threshold)
1858        required_cols = ["mz", "intensity", "scan"]
1859        data_thres = data.loc[valid_mask, required_cols].copy()
1860        data_thres["persistence"] = data_thres["intensity"]
1861
1862        # Merge with required scan data
1863        scan_subset = self.scan_df[["scan", "scan_time"]]
1864        mf_df = data_thres.merge(scan_subset, on="scan", how="inner")
1865        del data_thres, scan_subset
1866
1867        # Order by scan_time and then mz to ensure features near in rt are processed together
1868        # It's ok that different scans are in different partitions; we will roll up later
1869        mf_df = mf_df.sort_values(
1870            by=["scan_time", "mz"], ascending=[True, True]
1871        ).reset_index(drop=True)
1872        partition_size = 10000
1873        partitions = [
1874            mf_df.iloc[i : i + partition_size].reset_index(drop=True)
1875            for i in range(0, len(mf_df), partition_size)
1876        ]
1877        del mf_df
1878
1879        # Run roll_up_dataframe on each partition
1880        rolled_partitions = []
1881        for part in partitions:
1882            rolled = self.roll_up_dataframe(
1883                df=part,
1884                sort_by="persistence",
1885                dims=["mz", "scan_time"],
1886                tol=[
1887                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1888                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1889                ],
1890                relative=[True, False],
1891            )
1892            rolled_partitions.append(rolled)
1893        del partitions
1894
1895        # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries
1896
1897        # Combine results and run a final roll-up to merge features near partition boundaries
1898        mf_df_final = pd.concat(rolled_partitions, ignore_index=True)
1899        del rolled_partitions
1900
1901        # Reorder by persistence before final roll-up
1902        mf_df_final = mf_df_final.sort_values(
1903            by="persistence", ascending=False
1904        ).reset_index(drop=True)
1905
1906        mf_df_final = self.roll_up_dataframe(
1907            df=mf_df_final,
1908            sort_by="persistence",
1909            dims=["mz", "scan_time"],
1910            tol=[
1911                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1912                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1913            ],
1914            relative=[True, False],
1915        )
1916        # reset index
1917        mf_df_final = mf_df_final.reset_index(drop=True)
1918
1919        # Combine rename and sort operations
1920        mass_features = (
1921            mf_df_final.rename(
1922                columns={"scan": "apex_scan", "scan_time": "retention_time"}
1923            )
1924            .sort_values(by="persistence", ascending=False)
1925            .reset_index(drop=True)
1926        )
1927        del mf_df_final  # Free memory
1928
1929        # Order by persistence and reset index
1930        mass_features = mass_features.sort_values(
1931            by="persistence", ascending=False
1932        ).reset_index(drop=True)
1933
1934        self.mass_features = {}
1935        for idx, row in mass_features.iterrows():
1936            row_dict = row.to_dict()
1937            lcms_feature = LCMSMassFeature(self, **row_dict)
1938            self.mass_features[lcms_feature.id] = lcms_feature
1939
1940        if self.parameters.lc_ms.verbose_processing:
1941            print("Found " + str(len(mass_features)) + " initial mass features")
1942
1943    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1944        """Cluster mass features
1945
1946        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1947
1948        Parameters
1949        ----------
1950        drop_children : bool, optional
1951            Whether to drop the mass features that are not cluster parents. Default is True.
1952        sort_by : str, optional
1953            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1954
1955        Raises
1956        ------
1957        ValueError
1958            If no mass features are found.
1959            If too many mass features are found.
1960
1961        Returns
1962        -------
1963        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1964        """
1965        if self.mass_features is None:
1966            raise ValueError("No mass features found, run find_mass_features() first")
1967        if len(self.mass_features) > 400000:
1968            raise ValueError(
1969                "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold"
1970            )
1971        dims = ["mz", "scan_time"]
1972        mf_df_og = self.mass_features_to_df()
1973        mf_df = mf_df_og.copy()
1974
1975        tol = [
1976            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1977            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1978        ]  # mz, in relative; scan_time in minutes
1979        relative = [True, False]
1980
1981        # Roll up mass features based on their proximity in the declared dimensions
1982        mf_df_new = self.roll_up_dataframe(
1983            df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative
1984        )
1985
1986        mf_df["cluster_parent"] = np.where(
1987            np.isin(mf_df.index, mf_df_new.index), True, False
1988        )
1989
1990        # get mass feature ids of features that are not cluster parents
1991        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1992        if drop_children is True:
1993            # Drop mass features that are not cluster parents from self
1994            self.mass_features = {
1995                k: v
1996                for k, v in self.mass_features.items()
1997                if k not in cluster_daughters
1998            }
1999        else:
2000            return cluster_daughters
def find_closest(A, target):
14def find_closest(A, target):
15    """Find the index of closest value in A to each value in target.
16
17    Parameters
18    ----------
19    A : :obj:`~numpy.array`
20        The array to search (blueprint). A must be sorted.
21    target : :obj:`~numpy.array`
22        The array of values to search for. target must be sorted.
23
24    Returns
25    -------
26    :obj:`~numpy.array`
27        The indices of the closest values in A to each value in target.
28    """
29    idx = A.searchsorted(target)
30    idx = np.clip(idx, 1, len(A) - 1)
31    left = A[idx - 1]
32    right = A[idx]
33    idx -= target - left < right - target
34    return idx

Find the index of closest value in A to each value in target.

Parameters
  • A (~numpy.array): The array to search (blueprint). A must be sorted.
  • target (~numpy.array): The array of values to search for. target must be sorted.
Returns
  • ~numpy.array: The indices of the closest values in A to each value in target.
class LCCalculations:
 37class LCCalculations:
 38    """Methods for performing LC calculations on mass spectra data.
 39
 40    Notes
 41    -----
 42    This class is intended to be used as a mixin for the LCMSBase class.
 43
 44    Methods
 45    -------
 46    * get_max_eic(eic_data).
 47        Returns the maximum EIC value from the given EIC data. A static method.
 48    * smooth_tic(tic).
 49        Smooths the TIC data using the specified smoothing method and settings.
 50    * eic_centroid_detector(rt, eic, max_eic).
 51        Performs EIC centroid detection on the given EIC data.
 52    * find_nearest_scan(rt).
 53        Finds the nearest scan to the given retention time.
 54    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
 55        Returns an averaged mass spectrum object.
 56    * find_mass_features(ms_level=1).
 57        Find regions of interest for a given MS level (default is MS1).
 58    * integrate_mass_features(drop_if_fail=False, ms_level=1).
 59        Integrate mass features of interest and extracts EICs.
 60    * find_c13_mass_features().
 61        Evaluate mass features and mark likely C13 isotopes.
 62    * deconvolute_ms1_mass_features().
 63        Deconvolute mass features' ms1 mass spectra.
 64    """
 65
 66    @staticmethod
 67    def get_max_eic(eic_data: dict):
 68        """Returns the maximum EIC value from the given EIC data.
 69
 70        Notes
 71        -----
 72        This is a static method.
 73
 74        Parameters
 75        ----------
 76        eic_data : dict
 77            A dictionary containing EIC data.
 78
 79        Returns
 80        -------
 81        float
 82            The maximum EIC value.
 83        """
 84        max_eic = 0
 85        for eic_data in eic_data.values():
 86            ind_max_eic = max(eic_data.get("EIC"))
 87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
 88
 89        return max_eic
 90
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )
115
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159
160        return include_indexes
161
162    def find_nearest_scan(self, rt):
163        """Finds the nearest scan to the given retention time.
164
165        Parameters
166        ----------
167        rt : float
168            The retention time (in minutes) to find the nearest scan for.
169
170        Returns
171        -------
172        int
173            The scan number of the nearest scan.
174        """
175        array_rt = np.array(self.retention_time)
176
177        scan_index = (np.abs(array_rt - rt)).argmin()
178
179        real_scan = self.scans_number[scan_index]
180
181        return real_scan
182
183    def add_peak_metrics(self):
184        """Add peak metrics to the mass features.
185
186        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
187        """
188        # Check that at least some mass features have eic data
189        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
190            raise ValueError(
191                "No mass features have EIC data. Run integrate_mass_features first."
192            )
193
194        for mass_feature in self.mass_features.values():
195            # Check if the mass feature has been integrated
196            if mass_feature._eic_data is not None:
197                # Calculate peak metrics
198                mass_feature.calc_half_height_width()
199                mass_feature.calc_tailing_factor()
200                mass_feature.calc_dispersity_index()
201
202    def get_average_mass_spectrum(
203        self,
204        scan_list,
205        apex_scan,
206        spectrum_mode="profile",
207        ms_level=1,
208        auto_process=True,
209        use_parser=False,
210        perform_checks=True,
211        polarity=None,
212        ms_params=None,
213    ):
214        """Returns an averaged mass spectrum object
215
216        Parameters
217        ----------
218        scan_list : list
219            List of scan numbers to average.
220        apex_scan : int
221            Number of the apex scan
222        spectrum_mode : str, optional
223            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
224        ms_level : int, optional
225            The MS level to use. Defaults to 1.
226        auto_process : bool, optional
227            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
228        use_parser : bool, optional
229            If True, the mass spectra will be obtained from the parser. Defaults to False.
230        perform_checks : bool, optional
231            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
232        polarity : int, optional
233            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
234        ms_params : MSParameters, optional
235            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
236
237        Returns
238        -------
239        MassSpectrumProfile
240            The averaged mass spectrum object.
241
242        Raises
243        ------
244        ValueError
245            If the spectrum mode is not "profile".
246            If the MS level is not found in the unprocessed mass spectra dictionary.
247            If not all scan numbers are found in the unprocessed mass spectra dictionary.
248        """
249        if perform_checks:
250            if spectrum_mode != "profile":
251                raise ValueError("Averaging only supported for profile mode")
252
253        if polarity is None:
254            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
255            if self.polarity == "negative":
256                polarity = -1
257            elif self.polarity == "positive":
258                polarity = 1
259            else:
260                raise ValueError(
261                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
262                )
263
264        # if not using_parser, check that scan numbers are in _ms_unprocessed
265        if not use_parser:
266            if perform_checks:
267                # Set index to scan for faster lookup
268                ms_df = (
269                    self._ms_unprocessed[ms_level]
270                    .copy()
271                    .set_index("scan", drop=False)
272                    .sort_index()
273                )
274                my_ms_df = ms_df.loc[scan_list]
275                # Check that all scan numbers are in the ms_df
276                if not all(np.isin(scan_list, ms_df.index)):
277                    raise ValueError(
278                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
279                    )
280            else:
281                my_ms_df = (
282                    pd.DataFrame({"scan": scan_list})
283                    .set_index("scan")
284                    .join(self._ms_unprocessed[ms_level], how="left")
285                )
286
287        if use_parser:
288            ms_list = [
289                self.spectra_parser.get_mass_spectrum_from_scan(
290                    x, spectrum_mode=spectrum_mode, auto_process=False
291                )
292                for x in scan_list
293            ]
294            ms_mz = [x._mz_exp for x in ms_list]
295            ms_int = [x._abundance for x in ms_list]
296            my_ms_df = []
297            for i in np.arange(len(ms_mz)):
298                my_ms_df.append(
299                    pd.DataFrame(
300                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
301                    )
302                )
303            my_ms_df = pd.concat(my_ms_df)
304
305        if not self.check_if_grid(my_ms_df):
306            my_ms_df = self.grid_data(my_ms_df)
307
308        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
309
310        ms = ms_from_array_profile(
311            my_ms_ave.mz,
312            my_ms_ave.intensity,
313            self.file_location,
314            polarity=polarity,
315            auto_process=False,
316        )
317
318        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
319        if ms is not None:
320            if ms_params is not None:
321                ms.parameters = ms_params
322            ms.scan_number = apex_scan
323            if auto_process:
324                ms.process_mass_spec()
325        return ms
326
327    def find_mass_features(self, ms_level=1, grid=True):
328        """Find mass features within an LCMSBase object
329
330        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
331
332        Parameters
333        ----------
334        ms_level : int, optional
335            The MS level to use for peak picking Default is 1.
336        grid : bool, optional
337            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded),
338            used for persistent homology peak picking for profile data only. Default is True.
339
340        Raises
341        ------
342        ValueError
343            If no MS level data is found on the object.
344            If persistent homology peak picking is attempted on non-profile mode data.
345            If data is not gridded and grid is False.
346            If peak picking method is not implemented.
347
348        Returns
349        -------
350        None, but assigns the mass_features and eics attributes to the object.
351
352        """
353        pp_method = self.parameters.lc_ms.peak_picking_method
354
355        if pp_method == "persistent homology":
356            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
357            if all(msx_scan_df["ms_format"] == "profile"):
358                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
359            else:
360                raise ValueError(
361                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
362                        ms_level
363                    )
364                )
365        elif pp_method == "centroided_persistent_homology":
366            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
367            if all(msx_scan_df["ms_format"] == "centroid"):
368                self.find_mass_features_ph_centroid(ms_level=ms_level)
369            else:
370                raise ValueError(
371                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
372                        ms_level
373                    )
374                )
375        else:
376            raise ValueError("Peak picking method not implemented")
377
378    def integrate_mass_features(
379        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
380    ):
381        """Integrate mass features and extract EICs.
382
383        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
384
385        Parameters
386        ----------
387        drop_if_fail : bool, optional
388            Whether to drop mass features if the EIC limit calculations fail.
389            Default is True.
390        drop_duplicates : bool, optional
391            Whether to mass features that appear to be duplicates
392            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
393            Default is True.
394        ms_level : int, optional
395            The MS level to use. Default is 1.
396
397        Raises
398        ------
399        ValueError
400            If no mass features are found.
401            If no MS level data is found for the given MS level (either in data or in the scan data)
402
403        Returns
404        -------
405        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
406
407        Notes
408        -----
409        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
410        """
411        # Check if there is data
412        if ms_level in self._ms_unprocessed.keys():
413            raw_data = self._ms_unprocessed[ms_level].copy()
414        else:
415            raise ValueError("No MS level " + str(ms_level) + " data found")
416        if self.mass_features is not None:
417            mf_df = self.mass_features_to_df().copy()
418        else:
419            raise ValueError(
420                "No mass features found, did you run find_mass_features() first?"
421            )
422        # Check if mass_spectrum exists on each mass feature
423        if not all(
424            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
425        ):
426            raise ValueError(
427                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
428            )
429
430        # Subset scan data to only include correct ms_level
431        scan_df_sub = self.scan_df[
432            self.scan_df["ms_level"] == int(ms_level)
433        ].reset_index(drop=True)
434        if scan_df_sub.empty:
435            raise ValueError("No MS level " + ms_level + " data found in scan data")
436        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
437
438        mzs_to_extract = np.unique(mf_df["mz"].values)
439        mzs_to_extract.sort()
440
441        # Pre-sort raw_data by mz for faster filtering
442        raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True)
443        raw_data_mz = raw_data_sorted["mz"].values
444
445        # Get EICs for each unique mz in mass features list
446        for mz in mzs_to_extract:
447            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
448            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
449
450            # Use binary search for faster mz range filtering
451            left_idx = np.searchsorted(raw_data_mz, mz_min, side="left")
452            right_idx = np.searchsorted(raw_data_mz, mz_max, side="right")
453            raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy()
454
455            raw_data_sub = (
456                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
457            )
458            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
459            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
460            myEIC = EIC_Data(
461                scans=raw_data_sub["scan"].values,
462                time=raw_data_sub["scan_time"].values,
463                eic=raw_data_sub["intensity"].values,
464            )
465            # Smooth EIC
466            smoothed_eic = self.smooth_tic(myEIC.eic)
467            smoothed_eic[smoothed_eic < 0] = 0
468            myEIC.eic_smoothed = smoothed_eic
469            self.eics[mz] = myEIC
470
471        # Get limits of mass features using EIC centroid detector and integrate
472        mf_df["area"] = np.nan
473        for idx, mass_feature in mf_df.iterrows():
474            mz = mass_feature.mz
475            apex_scan = mass_feature.apex_scan
476
477            # Pull EIC data and find apex scan index
478            myEIC = self.eics[mz]
479            self.mass_features[idx]._eic_data = myEIC
480            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
481
482            # Find left and right limits of peak using EIC centroid detector, add to EICData
483            centroid_eics = self.eic_centroid_detector(
484                myEIC.time,
485                myEIC.eic_smoothed,
486                mass_feature.intensity * 1.1,
487                apex_indexes=[int(apex_index)],
488            )
489            l_a_r_scan_idx = [i for i in centroid_eics]
490            if len(l_a_r_scan_idx) > 0:
491                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
492                # Find the number of consecutive non-zero values in the EIC segment
493                mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0
494                # Find the longest run of consecutive True values
495                if np.any(mask):
496                    # Find indices where mask changes value
497                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
498                    starts = np.where(diff == 1)[0]
499                    ends = np.where(diff == -1)[0]
500                    consecutive_scans = (ends - starts).max()
501                else:
502                    consecutive_scans = 0
503                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
504                    self.mass_features.pop(idx)
505                    continue
506                # Add start and final scan to mass_features and EICData
507                left_scan, right_scan = (
508                    myEIC.scans[l_a_r_scan_idx[0][0]],
509                    myEIC.scans[l_a_r_scan_idx[0][2]],
510                )
511                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
512                myEIC.apexes = myEIC.apexes + mf_scan_apex
513                self.mass_features[idx].start_scan = left_scan
514                self.mass_features[idx].final_scan = right_scan
515
516                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
517                area = np.trapz(
518                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
519                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
520                )
521                mf_df.at[idx, "area"] = area
522                myEIC.areas = myEIC.areas + [area]
523                self.eics[mz] = myEIC
524                self.mass_features[idx]._area = area
525            else:
526                if drop_if_fail is True:
527                    self.mass_features.pop(idx)
528
529        if drop_duplicates:
530            # Prepare mass feature dataframe
531            mf_df = self.mass_features_to_df()
532
533            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
534            # Keep the first mass feature (highest persistence)
535            for idx, mass_feature in mf_df.iterrows():
536                mz = mass_feature.mz
537                apex_scan = mass_feature.apex_scan
538
539                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
540                mf_df_sub = mf_df[
541                    mf_df["mz_diff_ppm"]
542                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
543                    * 10**6
544                ].copy()
545
546                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
547                for idx2, mass_feature2 in mf_df_sub.iterrows():
548                    if idx2 != idx:
549                        if (
550                            mass_feature2.start_scan >= mass_feature.start_scan
551                            and mass_feature2.final_scan <= mass_feature.final_scan
552                        ):
553                            if idx2 in self.mass_features.keys():
554                                self.mass_features.pop(idx2)
555
556    def find_c13_mass_features(self):
557        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
558
559        Returns
560        -------
561        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
562
563        Raises
564        ------
565        ValueError
566            If no mass features are found.
567        """
568        verbose = self.parameters.lc_ms.verbose_processing
569        if verbose:
570            print("evaluating mass features for C13 isotopes")
571        if self.mass_features is None:
572            raise ValueError("No mass features found, run find_mass_features() first")
573
574        # Data prep fo sparse distance matrix
575        dims = ["mz", "scan_time"]
576        mf_df = self.mass_features_to_df().copy()
577        # Drop mass features that have no area (these are likely to be noise)
578        mf_df = mf_df[mf_df["area"].notnull()]
579        mf_df["mf_id"] = mf_df.index.values
580        dims = ["mz", "scan_time"]
581
582        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
583        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
584
585        mz_diff = 1.003355  # C13-C12 mass difference
586        tol = [
587            mf_df["mz"].median()
588            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
589            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
590        ]  # mz, in relative; scan_time in minutes
591
592        # Compute inter-feature distances
593        distances = None
594        for i in range(len(dims)):
595            # Construct k-d tree
596            values = mf_df[dims[i]].values
597            tree = KDTree(values.reshape(-1, 1))
598
599            max_tol = tol[i]
600            if dims[i] == "mz":
601                # Maximum absolute tolerance
602                max_tol = mz_diff + tol[i]
603
604            # Compute sparse distance matrix
605            # the larger the max_tol, the slower this operation is
606            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
607
608            # Only consider forward case, exclude diagonal
609            sdm = sparse.triu(sdm, k=1)
610
611            if dims[i] == "mz":
612                min_tol = mz_diff - tol[i]
613                # Get only the ones that are above the min tol
614                idx = sdm.data > min_tol
615
616                # Reconstruct sparse distance matrix
617                sdm = sparse.coo_matrix(
618                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
619                    shape=(len(values), len(values)),
620                )
621
622            # Cast as binary matrix
623            sdm.data = np.ones_like(sdm.data)
624
625            # Stack distances
626            if distances is None:
627                distances = sdm
628            else:
629                distances = distances.multiply(sdm)
630
631        # Extract indices of within-tolerance points
632        distances = distances.tocoo()
633        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
634
635        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
636        pairs_mf = pairs.copy()
637        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
638        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
639
640        # Connect monoisotopic masses with isotopologes within mass_features
641        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
642        for mono in monos:
643            self.mass_features[mono].monoisotopic_mf_id = mono
644        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
645        while not pairs_iso_df.empty:
646            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
647            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
648            for iso in m1_isos:
649                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
650                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
651                if len(parent) > 1:
652                    # Choose the parent that is closest in time to the isotopologue
653                    parent_time = [self.mass_features[p].retention_time for p in parent]
654                    time_diff = [
655                        np.abs(self.mass_features[iso].retention_time - x)
656                        for x in parent_time
657                    ]
658                    parent = parent[np.argmin(time_diff)]
659                else:
660                    parent = parent[0]
661                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
662                    parent
663                ].monoisotopic_mf_id
664                if self.mass_features[iso].monoisotopic_mf_id is not None:
665                    mass_diff = (
666                        self.mass_features[iso].mz
667                        - self.mass_features[
668                            self.mass_features[iso].monoisotopic_mf_id
669                        ].mz
670                    )
671                    self.mass_features[iso].isotopologue_type = "13C" + str(
672                        int(round(mass_diff, 0))
673                    )
674
675            # Drop the mono and iso from the pairs_iso_df
676            pairs_iso_df = pairs_iso_df.drop(
677                index=monos, errors="ignore"
678            )  # Drop pairs where the parent is a child that is a child of a root
679            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
680            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
681
682            if not pairs_iso_df.empty:
683                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
684                monos = np.setdiff1d(
685                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
686                )
687        if verbose:
688            # Report fraction of compounds annotated with isotopes
689            mf_df["c13_flag"] = np.where(
690                np.logical_or(
691                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
692                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
693                ),
694                1,
695                0,
696            )
697            print(
698                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
699                + " of mass features have or are C13 isotopes"
700            )
701
702    def deconvolute_ms1_mass_features(self):
703        """Deconvolute MS1 mass features
704
705        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
706
707        Parameters
708        ----------
709        None
710
711        Returns
712        -------
713        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
714        and associated_mass_features_deconvoluted attributes to the mass features in the
715        mass_features attribute of the LCMSBase object.
716
717        Raises
718        ------
719        ValueError
720            If no mass features are found, must run find_mass_features() first.
721            If no EICs are found, did you run integrate_mass_features() first?
722
723        """
724        # Checks for set mass_features and eics
725        if self.mass_features is None:
726            raise ValueError(
727                "No mass features found, did you run find_mass_features() first?"
728            )
729
730        if self.eics == {}:
731            raise ValueError(
732                "No EICs found, did you run integrate_mass_features() first?"
733            )
734
735        if 1 not in self._ms_unprocessed.keys():
736            raise ValueError("No unprocessed MS1 spectra found.")
737
738        # Prep ms1 data
739        ms1_data = self._ms_unprocessed[1].copy()
740        ms1_data = ms1_data.set_index("scan")
741
742        # Prep mass feature summary
743        mass_feature_df = self.mass_features_to_df()
744
745        # Loop through each mass feature
746        for mf_id, mass_feature in self.mass_features.items():
747            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
748            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
749                continue
750
751            # Get the left and right limits of the EIC of the mass feature
752            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
753
754            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
755            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
756            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
757
758            # Get the centroided masses of the mass feature
759            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
760
761            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
762            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
763                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
764            ]
765
766            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
767            ms1_data_sub["mz_diff_rel"] = (
768                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
769                / ms1_data_sub["mz"]
770            )
771            ms1_data_sub = ms1_data_sub[
772                ms1_data_sub["mz_diff_rel"]
773                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
774            ].reset_index(drop=True)
775
776            # Group by mass_feature_mz and scan and sum intensity
777            ms1_data_sub_group = (
778                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
779                .sum()
780                .reset_index()
781            )
782
783            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
784            corr = (
785                ms1_data_sub_group.pivot(
786                    index="scan", columns="mass_feature_mz", values="intensity"
787                )
788                .fillna(0)
789                .corr()
790            )
791
792            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
793            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
794
795            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
796            try:
797                corr_subset = corr.loc[mass_feature.mz,]
798            except KeyError:
799                # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
800                continue
801
802            corr_subset = corr_subset[corr_subset > decon_corr_min]
803
804            # Get the masses from the mass spectrum that are the result of the deconvolution
805            mzs_decon = corr_subset.index.values
806
807            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
808            mzs_decon_idx = [
809                id
810                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
811                if mz in mzs_decon
812            ]
813            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
814
815            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
816            if (
817                mass_feature.ms1_peak.abundance
818                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
819            ):
820                mass_feature.mass_spectrum_deconvoluted_parent = True
821            else:
822                mass_feature.mass_spectrum_deconvoluted_parent = False
823
824            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
825            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
826            mass_feature_df_sub = mass_feature_df[
827                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
828                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
829            ].copy()
830            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
831            mass_feature_df_sub["mz_diff_ppm"] = [
832                np.abs(mzs_decon - mz).min() / mz * 10**6
833                for mz in mass_feature_df_sub["mz"]
834            ]
835            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
836            mfs_associated_decon = mass_feature_df_sub[
837                mass_feature_df_sub["mz_diff_ppm"]
838                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
839            ].index.values
840
841            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Methods for performing LC calculations on mass spectra data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
  • smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
  • eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
  • find_nearest_scan(rt). Finds the nearest scan to the given retention time.
  • get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
  • find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
  • integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
  • find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
  • deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
@staticmethod
def get_max_eic(eic_data: dict):
66    @staticmethod
67    def get_max_eic(eic_data: dict):
68        """Returns the maximum EIC value from the given EIC data.
69
70        Notes
71        -----
72        This is a static method.
73
74        Parameters
75        ----------
76        eic_data : dict
77            A dictionary containing EIC data.
78
79        Returns
80        -------
81        float
82            The maximum EIC value.
83        """
84        max_eic = 0
85        for eic_data in eic_data.values():
86            ind_max_eic = max(eic_data.get("EIC"))
87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
88
89        return max_eic

Returns the maximum EIC value from the given EIC data.

Notes

This is a static method.

Parameters
  • eic_data (dict): A dictionary containing EIC data.
Returns
  • float: The maximum EIC value.
def smooth_tic(self, tic):
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )

Smooths the TIC or EIC data using the specified smoothing method and settings.

Parameters
  • tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
  • numpy.ndarray: The smoothed TIC data.
def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159
160        return include_indexes

Performs EIC centroid detection on the given EIC data.

Parameters
  • rt (numpy.ndarray): The retention time data.
  • eic (numpy.ndarray): The EIC data.
  • max_eic (float): The maximum EIC value.
  • apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
  • numpy.ndarray: The indexes of left, apex, and right limits as a generator.
def find_nearest_scan(self, rt):
162    def find_nearest_scan(self, rt):
163        """Finds the nearest scan to the given retention time.
164
165        Parameters
166        ----------
167        rt : float
168            The retention time (in minutes) to find the nearest scan for.
169
170        Returns
171        -------
172        int
173            The scan number of the nearest scan.
174        """
175        array_rt = np.array(self.retention_time)
176
177        scan_index = (np.abs(array_rt - rt)).argmin()
178
179        real_scan = self.scans_number[scan_index]
180
181        return real_scan

Finds the nearest scan to the given retention time.

Parameters
  • rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
  • int: The scan number of the nearest scan.
def add_peak_metrics(self):
183    def add_peak_metrics(self):
184        """Add peak metrics to the mass features.
185
186        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
187        """
188        # Check that at least some mass features have eic data
189        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
190            raise ValueError(
191                "No mass features have EIC data. Run integrate_mass_features first."
192            )
193
194        for mass_feature in self.mass_features.values():
195            # Check if the mass feature has been integrated
196            if mass_feature._eic_data is not None:
197                # Calculate peak metrics
198                mass_feature.calc_half_height_width()
199                mass_feature.calc_tailing_factor()
200                mass_feature.calc_dispersity_index()

Add peak metrics to the mass features.

This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.

def get_average_mass_spectrum( self, scan_list, apex_scan, spectrum_mode='profile', ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None, ms_params=None):
202    def get_average_mass_spectrum(
203        self,
204        scan_list,
205        apex_scan,
206        spectrum_mode="profile",
207        ms_level=1,
208        auto_process=True,
209        use_parser=False,
210        perform_checks=True,
211        polarity=None,
212        ms_params=None,
213    ):
214        """Returns an averaged mass spectrum object
215
216        Parameters
217        ----------
218        scan_list : list
219            List of scan numbers to average.
220        apex_scan : int
221            Number of the apex scan
222        spectrum_mode : str, optional
223            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
224        ms_level : int, optional
225            The MS level to use. Defaults to 1.
226        auto_process : bool, optional
227            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
228        use_parser : bool, optional
229            If True, the mass spectra will be obtained from the parser. Defaults to False.
230        perform_checks : bool, optional
231            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
232        polarity : int, optional
233            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
234        ms_params : MSParameters, optional
235            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
236
237        Returns
238        -------
239        MassSpectrumProfile
240            The averaged mass spectrum object.
241
242        Raises
243        ------
244        ValueError
245            If the spectrum mode is not "profile".
246            If the MS level is not found in the unprocessed mass spectra dictionary.
247            If not all scan numbers are found in the unprocessed mass spectra dictionary.
248        """
249        if perform_checks:
250            if spectrum_mode != "profile":
251                raise ValueError("Averaging only supported for profile mode")
252
253        if polarity is None:
254            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
255            if self.polarity == "negative":
256                polarity = -1
257            elif self.polarity == "positive":
258                polarity = 1
259            else:
260                raise ValueError(
261                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
262                )
263
264        # if not using_parser, check that scan numbers are in _ms_unprocessed
265        if not use_parser:
266            if perform_checks:
267                # Set index to scan for faster lookup
268                ms_df = (
269                    self._ms_unprocessed[ms_level]
270                    .copy()
271                    .set_index("scan", drop=False)
272                    .sort_index()
273                )
274                my_ms_df = ms_df.loc[scan_list]
275                # Check that all scan numbers are in the ms_df
276                if not all(np.isin(scan_list, ms_df.index)):
277                    raise ValueError(
278                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
279                    )
280            else:
281                my_ms_df = (
282                    pd.DataFrame({"scan": scan_list})
283                    .set_index("scan")
284                    .join(self._ms_unprocessed[ms_level], how="left")
285                )
286
287        if use_parser:
288            ms_list = [
289                self.spectra_parser.get_mass_spectrum_from_scan(
290                    x, spectrum_mode=spectrum_mode, auto_process=False
291                )
292                for x in scan_list
293            ]
294            ms_mz = [x._mz_exp for x in ms_list]
295            ms_int = [x._abundance for x in ms_list]
296            my_ms_df = []
297            for i in np.arange(len(ms_mz)):
298                my_ms_df.append(
299                    pd.DataFrame(
300                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
301                    )
302                )
303            my_ms_df = pd.concat(my_ms_df)
304
305        if not self.check_if_grid(my_ms_df):
306            my_ms_df = self.grid_data(my_ms_df)
307
308        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
309
310        ms = ms_from_array_profile(
311            my_ms_ave.mz,
312            my_ms_ave.intensity,
313            self.file_location,
314            polarity=polarity,
315            auto_process=False,
316        )
317
318        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
319        if ms is not None:
320            if ms_params is not None:
321                ms.parameters = ms_params
322            ms.scan_number = apex_scan
323            if auto_process:
324                ms.process_mass_spec()
325        return ms

Returns an averaged mass spectrum object

Parameters
  • scan_list (list): List of scan numbers to average.
  • apex_scan (int): Number of the apex scan
  • spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
  • ms_level (int, optional): The MS level to use. Defaults to 1.
  • auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
  • use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
  • perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
  • polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
  • ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
  • MassSpectrumProfile: The averaged mass spectrum object.
Raises
  • ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
def find_mass_features(self, ms_level=1, grid=True):
327    def find_mass_features(self, ms_level=1, grid=True):
328        """Find mass features within an LCMSBase object
329
330        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
331
332        Parameters
333        ----------
334        ms_level : int, optional
335            The MS level to use for peak picking Default is 1.
336        grid : bool, optional
337            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded),
338            used for persistent homology peak picking for profile data only. Default is True.
339
340        Raises
341        ------
342        ValueError
343            If no MS level data is found on the object.
344            If persistent homology peak picking is attempted on non-profile mode data.
345            If data is not gridded and grid is False.
346            If peak picking method is not implemented.
347
348        Returns
349        -------
350        None, but assigns the mass_features and eics attributes to the object.
351
352        """
353        pp_method = self.parameters.lc_ms.peak_picking_method
354
355        if pp_method == "persistent homology":
356            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
357            if all(msx_scan_df["ms_format"] == "profile"):
358                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
359            else:
360                raise ValueError(
361                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
362                        ms_level
363                    )
364                )
365        elif pp_method == "centroided_persistent_homology":
366            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
367            if all(msx_scan_df["ms_format"] == "centroid"):
368                self.find_mass_features_ph_centroid(ms_level=ms_level)
369            else:
370                raise ValueError(
371                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
372                        ms_level
373                    )
374                )
375        else:
376            raise ValueError("Peak picking method not implemented")

Find mass features within an LCMSBase object

Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.

Parameters
  • ms_level (int, optional): The MS level to use for peak picking Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), used for persistent homology peak picking for profile data only. Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
  • None, but assigns the mass_features and eics attributes to the object.
def integrate_mass_features(self, drop_if_fail=True, drop_duplicates=True, ms_level=1):
378    def integrate_mass_features(
379        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
380    ):
381        """Integrate mass features and extract EICs.
382
383        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
384
385        Parameters
386        ----------
387        drop_if_fail : bool, optional
388            Whether to drop mass features if the EIC limit calculations fail.
389            Default is True.
390        drop_duplicates : bool, optional
391            Whether to mass features that appear to be duplicates
392            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
393            Default is True.
394        ms_level : int, optional
395            The MS level to use. Default is 1.
396
397        Raises
398        ------
399        ValueError
400            If no mass features are found.
401            If no MS level data is found for the given MS level (either in data or in the scan data)
402
403        Returns
404        -------
405        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
406
407        Notes
408        -----
409        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
410        """
411        # Check if there is data
412        if ms_level in self._ms_unprocessed.keys():
413            raw_data = self._ms_unprocessed[ms_level].copy()
414        else:
415            raise ValueError("No MS level " + str(ms_level) + " data found")
416        if self.mass_features is not None:
417            mf_df = self.mass_features_to_df().copy()
418        else:
419            raise ValueError(
420                "No mass features found, did you run find_mass_features() first?"
421            )
422        # Check if mass_spectrum exists on each mass feature
423        if not all(
424            [mf.mass_spectrum is not None for mf in self.mass_features.values()]
425        ):
426            raise ValueError(
427                "Mass spectrum must be associated with each mass feature, did you run add_associated_ms1() first?"
428            )
429
430        # Subset scan data to only include correct ms_level
431        scan_df_sub = self.scan_df[
432            self.scan_df["ms_level"] == int(ms_level)
433        ].reset_index(drop=True)
434        if scan_df_sub.empty:
435            raise ValueError("No MS level " + ms_level + " data found in scan data")
436        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
437
438        mzs_to_extract = np.unique(mf_df["mz"].values)
439        mzs_to_extract.sort()
440
441        # Pre-sort raw_data by mz for faster filtering
442        raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True)
443        raw_data_mz = raw_data_sorted["mz"].values
444
445        # Get EICs for each unique mz in mass features list
446        for mz in mzs_to_extract:
447            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
448            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
449
450            # Use binary search for faster mz range filtering
451            left_idx = np.searchsorted(raw_data_mz, mz_min, side="left")
452            right_idx = np.searchsorted(raw_data_mz, mz_max, side="right")
453            raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy()
454
455            raw_data_sub = (
456                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
457            )
458            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
459            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
460            myEIC = EIC_Data(
461                scans=raw_data_sub["scan"].values,
462                time=raw_data_sub["scan_time"].values,
463                eic=raw_data_sub["intensity"].values,
464            )
465            # Smooth EIC
466            smoothed_eic = self.smooth_tic(myEIC.eic)
467            smoothed_eic[smoothed_eic < 0] = 0
468            myEIC.eic_smoothed = smoothed_eic
469            self.eics[mz] = myEIC
470
471        # Get limits of mass features using EIC centroid detector and integrate
472        mf_df["area"] = np.nan
473        for idx, mass_feature in mf_df.iterrows():
474            mz = mass_feature.mz
475            apex_scan = mass_feature.apex_scan
476
477            # Pull EIC data and find apex scan index
478            myEIC = self.eics[mz]
479            self.mass_features[idx]._eic_data = myEIC
480            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
481
482            # Find left and right limits of peak using EIC centroid detector, add to EICData
483            centroid_eics = self.eic_centroid_detector(
484                myEIC.time,
485                myEIC.eic_smoothed,
486                mass_feature.intensity * 1.1,
487                apex_indexes=[int(apex_index)],
488            )
489            l_a_r_scan_idx = [i for i in centroid_eics]
490            if len(l_a_r_scan_idx) > 0:
491                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
492                # Find the number of consecutive non-zero values in the EIC segment
493                mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0
494                # Find the longest run of consecutive True values
495                if np.any(mask):
496                    # Find indices where mask changes value
497                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
498                    starts = np.where(diff == 1)[0]
499                    ends = np.where(diff == -1)[0]
500                    consecutive_scans = (ends - starts).max()
501                else:
502                    consecutive_scans = 0
503                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
504                    self.mass_features.pop(idx)
505                    continue
506                # Add start and final scan to mass_features and EICData
507                left_scan, right_scan = (
508                    myEIC.scans[l_a_r_scan_idx[0][0]],
509                    myEIC.scans[l_a_r_scan_idx[0][2]],
510                )
511                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
512                myEIC.apexes = myEIC.apexes + mf_scan_apex
513                self.mass_features[idx].start_scan = left_scan
514                self.mass_features[idx].final_scan = right_scan
515
516                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
517                area = np.trapz(
518                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
519                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
520                )
521                mf_df.at[idx, "area"] = area
522                myEIC.areas = myEIC.areas + [area]
523                self.eics[mz] = myEIC
524                self.mass_features[idx]._area = area
525            else:
526                if drop_if_fail is True:
527                    self.mass_features.pop(idx)
528
529        if drop_duplicates:
530            # Prepare mass feature dataframe
531            mf_df = self.mass_features_to_df()
532
533            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
534            # Keep the first mass feature (highest persistence)
535            for idx, mass_feature in mf_df.iterrows():
536                mz = mass_feature.mz
537                apex_scan = mass_feature.apex_scan
538
539                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
540                mf_df_sub = mf_df[
541                    mf_df["mz_diff_ppm"]
542                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
543                    * 10**6
544                ].copy()
545
546                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
547                for idx2, mass_feature2 in mf_df_sub.iterrows():
548                    if idx2 != idx:
549                        if (
550                            mass_feature2.start_scan >= mass_feature.start_scan
551                            and mass_feature2.final_scan <= mass_feature.final_scan
552                        ):
553                            if idx2 in self.mass_features.keys():
554                                self.mass_features.pop(idx2)

Integrate mass features and extract EICs.

Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.

Parameters
  • drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
  • drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
  • None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes

drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).

def find_c13_mass_features(self):
556    def find_c13_mass_features(self):
557        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
558
559        Returns
560        -------
561        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
562
563        Raises
564        ------
565        ValueError
566            If no mass features are found.
567        """
568        verbose = self.parameters.lc_ms.verbose_processing
569        if verbose:
570            print("evaluating mass features for C13 isotopes")
571        if self.mass_features is None:
572            raise ValueError("No mass features found, run find_mass_features() first")
573
574        # Data prep fo sparse distance matrix
575        dims = ["mz", "scan_time"]
576        mf_df = self.mass_features_to_df().copy()
577        # Drop mass features that have no area (these are likely to be noise)
578        mf_df = mf_df[mf_df["area"].notnull()]
579        mf_df["mf_id"] = mf_df.index.values
580        dims = ["mz", "scan_time"]
581
582        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
583        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
584
585        mz_diff = 1.003355  # C13-C12 mass difference
586        tol = [
587            mf_df["mz"].median()
588            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
589            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
590        ]  # mz, in relative; scan_time in minutes
591
592        # Compute inter-feature distances
593        distances = None
594        for i in range(len(dims)):
595            # Construct k-d tree
596            values = mf_df[dims[i]].values
597            tree = KDTree(values.reshape(-1, 1))
598
599            max_tol = tol[i]
600            if dims[i] == "mz":
601                # Maximum absolute tolerance
602                max_tol = mz_diff + tol[i]
603
604            # Compute sparse distance matrix
605            # the larger the max_tol, the slower this operation is
606            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
607
608            # Only consider forward case, exclude diagonal
609            sdm = sparse.triu(sdm, k=1)
610
611            if dims[i] == "mz":
612                min_tol = mz_diff - tol[i]
613                # Get only the ones that are above the min tol
614                idx = sdm.data > min_tol
615
616                # Reconstruct sparse distance matrix
617                sdm = sparse.coo_matrix(
618                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
619                    shape=(len(values), len(values)),
620                )
621
622            # Cast as binary matrix
623            sdm.data = np.ones_like(sdm.data)
624
625            # Stack distances
626            if distances is None:
627                distances = sdm
628            else:
629                distances = distances.multiply(sdm)
630
631        # Extract indices of within-tolerance points
632        distances = distances.tocoo()
633        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
634
635        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
636        pairs_mf = pairs.copy()
637        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
638        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
639
640        # Connect monoisotopic masses with isotopologes within mass_features
641        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
642        for mono in monos:
643            self.mass_features[mono].monoisotopic_mf_id = mono
644        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
645        while not pairs_iso_df.empty:
646            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
647            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
648            for iso in m1_isos:
649                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
650                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
651                if len(parent) > 1:
652                    # Choose the parent that is closest in time to the isotopologue
653                    parent_time = [self.mass_features[p].retention_time for p in parent]
654                    time_diff = [
655                        np.abs(self.mass_features[iso].retention_time - x)
656                        for x in parent_time
657                    ]
658                    parent = parent[np.argmin(time_diff)]
659                else:
660                    parent = parent[0]
661                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
662                    parent
663                ].monoisotopic_mf_id
664                if self.mass_features[iso].monoisotopic_mf_id is not None:
665                    mass_diff = (
666                        self.mass_features[iso].mz
667                        - self.mass_features[
668                            self.mass_features[iso].monoisotopic_mf_id
669                        ].mz
670                    )
671                    self.mass_features[iso].isotopologue_type = "13C" + str(
672                        int(round(mass_diff, 0))
673                    )
674
675            # Drop the mono and iso from the pairs_iso_df
676            pairs_iso_df = pairs_iso_df.drop(
677                index=monos, errors="ignore"
678            )  # Drop pairs where the parent is a child that is a child of a root
679            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
680            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
681
682            if not pairs_iso_df.empty:
683                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
684                monos = np.setdiff1d(
685                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
686                )
687        if verbose:
688            # Report fraction of compounds annotated with isotopes
689            mf_df["c13_flag"] = np.where(
690                np.logical_or(
691                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
692                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
693                ),
694                1,
695                0,
696            )
697            print(
698                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
699                + " of mass features have or are C13 isotopes"
700            )

Mark likely C13 isotopes and connect to monoisoitopic mass features.

Returns
  • None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found.
def deconvolute_ms1_mass_features(self):
702    def deconvolute_ms1_mass_features(self):
703        """Deconvolute MS1 mass features
704
705        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
706
707        Parameters
708        ----------
709        None
710
711        Returns
712        -------
713        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
714        and associated_mass_features_deconvoluted attributes to the mass features in the
715        mass_features attribute of the LCMSBase object.
716
717        Raises
718        ------
719        ValueError
720            If no mass features are found, must run find_mass_features() first.
721            If no EICs are found, did you run integrate_mass_features() first?
722
723        """
724        # Checks for set mass_features and eics
725        if self.mass_features is None:
726            raise ValueError(
727                "No mass features found, did you run find_mass_features() first?"
728            )
729
730        if self.eics == {}:
731            raise ValueError(
732                "No EICs found, did you run integrate_mass_features() first?"
733            )
734
735        if 1 not in self._ms_unprocessed.keys():
736            raise ValueError("No unprocessed MS1 spectra found.")
737
738        # Prep ms1 data
739        ms1_data = self._ms_unprocessed[1].copy()
740        ms1_data = ms1_data.set_index("scan")
741
742        # Prep mass feature summary
743        mass_feature_df = self.mass_features_to_df()
744
745        # Loop through each mass feature
746        for mf_id, mass_feature in self.mass_features.items():
747            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
748            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
749                continue
750
751            # Get the left and right limits of the EIC of the mass feature
752            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
753
754            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
755            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
756            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
757
758            # Get the centroided masses of the mass feature
759            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
760
761            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
762            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
763                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
764            ]
765
766            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
767            ms1_data_sub["mz_diff_rel"] = (
768                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
769                / ms1_data_sub["mz"]
770            )
771            ms1_data_sub = ms1_data_sub[
772                ms1_data_sub["mz_diff_rel"]
773                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
774            ].reset_index(drop=True)
775
776            # Group by mass_feature_mz and scan and sum intensity
777            ms1_data_sub_group = (
778                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
779                .sum()
780                .reset_index()
781            )
782
783            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
784            corr = (
785                ms1_data_sub_group.pivot(
786                    index="scan", columns="mass_feature_mz", values="intensity"
787                )
788                .fillna(0)
789                .corr()
790            )
791
792            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
793            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
794
795            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
796            try:
797                corr_subset = corr.loc[mass_feature.mz,]
798            except KeyError:
799                # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
800                continue
801
802            corr_subset = corr_subset[corr_subset > decon_corr_min]
803
804            # Get the masses from the mass spectrum that are the result of the deconvolution
805            mzs_decon = corr_subset.index.values
806
807            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
808            mzs_decon_idx = [
809                id
810                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
811                if mz in mzs_decon
812            ]
813            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
814
815            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
816            if (
817                mass_feature.ms1_peak.abundance
818                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
819            ):
820                mass_feature.mass_spectrum_deconvoluted_parent = True
821            else:
822                mass_feature.mass_spectrum_deconvoluted_parent = False
823
824            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
825            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
826            mass_feature_df_sub = mass_feature_df[
827                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
828                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
829            ].copy()
830            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
831            mass_feature_df_sub["mz_diff_ppm"] = [
832                np.abs(mzs_decon - mz).min() / mz * 10**6
833                for mz in mass_feature_df_sub["mz"]
834            ]
835            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
836            mfs_associated_decon = mass_feature_df_sub[
837                mass_feature_df_sub["mz_diff_ppm"]
838                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
839            ].index.values
840
841            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Deconvolute MS1 mass features

Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features

Parameters
  • None
Returns
  • None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
  • and associated_mass_features_deconvoluted attributes to the mass features in the
  • mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
class PHCalculations:
 844class PHCalculations:
 845    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
 846
 847    Notes
 848    -----
 849    This class is intended to be used as a mixin for the LCMSBase class.
 850
 851    Methods
 852    -------
 853    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
 854        Sparse implementation of a mean filter.
 855    * embed_unique_indices(a).
 856        Creates an array of indices, sorted by unique element.
 857    * sparse_upper_star(idx, V).
 858        Sparse implementation of an upper star filtration.
 859    * check_if_grid(data).
 860        Check if the data is gridded in mz space.
 861    * grid_data(data).
 862        Grid the data in the mz dimension.
 863    * find_mass_features_ph(ms_level=1, grid=True).
 864        Find mass features within an LCMSBase object using persistent homology.
 865    * cluster_mass_features(drop_children=True).
 866        Cluster regions of interest.
 867    """
 868
 869    @staticmethod
 870    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
 871        """Sparse implementation of a mean filter.
 872
 873        Parameters
 874        ----------
 875        idx : :obj:`~numpy.array`
 876            Edge indices for each dimension (MxN).
 877        V : :obj:`~numpy.array`
 878            Array of intensity data (Mx1).
 879        radius : float or list
 880            Radius of the sparse filter in each dimension. Values less than
 881            zero indicate no connectivity in that dimension.
 882
 883        Returns
 884        -------
 885        :obj:`~numpy.array`
 886            Filtered intensities (Mx1).
 887
 888        Notes
 889        -----
 890        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
 891        This is a static method.
 892        """
 893
 894        # Copy indices
 895        idx = idx.copy().astype(V.dtype)
 896
 897        # Scale
 898        for i, r in enumerate(radius):
 899            # Increase inter-index distance
 900            if r < 1:
 901                idx[:, i] *= 2
 902
 903            # Do nothing
 904            elif r == 1:
 905                pass
 906
 907            # Decrease inter-index distance
 908            else:
 909                idx[:, i] /= r
 910
 911        # Connectivity matrix
 912        cmat = KDTree(idx)
 913        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
 914        cmat.setdiag(1)
 915
 916        # Pair indices
 917        I, J = cmat.nonzero()
 918
 919        # Delete cmat
 920        cmat_shape = cmat.shape
 921        del cmat
 922
 923        # Sum over columns
 924        V_sum = sparse.bsr_matrix(
 925            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
 926        ).diagonal(0)
 927
 928        # Count over columns
 929        V_count = sparse.bsr_matrix(
 930            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
 931        ).diagonal(0)
 932
 933        return V_sum / V_count
 934
 935    @staticmethod
 936    def embed_unique_indices(a):
 937        """Creates an array of indices, sorted by unique element.
 938
 939        Parameters
 940        ----------
 941        a : :obj:`~numpy.array`
 942            Array of unique elements (Mx1).
 943
 944        Returns
 945        -------
 946        :obj:`~numpy.array`
 947            Array of indices (Mx1).
 948
 949        Notes
 950        -----
 951        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 952        This is a static method.
 953        """
 954
 955        def count_tens(n):
 956            # Count tens
 957            ntens = (n - 1) // 10
 958
 959            while True:
 960                ntens_test = (ntens + n - 1) // 10
 961
 962                if ntens_test == ntens:
 963                    return ntens
 964                else:
 965                    ntens = ntens_test
 966
 967        def arange_exclude_10s(n):
 968            # How many 10s will there be?
 969            ntens = count_tens(n)
 970
 971            # Base array
 972            arr = np.arange(0, n + ntens)
 973
 974            # Exclude 10s
 975            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 976
 977            return arr
 978
 979        # Creates an array of indices, sorted by unique element
 980        idx_sort = np.argsort(a)
 981        idx_unsort = np.argsort(idx_sort)
 982
 983        # Sorts records array so all unique elements are together
 984        sorted_a = a[idx_sort]
 985
 986        # Returns the unique values, the index of the first occurrence,
 987        # and the count for each element
 988        vals, idx_start, count = np.unique(
 989            sorted_a, return_index=True, return_counts=True
 990        )
 991
 992        # Splits the indices into separate arrays
 993        splits = np.split(idx_sort, idx_start[1:])
 994
 995        # Creates unique indices for each split
 996        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 997
 998        # Reorders according to input array
 999        idx_unq = idx_unq[idx_unsort]
1000
1001        # Magnitude of each index
1002        exp = np.log10(
1003            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
1004        )
1005        idx_unq_mag = np.power(10, np.floor(exp) + 1)
1006
1007        # Result
1008        return a + idx_unq / idx_unq_mag
1009
1010    @staticmethod
1011    def roll_up_dataframe(
1012        df: pd.DataFrame,
1013        sort_by: str,
1014        tol: list,
1015        relative: list,
1016        dims: list,
1017        memory_opt_threshold: int = 10000,
1018    ):
1019        """Subset data by rolling up into apex in appropriate dimensions.
1020
1021        Parameters
1022        ----------
1023        data : pd.DataFrame
1024            The input data containing "dims" columns and the "sort_by" column.
1025        sort_by : str
1026            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature
1027            (i.e., the mass feature with the highest value in the sort_by column).
1028        dims : list
1029            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1030        tol : list
1031            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1032            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1033            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1034            If relative is False, the tolerance will be used as is.
1035        relative : list
1036            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1037        memory_opt_threshold : int, optional
1038            Minimum number of rows to trigger memory-optimized processing. Default is 10000.
1039
1040        Returns
1041        -------
1042        pd.DataFrame
1043            A DataFrame with only the rolled up mass features, with the original index and columns.
1044
1045
1046        Raises
1047        ------
1048        ValueError
1049            If the input data is not a pandas DataFrame.
1050            If the input data does not have columns for each of the dimensions in "dims".
1051            If the length of "dims", "tol", and "relative" do not match.
1052        """
1053        og_columns = df.columns.copy()
1054
1055        # Unindex the data, but keep the original index
1056        if df.index.name is not None:
1057            og_index = df.index.name
1058        else:
1059            og_index = "index"
1060        df = df.reset_index(drop=False)
1061
1062        # Sort data by sort_by column, and reindex
1063        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1064
1065        # Check that data is a DataFrame and has columns for each of the dims
1066        if not isinstance(df, pd.DataFrame):
1067            raise ValueError("Data must be a pandas DataFrame")
1068        for dim in dims:
1069            if dim not in df.columns:
1070                raise ValueError(f"Data must have a column for {dim}")
1071        if len(dims) != len(tol) or len(dims) != len(relative):
1072            raise ValueError(
1073                "Dimensions, tolerances, and relative flags must be the same length"
1074            )
1075
1076        # Pre-compute all values arrays
1077        all_values = [df[dim].values for dim in dims]
1078
1079        # Choose processing method based on dataframe size
1080        if len(df) >= memory_opt_threshold:
1081            # Memory-optimized approach for large dataframes
1082            distances = PHCalculations._compute_distances_memory_optimized(
1083                all_values, tol, relative
1084            )
1085        else:
1086            # Faster approach for smaller dataframes
1087            distances = PHCalculations._compute_distances_original(
1088                all_values, tol, relative
1089            )
1090
1091        # Process pairs with original logic but memory optimizations
1092        distances = distances.tocoo()
1093        pairs = np.stack((distances.row, distances.col), axis=1)
1094        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent")
1095        del distances, pairs  # Free memory immediately
1096
1097        to_drop = []
1098        while not pairs_df.empty:
1099            # Find root_parents and their children (original logic preserved)
1100            root_parents = np.setdiff1d(
1101                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1102            )
1103            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1104            to_drop.extend(children_of_roots)  # Use extend instead of append
1105
1106            # Remove root_children as possible parents from pairs_df for next iteration
1107            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1108            pairs_df = pairs_df.reset_index().set_index("child")
1109            # Remove root_children as possible children from pairs_df for next iteration
1110            pairs_df = pairs_df.drop(index=children_of_roots)
1111
1112            # Prepare for next iteration
1113            pairs_df = pairs_df.reset_index().set_index("parent")
1114
1115        # Convert to numpy array for efficient dropping
1116        to_drop = np.array(to_drop)
1117
1118        # Drop mass features that are not cluster parents
1119        df_sub = df.drop(index=to_drop)
1120
1121        # Set index back to og_index and only keep original columns
1122        df_sub = df_sub.set_index(og_index).sort_index()[og_columns]
1123
1124        return df_sub
1125
1126    @staticmethod
1127    def _compute_distances_original(all_values, tol, relative):
1128        """Original distance computation method for smaller datasets.
1129
1130        This method computes the pairwise distances between features in the dataset
1131        using a straightforward approach. It is suitable for smaller datasets where
1132        memory usage is not a primary concern.
1133
1134        Parameters
1135        ----------
1136        all_values : list of :obj:`~numpy.array`
1137            List of arrays containing the values for each dimension.
1138        tol : list of float
1139            List of tolerances for each dimension.
1140        relative : list of bool
1141            List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1142
1143        Returns
1144        -------
1145        :obj:`~scipy.sparse.coo_matrix`
1146            Sparse matrix indicating pairwise distances within tolerances.
1147        """
1148        # Compute inter-feature distances with memory optimization
1149        distances = None
1150        for i in range(len(all_values)):
1151            values = all_values[i]
1152            # Use single precision if possible to reduce memory
1153            tree = KDTree(values.reshape(-1, 1).astype(np.float32))
1154
1155            max_tol = tol[i]
1156            if relative[i] is True:
1157                max_tol = tol[i] * values.max()
1158
1159            # Compute sparse distance matrix with smaller chunks if memory is an issue
1160            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1161
1162            # Only consider forward case, exclude diagonal
1163            sdm = sparse.triu(sdm, k=1)
1164
1165            # Process relative distances more efficiently
1166            if relative[i] is True:
1167                # Vectorized computation without creating intermediate arrays
1168                row_values = values[sdm.row]
1169                valid_idx = sdm.data <= tol[i] * row_values
1170
1171                # Reconstruct sparse matrix more efficiently
1172                sdm = sparse.coo_matrix(
1173                    (
1174                        np.ones(valid_idx.sum(), dtype=np.uint8),
1175                        (sdm.row[valid_idx], sdm.col[valid_idx]),
1176                    ),
1177                    shape=(len(values), len(values)),
1178                )
1179            else:
1180                # Cast as binary matrix with smaller data type
1181                sdm.data = np.ones(len(sdm.data), dtype=np.uint8)
1182
1183            # Stack distances with memory-efficient multiplication
1184            if distances is None:
1185                distances = sdm
1186            else:
1187                # Use in-place operations where possible
1188                distances = distances.multiply(sdm)
1189                del sdm  # Free memory immediately
1190
1191        return distances
1192
1193    @staticmethod
1194    def _compute_distances_memory_optimized(all_values, tol, relative):
1195        """Memory-optimized distance computation for large datasets.
1196
1197        This method computes the pairwise distances between features in the dataset
1198        using a more memory-efficient approach. It is suitable for larger datasets
1199        where memory usage is a primary concern.
1200
1201        Parameters
1202        ----------
1203        all_values : list of :obj:`~numpy.array`
1204            List of arrays containing the values for each dimension.
1205        tol : list of float
1206            List of tolerances for each dimension.
1207        relative : list of bool
1208            List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1209
1210        Returns
1211        -------
1212        :obj:`~scipy.sparse.coo_matrix`
1213            Sparse matrix indicating pairwise distances within tolerances.
1214        """
1215        # Compute distance matrix for first dimension (full matrix as before)
1216        values_0 = all_values[0].astype(np.float32)
1217        tree_0 = KDTree(values_0.reshape(-1, 1))
1218
1219        max_tol_0 = tol[0]
1220        if relative[0] is True:
1221            max_tol_0 = tol[0] * values_0.max()
1222
1223        # Compute sparse distance matrix for first dimension
1224        distances = tree_0.sparse_distance_matrix(
1225            tree_0, max_tol_0, output_type="coo_matrix"
1226        )
1227        distances = sparse.triu(distances, k=1)
1228
1229        # Process relative distances for first dimension
1230        if relative[0] is True:
1231            row_values = values_0[distances.row]
1232            valid_idx = distances.data <= tol[0] * row_values
1233            distances = sparse.coo_matrix(
1234                (
1235                    np.ones(valid_idx.sum(), dtype=np.uint8),
1236                    (distances.row[valid_idx], distances.col[valid_idx]),
1237                ),
1238                shape=(len(values_0), len(values_0)),
1239            )
1240        else:
1241            distances.data = np.ones(len(distances.data), dtype=np.uint8)
1242
1243        # For remaining dimensions, work only on chunks defined by first dimension pairs
1244        if len(all_values) > 1:
1245            distances_coo = distances.tocoo()
1246            valid_pairs = []
1247
1248            # Process each pair from first dimension
1249            for idx in range(len(distances_coo.data)):
1250                i, j = distances_coo.row[idx], distances_coo.col[idx]
1251                is_valid_pair = True
1252
1253                # Check remaining dimensions for this specific pair
1254                for dim_idx in range(1, len(all_values)):
1255                    values = all_values[dim_idx]
1256                    val_i, val_j = values[i], values[j]
1257
1258                    max_tol = tol[dim_idx]
1259                    if relative[dim_idx] is True:
1260                        max_tol = tol[dim_idx] * values.max()
1261
1262                    distance_ij = abs(val_i - val_j)
1263
1264                    # Check if this pair satisfies the tolerance for this dimension
1265                    if relative[dim_idx] is True:
1266                        if distance_ij > tol[dim_idx] * val_i:
1267                            is_valid_pair = False
1268                            break
1269                    else:
1270                        if distance_ij > max_tol:
1271                            is_valid_pair = False
1272                            break
1273
1274                if is_valid_pair:
1275                    valid_pairs.append((i, j))
1276
1277            # Rebuild distances matrix with only valid pairs
1278            if valid_pairs:
1279                valid_pairs = np.array(valid_pairs)
1280                distances = sparse.coo_matrix(
1281                    (
1282                        np.ones(len(valid_pairs), dtype=np.uint8),
1283                        (valid_pairs[:, 0], valid_pairs[:, 1]),
1284                    ),
1285                    shape=(len(values_0), len(values_0)),
1286                )
1287            else:
1288                # No valid pairs found
1289                distances = sparse.coo_matrix(
1290                    (len(values_0), len(values_0)), dtype=np.uint8
1291                )
1292
1293        return distances
1294
1295    def sparse_upper_star(self, idx, V):
1296        """Sparse implementation of an upper star filtration.
1297
1298        Parameters
1299        ----------
1300        idx : :obj:`~numpy.array`
1301            Edge indices for each dimension (MxN).
1302        V : :obj:`~numpy.array`
1303            Array of intensity data (Mx1).
1304        Returns
1305        -------
1306        idx : :obj:`~numpy.array`
1307            Index of filtered points (Mx1).
1308        persistence : :obj:`~numpy.array`
1309            Persistence of each filtered point (Mx1).
1310
1311        Notes
1312        -----
1313        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1314        """
1315
1316        # Invert
1317        V = -1 * V.copy().astype(int)
1318
1319        # Embed indices
1320        V = self.embed_unique_indices(V)
1321
1322        # Connectivity matrix
1323        cmat = KDTree(idx)
1324        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1325        cmat.setdiag(1)
1326        cmat = sparse.triu(cmat)
1327
1328        # Pairwise minimums
1329        I, J = cmat.nonzero()
1330        d = np.maximum(V[I], V[J])
1331
1332        # Delete connectiity matrix
1333        cmat_shape = cmat.shape
1334        del cmat
1335
1336        # Sparse distance matrix
1337        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1338
1339        # Delete pairwise mins
1340        del d, I, J
1341
1342        # Persistence homology
1343        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1344
1345        # Bound death values
1346        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1347
1348        # Construct tree to query against
1349        tree = KDTree(V.reshape((-1, 1)))
1350
1351        # Get the indexes of the first nearest neighbor by birth
1352        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1353
1354        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1355
1356    def check_if_grid(self, data):
1357        """Check if the data are gridded in mz space.
1358
1359        Parameters
1360        ----------
1361        data : DataFrame
1362            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1363
1364        Returns
1365        -------
1366        bool
1367            True if the data is gridded in the mz direction, False otherwise.
1368
1369        Notes
1370        -----
1371        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1372        """
1373        # Calculate the difference between consecutive mz values in a single scan
1374        dat_check = data.copy().reset_index(drop=True)
1375        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1376        mz_diff_min = (
1377            dat_check.groupby("scan")["mz_diff"].min().min()
1378        )  # within each scan, what is the smallest mz difference between consecutive mz values
1379
1380        # Find the mininum mz difference between mz values in the data; regardless of scan
1381        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1382        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1383        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1384
1385        # Get minimum mz_diff between mz values in the data
1386        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1387
1388        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1389        if mz_diff_min_raw < mz_diff_min:
1390            return False
1391        else:
1392            return True
1393
1394    def grid_data(self, data, attempts=5):
1395        """Grid the data in the mz dimension.
1396
1397        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1398
1399        Parameters
1400        ----------
1401        data : DataFrame
1402            The input data containing mz, scan, scan_time, and intensity columns.
1403        attempts : int, optional
1404            The number of attempts to grid the data. Default is 5.
1405
1406        Returns
1407        -------
1408        DataFrame
1409            The gridded data with mz, scan, scan_time, and intensity columns.
1410
1411        Raises
1412        ------
1413        ValueError
1414            If gridding fails after the specified number of attempts.
1415        """
1416        attempt_i = 0
1417        while attempt_i < attempts:
1418            attempt_i += 1
1419            data = self._grid_data(data)
1420
1421            if self.check_if_grid(data):
1422                return data
1423
1424        if not self.check_if_grid(data):
1425            raise ValueError(
1426                "Gridding failed after "
1427                + str(attempt_i)
1428                + " attempts. Please check the data."
1429            )
1430        else:
1431            return data
1432
1433    def _grid_data(self, data):
1434        """Internal method to grid the data in the mz dimension.
1435
1436        Notes
1437        -----
1438        This method is called by the grid_data method and should not be called directly.
1439        It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan,
1440        but it does not check if the data is already gridded or if the gridding is successful.
1441
1442        Parameters
1443        ----------
1444        data : pd.DataFrame or pl.DataFrame
1445            The input data to grid.
1446
1447        Returns
1448        -------
1449        pd.DataFrame or pl.DataFrame
1450            The data after attempting to grid it in the mz dimension.
1451        """
1452        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1453        data_w = data.copy().reset_index(drop=True)
1454        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1455        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1456
1457        # Need high intensity mz values first so they are parents in the output pairs stack
1458        dat_mz = data_w[["mz", "intensity"]].sort_values(
1459            by=["intensity"], ascending=False
1460        )
1461        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1462
1463        # Construct KD tree
1464        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1465        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1466        sdm = sparse.triu(sdm, k=1)
1467        sdm.data = np.ones_like(sdm.data)
1468        distances = sdm.tocoo()
1469        pairs = np.stack((distances.row, distances.col), axis=1)
1470
1471        # Cull pairs to just get root
1472        to_drop = []
1473        while len(pairs) > 0:
1474            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1475            id_root_parents = np.isin(pairs[:, 0], root_parents)
1476            children_of_roots = np.unique(pairs[id_root_parents, 1])
1477            to_drop = np.append(to_drop, children_of_roots)
1478
1479            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1480            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1481            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1482        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1483        mz_dat_np = (
1484            dat_mz[["mz"]]
1485            .sort_values(by=["mz"])
1486            .reset_index(drop=True)
1487            .values.flatten()
1488        )
1489
1490        # Sort data by mz and recast mz to nearest value in mz_dat_np
1491        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1492        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1493        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1494
1495        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1496        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1497        new_data_w = (
1498            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1499            .groupby(["scan", "mz"])["intensity"]
1500            .sum()
1501            .reset_index()
1502        )
1503        new_data_w = (
1504            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1505            .reset_index(drop=True)
1506            .copy()
1507        )
1508
1509        return new_data_w
1510
1511    def find_mass_features_ph(self, ms_level=1, grid=True):
1512        """Find mass features within an LCMSBase object using persistent homology.
1513
1514        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1515
1516        Parameters
1517        ----------
1518        ms_level : int, optional
1519            The MS level to use. Default is 1.
1520        grid : bool, optional
1521            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1522
1523        Raises
1524        ------
1525        ValueError
1526            If no MS level data is found on the object.
1527            If data is not gridded and grid is False.
1528
1529        Returns
1530        -------
1531        None, but assigns the mass_features attribute to the object.
1532
1533        Notes
1534        -----
1535        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1536        """
1537        # Check that ms_level is a key in self._ms_uprocessed
1538        if ms_level not in self._ms_unprocessed.keys():
1539            raise ValueError(
1540                "No MS level "
1541                + str(ms_level)
1542                + " data found, did you instantiate with parser specific to MS level?"
1543            )
1544
1545        # Get ms data
1546        data = self._ms_unprocessed[ms_level].copy()
1547
1548        # Drop rows with missing intensity values and reset index
1549        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1550
1551        # Threshold data
1552        dims = ["mz", "scan_time"]
1553        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1554        persistence_threshold = (
1555            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1556        )
1557        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1558
1559        # Check if gridded, if not, grid
1560        gridded_mz = self.check_if_grid(data_thres)
1561        if gridded_mz is False:
1562            if grid is False:
1563                raise ValueError(
1564                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1565                )
1566            else:
1567                data_thres = self.grid_data(data_thres)
1568
1569        # Add scan_time
1570        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1571        # Process in chunks if required
1572        if len(data_thres) > 10000:
1573            return self._find_mass_features_ph_partition(
1574                data_thres, dims, persistence_threshold
1575            )
1576        else:
1577            # Process all at once
1578            return self._find_mass_features_ph_single(
1579                data_thres, dims, persistence_threshold
1580            )
1581
1582    def _find_mass_features_ph_single(self, data_thres, dims, persistence_threshold):
1583        """Process all data at once (original logic)."""
1584        # Build factors
1585        factors = {
1586            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1587            for dim in dims
1588        }
1589
1590        # Build indexes
1591        index = {
1592            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1593            for dim in factors
1594        }
1595
1596        # Smooth and process
1597        mass_features_df = self._process_partition_ph(
1598            data_thres, index, dims, persistence_threshold
1599        )
1600
1601        # Roll up within chunk to remove duplicates
1602        mass_features_df = self.roll_up_dataframe(
1603            df=mass_features_df,
1604            sort_by="persistence",
1605            dims=["mz", "scan_time"],
1606            tol=[
1607                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1608                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1609            ],
1610            relative=[True, False],
1611        )
1612        mass_features_df = mass_features_df.reset_index(drop=True)
1613
1614        # Populate mass_features attribute
1615        self._populate_mass_features(mass_features_df)
1616
1617    def _find_mass_features_ph_partition(self, data_thres, dims, persistence_threshold):
1618        """Partition the persistent homology mass feature detection for large datasets.
1619
1620        This method splits the input data into overlapping scan partitions, processes each partition to detect mass features
1621        using persistent homology, rolls up duplicates within and across partitions, and populates the mass_features attribute.
1622
1623        Parameters
1624        ----------
1625        data_thres : pd.DataFrame
1626            The thresholded input data containing mass spectrometry information.
1627        dims : list
1628            List of dimension names (e.g., ["mz", "scan_time"]) used for feature detection.
1629        persistence_threshold : float
1630            Minimum persistence value required for a detected mass feature to be retained.
1631
1632        Returns
1633        -------
1634        None
1635            Populates the mass_features attribute of the object with detected mass features.
1636        """
1637        all_mass_features = []
1638
1639        # Split scans into partitions
1640        unique_scans = sorted(data_thres["scan"].unique())
1641        unique_scans_n = len(unique_scans)
1642
1643        # Calculate partition size in scans based on goal
1644        partition_size_goal = 5000
1645        scans_per_partition = max(
1646            1, partition_size_goal // (len(data_thres) // unique_scans_n)
1647        )
1648        if scans_per_partition == 0:
1649            scans_per_partition = 1
1650
1651        # Make partitions based on scans, with overlapping in partitioned scans
1652        scan_overlap = 4
1653        partition_scans = []
1654        for i in range(0, unique_scans_n, scans_per_partition):
1655            start_idx = max(0, i - scan_overlap)
1656            end_idx = min(
1657                unique_scans_n - 1, i + scans_per_partition - 1 + scan_overlap
1658            )
1659            scans_group = [int(s) for s in unique_scans[start_idx : end_idx + 1]]
1660            partition_scans.append(scans_group)
1661
1662        # Set index to scan for faster filtering
1663        data_thres = data_thres.set_index("scan")
1664        for scans in partition_scans:
1665            # Determine start and end scan for partition, with 5 scans overlap
1666            partition_data = data_thres.loc[scans].reset_index(drop=False).copy()
1667
1668            if len(partition_data) == 0:
1669                continue
1670
1671            # Build factors for this partition
1672            factors = {
1673                dim: pd.factorize(partition_data[dim], sort=True)[1].astype(np.float32)
1674                for dim in dims
1675            }
1676
1677            # Build indexes
1678            index = {
1679                dim: np.searchsorted(factors[dim], partition_data[dim]).astype(
1680                    np.float32
1681                )
1682                for dim in factors
1683            }
1684
1685            # Process partition
1686            partition_features = self._process_partition_ph(
1687                partition_data, index, dims, persistence_threshold
1688            )
1689
1690            if len(partition_features) == 0:
1691                continue
1692
1693            # Roll up within partition
1694            partition_features = self.roll_up_dataframe(
1695                df=partition_features,
1696                sort_by="persistence",
1697                dims=["mz", "scan_time"],
1698                tol=[
1699                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1700                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1701                ],
1702                relative=[True, False],
1703            )
1704            partition_features = partition_features.reset_index(drop=True)
1705
1706            if len(partition_features) > 0:
1707                all_mass_features.append(partition_features)
1708
1709        # Combine results from all partitions
1710        if all_mass_features:
1711            combined_features = pd.concat(all_mass_features, ignore_index=True)
1712
1713            # Sort by persistence
1714            combined_features = combined_features.sort_values(
1715                by="persistence", ascending=False
1716            ).reset_index(drop=True)
1717
1718            # Remove duplicates from overlapping regions
1719            combined_features = self.roll_up_dataframe(
1720                df=combined_features,
1721                sort_by="persistence",
1722                dims=["mz", "scan_time"],
1723                tol=[
1724                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1725                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1726                ],
1727                relative=[True, False],
1728            )
1729
1730            # resort by persistence and reset index
1731            combined_features = combined_features.reset_index(drop=True)
1732
1733            # Populate mass_features attribute
1734            self._populate_mass_features(combined_features)
1735        else:
1736            self.mass_features = {}
1737
1738    def _process_partition_ph(self, partition_data, index, dims, persistence_threshold):
1739        """Process a single partition with persistent homology."""
1740        # Smooth data
1741        iterations = self.parameters.lc_ms.ph_smooth_it
1742        smooth_radius = [
1743            self.parameters.lc_ms.ph_smooth_radius_mz,
1744            self.parameters.lc_ms.ph_smooth_radius_scan,
1745        ]
1746
1747        index_array = np.vstack([index[dim] for dim in dims]).T
1748        V = partition_data["intensity"].values
1749        resid = np.inf
1750
1751        for i in range(iterations):
1752            # Previous iteration
1753            V_prev = V.copy()
1754            resid_prev = resid
1755            V = self.sparse_mean_filter(index_array, V, radius=smooth_radius)
1756
1757            # Calculate residual with previous iteration
1758            resid = np.sqrt(np.mean(np.square(V - V_prev)))
1759
1760            # Evaluate convergence
1761            if i > 0:
1762                # Percent change in residual
1763                test = np.abs(resid - resid_prev) / resid_prev
1764
1765                # Exit criteria
1766                if test <= 0:
1767                    break
1768
1769        # Overwrite values
1770        partition_data = partition_data.copy()
1771        partition_data["intensity"] = V
1772
1773        # Use persistent homology to find regions of interest
1774        pidx, pers = self.sparse_upper_star(index_array, V)
1775        pidx = pidx[pers > 1]
1776        pers = pers[pers > 1]
1777
1778        if len(pidx) == 0:
1779            return pd.DataFrame()
1780
1781        # Get peaks
1782        peaks = partition_data.iloc[pidx, :].reset_index(drop=True)
1783
1784        # Add persistence column
1785        peaks["persistence"] = pers
1786        mass_features = peaks.sort_values(
1787            by="persistence", ascending=False
1788        ).reset_index(drop=True)
1789
1790        # Filter by persistence threshold
1791        mass_features = mass_features.loc[
1792            mass_features["persistence"] > persistence_threshold, :
1793        ].reset_index(drop=True)
1794
1795        return mass_features
1796
1797    def _populate_mass_features(self, mass_features_df):
1798        """Populate the mass_features attribute from a DataFrame.
1799
1800        Parameters
1801        ----------
1802        mass_features_df : pd.DataFrame
1803            DataFrame containing mass feature information.
1804            Note that the order of this DataFrame will determine the order of mass features in the mass_features attribute.
1805
1806        Returns
1807        -------
1808        None, but assigns the mass_features attribute to the object.
1809        """
1810        # Rename scan column to apex_scan
1811        mass_features_df = mass_features_df.rename(
1812            columns={"scan": "apex_scan", "scan_time": "retention_time"}
1813        )
1814
1815        # Populate mass_features attribute
1816        self.mass_features = {}
1817        for row in mass_features_df.itertuples():
1818            row_dict = mass_features_df.iloc[row.Index].to_dict()
1819            lcms_feature = LCMSMassFeature(self, **row_dict)
1820            self.mass_features[lcms_feature.id] = lcms_feature
1821
1822        if self.parameters.lc_ms.verbose_processing:
1823            print("Found " + str(len(mass_features_df)) + " initial mass features")
1824
1825    def find_mass_features_ph_centroid(self, ms_level=1):
1826        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1827
1828        Parameters
1829        ----------
1830        ms_level : int, optional
1831            The MS level to use. Default is 1.
1832
1833        Raises
1834        ------
1835        ValueError
1836            If no MS level data is found on the object.
1837
1838        Returns
1839        -------
1840        None, but assigns the mass_features attribute to the object.
1841        """
1842        # Check that ms_level is a key in self._ms_uprocessed
1843        if ms_level not in self._ms_unprocessed.keys():
1844            raise ValueError(
1845                "No MS level "
1846                + str(ms_level)
1847                + " data found, did you instantiate with parser specific to MS level?"
1848            )
1849
1850        # Work with reference instead of copy
1851        data = self._ms_unprocessed[ms_level]
1852
1853        # Calculate threshold first to avoid multiple operations
1854        max_intensity = data["intensity"].max()
1855        threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity
1856
1857        # Create single filter condition and apply to required columns only
1858        valid_mask = data["intensity"].notna() & (data["intensity"] > threshold)
1859        required_cols = ["mz", "intensity", "scan"]
1860        data_thres = data.loc[valid_mask, required_cols].copy()
1861        data_thres["persistence"] = data_thres["intensity"]
1862
1863        # Merge with required scan data
1864        scan_subset = self.scan_df[["scan", "scan_time"]]
1865        mf_df = data_thres.merge(scan_subset, on="scan", how="inner")
1866        del data_thres, scan_subset
1867
1868        # Order by scan_time and then mz to ensure features near in rt are processed together
1869        # It's ok that different scans are in different partitions; we will roll up later
1870        mf_df = mf_df.sort_values(
1871            by=["scan_time", "mz"], ascending=[True, True]
1872        ).reset_index(drop=True)
1873        partition_size = 10000
1874        partitions = [
1875            mf_df.iloc[i : i + partition_size].reset_index(drop=True)
1876            for i in range(0, len(mf_df), partition_size)
1877        ]
1878        del mf_df
1879
1880        # Run roll_up_dataframe on each partition
1881        rolled_partitions = []
1882        for part in partitions:
1883            rolled = self.roll_up_dataframe(
1884                df=part,
1885                sort_by="persistence",
1886                dims=["mz", "scan_time"],
1887                tol=[
1888                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1889                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1890                ],
1891                relative=[True, False],
1892            )
1893            rolled_partitions.append(rolled)
1894        del partitions
1895
1896        # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries
1897
1898        # Combine results and run a final roll-up to merge features near partition boundaries
1899        mf_df_final = pd.concat(rolled_partitions, ignore_index=True)
1900        del rolled_partitions
1901
1902        # Reorder by persistence before final roll-up
1903        mf_df_final = mf_df_final.sort_values(
1904            by="persistence", ascending=False
1905        ).reset_index(drop=True)
1906
1907        mf_df_final = self.roll_up_dataframe(
1908            df=mf_df_final,
1909            sort_by="persistence",
1910            dims=["mz", "scan_time"],
1911            tol=[
1912                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1913                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1914            ],
1915            relative=[True, False],
1916        )
1917        # reset index
1918        mf_df_final = mf_df_final.reset_index(drop=True)
1919
1920        # Combine rename and sort operations
1921        mass_features = (
1922            mf_df_final.rename(
1923                columns={"scan": "apex_scan", "scan_time": "retention_time"}
1924            )
1925            .sort_values(by="persistence", ascending=False)
1926            .reset_index(drop=True)
1927        )
1928        del mf_df_final  # Free memory
1929
1930        # Order by persistence and reset index
1931        mass_features = mass_features.sort_values(
1932            by="persistence", ascending=False
1933        ).reset_index(drop=True)
1934
1935        self.mass_features = {}
1936        for idx, row in mass_features.iterrows():
1937            row_dict = row.to_dict()
1938            lcms_feature = LCMSMassFeature(self, **row_dict)
1939            self.mass_features[lcms_feature.id] = lcms_feature
1940
1941        if self.parameters.lc_ms.verbose_processing:
1942            print("Found " + str(len(mass_features)) + " initial mass features")
1943
1944    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1945        """Cluster mass features
1946
1947        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1948
1949        Parameters
1950        ----------
1951        drop_children : bool, optional
1952            Whether to drop the mass features that are not cluster parents. Default is True.
1953        sort_by : str, optional
1954            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1955
1956        Raises
1957        ------
1958        ValueError
1959            If no mass features are found.
1960            If too many mass features are found.
1961
1962        Returns
1963        -------
1964        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1965        """
1966        if self.mass_features is None:
1967            raise ValueError("No mass features found, run find_mass_features() first")
1968        if len(self.mass_features) > 400000:
1969            raise ValueError(
1970                "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold"
1971            )
1972        dims = ["mz", "scan_time"]
1973        mf_df_og = self.mass_features_to_df()
1974        mf_df = mf_df_og.copy()
1975
1976        tol = [
1977            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1978            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1979        ]  # mz, in relative; scan_time in minutes
1980        relative = [True, False]
1981
1982        # Roll up mass features based on their proximity in the declared dimensions
1983        mf_df_new = self.roll_up_dataframe(
1984            df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative
1985        )
1986
1987        mf_df["cluster_parent"] = np.where(
1988            np.isin(mf_df.index, mf_df_new.index), True, False
1989        )
1990
1991        # get mass feature ids of features that are not cluster parents
1992        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1993        if drop_children is True:
1994            # Drop mass features that are not cluster parents from self
1995            self.mass_features = {
1996                k: v
1997                for k, v in self.mass_features.items()
1998                if k not in cluster_daughters
1999            }
2000        else:
2001            return cluster_daughters

Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
  • embed_unique_indices(a). Creates an array of indices, sorted by unique element.
  • sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
  • check_if_grid(data). Check if the data is gridded in mz space.
  • grid_data(data). Grid the data in the mz dimension.
  • find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
  • cluster_mass_features(drop_children=True). Cluster regions of interest.
@staticmethod
def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
869    @staticmethod
870    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
871        """Sparse implementation of a mean filter.
872
873        Parameters
874        ----------
875        idx : :obj:`~numpy.array`
876            Edge indices for each dimension (MxN).
877        V : :obj:`~numpy.array`
878            Array of intensity data (Mx1).
879        radius : float or list
880            Radius of the sparse filter in each dimension. Values less than
881            zero indicate no connectivity in that dimension.
882
883        Returns
884        -------
885        :obj:`~numpy.array`
886            Filtered intensities (Mx1).
887
888        Notes
889        -----
890        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
891        This is a static method.
892        """
893
894        # Copy indices
895        idx = idx.copy().astype(V.dtype)
896
897        # Scale
898        for i, r in enumerate(radius):
899            # Increase inter-index distance
900            if r < 1:
901                idx[:, i] *= 2
902
903            # Do nothing
904            elif r == 1:
905                pass
906
907            # Decrease inter-index distance
908            else:
909                idx[:, i] /= r
910
911        # Connectivity matrix
912        cmat = KDTree(idx)
913        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
914        cmat.setdiag(1)
915
916        # Pair indices
917        I, J = cmat.nonzero()
918
919        # Delete cmat
920        cmat_shape = cmat.shape
921        del cmat
922
923        # Sum over columns
924        V_sum = sparse.bsr_matrix(
925            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
926        ).diagonal(0)
927
928        # Count over columns
929        V_count = sparse.bsr_matrix(
930            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
931        ).diagonal(0)
932
933        return V_sum / V_count

Sparse implementation of a mean filter.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
  • radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
  • ~numpy.array: Filtered intensities (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.

@staticmethod
def embed_unique_indices(a):
 935    @staticmethod
 936    def embed_unique_indices(a):
 937        """Creates an array of indices, sorted by unique element.
 938
 939        Parameters
 940        ----------
 941        a : :obj:`~numpy.array`
 942            Array of unique elements (Mx1).
 943
 944        Returns
 945        -------
 946        :obj:`~numpy.array`
 947            Array of indices (Mx1).
 948
 949        Notes
 950        -----
 951        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
 952        This is a static method.
 953        """
 954
 955        def count_tens(n):
 956            # Count tens
 957            ntens = (n - 1) // 10
 958
 959            while True:
 960                ntens_test = (ntens + n - 1) // 10
 961
 962                if ntens_test == ntens:
 963                    return ntens
 964                else:
 965                    ntens = ntens_test
 966
 967        def arange_exclude_10s(n):
 968            # How many 10s will there be?
 969            ntens = count_tens(n)
 970
 971            # Base array
 972            arr = np.arange(0, n + ntens)
 973
 974            # Exclude 10s
 975            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
 976
 977            return arr
 978
 979        # Creates an array of indices, sorted by unique element
 980        idx_sort = np.argsort(a)
 981        idx_unsort = np.argsort(idx_sort)
 982
 983        # Sorts records array so all unique elements are together
 984        sorted_a = a[idx_sort]
 985
 986        # Returns the unique values, the index of the first occurrence,
 987        # and the count for each element
 988        vals, idx_start, count = np.unique(
 989            sorted_a, return_index=True, return_counts=True
 990        )
 991
 992        # Splits the indices into separate arrays
 993        splits = np.split(idx_sort, idx_start[1:])
 994
 995        # Creates unique indices for each split
 996        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
 997
 998        # Reorders according to input array
 999        idx_unq = idx_unq[idx_unsort]
1000
1001        # Magnitude of each index
1002        exp = np.log10(
1003            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
1004        )
1005        idx_unq_mag = np.power(10, np.floor(exp) + 1)
1006
1007        # Result
1008        return a + idx_unq / idx_unq_mag

Creates an array of indices, sorted by unique element.

Parameters
  • a (~numpy.array): Array of unique elements (Mx1).
Returns
  • ~numpy.array: Array of indices (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.

@staticmethod
def roll_up_dataframe( df: pandas.core.frame.DataFrame, sort_by: str, tol: list, relative: list, dims: list, memory_opt_threshold: int = 10000):
1010    @staticmethod
1011    def roll_up_dataframe(
1012        df: pd.DataFrame,
1013        sort_by: str,
1014        tol: list,
1015        relative: list,
1016        dims: list,
1017        memory_opt_threshold: int = 10000,
1018    ):
1019        """Subset data by rolling up into apex in appropriate dimensions.
1020
1021        Parameters
1022        ----------
1023        data : pd.DataFrame
1024            The input data containing "dims" columns and the "sort_by" column.
1025        sort_by : str
1026            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature
1027            (i.e., the mass feature with the highest value in the sort_by column).
1028        dims : list
1029            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1030        tol : list
1031            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1032            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1033            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1034            If relative is False, the tolerance will be used as is.
1035        relative : list
1036            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1037        memory_opt_threshold : int, optional
1038            Minimum number of rows to trigger memory-optimized processing. Default is 10000.
1039
1040        Returns
1041        -------
1042        pd.DataFrame
1043            A DataFrame with only the rolled up mass features, with the original index and columns.
1044
1045
1046        Raises
1047        ------
1048        ValueError
1049            If the input data is not a pandas DataFrame.
1050            If the input data does not have columns for each of the dimensions in "dims".
1051            If the length of "dims", "tol", and "relative" do not match.
1052        """
1053        og_columns = df.columns.copy()
1054
1055        # Unindex the data, but keep the original index
1056        if df.index.name is not None:
1057            og_index = df.index.name
1058        else:
1059            og_index = "index"
1060        df = df.reset_index(drop=False)
1061
1062        # Sort data by sort_by column, and reindex
1063        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1064
1065        # Check that data is a DataFrame and has columns for each of the dims
1066        if not isinstance(df, pd.DataFrame):
1067            raise ValueError("Data must be a pandas DataFrame")
1068        for dim in dims:
1069            if dim not in df.columns:
1070                raise ValueError(f"Data must have a column for {dim}")
1071        if len(dims) != len(tol) or len(dims) != len(relative):
1072            raise ValueError(
1073                "Dimensions, tolerances, and relative flags must be the same length"
1074            )
1075
1076        # Pre-compute all values arrays
1077        all_values = [df[dim].values for dim in dims]
1078
1079        # Choose processing method based on dataframe size
1080        if len(df) >= memory_opt_threshold:
1081            # Memory-optimized approach for large dataframes
1082            distances = PHCalculations._compute_distances_memory_optimized(
1083                all_values, tol, relative
1084            )
1085        else:
1086            # Faster approach for smaller dataframes
1087            distances = PHCalculations._compute_distances_original(
1088                all_values, tol, relative
1089            )
1090
1091        # Process pairs with original logic but memory optimizations
1092        distances = distances.tocoo()
1093        pairs = np.stack((distances.row, distances.col), axis=1)
1094        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent")
1095        del distances, pairs  # Free memory immediately
1096
1097        to_drop = []
1098        while not pairs_df.empty:
1099            # Find root_parents and their children (original logic preserved)
1100            root_parents = np.setdiff1d(
1101                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1102            )
1103            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1104            to_drop.extend(children_of_roots)  # Use extend instead of append
1105
1106            # Remove root_children as possible parents from pairs_df for next iteration
1107            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1108            pairs_df = pairs_df.reset_index().set_index("child")
1109            # Remove root_children as possible children from pairs_df for next iteration
1110            pairs_df = pairs_df.drop(index=children_of_roots)
1111
1112            # Prepare for next iteration
1113            pairs_df = pairs_df.reset_index().set_index("parent")
1114
1115        # Convert to numpy array for efficient dropping
1116        to_drop = np.array(to_drop)
1117
1118        # Drop mass features that are not cluster parents
1119        df_sub = df.drop(index=to_drop)
1120
1121        # Set index back to og_index and only keep original columns
1122        df_sub = df_sub.set_index(og_index).sort_index()[og_columns]
1123
1124        return df_sub

Subset data by rolling up into apex in appropriate dimensions.

Parameters
  • data (pd.DataFrame): The input data containing "dims" columns and the "sort_by" column.
  • sort_by (str): The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature (i.e., the mass feature with the highest value in the sort_by column).
  • dims (list): A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
  • tol (list): A list of tolerances for each dimension. The length of the list must match the number of dimensions. The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). If relative is True, the tolerance will be multiplied by the maximum value in that dimension. If relative is False, the tolerance will be used as is.
  • relative (list): A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
  • memory_opt_threshold (int, optional): Minimum number of rows to trigger memory-optimized processing. Default is 10000.
Returns
  • pd.DataFrame: A DataFrame with only the rolled up mass features, with the original index and columns.
Raises
  • ValueError: If the input data is not a pandas DataFrame. If the input data does not have columns for each of the dimensions in "dims". If the length of "dims", "tol", and "relative" do not match.
def sparse_upper_star(self, idx, V):
1295    def sparse_upper_star(self, idx, V):
1296        """Sparse implementation of an upper star filtration.
1297
1298        Parameters
1299        ----------
1300        idx : :obj:`~numpy.array`
1301            Edge indices for each dimension (MxN).
1302        V : :obj:`~numpy.array`
1303            Array of intensity data (Mx1).
1304        Returns
1305        -------
1306        idx : :obj:`~numpy.array`
1307            Index of filtered points (Mx1).
1308        persistence : :obj:`~numpy.array`
1309            Persistence of each filtered point (Mx1).
1310
1311        Notes
1312        -----
1313        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1314        """
1315
1316        # Invert
1317        V = -1 * V.copy().astype(int)
1318
1319        # Embed indices
1320        V = self.embed_unique_indices(V)
1321
1322        # Connectivity matrix
1323        cmat = KDTree(idx)
1324        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1325        cmat.setdiag(1)
1326        cmat = sparse.triu(cmat)
1327
1328        # Pairwise minimums
1329        I, J = cmat.nonzero()
1330        d = np.maximum(V[I], V[J])
1331
1332        # Delete connectiity matrix
1333        cmat_shape = cmat.shape
1334        del cmat
1335
1336        # Sparse distance matrix
1337        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1338
1339        # Delete pairwise mins
1340        del d, I, J
1341
1342        # Persistence homology
1343        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1344
1345        # Bound death values
1346        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1347
1348        # Construct tree to query against
1349        tree = KDTree(V.reshape((-1, 1)))
1350
1351        # Get the indexes of the first nearest neighbor by birth
1352        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1353
1354        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)

Sparse implementation of an upper star filtration.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
Returns
  • idx (~numpy.array): Index of filtered points (Mx1).
  • persistence (~numpy.array): Persistence of each filtered point (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def check_if_grid(self, data):
1356    def check_if_grid(self, data):
1357        """Check if the data are gridded in mz space.
1358
1359        Parameters
1360        ----------
1361        data : DataFrame
1362            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1363
1364        Returns
1365        -------
1366        bool
1367            True if the data is gridded in the mz direction, False otherwise.
1368
1369        Notes
1370        -----
1371        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1372        """
1373        # Calculate the difference between consecutive mz values in a single scan
1374        dat_check = data.copy().reset_index(drop=True)
1375        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1376        mz_diff_min = (
1377            dat_check.groupby("scan")["mz_diff"].min().min()
1378        )  # within each scan, what is the smallest mz difference between consecutive mz values
1379
1380        # Find the mininum mz difference between mz values in the data; regardless of scan
1381        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1382        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1383        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1384
1385        # Get minimum mz_diff between mz values in the data
1386        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1387
1388        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1389        if mz_diff_min_raw < mz_diff_min:
1390            return False
1391        else:
1392            return True

Check if the data are gridded in mz space.

Parameters
  • data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
  • bool: True if the data is gridded in the mz direction, False otherwise.
Notes

This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.

def grid_data(self, data, attempts=5):
1394    def grid_data(self, data, attempts=5):
1395        """Grid the data in the mz dimension.
1396
1397        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1398
1399        Parameters
1400        ----------
1401        data : DataFrame
1402            The input data containing mz, scan, scan_time, and intensity columns.
1403        attempts : int, optional
1404            The number of attempts to grid the data. Default is 5.
1405
1406        Returns
1407        -------
1408        DataFrame
1409            The gridded data with mz, scan, scan_time, and intensity columns.
1410
1411        Raises
1412        ------
1413        ValueError
1414            If gridding fails after the specified number of attempts.
1415        """
1416        attempt_i = 0
1417        while attempt_i < attempts:
1418            attempt_i += 1
1419            data = self._grid_data(data)
1420
1421            if self.check_if_grid(data):
1422                return data
1423
1424        if not self.check_if_grid(data):
1425            raise ValueError(
1426                "Gridding failed after "
1427                + str(attempt_i)
1428                + " attempts. Please check the data."
1429            )
1430        else:
1431            return data

Grid the data in the mz dimension.

Data must be gridded prior to persistent homology calculations and computing average mass spectrum

Parameters
  • data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
  • attempts (int, optional): The number of attempts to grid the data. Default is 5.
Returns
  • DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
  • ValueError: If gridding fails after the specified number of attempts.
def find_mass_features_ph(self, ms_level=1, grid=True):
1511    def find_mass_features_ph(self, ms_level=1, grid=True):
1512        """Find mass features within an LCMSBase object using persistent homology.
1513
1514        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1515
1516        Parameters
1517        ----------
1518        ms_level : int, optional
1519            The MS level to use. Default is 1.
1520        grid : bool, optional
1521            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1522
1523        Raises
1524        ------
1525        ValueError
1526            If no MS level data is found on the object.
1527            If data is not gridded and grid is False.
1528
1529        Returns
1530        -------
1531        None, but assigns the mass_features attribute to the object.
1532
1533        Notes
1534        -----
1535        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1536        """
1537        # Check that ms_level is a key in self._ms_uprocessed
1538        if ms_level not in self._ms_unprocessed.keys():
1539            raise ValueError(
1540                "No MS level "
1541                + str(ms_level)
1542                + " data found, did you instantiate with parser specific to MS level?"
1543            )
1544
1545        # Get ms data
1546        data = self._ms_unprocessed[ms_level].copy()
1547
1548        # Drop rows with missing intensity values and reset index
1549        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1550
1551        # Threshold data
1552        dims = ["mz", "scan_time"]
1553        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1554        persistence_threshold = (
1555            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1556        )
1557        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1558
1559        # Check if gridded, if not, grid
1560        gridded_mz = self.check_if_grid(data_thres)
1561        if gridded_mz is False:
1562            if grid is False:
1563                raise ValueError(
1564                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1565                )
1566            else:
1567                data_thres = self.grid_data(data_thres)
1568
1569        # Add scan_time
1570        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1571        # Process in chunks if required
1572        if len(data_thres) > 10000:
1573            return self._find_mass_features_ph_partition(
1574                data_thres, dims, persistence_threshold
1575            )
1576        else:
1577            # Process all at once
1578            return self._find_mass_features_ph_single(
1579                data_thres, dims, persistence_threshold
1580            )

Find mass features within an LCMSBase object using persistent homology.

Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
  • None, but assigns the mass_features attribute to the object.
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def find_mass_features_ph_centroid(self, ms_level=1):
1825    def find_mass_features_ph_centroid(self, ms_level=1):
1826        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
1827
1828        Parameters
1829        ----------
1830        ms_level : int, optional
1831            The MS level to use. Default is 1.
1832
1833        Raises
1834        ------
1835        ValueError
1836            If no MS level data is found on the object.
1837
1838        Returns
1839        -------
1840        None, but assigns the mass_features attribute to the object.
1841        """
1842        # Check that ms_level is a key in self._ms_uprocessed
1843        if ms_level not in self._ms_unprocessed.keys():
1844            raise ValueError(
1845                "No MS level "
1846                + str(ms_level)
1847                + " data found, did you instantiate with parser specific to MS level?"
1848            )
1849
1850        # Work with reference instead of copy
1851        data = self._ms_unprocessed[ms_level]
1852
1853        # Calculate threshold first to avoid multiple operations
1854        max_intensity = data["intensity"].max()
1855        threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity
1856
1857        # Create single filter condition and apply to required columns only
1858        valid_mask = data["intensity"].notna() & (data["intensity"] > threshold)
1859        required_cols = ["mz", "intensity", "scan"]
1860        data_thres = data.loc[valid_mask, required_cols].copy()
1861        data_thres["persistence"] = data_thres["intensity"]
1862
1863        # Merge with required scan data
1864        scan_subset = self.scan_df[["scan", "scan_time"]]
1865        mf_df = data_thres.merge(scan_subset, on="scan", how="inner")
1866        del data_thres, scan_subset
1867
1868        # Order by scan_time and then mz to ensure features near in rt are processed together
1869        # It's ok that different scans are in different partitions; we will roll up later
1870        mf_df = mf_df.sort_values(
1871            by=["scan_time", "mz"], ascending=[True, True]
1872        ).reset_index(drop=True)
1873        partition_size = 10000
1874        partitions = [
1875            mf_df.iloc[i : i + partition_size].reset_index(drop=True)
1876            for i in range(0, len(mf_df), partition_size)
1877        ]
1878        del mf_df
1879
1880        # Run roll_up_dataframe on each partition
1881        rolled_partitions = []
1882        for part in partitions:
1883            rolled = self.roll_up_dataframe(
1884                df=part,
1885                sort_by="persistence",
1886                dims=["mz", "scan_time"],
1887                tol=[
1888                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1889                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1890                ],
1891                relative=[True, False],
1892            )
1893            rolled_partitions.append(rolled)
1894        del partitions
1895
1896        # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries
1897
1898        # Combine results and run a final roll-up to merge features near partition boundaries
1899        mf_df_final = pd.concat(rolled_partitions, ignore_index=True)
1900        del rolled_partitions
1901
1902        # Reorder by persistence before final roll-up
1903        mf_df_final = mf_df_final.sort_values(
1904            by="persistence", ascending=False
1905        ).reset_index(drop=True)
1906
1907        mf_df_final = self.roll_up_dataframe(
1908            df=mf_df_final,
1909            sort_by="persistence",
1910            dims=["mz", "scan_time"],
1911            tol=[
1912                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1913                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1914            ],
1915            relative=[True, False],
1916        )
1917        # reset index
1918        mf_df_final = mf_df_final.reset_index(drop=True)
1919
1920        # Combine rename and sort operations
1921        mass_features = (
1922            mf_df_final.rename(
1923                columns={"scan": "apex_scan", "scan_time": "retention_time"}
1924            )
1925            .sort_values(by="persistence", ascending=False)
1926            .reset_index(drop=True)
1927        )
1928        del mf_df_final  # Free memory
1929
1930        # Order by persistence and reset index
1931        mass_features = mass_features.sort_values(
1932            by="persistence", ascending=False
1933        ).reset_index(drop=True)
1934
1935        self.mass_features = {}
1936        for idx, row in mass_features.iterrows():
1937            row_dict = row.to_dict()
1938            lcms_feature = LCMSMassFeature(self, **row_dict)
1939            self.mass_features[lcms_feature.id] = lcms_feature
1940
1941        if self.parameters.lc_ms.verbose_processing:
1942            print("Found " + str(len(mass_features)) + " initial mass features")

Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no MS level data is found on the object.
Returns
  • None, but assigns the mass_features attribute to the object.
def cluster_mass_features(self, drop_children=True, sort_by='persistence'):
1944    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
1945        """Cluster mass features
1946
1947        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
1948
1949        Parameters
1950        ----------
1951        drop_children : bool, optional
1952            Whether to drop the mass features that are not cluster parents. Default is True.
1953        sort_by : str, optional
1954            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
1955
1956        Raises
1957        ------
1958        ValueError
1959            If no mass features are found.
1960            If too many mass features are found.
1961
1962        Returns
1963        -------
1964        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
1965        """
1966        if self.mass_features is None:
1967            raise ValueError("No mass features found, run find_mass_features() first")
1968        if len(self.mass_features) > 400000:
1969            raise ValueError(
1970                "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold"
1971            )
1972        dims = ["mz", "scan_time"]
1973        mf_df_og = self.mass_features_to_df()
1974        mf_df = mf_df_og.copy()
1975
1976        tol = [
1977            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1978            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1979        ]  # mz, in relative; scan_time in minutes
1980        relative = [True, False]
1981
1982        # Roll up mass features based on their proximity in the declared dimensions
1983        mf_df_new = self.roll_up_dataframe(
1984            df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative
1985        )
1986
1987        mf_df["cluster_parent"] = np.where(
1988            np.isin(mf_df.index, mf_df_new.index), True, False
1989        )
1990
1991        # get mass feature ids of features that are not cluster parents
1992        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
1993        if drop_children is True:
1994            # Drop mass features that are not cluster parents from self
1995            self.mass_features = {
1996                k: v
1997                for k, v in self.mass_features.items()
1998                if k not in cluster_daughters
1999            }
2000        else:
2001            return cluster_daughters

Cluster mass features

Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.

Parameters
  • drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
  • sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
  • ValueError: If no mass features are found. If too many mass features are found.
Returns
  • None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.