corems.mass_spectra.calc.lc_calc

   1import numpy as np
   2import pandas as pd
   3from ripser import ripser
   4from scipy import sparse
   5from scipy.spatial import KDTree
   6
   7from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature
   8from corems.mass_spectra.calc import SignalProcessing as sp
   9from corems.mass_spectra.factory.chromat_data import EIC_Data
  10from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  11
  12
  13def find_closest(A, target):
  14    """Find the index of closest value in A to each value in target.
  15
  16    Parameters
  17    ----------
  18    A : :obj:`~numpy.array`
  19        The array to search (blueprint). A must be sorted.
  20    target : :obj:`~numpy.array`
  21        The array of values to search for. target must be sorted.
  22
  23    Returns
  24    -------
  25    :obj:`~numpy.array`
  26        The indices of the closest values in A to each value in target.
  27    """
  28    idx = A.searchsorted(target)
  29    idx = np.clip(idx, 1, len(A) - 1)
  30    left = A[idx - 1]
  31    right = A[idx]
  32    idx -= target - left < right - target
  33    return idx
  34
  35
  36class LCCalculations:
  37    """Methods for performing LC calculations on mass spectra data.
  38
  39    Notes
  40    -----
  41    This class is intended to be used as a mixin for the LCMSBase class.
  42
  43    Methods
  44    -------
  45    * get_max_eic(eic_data).
  46        Returns the maximum EIC value from the given EIC data. A static method.
  47    * smooth_tic(tic).
  48        Smooths the TIC data using the specified smoothing method and settings.
  49    * eic_centroid_detector(rt, eic, max_eic).
  50        Performs EIC centroid detection on the given EIC data.
  51    * find_nearest_scan(rt).
  52        Finds the nearest scan to the given retention time.
  53    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
  54        Returns an averaged mass spectrum object.
  55    * find_mass_features(ms_level=1).
  56        Find regions of interest for a given MS level (default is MS1).
  57    * integrate_mass_features(drop_if_fail=False, ms_level=1).
  58        Integrate mass features of interest and extracts EICs.
  59    * find_c13_mass_features().
  60        Evaluate mass features and mark likely C13 isotopes.
  61    * deconvolute_ms1_mass_features().
  62        Deconvolute mass features' ms1 mass spectra.
  63    """
  64
  65    @staticmethod
  66    def get_max_eic(eic_data: dict):
  67        """Returns the maximum EIC value from the given EIC data.
  68
  69        Notes
  70        -----
  71        This is a static method.
  72
  73        Parameters
  74        ----------
  75        eic_data : dict
  76            A dictionary containing EIC data.
  77
  78        Returns
  79        -------
  80        float
  81            The maximum EIC value.
  82        """
  83        max_eic = 0
  84        for eic_data in eic_data.values():
  85            ind_max_eic = max(eic_data.get("EIC"))
  86            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
  87
  88        return max_eic
  89
  90    def smooth_tic(self, tic):
  91        """Smooths the TIC or EIC data using the specified smoothing method and settings.
  92
  93        Parameters
  94        ----------
  95        tic : numpy.ndarray
  96            The TIC (or EIC) data to be smoothed.
  97
  98        Returns
  99        -------
 100        numpy.ndarray
 101            The smoothed TIC data.
 102        """
 103        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
 104
 105        pol_order = self.parameters.lc_ms.savgol_pol_order
 106
 107        window_len = self.parameters.lc_ms.smooth_window
 108
 109        window = self.parameters.lc_ms.smooth_method
 110
 111        return sp.smooth_signal(
 112            tic, window_len, window, pol_order, implemented_smooth_method
 113        )
 114
 115    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
 116        """Performs EIC centroid detection on the given EIC data.
 117
 118        Parameters
 119        ----------
 120        rt : numpy.ndarray
 121            The retention time data.
 122        eic : numpy.ndarray
 123            The EIC data.
 124        max_eic : float
 125            The maximum EIC value.
 126        apex_indexes : list, optional
 127            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
 128
 129        Returns
 130        -------
 131        numpy.ndarray
 132            The indexes of left, apex, and right limits as a generator.
 133        """
 134
 135        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
 136
 137        max_height = self.parameters.lc_ms.peak_height_max_percent
 138
 139        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
 140
 141        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
 142
 143        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
 144
 145        include_indexes = sp.peak_picking_first_derivative(
 146            domain=rt,
 147            signal=eic,
 148            max_height=max_height,
 149            max_prominence=max_prominence,
 150            max_signal=max_eic,
 151            min_peak_datapoints=min_peak_datapoints,
 152            peak_derivative_threshold=peak_derivative_threshold,
 153            signal_threshold=signal_threshold,
 154            correct_baseline=False,
 155            plot_res=False,
 156            apex_indexes=apex_indexes,
 157        )
 158        #include_indexes is a generator of tuples (left_index, apex_index, right_index)
 159        include_indexes = list(include_indexes)
 160        # Add check to make sure that there are at least 1/2 of min_peak_datapoints on either side of the apex
 161        indicies = [x for x in include_indexes]
 162        for idx in indicies:
 163            if (idx[1] - idx[0] < min_peak_datapoints / 2) or (
 164                idx[2] - idx[1] < min_peak_datapoints / 2
 165            ):
 166                include_indexes.remove(idx)
 167        return include_indexes
 168
 169    def find_nearest_scan(self, rt):
 170        """Finds the nearest scan to the given retention time.
 171
 172        Parameters
 173        ----------
 174        rt : float
 175            The retention time (in minutes) to find the nearest scan for.
 176
 177        Returns
 178        -------
 179        int
 180            The scan number of the nearest scan.
 181        """
 182        array_rt = np.array(self.retention_time)
 183
 184        scan_index = (np.abs(array_rt - rt)).argmin()
 185
 186        real_scan = self.scans_number[scan_index]
 187
 188        return real_scan
 189
 190    def add_peak_metrics(self, remove_by_metrics=True):
 191        """Add peak metrics to the mass features.
 192
 193        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
 194
 195        Parameters
 196        ----------
 197        remove_by_metrics : bool, optional
 198            If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity,
 199            dispersity index, and noise score. Default is True, which checks the setting in the processing parameters.
 200            If False, peak metrics are calculated but no mass features are removed, regardless of the setting in the processing parameters.
 201        """
 202        # Check that at least some mass features have eic data
 203        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
 204            raise ValueError(
 205                "No mass features have EIC data. Run integrate_mass_features first."
 206            )
 207
 208        for mass_feature in self.mass_features.values():
 209            # Check if the mass feature has been integrated
 210            if mass_feature._eic_data is not None:
 211                # Calculate peak metrics
 212                mass_feature.calc_half_height_width()
 213                mass_feature.calc_tailing_factor()
 214                mass_feature.calc_dispersity_index()
 215                mass_feature.calc_gaussian_similarity()
 216                mass_feature.calc_noise_score()
 217        
 218        # Remove mass features by peak metrics if designated in parameters
 219        if self.parameters.lc_ms.remove_mass_features_by_peak_metrics and remove_by_metrics:
 220            self._remove_mass_features_by_peak_metrics()
 221
 222    def get_average_mass_spectrum(
 223        self,
 224        scan_list,
 225        apex_scan,
 226        spectrum_mode="profile",
 227        ms_level=1,
 228        auto_process=True,
 229        use_parser=False,
 230        perform_checks=True,
 231        polarity=None,
 232        ms_params=None,
 233    ):
 234        """Returns an averaged mass spectrum object
 235
 236        Parameters
 237        ----------
 238        scan_list : list
 239            List of scan numbers to average.
 240        apex_scan : int
 241            Number of the apex scan
 242        spectrum_mode : str, optional
 243            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
 244        ms_level : int, optional
 245            The MS level to use. Defaults to 1.
 246        auto_process : bool, optional
 247            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
 248        use_parser : bool, optional
 249            If True, the mass spectra will be obtained from the parser. Defaults to False.
 250        perform_checks : bool, optional
 251            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
 252        polarity : int, optional
 253            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
 254        ms_params : MSParameters, optional
 255            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
 256
 257        Returns
 258        -------
 259        MassSpectrumProfile
 260            The averaged mass spectrum object.
 261
 262        Raises
 263        ------
 264        ValueError
 265            If the spectrum mode is not "profile".
 266            If the MS level is not found in the unprocessed mass spectra dictionary.
 267            If not all scan numbers are found in the unprocessed mass spectra dictionary.
 268        """
 269        if perform_checks:
 270            if spectrum_mode != "profile":
 271                raise ValueError("Averaging only supported for profile mode")
 272
 273        if polarity is None:
 274            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 275            if self.polarity == "negative":
 276                polarity = -1
 277            elif self.polarity == "positive":
 278                polarity = 1
 279            else:
 280                raise ValueError(
 281                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
 282                )
 283
 284        # if not using_parser, check that scan numbers are in _ms_unprocessed
 285        if not use_parser:
 286            if perform_checks:
 287                # Set index to scan for faster lookup
 288                ms_df = (
 289                    self._ms_unprocessed[ms_level]
 290                    .copy()
 291                    .set_index("scan", drop=False)
 292                    .sort_index()
 293                )
 294                my_ms_df = ms_df.loc[scan_list]
 295                # Check that all scan numbers are in the ms_df
 296                if not all(np.isin(scan_list, ms_df.index)):
 297                    raise ValueError(
 298                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
 299                    )
 300            else:
 301                my_ms_df = (
 302                    pd.DataFrame({"scan": scan_list})
 303                    .set_index("scan")
 304                    .join(self._ms_unprocessed[ms_level], how="left")
 305                )
 306
 307        if use_parser:
 308            ms_list = [
 309                self.spectra_parser.get_mass_spectrum_from_scan(
 310                    x, spectrum_mode=spectrum_mode, auto_process=False
 311                )
 312                for x in scan_list
 313            ]
 314            ms_mz = [x._mz_exp for x in ms_list]
 315            ms_int = [x._abundance for x in ms_list]
 316            my_ms_df = []
 317            for i in np.arange(len(ms_mz)):
 318                my_ms_df.append(
 319                    pd.DataFrame(
 320                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
 321                    )
 322                )
 323            my_ms_df = pd.concat(my_ms_df)
 324
 325        if not self.check_if_grid(my_ms_df):
 326            my_ms_df = self.grid_data(my_ms_df)
 327
 328        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
 329
 330        ms = ms_from_array_profile(
 331            my_ms_ave.mz,
 332            my_ms_ave.intensity,
 333            self.file_location,
 334            polarity=polarity,
 335            auto_process=False,
 336        )
 337
 338        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 339        if ms is not None:
 340            if ms_params is not None:
 341                ms.parameters = ms_params
 342            ms.scan_number = apex_scan
 343            if auto_process:
 344                ms.process_mass_spec()
 345        return ms
 346
 347    def find_mass_features(self, ms_level=1, grid=True):
 348        """Find mass features within an LCMSBase object
 349
 350        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
 351
 352        Parameters
 353        ----------
 354        ms_level : int, optional
 355            The MS level to use for peak picking Default is 1.
 356        grid : bool, optional
 357            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded),
 358            used for persistent homology peak picking for profile data only. Default is True.
 359
 360        Raises
 361        ------
 362        ValueError
 363            If no MS level data is found on the object.
 364            If persistent homology peak picking is attempted on non-profile mode data.
 365            If data is not gridded and grid is False.
 366            If peak picking method is not implemented.
 367
 368        Returns
 369        -------
 370        None, but assigns the mass_features and eics attributes to the object.
 371
 372        """
 373        pp_method = self.parameters.lc_ms.peak_picking_method
 374
 375        if pp_method == "persistent homology":
 376            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 377            if all(msx_scan_df["ms_format"] == "profile"):
 378                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
 379            else:
 380                raise ValueError(
 381                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
 382                        ms_level
 383                    )
 384                )
 385        elif pp_method == "centroided_persistent_homology":
 386            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 387            if all(msx_scan_df["ms_format"] == "centroid"):
 388                self.find_mass_features_ph_centroid(ms_level=ms_level)
 389            else:
 390                raise ValueError(
 391                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
 392                        ms_level
 393                    )
 394                )
 395        else:
 396            raise ValueError("Peak picking method not implemented")
 397        
 398        # Remove noisey mass features if designated in parameters
 399        if self.parameters.lc_ms.remove_redundant_mass_features:
 400            self._remove_redundant_mass_features()
 401
 402    def integrate_mass_features(
 403        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
 404    ):
 405        """Integrate mass features and extract EICs.
 406
 407        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
 408
 409        Parameters
 410        ----------
 411        drop_if_fail : bool, optional
 412            Whether to drop mass features if the EIC limit calculations fail.
 413            Default is True.
 414        drop_duplicates : bool, optional
 415            Whether to mass features that appear to be duplicates
 416            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
 417            Default is True.
 418        ms_level : int, optional
 419            The MS level to use. Default is 1.
 420
 421        Raises
 422        ------
 423        ValueError
 424            If no mass features are found.
 425            If no MS level data is found for the given MS level (either in data or in the scan data)
 426
 427        Returns
 428        -------
 429        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
 430
 431        Notes
 432        -----
 433        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
 434        """
 435        # Check if there is data
 436        if ms_level in self._ms_unprocessed.keys():
 437            raw_data = self._ms_unprocessed[ms_level].copy()
 438        else:
 439            raise ValueError("No MS level " + str(ms_level) + " data found")
 440        if self.mass_features is not None:
 441            mf_df = self.mass_features_to_df().copy()
 442        else:
 443            raise ValueError(
 444                "No mass features found, did you run find_mass_features() first?"
 445            )
 446
 447        # Subset scan data to only include correct ms_level
 448        scan_df_sub = self.scan_df[
 449            self.scan_df["ms_level"] == int(ms_level)
 450        ].reset_index(drop=True)
 451        if scan_df_sub.empty:
 452            raise ValueError("No MS level " + ms_level + " data found in scan data")
 453        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
 454
 455        mzs_to_extract = np.unique(mf_df["mz"].values)
 456        mzs_to_extract.sort()
 457
 458        # Pre-sort raw_data by mz for faster filtering
 459        raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True)
 460        raw_data_mz = raw_data_sorted["mz"].values
 461
 462        # Get EICs for each unique mz in mass features list
 463        for mz in mzs_to_extract:
 464            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 465            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 466
 467            # Use binary search for faster mz range filtering
 468            left_idx = np.searchsorted(raw_data_mz, mz_min, side="left")
 469            right_idx = np.searchsorted(raw_data_mz, mz_max, side="right")
 470            raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy()
 471
 472            raw_data_sub = (
 473                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
 474            )
 475            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
 476            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
 477            myEIC = EIC_Data(
 478                scans=raw_data_sub["scan"].values,
 479                time=raw_data_sub["scan_time"].values,
 480                eic=raw_data_sub["intensity"].values,
 481            )
 482            # Smooth EIC
 483            smoothed_eic = self.smooth_tic(myEIC.eic)
 484            smoothed_eic[smoothed_eic < 0] = 0
 485            myEIC.eic_smoothed = smoothed_eic
 486            self.eics[mz] = myEIC
 487
 488        # Get limits of mass features using EIC centroid detector and integrate
 489        mf_df["area"] = np.nan
 490        for idx, mass_feature in mf_df.iterrows():
 491            mz = mass_feature.mz
 492            apex_scan = mass_feature.apex_scan
 493
 494            # Pull EIC data and find apex scan index
 495            myEIC = self.eics[mz]
 496            self.mass_features[idx]._eic_data = myEIC
 497            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
 498
 499            # Find left and right limits of peak using EIC centroid detector, add to EICData
 500            centroid_eics = self.eic_centroid_detector(
 501                myEIC.time,
 502                myEIC.eic_smoothed,
 503                mass_feature.intensity * 1.1,
 504                apex_indexes=[int(apex_index)],
 505            )
 506            l_a_r_scan_idx = [i for i in centroid_eics]
 507            if len(l_a_r_scan_idx) > 0:
 508                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
 509                # Find the number of consecutive non-zero values in the EIC segment
 510                mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0
 511                # Find the longest run of consecutive True values
 512                if np.any(mask):
 513                    # Find indices where mask changes value
 514                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
 515                    starts = np.where(diff == 1)[0]
 516                    ends = np.where(diff == -1)[0]
 517                    consecutive_scans = (ends - starts).max()
 518                else:
 519                    consecutive_scans = 0
 520                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
 521                    self.mass_features.pop(idx)
 522                    continue
 523                # Add start and final scan to mass_features and EICData
 524                left_scan, right_scan = (
 525                    myEIC.scans[l_a_r_scan_idx[0][0]],
 526                    myEIC.scans[l_a_r_scan_idx[0][2]],
 527                )
 528                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
 529                myEIC.apexes = myEIC.apexes + mf_scan_apex
 530                self.mass_features[idx].start_scan = left_scan
 531                self.mass_features[idx].final_scan = right_scan
 532
 533                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
 534                area = np.trapz(
 535                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 536                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 537                )
 538                mf_df.at[idx, "area"] = area
 539                myEIC.areas = myEIC.areas + [area]
 540                self.eics[mz] = myEIC
 541                self.mass_features[idx]._area = area
 542            else:
 543                if drop_if_fail is True:
 544                    self.mass_features.pop(idx)
 545
 546        if drop_duplicates:
 547            # Prepare mass feature dataframe
 548            mf_df = self.mass_features_to_df()
 549
 550            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
 551            # Keep the first mass feature (highest persistence)
 552            for idx, mass_feature in mf_df.iterrows():
 553                mz = mass_feature.mz
 554                apex_scan = mass_feature.apex_scan
 555
 556                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
 557                mf_df_sub = mf_df[
 558                    mf_df["mz_diff_ppm"]
 559                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 560                    * 10**6
 561                ].copy()
 562
 563                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
 564                for idx2, mass_feature2 in mf_df_sub.iterrows():
 565                    if idx2 != idx:
 566                        if (
 567                            mass_feature2.start_scan >= mass_feature.start_scan
 568                            and mass_feature2.final_scan <= mass_feature.final_scan
 569                        ):
 570                            if idx2 in self.mass_features.keys():
 571                                self.mass_features.pop(idx2)
 572
 573    def find_c13_mass_features(self):
 574        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
 575
 576        Returns
 577        -------
 578        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
 579
 580        Raises
 581        ------
 582        ValueError
 583            If no mass features are found.
 584        """
 585        verbose = self.parameters.lc_ms.verbose_processing
 586        if verbose:
 587            print("evaluating mass features for C13 isotopes")
 588        if self.mass_features is None:
 589            raise ValueError("No mass features found, run find_mass_features() first")
 590
 591        # Data prep fo sparse distance matrix
 592        dims = ["mz", "scan_time"]
 593        mf_df = self.mass_features_to_df().copy()
 594        # Drop mass features that have no area (these are likely to be noise)
 595        mf_df = mf_df[mf_df["area"].notnull()]
 596        mf_df["mf_id"] = mf_df.index.values
 597        dims = ["mz", "scan_time"]
 598
 599        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
 600        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
 601
 602        mz_diff = 1.003355  # C13-C12 mass difference
 603        tol = [
 604            mf_df["mz"].median()
 605            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
 606            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
 607        ]  # mz, in relative; scan_time in minutes
 608
 609        # Compute inter-feature distances
 610        distances = None
 611        for i in range(len(dims)):
 612            # Construct k-d tree
 613            values = mf_df[dims[i]].values
 614            tree = KDTree(values.reshape(-1, 1))
 615
 616            max_tol = tol[i]
 617            if dims[i] == "mz":
 618                # Maximum absolute tolerance
 619                max_tol = mz_diff + tol[i]
 620
 621            # Compute sparse distance matrix
 622            # the larger the max_tol, the slower this operation is
 623            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
 624
 625            # Only consider forward case, exclude diagonal
 626            sdm = sparse.triu(sdm, k=1)
 627
 628            if dims[i] == "mz":
 629                min_tol = mz_diff - tol[i]
 630                # Get only the ones that are above the min tol
 631                idx = sdm.data > min_tol
 632
 633                # Reconstruct sparse distance matrix
 634                sdm = sparse.coo_matrix(
 635                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
 636                    shape=(len(values), len(values)),
 637                )
 638
 639            # Cast as binary matrix
 640            sdm.data = np.ones_like(sdm.data)
 641
 642            # Stack distances
 643            if distances is None:
 644                distances = sdm
 645            else:
 646                distances = distances.multiply(sdm)
 647
 648        # Extract indices of within-tolerance points
 649        distances = distances.tocoo()
 650        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
 651
 652        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
 653        pairs_mf = pairs.copy()
 654        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
 655        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
 656
 657        # Connect monoisotopic masses with isotopologes within mass_features
 658        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
 659        for mono in monos:
 660            self.mass_features[mono].monoisotopic_mf_id = mono
 661        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
 662        while not pairs_iso_df.empty:
 663            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
 664            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
 665            for iso in m1_isos:
 666                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
 667                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
 668                if len(parent) > 1:
 669                    # Choose the parent that is closest in time to the isotopologue
 670                    parent_time = [self.mass_features[p].retention_time for p in parent]
 671                    time_diff = [
 672                        np.abs(self.mass_features[iso].retention_time - x)
 673                        for x in parent_time
 674                    ]
 675                    parent = parent[np.argmin(time_diff)]
 676                else:
 677                    parent = parent[0]
 678                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
 679                    parent
 680                ].monoisotopic_mf_id
 681                if self.mass_features[iso].monoisotopic_mf_id is not None:
 682                    mass_diff = (
 683                        self.mass_features[iso].mz
 684                        - self.mass_features[
 685                            self.mass_features[iso].monoisotopic_mf_id
 686                        ].mz
 687                    )
 688                    self.mass_features[iso].isotopologue_type = "13C" + str(
 689                        int(round(mass_diff, 0))
 690                    )
 691
 692            # Drop the mono and iso from the pairs_iso_df
 693            pairs_iso_df = pairs_iso_df.drop(
 694                index=monos, errors="ignore"
 695            )  # Drop pairs where the parent is a child that is a child of a root
 696            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
 697            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
 698
 699            if not pairs_iso_df.empty:
 700                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
 701                monos = np.setdiff1d(
 702                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
 703                )
 704        if verbose:
 705            # Report fraction of compounds annotated with isotopes
 706            mf_df["c13_flag"] = np.where(
 707                np.logical_or(
 708                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
 709                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
 710                ),
 711                1,
 712                0,
 713            )
 714            print(
 715                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
 716                + " of mass features have or are C13 isotopes"
 717            )
 718
 719    def deconvolute_ms1_mass_features(self):
 720        """Deconvolute MS1 mass features
 721
 722        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
 723
 724        Parameters
 725        ----------
 726        None
 727
 728        Returns
 729        -------
 730        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
 731        and associated_mass_features_deconvoluted attributes to the mass features in the
 732        mass_features attribute of the LCMSBase object.
 733
 734        Raises
 735        ------
 736        ValueError
 737            If no mass features are found, must run find_mass_features() first.
 738            If no EICs are found, did you run integrate_mass_features() first?
 739
 740        """
 741        # Checks for set mass_features and eics
 742        if self.mass_features is None:
 743            raise ValueError(
 744                "No mass features found, did you run find_mass_features() first?"
 745            )
 746
 747        if self.eics == {}:
 748            raise ValueError(
 749                "No EICs found, did you run integrate_mass_features() first?"
 750            )
 751
 752        if 1 not in self._ms_unprocessed.keys():
 753            raise ValueError("No unprocessed MS1 spectra found.")
 754
 755        # Prep ms1 data
 756        ms1_data = self._ms_unprocessed[1].copy()
 757        ms1_data = ms1_data.set_index("scan")
 758
 759        # Prep mass feature summary
 760        mass_feature_df = self.mass_features_to_df()
 761
 762        # Loop through each mass feature
 763        for mf_id, mass_feature in self.mass_features.items():
 764            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
 765            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
 766                continue
 767
 768            # Get the left and right limits of the EIC of the mass feature
 769            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
 770
 771            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
 772            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
 773            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
 774
 775            # Get the centroided masses of the mass feature
 776            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
 777
 778            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
 779            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
 780                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
 781            ]
 782
 783            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
 784            ms1_data_sub["mz_diff_rel"] = (
 785                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
 786                / ms1_data_sub["mz"]
 787            )
 788            ms1_data_sub = ms1_data_sub[
 789                ms1_data_sub["mz_diff_rel"]
 790                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 791            ].reset_index(drop=True)
 792
 793            # Group by mass_feature_mz and scan and sum intensity
 794            ms1_data_sub_group = (
 795                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
 796                .sum()
 797                .reset_index()
 798            )
 799
 800            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
 801            corr = (
 802                ms1_data_sub_group.pivot(
 803                    index="scan", columns="mass_feature_mz", values="intensity"
 804                )
 805                .fillna(0)
 806                .corr()
 807            )
 808
 809            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
 810            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
 811
 812            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
 813            try:
 814                corr_subset = corr.loc[mass_feature.mz,]
 815            except KeyError:
 816                # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
 817                continue
 818
 819            corr_subset = corr_subset[corr_subset > decon_corr_min]
 820
 821            # Get the masses from the mass spectrum that are the result of the deconvolution
 822            mzs_decon = corr_subset.index.values
 823
 824            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
 825            mzs_decon_idx = [
 826                id
 827                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
 828                if mz in mzs_decon
 829            ]
 830            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
 831
 832            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
 833            if (
 834                mass_feature.ms1_peak.abundance
 835                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
 836            ):
 837                mass_feature.mass_spectrum_deconvoluted_parent = True
 838            else:
 839                mass_feature.mass_spectrum_deconvoluted_parent = False
 840
 841            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
 842            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
 843            mass_feature_df_sub = mass_feature_df[
 844                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
 845                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 846            ].copy()
 847            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
 848            mass_feature_df_sub["mz_diff_ppm"] = [
 849                np.abs(mzs_decon - mz).min() / mz * 10**6
 850                for mz in mass_feature_df_sub["mz"]
 851            ]
 852            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
 853            mfs_associated_decon = mass_feature_df_sub[
 854                mass_feature_df_sub["mz_diff_ppm"]
 855                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
 856            ].index.values
 857
 858            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
 859
 860    def _remove_redundant_mass_features(
 861        self,
 862        ) -> None:
 863        """
 864        Identify and remove redundant mass features that are likely contaminants based on their m/z values and scan frequency. 
 865        Especially useful for HILIC data where signals do not return to baseline between peaks or for data with significant background noise.
 866        
 867        Contaminants are characterized by:
 868        1. Similar m/z values (within ppm_tolerance)
 869        2. High frequency across scan numbers (ubiquitous presence)
 870    
 871        Notes
 872        -----
 873        Depends on self.mass_features being populated, uses the parameters in self.parameters.lc_ms for tolerances (mass_feature_cluster_mz_tolerance_rel)
 874        """
 875        ppm_tolerance = self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel*1e6
 876        min_scan_frequency = self.parameters.lc_ms.redundant_scan_frequency_min
 877        n_retain = self.parameters.lc_ms.redundant_feature_retain_n
 878
 879        df = self.mass_features_to_df()
 880
 881        if df.empty:
 882            return pd.DataFrame()
 883        # df index should be mf_id
 884        if 'mf_id' not in df.columns:
 885            if 'mf_id' in df.index.names:
 886                df = df.reset_index()
 887            else:
 888                raise ValueError("DataFrame must contain 'mf_id' column or index.")
 889        
 890        # Sort by m/z for efficient grouping
 891        df_sorted = df.sort_values('mz').reset_index(drop=True)
 892        
 893        # Calculate total number of unique scans for frequency calculation
 894        # Calculate total possible scans (check the cluster rt tolerance and the min rt and max rt of the data)
 895        total_time = self.scan_df['scan_time'].max() - self.scan_df['scan_time'].min()
 896        cluster_rt_tolerance = self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 897        # If the feature was detected in every possible scan (and then rolled up), it would be in this many scans
 898        total_scans =  int(total_time / cluster_rt_tolerance) + 1
 899
 900        # Group similar m/z values using ppm tolerance
 901        mz_groups = []
 902        current_group = []
 903        
 904        for i, row in df_sorted.iterrows():
 905            current_mz = row['mz']
 906            
 907            if not current_group:
 908                # Start first group
 909                current_group = [i]
 910            else:
 911                # Check if current m/z is within tolerance of group representative
 912                group_representative_mz = df_sorted.iloc[current_group[0]]['mz']
 913                ppm_diff = abs(current_mz - group_representative_mz) / group_representative_mz * 1e6
 914                
 915                if ppm_diff <= ppm_tolerance:
 916                    # Add to current group
 917                    current_group.append(i)
 918                else:
 919                    # Start new group, but first process current group
 920                    if len(current_group) > 0:
 921                        mz_groups.append(current_group)
 922                    current_group = [i]
 923        
 924        # Don't forget the last group
 925        if current_group:
 926            mz_groups.append(current_group)
 927        
 928        # Analyze each m/z group for contaminant characteristics
 929        
 930        for group_indices in mz_groups:
 931            group_data = df_sorted.iloc[group_indices]
 932            
 933            # Calculate group statistics
 934            unique_scans = group_data['apex_scan'].nunique()
 935            scan_frequency = unique_scans / total_scans
 936            
 937            # Check if this group meets contaminant criteria
 938            if scan_frequency >= min_scan_frequency:
 939                group_data = group_data.sort_values('intensity', ascending=False)
 940                non_representative_mf_id = group_data.iloc[n_retain:]['mf_id'].tolist()  # These will be removed
 941
 942                self.mass_features = {
 943                    k: v for k, v in self.mass_features.items() if k not in non_representative_mf_id
 944                }
 945
 946    def _remove_mass_features_by_peak_metrics(self) -> None:
 947        """Remove mass features based on peak metrics defined in mass_feature_attribute_filter_dict.
 948        
 949        This method filters mass features based on various peak shape metrics and quality indicators
 950        such as noise scores, Gaussian similarity, tailing factors, dispersity index, etc.
 951        
 952        The filtering criteria are defined in the mass_feature_attribute_filter_dict parameter,
 953        which should contain attribute names as keys and filter specifications as values.
 954        
 955        Filter specification format:
 956        {attribute_name: {'value': threshold, 'operator': comparison}}
 957        
 958        Available operators:
 959        - '>' or 'greater': Keep features where attribute > threshold
 960        - '<' or 'less': Keep features where attribute < threshold  
 961        - '>=' or 'greater_equal': Keep features where attribute >= threshold
 962        - '<=' or 'less_equal': Keep features where attribute <= threshold
 963        
 964        Examples:
 965        - {'noise_score_max': {'value': 0.5, 'operator': '>='}} - Keep features with noise_score_max >= 0.5
 966        - {'dispersity_index': {'value': 0.1, 'operator': '<'}} - Keep features with dispersity_index < 0.1
 967        - {'gaussian_similarity': {'value': 0.7, 'operator': '>='}} - Keep features with gaussian_similarity >= 0.7
 968        
 969        Returns
 970        -------
 971        None
 972            Modifies self.mass_features in place by removing filtered features.
 973            
 974        Raises
 975        ------
 976        ValueError
 977            If no mass features are found, if an invalid attribute is specified, or if filter specification is malformed.
 978        """
 979        if self.mass_features is None or len(self.mass_features) == 0:
 980            raise ValueError("No mass features found, run find_mass_features() first")
 981            
 982        filter_dict = self.parameters.lc_ms.mass_feature_attribute_filter_dict
 983        
 984        if not filter_dict:
 985            # No filtering criteria specified, return early
 986            return
 987            
 988        verbose = self.parameters.lc_ms.verbose_processing
 989        initial_count = len(self.mass_features)
 990        
 991        if verbose:
 992            print(f"Filtering mass features using peak metrics. Initial count: {initial_count}")
 993            
 994        # List to collect IDs of mass features to remove
 995        features_to_remove = []
 996        
 997        for mf_id, mass_feature in self.mass_features.items():
 998            should_remove = False
 999            
1000            for attribute_name, filter_spec in filter_dict.items():
1001                # Validate filter specification structure
1002                if not isinstance(filter_spec, dict):
1003                    raise ValueError(f"Filter specification for '{attribute_name}' must be a dictionary with 'value' and 'operator' keys")
1004                
1005                if 'value' not in filter_spec or 'operator' not in filter_spec:
1006                    raise ValueError(f"Filter specification for '{attribute_name}' must contain both 'value' and 'operator' keys")
1007                
1008                threshold_value = filter_spec['value']
1009                operator = filter_spec['operator'].lower().strip()
1010                
1011                # Validate operator
1012                valid_operators = {'>', '<', '>=', '<=', 'greater', 'less', 'greater_equal', 'less_equal'}
1013                if operator not in valid_operators:
1014                    raise ValueError(f"Invalid operator '{operator}' for attribute '{attribute_name}'. Valid operators: {valid_operators}")
1015                
1016                # Normalize operator names
1017                operator_map = {
1018                    'greater': '>',
1019                    'less': '<', 
1020                    'greater_equal': '>=',
1021                    'less_equal': '<='
1022                }
1023                operator = operator_map.get(operator, operator)
1024                
1025                # Get the attribute value from the mass feature
1026                try:
1027                    if hasattr(mass_feature, attribute_name):
1028                        attribute_value = getattr(mass_feature, attribute_name)
1029                    else:
1030                        raise ValueError(f"Mass feature does not have attribute '{attribute_name}'")
1031                        
1032                    # Handle None values or attributes that haven't been calculated
1033                    if attribute_value is None:
1034                        if verbose:
1035                            print(f"Warning: Mass feature {mf_id} has None value for '{attribute_name}'. Removing feature.")
1036                        should_remove = True
1037                        break
1038                        
1039                    # Handle numpy arrays (like half_height_width which returns mean)
1040                    if hasattr(attribute_value, '__len__') and not isinstance(attribute_value, str):
1041                        # For arrays, we use the mean or appropriate summary statistic
1042                        if attribute_name == 'half_height_width':
1043                            # half_height_width property already returns the mean
1044                            pass
1045                        else:
1046                            attribute_value = float(np.mean(attribute_value))
1047                    
1048                    # Handle NaN values
1049                    if np.isnan(float(attribute_value)):
1050                        if verbose:
1051                            print(f"Warning: Mass feature {mf_id} has NaN value for '{attribute_name}'. Removing feature.")
1052                        should_remove = True
1053                        break
1054                    
1055                    # Apply the threshold comparison based on operator
1056                    attribute_value = float(attribute_value)
1057                    threshold_value = float(threshold_value)
1058                    
1059                    if operator == '>' and not (attribute_value > threshold_value):
1060                        should_remove = True
1061                        break
1062                    elif operator == '<' and not (attribute_value < threshold_value):
1063                        should_remove = True
1064                        break
1065                    elif operator == '>=' and not (attribute_value >= threshold_value):
1066                        should_remove = True
1067                        break  
1068                    elif operator == '<=' and not (attribute_value <= threshold_value):
1069                        should_remove = True
1070                        break
1071                        
1072                except (AttributeError, ValueError, TypeError) as e:
1073                    if verbose:
1074                        print(f"Error evaluating filter '{attribute_name}' for mass feature {mf_id}: {e}")
1075                    should_remove = True
1076                    break
1077            
1078            if should_remove:
1079                features_to_remove.append(mf_id)
1080        
1081        # Remove filtered mass features
1082        for mf_id in features_to_remove:
1083            del self.mass_features[mf_id]
1084        
1085        # Clean up unassociated EICs and ms1 data
1086        self._remove_unassociated_eics()
1087        self._remove_unassociated_ms1_spectra()
1088            
1089    def _remove_unassociated_eics(self) -> None:
1090        """Remove EICs that are not associated with any mass features.
1091
1092        This method cleans up the eics attribute by removing any EICs that do not correspond to
1093        any mass features currently stored in the mass_features attribute. This is useful for
1094        freeing up memory and ensuring that only relevant EICs are retained.
1095
1096        Returns
1097        -------
1098        None
1099            Modifies self.eics in place by removing unassociated EICs.
1100        """
1101        if self.mass_features is None or len(self.mass_features) == 0:
1102            self.eics = {}
1103            return
1104
1105        # Get the set of m/z values associated with current mass features
1106        associated_mzs = {mf.mz for mf in self.mass_features.values()}
1107
1108        # Remove EICs that are not associated with any mass features
1109        self.eics = {mz: eic for mz, eic in self.eics.items() if mz in associated_mzs}
1110    
1111    def _remove_unassociated_ms1_spectra(self) -> None:
1112        """Remove MS1 spectra that are not associated with any mass features.
1113        This method cleans up the _ms_unprocessed attribute by removing any MS1 spectra that do not correspond to
1114        any mass features currently stored in the mass_features attribute. This is useful for freeing up memory
1115        and ensuring that only relevant MS1 spectra are retained.
1116
1117        Returns
1118        -------
1119        None
1120        """
1121        if self.mass_features is None or len(self.mass_features) == 0:
1122            self._ms_unprocessed = {}
1123            return
1124
1125        # Get the set of m/z values associated with current mass features
1126        associated_ms1_scans = {mf.apex_scan for mf in self.mass_features.values()}
1127        associated_ms1_scans = [int(scan) for scan in associated_ms1_scans]
1128        
1129        # Get keys within the _ms attribute (these are individual MassSpectrum objects)
1130        current_stored_spectra = list(set(self._ms.keys()))
1131        if len(current_stored_spectra) == 0:
1132            return
1133        current_stored_spectra = [int(scan) for scan in current_stored_spectra]
1134
1135        # Filter the current_stored_spectra to only ms1 scans
1136        current_stored_spectra_ms1 = [ scan for scan in current_stored_spectra if scan in self.ms1_scans ]
1137
1138        # Remove MS1 spectra that are not associated with any mass features
1139        scans_to_drop = [scan for scan in current_stored_spectra_ms1 if scan not in associated_ms1_scans]
1140        for scan in scans_to_drop:
1141            if scan in self._ms:
1142                del self._ms[scan]
1143
1144class PHCalculations:
1145    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
1146
1147    Notes
1148    -----
1149    This class is intended to be used as a mixin for the LCMSBase class.
1150
1151    Methods
1152    -------
1153    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
1154        Sparse implementation of a mean filter.
1155    * embed_unique_indices(a).
1156        Creates an array of indices, sorted by unique element.
1157    * sparse_upper_star(idx, V).
1158        Sparse implementation of an upper star filtration.
1159    * check_if_grid(data).
1160        Check if the data is gridded in mz space.
1161    * grid_data(data).
1162        Grid the data in the mz dimension.
1163    * find_mass_features_ph(ms_level=1, grid=True).
1164        Find mass features within an LCMSBase object using persistent homology.
1165    * cluster_mass_features(drop_children=True).
1166        Cluster regions of interest.
1167    """
1168
1169    @staticmethod
1170    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
1171        """Sparse implementation of a mean filter.
1172
1173        Parameters
1174        ----------
1175        idx : :obj:`~numpy.array`
1176            Edge indices for each dimension (MxN).
1177        V : :obj:`~numpy.array`
1178            Array of intensity data (Mx1).
1179        radius : float or list
1180            Radius of the sparse filter in each dimension. Values less than
1181            zero indicate no connectivity in that dimension.
1182
1183        Returns
1184        -------
1185        :obj:`~numpy.array`
1186            Filtered intensities (Mx1).
1187
1188        Notes
1189        -----
1190        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
1191        This is a static method.
1192        """
1193
1194        # Copy indices
1195        idx = idx.copy().astype(V.dtype)
1196
1197        # Scale
1198        for i, r in enumerate(radius):
1199            # Increase inter-index distance
1200            if r < 1:
1201                idx[:, i] *= 2
1202
1203            # Do nothing
1204            elif r == 1:
1205                pass
1206
1207            # Decrease inter-index distance
1208            else:
1209                idx[:, i] /= r
1210
1211        # Connectivity matrix
1212        cmat = KDTree(idx)
1213        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1214        cmat.setdiag(1)
1215
1216        # Pair indices
1217        I, J = cmat.nonzero()
1218
1219        # Delete cmat
1220        cmat_shape = cmat.shape
1221        del cmat
1222
1223        # Sum over columns
1224        V_sum = sparse.bsr_matrix(
1225            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
1226        ).diagonal(0)
1227
1228        # Count over columns
1229        V_count = sparse.bsr_matrix(
1230            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
1231        ).diagonal(0)
1232
1233        return V_sum / V_count
1234
1235    @staticmethod
1236    def embed_unique_indices(a):
1237        """Creates an array of indices, sorted by unique element.
1238
1239        Parameters
1240        ----------
1241        a : :obj:`~numpy.array`
1242            Array of unique elements (Mx1).
1243
1244        Returns
1245        -------
1246        :obj:`~numpy.array`
1247            Array of indices (Mx1).
1248
1249        Notes
1250        -----
1251        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1252        This is a static method.
1253        """
1254
1255        def count_tens(n):
1256            # Count tens
1257            ntens = (n - 1) // 10
1258
1259            while True:
1260                ntens_test = (ntens + n - 1) // 10
1261
1262                if ntens_test == ntens:
1263                    return ntens
1264                else:
1265                    ntens = ntens_test
1266
1267        def arange_exclude_10s(n):
1268            # How many 10s will there be?
1269            ntens = count_tens(n)
1270
1271            # Base array
1272            arr = np.arange(0, n + ntens)
1273
1274            # Exclude 10s
1275            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
1276
1277            return arr
1278
1279        # Creates an array of indices, sorted by unique element
1280        idx_sort = np.argsort(a)
1281        idx_unsort = np.argsort(idx_sort)
1282
1283        # Sorts records array so all unique elements are together
1284        sorted_a = a[idx_sort]
1285
1286        # Returns the unique values, the index of the first occurrence,
1287        # and the count for each element
1288        vals, idx_start, count = np.unique(
1289            sorted_a, return_index=True, return_counts=True
1290        )
1291
1292        # Splits the indices into separate arrays
1293        splits = np.split(idx_sort, idx_start[1:])
1294
1295        # Creates unique indices for each split
1296        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
1297
1298        # Reorders according to input array
1299        idx_unq = idx_unq[idx_unsort]
1300
1301        # Magnitude of each index
1302        exp = np.log10(
1303            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
1304        )
1305        idx_unq_mag = np.power(10, np.floor(exp) + 1)
1306
1307        # Result
1308        return a + idx_unq / idx_unq_mag
1309
1310    @staticmethod
1311    def roll_up_dataframe(
1312        df: pd.DataFrame,
1313        sort_by: str,
1314        tol: list,
1315        relative: list,
1316        dims: list,
1317        memory_opt_threshold: int = 10000,
1318    ):
1319        """Subset data by rolling up into apex in appropriate dimensions.
1320
1321        Parameters
1322        ----------
1323        data : pd.DataFrame
1324            The input data containing "dims" columns and the "sort_by" column.
1325        sort_by : str
1326            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature
1327            (i.e., the mass feature with the highest value in the sort_by column).
1328        dims : list
1329            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1330        tol : list
1331            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1332            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1333            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1334            If relative is False, the tolerance will be used as is.
1335        relative : list
1336            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1337        memory_opt_threshold : int, optional
1338            Minimum number of rows to trigger memory-optimized processing. Default is 10000.
1339
1340        Returns
1341        -------
1342        pd.DataFrame
1343            A DataFrame with only the rolled up mass features, with the original index and columns.
1344
1345
1346        Raises
1347        ------
1348        ValueError
1349            If the input data is not a pandas DataFrame.
1350            If the input data does not have columns for each of the dimensions in "dims".
1351            If the length of "dims", "tol", and "relative" do not match.
1352        """
1353        og_columns = df.columns.copy()
1354
1355        # Unindex the data, but keep the original index
1356        if df.index.name is not None:
1357            og_index = df.index.name
1358        else:
1359            og_index = "index"
1360        df = df.reset_index(drop=False)
1361
1362        # Sort data by sort_by column, and reindex
1363        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1364
1365        # Check that data is a DataFrame and has columns for each of the dims
1366        if not isinstance(df, pd.DataFrame):
1367            raise ValueError("Data must be a pandas DataFrame")
1368        for dim in dims:
1369            if dim not in df.columns:
1370                raise ValueError(f"Data must have a column for {dim}")
1371        if len(dims) != len(tol) or len(dims) != len(relative):
1372            raise ValueError(
1373                "Dimensions, tolerances, and relative flags must be the same length"
1374            )
1375
1376        # Pre-compute all values arrays
1377        all_values = [df[dim].values for dim in dims]
1378
1379        # Choose processing method based on dataframe size
1380        if len(df) >= memory_opt_threshold:
1381            # Memory-optimized approach for large dataframes
1382            distances = PHCalculations._compute_distances_memory_optimized(
1383                all_values, tol, relative
1384            )
1385        else:
1386            # Faster approach for smaller dataframes
1387            distances = PHCalculations._compute_distances_original(
1388                all_values, tol, relative
1389            )
1390
1391        # Process pairs with original logic but memory optimizations
1392        distances = distances.tocoo()
1393        pairs = np.stack((distances.row, distances.col), axis=1)
1394        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent")
1395        del distances, pairs  # Free memory immediately
1396
1397        to_drop = []
1398        while not pairs_df.empty:
1399            # Find root_parents and their children (original logic preserved)
1400            root_parents = np.setdiff1d(
1401                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1402            )
1403            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1404            to_drop.extend(children_of_roots)  # Use extend instead of append
1405
1406            # Remove root_children as possible parents from pairs_df for next iteration
1407            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1408            pairs_df = pairs_df.reset_index().set_index("child")
1409            # Remove root_children as possible children from pairs_df for next iteration
1410            pairs_df = pairs_df.drop(index=children_of_roots)
1411
1412            # Prepare for next iteration
1413            pairs_df = pairs_df.reset_index().set_index("parent")
1414
1415        # Convert to numpy array for efficient dropping
1416        to_drop = np.array(to_drop)
1417
1418        # Drop mass features that are not cluster parents
1419        df_sub = df.drop(index=to_drop)
1420
1421        # Set index back to og_index and only keep original columns
1422        df_sub = df_sub.set_index(og_index).sort_index()[og_columns]
1423
1424        return df_sub
1425
1426    @staticmethod
1427    def _compute_distances_original(all_values, tol, relative):
1428        """Original distance computation method for smaller datasets.
1429
1430        This method computes the pairwise distances between features in the dataset
1431        using a straightforward approach. It is suitable for smaller datasets where
1432        memory usage is not a primary concern.
1433
1434        Parameters
1435        ----------
1436        all_values : list of :obj:`~numpy.array`
1437            List of arrays containing the values for each dimension.
1438        tol : list of float
1439            List of tolerances for each dimension.
1440        relative : list of bool
1441            List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1442
1443        Returns
1444        -------
1445        :obj:`~scipy.sparse.coo_matrix`
1446            Sparse matrix indicating pairwise distances within tolerances.
1447        """
1448        # Compute inter-feature distances with memory optimization
1449        distances = None
1450        for i in range(len(all_values)):
1451            values = all_values[i]
1452            # Use single precision if possible to reduce memory
1453            tree = KDTree(values.reshape(-1, 1).astype(np.float32))
1454
1455            max_tol = tol[i]
1456            if relative[i] is True:
1457                max_tol = tol[i] * values.max()
1458
1459            # Compute sparse distance matrix with smaller chunks if memory is an issue
1460            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1461
1462            # Only consider forward case, exclude diagonal
1463            sdm = sparse.triu(sdm, k=1)
1464
1465            # Process relative distances more efficiently
1466            if relative[i] is True:
1467                # Vectorized computation without creating intermediate arrays
1468                row_values = values[sdm.row]
1469                valid_idx = sdm.data <= tol[i] * row_values
1470
1471                # Reconstruct sparse matrix more efficiently
1472                sdm = sparse.coo_matrix(
1473                    (
1474                        np.ones(valid_idx.sum(), dtype=np.uint8),
1475                        (sdm.row[valid_idx], sdm.col[valid_idx]),
1476                    ),
1477                    shape=(len(values), len(values)),
1478                )
1479            else:
1480                # Cast as binary matrix with smaller data type
1481                sdm.data = np.ones(len(sdm.data), dtype=np.uint8)
1482
1483            # Stack distances with memory-efficient multiplication
1484            if distances is None:
1485                distances = sdm
1486            else:
1487                # Use in-place operations where possible
1488                distances = distances.multiply(sdm)
1489                del sdm  # Free memory immediately
1490
1491        return distances
1492
1493    @staticmethod
1494    def _compute_distances_memory_optimized(all_values, tol, relative):
1495        """Memory-optimized distance computation for large datasets.
1496
1497        This method computes the pairwise distances between features in the dataset
1498        using a more memory-efficient approach. It is suitable for larger datasets
1499        where memory usage is a primary concern.
1500
1501        Parameters
1502        ----------
1503        all_values : list of :obj:`~numpy.array`
1504            List of arrays containing the values for each dimension.
1505        tol : list of float
1506            List of tolerances for each dimension.
1507        relative : list of bool
1508            List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1509
1510        Returns
1511        -------
1512        :obj:`~scipy.sparse.coo_matrix`
1513            Sparse matrix indicating pairwise distances within tolerances.
1514        """
1515        # Compute distance matrix for first dimension (full matrix as before)
1516        values_0 = all_values[0].astype(np.float32)
1517        tree_0 = KDTree(values_0.reshape(-1, 1))
1518
1519        max_tol_0 = tol[0]
1520        if relative[0] is True:
1521            max_tol_0 = tol[0] * values_0.max()
1522
1523        # Compute sparse distance matrix for first dimension
1524        distances = tree_0.sparse_distance_matrix(
1525            tree_0, max_tol_0, output_type="coo_matrix"
1526        )
1527        distances = sparse.triu(distances, k=1)
1528
1529        # Process relative distances for first dimension
1530        if relative[0] is True:
1531            row_values = values_0[distances.row]
1532            valid_idx = distances.data <= tol[0] * row_values
1533            distances = sparse.coo_matrix(
1534                (
1535                    np.ones(valid_idx.sum(), dtype=np.uint8),
1536                    (distances.row[valid_idx], distances.col[valid_idx]),
1537                ),
1538                shape=(len(values_0), len(values_0)),
1539            )
1540        else:
1541            distances.data = np.ones(len(distances.data), dtype=np.uint8)
1542
1543        # For remaining dimensions, work only on chunks defined by first dimension pairs
1544        if len(all_values) > 1:
1545            distances_coo = distances.tocoo()
1546            valid_pairs = []
1547
1548            # Process each pair from first dimension
1549            for idx in range(len(distances_coo.data)):
1550                i, j = distances_coo.row[idx], distances_coo.col[idx]
1551                is_valid_pair = True
1552
1553                # Check remaining dimensions for this specific pair
1554                for dim_idx in range(1, len(all_values)):
1555                    values = all_values[dim_idx]
1556                    val_i, val_j = values[i], values[j]
1557
1558                    max_tol = tol[dim_idx]
1559                    if relative[dim_idx] is True:
1560                        max_tol = tol[dim_idx] * values.max()
1561
1562                    distance_ij = abs(val_i - val_j)
1563
1564                    # Check if this pair satisfies the tolerance for this dimension
1565                    if relative[dim_idx] is True:
1566                        if distance_ij > tol[dim_idx] * val_i:
1567                            is_valid_pair = False
1568                            break
1569                    else:
1570                        if distance_ij > max_tol:
1571                            is_valid_pair = False
1572                            break
1573
1574                if is_valid_pair:
1575                    valid_pairs.append((i, j))
1576
1577            # Rebuild distances matrix with only valid pairs
1578            if valid_pairs:
1579                valid_pairs = np.array(valid_pairs)
1580                distances = sparse.coo_matrix(
1581                    (
1582                        np.ones(len(valid_pairs), dtype=np.uint8),
1583                        (valid_pairs[:, 0], valid_pairs[:, 1]),
1584                    ),
1585                    shape=(len(values_0), len(values_0)),
1586                )
1587            else:
1588                # No valid pairs found
1589                distances = sparse.coo_matrix(
1590                    (len(values_0), len(values_0)), dtype=np.uint8
1591                )
1592
1593        return distances
1594
1595    def sparse_upper_star(self, idx, V):
1596        """Sparse implementation of an upper star filtration.
1597
1598        Parameters
1599        ----------
1600        idx : :obj:`~numpy.array`
1601            Edge indices for each dimension (MxN).
1602        V : :obj:`~numpy.array`
1603            Array of intensity data (Mx1).
1604        Returns
1605        -------
1606        idx : :obj:`~numpy.array`
1607            Index of filtered points (Mx1).
1608        persistence : :obj:`~numpy.array`
1609            Persistence of each filtered point (Mx1).
1610
1611        Notes
1612        -----
1613        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1614        """
1615
1616        # Invert
1617        V = -1 * V.copy().astype(int)
1618
1619        # Embed indices
1620        V = self.embed_unique_indices(V)
1621
1622        # Connectivity matrix
1623        cmat = KDTree(idx)
1624        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1625        cmat.setdiag(1)
1626        cmat = sparse.triu(cmat)
1627
1628        # Pairwise minimums
1629        I, J = cmat.nonzero()
1630        d = np.maximum(V[I], V[J])
1631
1632        # Delete connectiity matrix
1633        cmat_shape = cmat.shape
1634        del cmat
1635
1636        # Sparse distance matrix
1637        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1638
1639        # Delete pairwise mins
1640        del d, I, J
1641
1642        # Persistence homology
1643        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1644
1645        # Bound death values
1646        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1647
1648        # Construct tree to query against
1649        tree = KDTree(V.reshape((-1, 1)))
1650
1651        # Get the indexes of the first nearest neighbor by birth
1652        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1653
1654        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1655
1656    def check_if_grid(self, data):
1657        """Check if the data are gridded in mz space.
1658
1659        Parameters
1660        ----------
1661        data : DataFrame
1662            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1663
1664        Returns
1665        -------
1666        bool
1667            True if the data is gridded in the mz direction, False otherwise.
1668
1669        Notes
1670        -----
1671        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1672        """
1673        # Calculate the difference between consecutive mz values in a single scan
1674        dat_check = data.copy().reset_index(drop=True)
1675        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1676        mz_diff_min = (
1677            dat_check.groupby("scan")["mz_diff"].min().min()
1678        )  # within each scan, what is the smallest mz difference between consecutive mz values
1679
1680        # Find the mininum mz difference between mz values in the data; regardless of scan
1681        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1682        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1683        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1684
1685        # Get minimum mz_diff between mz values in the data
1686        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1687
1688        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1689        if mz_diff_min_raw < mz_diff_min:
1690            return False
1691        else:
1692            return True
1693
1694    def grid_data(self, data, attempts=5):
1695        """Grid the data in the mz dimension.
1696
1697        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1698
1699        Parameters
1700        ----------
1701        data : DataFrame
1702            The input data containing mz, scan, scan_time, and intensity columns.
1703        attempts : int, optional
1704            The number of attempts to grid the data. Default is 5.
1705
1706        Returns
1707        -------
1708        DataFrame
1709            The gridded data with mz, scan, scan_time, and intensity columns.
1710
1711        Raises
1712        ------
1713        ValueError
1714            If gridding fails after the specified number of attempts.
1715        """
1716        attempt_i = 0
1717        while attempt_i < attempts:
1718            attempt_i += 1
1719            data = self._grid_data(data)
1720
1721            if self.check_if_grid(data):
1722                return data
1723
1724        if not self.check_if_grid(data):
1725            raise ValueError(
1726                "Gridding failed after "
1727                + str(attempt_i)
1728                + " attempts. Please check the data."
1729            )
1730        else:
1731            return data
1732
1733    def _grid_data(self, data):
1734        """Internal method to grid the data in the mz dimension.
1735
1736        Notes
1737        -----
1738        This method is called by the grid_data method and should not be called directly.
1739        It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan,
1740        but it does not check if the data is already gridded or if the gridding is successful.
1741
1742        Parameters
1743        ----------
1744        data : pd.DataFrame or pl.DataFrame
1745            The input data to grid.
1746
1747        Returns
1748        -------
1749        pd.DataFrame or pl.DataFrame
1750            The data after attempting to grid it in the mz dimension.
1751        """
1752        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1753        data_w = data.copy().reset_index(drop=True)
1754        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1755        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1756
1757        # Need high intensity mz values first so they are parents in the output pairs stack
1758        dat_mz = data_w[["mz", "intensity"]].sort_values(
1759            by=["intensity"], ascending=False
1760        )
1761        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1762
1763        # Construct KD tree
1764        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1765        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1766        sdm = sparse.triu(sdm, k=1)
1767        sdm.data = np.ones_like(sdm.data)
1768        distances = sdm.tocoo()
1769        pairs = np.stack((distances.row, distances.col), axis=1)
1770
1771        # Cull pairs to just get root
1772        to_drop = []
1773        while len(pairs) > 0:
1774            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1775            id_root_parents = np.isin(pairs[:, 0], root_parents)
1776            children_of_roots = np.unique(pairs[id_root_parents, 1])
1777            to_drop = np.append(to_drop, children_of_roots)
1778
1779            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1780            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1781            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1782        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1783        mz_dat_np = (
1784            dat_mz[["mz"]]
1785            .sort_values(by=["mz"])
1786            .reset_index(drop=True)
1787            .values.flatten()
1788        )
1789
1790        # Sort data by mz and recast mz to nearest value in mz_dat_np
1791        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1792        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1793        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1794
1795        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1796        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1797        new_data_w = (
1798            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1799            .groupby(["scan", "mz"])["intensity"]
1800            .sum()
1801            .reset_index()
1802        )
1803        new_data_w = (
1804            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1805            .reset_index(drop=True)
1806            .copy()
1807        )
1808
1809        return new_data_w
1810
1811    def find_mass_features_ph(self, ms_level=1, grid=True):
1812        """Find mass features within an LCMSBase object using persistent homology.
1813
1814        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1815
1816        Parameters
1817        ----------
1818        ms_level : int, optional
1819            The MS level to use. Default is 1.
1820        grid : bool, optional
1821            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1822
1823        Raises
1824        ------
1825        ValueError
1826            If no MS level data is found on the object.
1827            If data is not gridded and grid is False.
1828
1829        Returns
1830        -------
1831        None, but assigns the mass_features attribute to the object.
1832
1833        Notes
1834        -----
1835        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1836        """
1837        # Check that ms_level is a key in self._ms_uprocessed
1838        if ms_level not in self._ms_unprocessed.keys():
1839            raise ValueError(
1840                "No MS level "
1841                + str(ms_level)
1842                + " data found, did you instantiate with parser specific to MS level?"
1843            )
1844
1845        # Get ms data
1846        data = self._ms_unprocessed[ms_level].copy()
1847
1848        # Drop rows with missing intensity values and reset index
1849        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1850
1851        # Threshold data
1852        dims = ["mz", "scan_time"]
1853        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1854        persistence_threshold = (
1855            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1856        )
1857        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1858
1859        # Check if gridded, if not, grid
1860        gridded_mz = self.check_if_grid(data_thres)
1861        if gridded_mz is False:
1862            if grid is False:
1863                raise ValueError(
1864                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1865                )
1866            else:
1867                data_thres = self.grid_data(data_thres)
1868
1869        # Add scan_time
1870        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1871        # Process in chunks if required
1872        if len(data_thres) > 10000:
1873            return self._find_mass_features_ph_partition(
1874                data_thres, dims, persistence_threshold
1875            )
1876        else:
1877            # Process all at once
1878            return self._find_mass_features_ph_single(
1879                data_thres, dims, persistence_threshold
1880            )
1881
1882    def _find_mass_features_ph_single(self, data_thres, dims, persistence_threshold):
1883        """Process all data at once (original logic)."""
1884        # Build factors
1885        factors = {
1886            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1887            for dim in dims
1888        }
1889
1890        # Build indexes
1891        index = {
1892            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1893            for dim in factors
1894        }
1895
1896        # Smooth and process
1897        mass_features_df = self._process_partition_ph(
1898            data_thres, index, dims, persistence_threshold
1899        )
1900
1901        # Roll up within chunk to remove duplicates
1902        mass_features_df = self.roll_up_dataframe(
1903            df=mass_features_df,
1904            sort_by="persistence",
1905            dims=["mz", "scan_time"],
1906            tol=[
1907                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1908                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1909            ],
1910            relative=[True, False],
1911        )
1912        mass_features_df = mass_features_df.reset_index(drop=True)
1913
1914        # Populate mass_features attribute
1915        self._populate_mass_features(mass_features_df)
1916
1917    def _find_mass_features_ph_partition(self, data_thres, dims, persistence_threshold):
1918        """Partition the persistent homology mass feature detection for large datasets.
1919
1920        This method splits the input data into overlapping scan partitions, processes each partition to detect mass features
1921        using persistent homology, rolls up duplicates within and across partitions, and populates the mass_features attribute.
1922
1923        Parameters
1924        ----------
1925        data_thres : pd.DataFrame
1926            The thresholded input data containing mass spectrometry information.
1927        dims : list
1928            List of dimension names (e.g., ["mz", "scan_time"]) used for feature detection.
1929        persistence_threshold : float
1930            Minimum persistence value required for a detected mass feature to be retained.
1931
1932        Returns
1933        -------
1934        None
1935            Populates the mass_features attribute of the object with detected mass features.
1936        """
1937        all_mass_features = []
1938
1939        # Split scans into partitions
1940        unique_scans = sorted(data_thres["scan"].unique())
1941        unique_scans_n = len(unique_scans)
1942
1943        # Calculate partition size in scans based on goal
1944        partition_size_goal = 5000
1945        scans_per_partition = max(
1946            1, partition_size_goal // (len(data_thres) // unique_scans_n)
1947        )
1948        if scans_per_partition == 0:
1949            scans_per_partition = 1
1950
1951        # Make partitions based on scans, with overlapping in partitioned scans
1952        scan_overlap = 4
1953        partition_scans = []
1954        for i in range(0, unique_scans_n, scans_per_partition):
1955            start_idx = max(0, i - scan_overlap)
1956            end_idx = min(
1957                unique_scans_n - 1, i + scans_per_partition - 1 + scan_overlap
1958            )
1959            scans_group = [int(s) for s in unique_scans[start_idx : end_idx + 1]]
1960            partition_scans.append(scans_group)
1961
1962        # Set index to scan for faster filtering
1963        data_thres = data_thres.set_index("scan")
1964        for scans in partition_scans:
1965            # Determine start and end scan for partition, with 5 scans overlap
1966            partition_data = data_thres.loc[scans].reset_index(drop=False).copy()
1967
1968            if len(partition_data) == 0:
1969                continue
1970
1971            # Build factors for this partition
1972            factors = {
1973                dim: pd.factorize(partition_data[dim], sort=True)[1].astype(np.float32)
1974                for dim in dims
1975            }
1976
1977            # Build indexes
1978            index = {
1979                dim: np.searchsorted(factors[dim], partition_data[dim]).astype(
1980                    np.float32
1981                )
1982                for dim in factors
1983            }
1984
1985            # Process partition
1986            partition_features = self._process_partition_ph(
1987                partition_data, index, dims, persistence_threshold
1988            )
1989
1990            if len(partition_features) == 0:
1991                continue
1992
1993            # Roll up within partition
1994            partition_features = self.roll_up_dataframe(
1995                df=partition_features,
1996                sort_by="persistence",
1997                dims=["mz", "scan_time"],
1998                tol=[
1999                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2000                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2001                ],
2002                relative=[True, False],
2003            )
2004            partition_features = partition_features.reset_index(drop=True)
2005
2006            if len(partition_features) > 0:
2007                all_mass_features.append(partition_features)
2008
2009        # Combine results from all partitions
2010        if all_mass_features:
2011            combined_features = pd.concat(all_mass_features, ignore_index=True)
2012
2013            # Sort by persistence
2014            combined_features = combined_features.sort_values(
2015                by="persistence", ascending=False
2016            ).reset_index(drop=True)
2017
2018            # Remove duplicates from overlapping regions
2019            combined_features = self.roll_up_dataframe(
2020                df=combined_features,
2021                sort_by="persistence",
2022                dims=["mz", "scan_time"],
2023                tol=[
2024                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2025                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2026                ],
2027                relative=[True, False],
2028            )
2029
2030            # resort by persistence and reset index
2031            combined_features = combined_features.reset_index(drop=True)
2032
2033            # Populate mass_features attribute
2034            self._populate_mass_features(combined_features)
2035        else:
2036            self.mass_features = {}
2037
2038    def _process_partition_ph(self, partition_data, index, dims, persistence_threshold):
2039        """Process a single partition with persistent homology."""
2040        # Smooth data
2041        iterations = self.parameters.lc_ms.ph_smooth_it
2042        smooth_radius = [
2043            self.parameters.lc_ms.ph_smooth_radius_mz,
2044            self.parameters.lc_ms.ph_smooth_radius_scan,
2045        ]
2046
2047        index_array = np.vstack([index[dim] for dim in dims]).T
2048        V = partition_data["intensity"].values
2049        resid = np.inf
2050
2051        for i in range(iterations):
2052            # Previous iteration
2053            V_prev = V.copy()
2054            resid_prev = resid
2055            V = self.sparse_mean_filter(index_array, V, radius=smooth_radius)
2056
2057            # Calculate residual with previous iteration
2058            resid = np.sqrt(np.mean(np.square(V - V_prev)))
2059
2060            # Evaluate convergence
2061            if i > 0:
2062                # Percent change in residual
2063                test = np.abs(resid - resid_prev) / resid_prev
2064
2065                # Exit criteria
2066                if test <= 0:
2067                    break
2068
2069        # Overwrite values
2070        partition_data = partition_data.copy()
2071        partition_data["intensity"] = V
2072
2073        # Use persistent homology to find regions of interest
2074        pidx, pers = self.sparse_upper_star(index_array, V)
2075        pidx = pidx[pers > 1]
2076        pers = pers[pers > 1]
2077
2078        if len(pidx) == 0:
2079            return pd.DataFrame()
2080
2081        # Get peaks
2082        peaks = partition_data.iloc[pidx, :].reset_index(drop=True)
2083
2084        # Add persistence column
2085        peaks["persistence"] = pers
2086        mass_features = peaks.sort_values(
2087            by="persistence", ascending=False
2088        ).reset_index(drop=True)
2089
2090        # Filter by persistence threshold
2091        mass_features = mass_features.loc[
2092            mass_features["persistence"] > persistence_threshold, :
2093        ].reset_index(drop=True)
2094
2095        return mass_features
2096
2097    def _populate_mass_features(self, mass_features_df):
2098        """Populate the mass_features attribute from a DataFrame.
2099
2100        Parameters
2101        ----------
2102        mass_features_df : pd.DataFrame
2103            DataFrame containing mass feature information.
2104            Note that the order of this DataFrame will determine the order of mass features in the mass_features attribute.
2105
2106        Returns
2107        -------
2108        None, but assigns the mass_features attribute to the object.
2109        """
2110        # Rename scan column to apex_scan
2111        mass_features_df = mass_features_df.rename(
2112            columns={"scan": "apex_scan", "scan_time": "retention_time"}
2113        )
2114
2115        # Populate mass_features attribute
2116        self.mass_features = {}
2117        for row in mass_features_df.itertuples():
2118            row_dict = mass_features_df.iloc[row.Index].to_dict()
2119            lcms_feature = LCMSMassFeature(self, **row_dict)
2120            self.mass_features[lcms_feature.id] = lcms_feature
2121
2122        if self.parameters.lc_ms.verbose_processing:
2123            print("Found " + str(len(mass_features_df)) + " initial mass features")
2124
2125    def find_mass_features_ph_centroid(self, ms_level=1):
2126        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
2127
2128        Parameters
2129        ----------
2130        ms_level : int, optional
2131            The MS level to use. Default is 1.
2132
2133        Raises
2134        ------
2135        ValueError
2136            If no MS level data is found on the object.
2137
2138        Returns
2139        -------
2140        None, but assigns the mass_features attribute to the object.
2141        """
2142        # Check that ms_level is a key in self._ms_uprocessed
2143        if ms_level not in self._ms_unprocessed.keys():
2144            raise ValueError(
2145                "No MS level "
2146                + str(ms_level)
2147                + " data found, did you instantiate with parser specific to MS level?"
2148            )
2149
2150        # Work with reference instead of copy
2151        data = self._ms_unprocessed[ms_level]
2152
2153        # Calculate threshold first to avoid multiple operations
2154        max_intensity = data["intensity"].max()
2155        threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity
2156
2157        # Create single filter condition and apply to required columns only
2158        valid_mask = data["intensity"].notna() & (data["intensity"] > threshold)
2159        required_cols = ["mz", "intensity", "scan"]
2160        data_thres = data.loc[valid_mask, required_cols].copy()
2161        data_thres["persistence"] = data_thres["intensity"]
2162
2163        # Merge with required scan data
2164        scan_subset = self.scan_df[["scan", "scan_time"]]
2165        mf_df = data_thres.merge(scan_subset, on="scan", how="inner")
2166        del data_thres, scan_subset
2167
2168        # Order by scan_time and then mz to ensure features near in rt are processed together
2169        # It's ok that different scans are in different partitions; we will roll up later
2170        mf_df = mf_df.sort_values(
2171            by=["scan_time", "mz"], ascending=[True, True]
2172        ).reset_index(drop=True)
2173        partition_size = 10000
2174        partitions = [
2175            mf_df.iloc[i : i + partition_size].reset_index(drop=True)
2176            for i in range(0, len(mf_df), partition_size)
2177        ]
2178        del mf_df
2179
2180        # Run roll_up_dataframe on each partition
2181        rolled_partitions = []
2182        for part in partitions:
2183            rolled = self.roll_up_dataframe(
2184                df=part,
2185                sort_by="persistence",
2186                dims=["mz", "scan_time"],
2187                tol=[
2188                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2189                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2190                ],
2191                relative=[True, False],
2192            )
2193            rolled_partitions.append(rolled)
2194        del partitions
2195
2196        # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries
2197
2198        # Combine results and run a final roll-up to merge features near partition boundaries
2199        mf_df_final = pd.concat(rolled_partitions, ignore_index=True)
2200        del rolled_partitions
2201
2202        # Reorder by persistence before final roll-up
2203        mf_df_final = mf_df_final.sort_values(
2204            by="persistence", ascending=False
2205        ).reset_index(drop=True)
2206
2207        mf_df_final = self.roll_up_dataframe(
2208            df=mf_df_final,
2209            sort_by="persistence",
2210            dims=["mz", "scan_time"],
2211            tol=[
2212                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2213                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2214            ],
2215            relative=[True, False],
2216        )
2217        # reset index
2218        mf_df_final = mf_df_final.reset_index(drop=True)
2219
2220        # Combine rename and sort operations
2221        mass_features = (
2222            mf_df_final.rename(
2223                columns={"scan": "apex_scan", "scan_time": "retention_time"}
2224            )
2225            .sort_values(by="persistence", ascending=False)
2226            .reset_index(drop=True)
2227        )
2228        del mf_df_final  # Free memory
2229
2230        # Order by persistence and reset index
2231        mass_features = mass_features.sort_values(
2232            by="persistence", ascending=False
2233        ).reset_index(drop=True)
2234
2235        self.mass_features = {}
2236        for idx, row in mass_features.iterrows():
2237            row_dict = row.to_dict()
2238            lcms_feature = LCMSMassFeature(self, **row_dict)
2239            self.mass_features[lcms_feature.id] = lcms_feature
2240
2241        if self.parameters.lc_ms.verbose_processing:
2242            print("Found " + str(len(mass_features)) + " initial mass features")
2243
2244    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
2245        """Cluster mass features
2246
2247        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
2248
2249        Parameters
2250        ----------
2251        drop_children : bool, optional
2252            Whether to drop the mass features that are not cluster parents. Default is True.
2253        sort_by : str, optional
2254            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
2255
2256        Raises
2257        ------
2258        ValueError
2259            If no mass features are found.
2260            If too many mass features are found.
2261
2262        Returns
2263        -------
2264        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
2265        """
2266        if self.mass_features is None:
2267            raise ValueError("No mass features found, run find_mass_features() first")
2268        if len(self.mass_features) > 400000:
2269            raise ValueError(
2270                "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold"
2271            )
2272        dims = ["mz", "scan_time"]
2273        mf_df_og = self.mass_features_to_df()
2274        mf_df = mf_df_og.copy()
2275
2276        tol = [
2277            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2278            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2279        ]  # mz, in relative; scan_time in minutes
2280        relative = [True, False]
2281
2282        # Roll up mass features based on their proximity in the declared dimensions
2283        mf_df_new = self.roll_up_dataframe(
2284            df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative
2285        )
2286
2287        mf_df["cluster_parent"] = np.where(
2288            np.isin(mf_df.index, mf_df_new.index), True, False
2289        )
2290
2291        # get mass feature ids of features that are not cluster parents
2292        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
2293        if drop_children is True:
2294            # Drop mass features that are not cluster parents from self
2295            self.mass_features = {
2296                k: v
2297                for k, v in self.mass_features.items()
2298                if k not in cluster_daughters
2299            }
2300        else:
2301            return cluster_daughters
def find_closest(A, target):
14def find_closest(A, target):
15    """Find the index of closest value in A to each value in target.
16
17    Parameters
18    ----------
19    A : :obj:`~numpy.array`
20        The array to search (blueprint). A must be sorted.
21    target : :obj:`~numpy.array`
22        The array of values to search for. target must be sorted.
23
24    Returns
25    -------
26    :obj:`~numpy.array`
27        The indices of the closest values in A to each value in target.
28    """
29    idx = A.searchsorted(target)
30    idx = np.clip(idx, 1, len(A) - 1)
31    left = A[idx - 1]
32    right = A[idx]
33    idx -= target - left < right - target
34    return idx

Find the index of closest value in A to each value in target.

Parameters
  • A (~numpy.array): The array to search (blueprint). A must be sorted.
  • target (~numpy.array): The array of values to search for. target must be sorted.
Returns
  • ~numpy.array: The indices of the closest values in A to each value in target.
class LCCalculations:
  37class LCCalculations:
  38    """Methods for performing LC calculations on mass spectra data.
  39
  40    Notes
  41    -----
  42    This class is intended to be used as a mixin for the LCMSBase class.
  43
  44    Methods
  45    -------
  46    * get_max_eic(eic_data).
  47        Returns the maximum EIC value from the given EIC data. A static method.
  48    * smooth_tic(tic).
  49        Smooths the TIC data using the specified smoothing method and settings.
  50    * eic_centroid_detector(rt, eic, max_eic).
  51        Performs EIC centroid detection on the given EIC data.
  52    * find_nearest_scan(rt).
  53        Finds the nearest scan to the given retention time.
  54    * get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None).
  55        Returns an averaged mass spectrum object.
  56    * find_mass_features(ms_level=1).
  57        Find regions of interest for a given MS level (default is MS1).
  58    * integrate_mass_features(drop_if_fail=False, ms_level=1).
  59        Integrate mass features of interest and extracts EICs.
  60    * find_c13_mass_features().
  61        Evaluate mass features and mark likely C13 isotopes.
  62    * deconvolute_ms1_mass_features().
  63        Deconvolute mass features' ms1 mass spectra.
  64    """
  65
  66    @staticmethod
  67    def get_max_eic(eic_data: dict):
  68        """Returns the maximum EIC value from the given EIC data.
  69
  70        Notes
  71        -----
  72        This is a static method.
  73
  74        Parameters
  75        ----------
  76        eic_data : dict
  77            A dictionary containing EIC data.
  78
  79        Returns
  80        -------
  81        float
  82            The maximum EIC value.
  83        """
  84        max_eic = 0
  85        for eic_data in eic_data.values():
  86            ind_max_eic = max(eic_data.get("EIC"))
  87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
  88
  89        return max_eic
  90
  91    def smooth_tic(self, tic):
  92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
  93
  94        Parameters
  95        ----------
  96        tic : numpy.ndarray
  97            The TIC (or EIC) data to be smoothed.
  98
  99        Returns
 100        -------
 101        numpy.ndarray
 102            The smoothed TIC data.
 103        """
 104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
 105
 106        pol_order = self.parameters.lc_ms.savgol_pol_order
 107
 108        window_len = self.parameters.lc_ms.smooth_window
 109
 110        window = self.parameters.lc_ms.smooth_method
 111
 112        return sp.smooth_signal(
 113            tic, window_len, window, pol_order, implemented_smooth_method
 114        )
 115
 116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
 117        """Performs EIC centroid detection on the given EIC data.
 118
 119        Parameters
 120        ----------
 121        rt : numpy.ndarray
 122            The retention time data.
 123        eic : numpy.ndarray
 124            The EIC data.
 125        max_eic : float
 126            The maximum EIC value.
 127        apex_indexes : list, optional
 128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
 129
 130        Returns
 131        -------
 132        numpy.ndarray
 133            The indexes of left, apex, and right limits as a generator.
 134        """
 135
 136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
 137
 138        max_height = self.parameters.lc_ms.peak_height_max_percent
 139
 140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
 141
 142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
 143
 144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
 145
 146        include_indexes = sp.peak_picking_first_derivative(
 147            domain=rt,
 148            signal=eic,
 149            max_height=max_height,
 150            max_prominence=max_prominence,
 151            max_signal=max_eic,
 152            min_peak_datapoints=min_peak_datapoints,
 153            peak_derivative_threshold=peak_derivative_threshold,
 154            signal_threshold=signal_threshold,
 155            correct_baseline=False,
 156            plot_res=False,
 157            apex_indexes=apex_indexes,
 158        )
 159        #include_indexes is a generator of tuples (left_index, apex_index, right_index)
 160        include_indexes = list(include_indexes)
 161        # Add check to make sure that there are at least 1/2 of min_peak_datapoints on either side of the apex
 162        indicies = [x for x in include_indexes]
 163        for idx in indicies:
 164            if (idx[1] - idx[0] < min_peak_datapoints / 2) or (
 165                idx[2] - idx[1] < min_peak_datapoints / 2
 166            ):
 167                include_indexes.remove(idx)
 168        return include_indexes
 169
 170    def find_nearest_scan(self, rt):
 171        """Finds the nearest scan to the given retention time.
 172
 173        Parameters
 174        ----------
 175        rt : float
 176            The retention time (in minutes) to find the nearest scan for.
 177
 178        Returns
 179        -------
 180        int
 181            The scan number of the nearest scan.
 182        """
 183        array_rt = np.array(self.retention_time)
 184
 185        scan_index = (np.abs(array_rt - rt)).argmin()
 186
 187        real_scan = self.scans_number[scan_index]
 188
 189        return real_scan
 190
 191    def add_peak_metrics(self, remove_by_metrics=True):
 192        """Add peak metrics to the mass features.
 193
 194        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
 195
 196        Parameters
 197        ----------
 198        remove_by_metrics : bool, optional
 199            If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity,
 200            dispersity index, and noise score. Default is True, which checks the setting in the processing parameters.
 201            If False, peak metrics are calculated but no mass features are removed, regardless of the setting in the processing parameters.
 202        """
 203        # Check that at least some mass features have eic data
 204        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
 205            raise ValueError(
 206                "No mass features have EIC data. Run integrate_mass_features first."
 207            )
 208
 209        for mass_feature in self.mass_features.values():
 210            # Check if the mass feature has been integrated
 211            if mass_feature._eic_data is not None:
 212                # Calculate peak metrics
 213                mass_feature.calc_half_height_width()
 214                mass_feature.calc_tailing_factor()
 215                mass_feature.calc_dispersity_index()
 216                mass_feature.calc_gaussian_similarity()
 217                mass_feature.calc_noise_score()
 218        
 219        # Remove mass features by peak metrics if designated in parameters
 220        if self.parameters.lc_ms.remove_mass_features_by_peak_metrics and remove_by_metrics:
 221            self._remove_mass_features_by_peak_metrics()
 222
 223    def get_average_mass_spectrum(
 224        self,
 225        scan_list,
 226        apex_scan,
 227        spectrum_mode="profile",
 228        ms_level=1,
 229        auto_process=True,
 230        use_parser=False,
 231        perform_checks=True,
 232        polarity=None,
 233        ms_params=None,
 234    ):
 235        """Returns an averaged mass spectrum object
 236
 237        Parameters
 238        ----------
 239        scan_list : list
 240            List of scan numbers to average.
 241        apex_scan : int
 242            Number of the apex scan
 243        spectrum_mode : str, optional
 244            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
 245        ms_level : int, optional
 246            The MS level to use. Defaults to 1.
 247        auto_process : bool, optional
 248            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
 249        use_parser : bool, optional
 250            If True, the mass spectra will be obtained from the parser. Defaults to False.
 251        perform_checks : bool, optional
 252            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
 253        polarity : int, optional
 254            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
 255        ms_params : MSParameters, optional
 256            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
 257
 258        Returns
 259        -------
 260        MassSpectrumProfile
 261            The averaged mass spectrum object.
 262
 263        Raises
 264        ------
 265        ValueError
 266            If the spectrum mode is not "profile".
 267            If the MS level is not found in the unprocessed mass spectra dictionary.
 268            If not all scan numbers are found in the unprocessed mass spectra dictionary.
 269        """
 270        if perform_checks:
 271            if spectrum_mode != "profile":
 272                raise ValueError("Averaging only supported for profile mode")
 273
 274        if polarity is None:
 275            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 276            if self.polarity == "negative":
 277                polarity = -1
 278            elif self.polarity == "positive":
 279                polarity = 1
 280            else:
 281                raise ValueError(
 282                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
 283                )
 284
 285        # if not using_parser, check that scan numbers are in _ms_unprocessed
 286        if not use_parser:
 287            if perform_checks:
 288                # Set index to scan for faster lookup
 289                ms_df = (
 290                    self._ms_unprocessed[ms_level]
 291                    .copy()
 292                    .set_index("scan", drop=False)
 293                    .sort_index()
 294                )
 295                my_ms_df = ms_df.loc[scan_list]
 296                # Check that all scan numbers are in the ms_df
 297                if not all(np.isin(scan_list, ms_df.index)):
 298                    raise ValueError(
 299                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
 300                    )
 301            else:
 302                my_ms_df = (
 303                    pd.DataFrame({"scan": scan_list})
 304                    .set_index("scan")
 305                    .join(self._ms_unprocessed[ms_level], how="left")
 306                )
 307
 308        if use_parser:
 309            ms_list = [
 310                self.spectra_parser.get_mass_spectrum_from_scan(
 311                    x, spectrum_mode=spectrum_mode, auto_process=False
 312                )
 313                for x in scan_list
 314            ]
 315            ms_mz = [x._mz_exp for x in ms_list]
 316            ms_int = [x._abundance for x in ms_list]
 317            my_ms_df = []
 318            for i in np.arange(len(ms_mz)):
 319                my_ms_df.append(
 320                    pd.DataFrame(
 321                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
 322                    )
 323                )
 324            my_ms_df = pd.concat(my_ms_df)
 325
 326        if not self.check_if_grid(my_ms_df):
 327            my_ms_df = self.grid_data(my_ms_df)
 328
 329        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
 330
 331        ms = ms_from_array_profile(
 332            my_ms_ave.mz,
 333            my_ms_ave.intensity,
 334            self.file_location,
 335            polarity=polarity,
 336            auto_process=False,
 337        )
 338
 339        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 340        if ms is not None:
 341            if ms_params is not None:
 342                ms.parameters = ms_params
 343            ms.scan_number = apex_scan
 344            if auto_process:
 345                ms.process_mass_spec()
 346        return ms
 347
 348    def find_mass_features(self, ms_level=1, grid=True):
 349        """Find mass features within an LCMSBase object
 350
 351        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
 352
 353        Parameters
 354        ----------
 355        ms_level : int, optional
 356            The MS level to use for peak picking Default is 1.
 357        grid : bool, optional
 358            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded),
 359            used for persistent homology peak picking for profile data only. Default is True.
 360
 361        Raises
 362        ------
 363        ValueError
 364            If no MS level data is found on the object.
 365            If persistent homology peak picking is attempted on non-profile mode data.
 366            If data is not gridded and grid is False.
 367            If peak picking method is not implemented.
 368
 369        Returns
 370        -------
 371        None, but assigns the mass_features and eics attributes to the object.
 372
 373        """
 374        pp_method = self.parameters.lc_ms.peak_picking_method
 375
 376        if pp_method == "persistent homology":
 377            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 378            if all(msx_scan_df["ms_format"] == "profile"):
 379                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
 380            else:
 381                raise ValueError(
 382                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
 383                        ms_level
 384                    )
 385                )
 386        elif pp_method == "centroided_persistent_homology":
 387            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
 388            if all(msx_scan_df["ms_format"] == "centroid"):
 389                self.find_mass_features_ph_centroid(ms_level=ms_level)
 390            else:
 391                raise ValueError(
 392                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
 393                        ms_level
 394                    )
 395                )
 396        else:
 397            raise ValueError("Peak picking method not implemented")
 398        
 399        # Remove noisey mass features if designated in parameters
 400        if self.parameters.lc_ms.remove_redundant_mass_features:
 401            self._remove_redundant_mass_features()
 402
 403    def integrate_mass_features(
 404        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
 405    ):
 406        """Integrate mass features and extract EICs.
 407
 408        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
 409
 410        Parameters
 411        ----------
 412        drop_if_fail : bool, optional
 413            Whether to drop mass features if the EIC limit calculations fail.
 414            Default is True.
 415        drop_duplicates : bool, optional
 416            Whether to mass features that appear to be duplicates
 417            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
 418            Default is True.
 419        ms_level : int, optional
 420            The MS level to use. Default is 1.
 421
 422        Raises
 423        ------
 424        ValueError
 425            If no mass features are found.
 426            If no MS level data is found for the given MS level (either in data or in the scan data)
 427
 428        Returns
 429        -------
 430        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
 431
 432        Notes
 433        -----
 434        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
 435        """
 436        # Check if there is data
 437        if ms_level in self._ms_unprocessed.keys():
 438            raw_data = self._ms_unprocessed[ms_level].copy()
 439        else:
 440            raise ValueError("No MS level " + str(ms_level) + " data found")
 441        if self.mass_features is not None:
 442            mf_df = self.mass_features_to_df().copy()
 443        else:
 444            raise ValueError(
 445                "No mass features found, did you run find_mass_features() first?"
 446            )
 447
 448        # Subset scan data to only include correct ms_level
 449        scan_df_sub = self.scan_df[
 450            self.scan_df["ms_level"] == int(ms_level)
 451        ].reset_index(drop=True)
 452        if scan_df_sub.empty:
 453            raise ValueError("No MS level " + ms_level + " data found in scan data")
 454        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
 455
 456        mzs_to_extract = np.unique(mf_df["mz"].values)
 457        mzs_to_extract.sort()
 458
 459        # Pre-sort raw_data by mz for faster filtering
 460        raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True)
 461        raw_data_mz = raw_data_sorted["mz"].values
 462
 463        # Get EICs for each unique mz in mass features list
 464        for mz in mzs_to_extract:
 465            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 466            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
 467
 468            # Use binary search for faster mz range filtering
 469            left_idx = np.searchsorted(raw_data_mz, mz_min, side="left")
 470            right_idx = np.searchsorted(raw_data_mz, mz_max, side="right")
 471            raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy()
 472
 473            raw_data_sub = (
 474                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
 475            )
 476            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
 477            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
 478            myEIC = EIC_Data(
 479                scans=raw_data_sub["scan"].values,
 480                time=raw_data_sub["scan_time"].values,
 481                eic=raw_data_sub["intensity"].values,
 482            )
 483            # Smooth EIC
 484            smoothed_eic = self.smooth_tic(myEIC.eic)
 485            smoothed_eic[smoothed_eic < 0] = 0
 486            myEIC.eic_smoothed = smoothed_eic
 487            self.eics[mz] = myEIC
 488
 489        # Get limits of mass features using EIC centroid detector and integrate
 490        mf_df["area"] = np.nan
 491        for idx, mass_feature in mf_df.iterrows():
 492            mz = mass_feature.mz
 493            apex_scan = mass_feature.apex_scan
 494
 495            # Pull EIC data and find apex scan index
 496            myEIC = self.eics[mz]
 497            self.mass_features[idx]._eic_data = myEIC
 498            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
 499
 500            # Find left and right limits of peak using EIC centroid detector, add to EICData
 501            centroid_eics = self.eic_centroid_detector(
 502                myEIC.time,
 503                myEIC.eic_smoothed,
 504                mass_feature.intensity * 1.1,
 505                apex_indexes=[int(apex_index)],
 506            )
 507            l_a_r_scan_idx = [i for i in centroid_eics]
 508            if len(l_a_r_scan_idx) > 0:
 509                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
 510                # Find the number of consecutive non-zero values in the EIC segment
 511                mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0
 512                # Find the longest run of consecutive True values
 513                if np.any(mask):
 514                    # Find indices where mask changes value
 515                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
 516                    starts = np.where(diff == 1)[0]
 517                    ends = np.where(diff == -1)[0]
 518                    consecutive_scans = (ends - starts).max()
 519                else:
 520                    consecutive_scans = 0
 521                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
 522                    self.mass_features.pop(idx)
 523                    continue
 524                # Add start and final scan to mass_features and EICData
 525                left_scan, right_scan = (
 526                    myEIC.scans[l_a_r_scan_idx[0][0]],
 527                    myEIC.scans[l_a_r_scan_idx[0][2]],
 528                )
 529                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
 530                myEIC.apexes = myEIC.apexes + mf_scan_apex
 531                self.mass_features[idx].start_scan = left_scan
 532                self.mass_features[idx].final_scan = right_scan
 533
 534                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
 535                area = np.trapz(
 536                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 537                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
 538                )
 539                mf_df.at[idx, "area"] = area
 540                myEIC.areas = myEIC.areas + [area]
 541                self.eics[mz] = myEIC
 542                self.mass_features[idx]._area = area
 543            else:
 544                if drop_if_fail is True:
 545                    self.mass_features.pop(idx)
 546
 547        if drop_duplicates:
 548            # Prepare mass feature dataframe
 549            mf_df = self.mass_features_to_df()
 550
 551            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
 552            # Keep the first mass feature (highest persistence)
 553            for idx, mass_feature in mf_df.iterrows():
 554                mz = mass_feature.mz
 555                apex_scan = mass_feature.apex_scan
 556
 557                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
 558                mf_df_sub = mf_df[
 559                    mf_df["mz_diff_ppm"]
 560                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 561                    * 10**6
 562                ].copy()
 563
 564                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
 565                for idx2, mass_feature2 in mf_df_sub.iterrows():
 566                    if idx2 != idx:
 567                        if (
 568                            mass_feature2.start_scan >= mass_feature.start_scan
 569                            and mass_feature2.final_scan <= mass_feature.final_scan
 570                        ):
 571                            if idx2 in self.mass_features.keys():
 572                                self.mass_features.pop(idx2)
 573
 574    def find_c13_mass_features(self):
 575        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
 576
 577        Returns
 578        -------
 579        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
 580
 581        Raises
 582        ------
 583        ValueError
 584            If no mass features are found.
 585        """
 586        verbose = self.parameters.lc_ms.verbose_processing
 587        if verbose:
 588            print("evaluating mass features for C13 isotopes")
 589        if self.mass_features is None:
 590            raise ValueError("No mass features found, run find_mass_features() first")
 591
 592        # Data prep fo sparse distance matrix
 593        dims = ["mz", "scan_time"]
 594        mf_df = self.mass_features_to_df().copy()
 595        # Drop mass features that have no area (these are likely to be noise)
 596        mf_df = mf_df[mf_df["area"].notnull()]
 597        mf_df["mf_id"] = mf_df.index.values
 598        dims = ["mz", "scan_time"]
 599
 600        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
 601        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
 602
 603        mz_diff = 1.003355  # C13-C12 mass difference
 604        tol = [
 605            mf_df["mz"].median()
 606            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
 607            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
 608        ]  # mz, in relative; scan_time in minutes
 609
 610        # Compute inter-feature distances
 611        distances = None
 612        for i in range(len(dims)):
 613            # Construct k-d tree
 614            values = mf_df[dims[i]].values
 615            tree = KDTree(values.reshape(-1, 1))
 616
 617            max_tol = tol[i]
 618            if dims[i] == "mz":
 619                # Maximum absolute tolerance
 620                max_tol = mz_diff + tol[i]
 621
 622            # Compute sparse distance matrix
 623            # the larger the max_tol, the slower this operation is
 624            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
 625
 626            # Only consider forward case, exclude diagonal
 627            sdm = sparse.triu(sdm, k=1)
 628
 629            if dims[i] == "mz":
 630                min_tol = mz_diff - tol[i]
 631                # Get only the ones that are above the min tol
 632                idx = sdm.data > min_tol
 633
 634                # Reconstruct sparse distance matrix
 635                sdm = sparse.coo_matrix(
 636                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
 637                    shape=(len(values), len(values)),
 638                )
 639
 640            # Cast as binary matrix
 641            sdm.data = np.ones_like(sdm.data)
 642
 643            # Stack distances
 644            if distances is None:
 645                distances = sdm
 646            else:
 647                distances = distances.multiply(sdm)
 648
 649        # Extract indices of within-tolerance points
 650        distances = distances.tocoo()
 651        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
 652
 653        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
 654        pairs_mf = pairs.copy()
 655        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
 656        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
 657
 658        # Connect monoisotopic masses with isotopologes within mass_features
 659        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
 660        for mono in monos:
 661            self.mass_features[mono].monoisotopic_mf_id = mono
 662        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
 663        while not pairs_iso_df.empty:
 664            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
 665            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
 666            for iso in m1_isos:
 667                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
 668                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
 669                if len(parent) > 1:
 670                    # Choose the parent that is closest in time to the isotopologue
 671                    parent_time = [self.mass_features[p].retention_time for p in parent]
 672                    time_diff = [
 673                        np.abs(self.mass_features[iso].retention_time - x)
 674                        for x in parent_time
 675                    ]
 676                    parent = parent[np.argmin(time_diff)]
 677                else:
 678                    parent = parent[0]
 679                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
 680                    parent
 681                ].monoisotopic_mf_id
 682                if self.mass_features[iso].monoisotopic_mf_id is not None:
 683                    mass_diff = (
 684                        self.mass_features[iso].mz
 685                        - self.mass_features[
 686                            self.mass_features[iso].monoisotopic_mf_id
 687                        ].mz
 688                    )
 689                    self.mass_features[iso].isotopologue_type = "13C" + str(
 690                        int(round(mass_diff, 0))
 691                    )
 692
 693            # Drop the mono and iso from the pairs_iso_df
 694            pairs_iso_df = pairs_iso_df.drop(
 695                index=monos, errors="ignore"
 696            )  # Drop pairs where the parent is a child that is a child of a root
 697            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
 698            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
 699
 700            if not pairs_iso_df.empty:
 701                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
 702                monos = np.setdiff1d(
 703                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
 704                )
 705        if verbose:
 706            # Report fraction of compounds annotated with isotopes
 707            mf_df["c13_flag"] = np.where(
 708                np.logical_or(
 709                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
 710                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
 711                ),
 712                1,
 713                0,
 714            )
 715            print(
 716                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
 717                + " of mass features have or are C13 isotopes"
 718            )
 719
 720    def deconvolute_ms1_mass_features(self):
 721        """Deconvolute MS1 mass features
 722
 723        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
 724
 725        Parameters
 726        ----------
 727        None
 728
 729        Returns
 730        -------
 731        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
 732        and associated_mass_features_deconvoluted attributes to the mass features in the
 733        mass_features attribute of the LCMSBase object.
 734
 735        Raises
 736        ------
 737        ValueError
 738            If no mass features are found, must run find_mass_features() first.
 739            If no EICs are found, did you run integrate_mass_features() first?
 740
 741        """
 742        # Checks for set mass_features and eics
 743        if self.mass_features is None:
 744            raise ValueError(
 745                "No mass features found, did you run find_mass_features() first?"
 746            )
 747
 748        if self.eics == {}:
 749            raise ValueError(
 750                "No EICs found, did you run integrate_mass_features() first?"
 751            )
 752
 753        if 1 not in self._ms_unprocessed.keys():
 754            raise ValueError("No unprocessed MS1 spectra found.")
 755
 756        # Prep ms1 data
 757        ms1_data = self._ms_unprocessed[1].copy()
 758        ms1_data = ms1_data.set_index("scan")
 759
 760        # Prep mass feature summary
 761        mass_feature_df = self.mass_features_to_df()
 762
 763        # Loop through each mass feature
 764        for mf_id, mass_feature in self.mass_features.items():
 765            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
 766            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
 767                continue
 768
 769            # Get the left and right limits of the EIC of the mass feature
 770            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
 771
 772            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
 773            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
 774            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
 775
 776            # Get the centroided masses of the mass feature
 777            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
 778
 779            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
 780            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
 781                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
 782            ]
 783
 784            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
 785            ms1_data_sub["mz_diff_rel"] = (
 786                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
 787                / ms1_data_sub["mz"]
 788            )
 789            ms1_data_sub = ms1_data_sub[
 790                ms1_data_sub["mz_diff_rel"]
 791                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
 792            ].reset_index(drop=True)
 793
 794            # Group by mass_feature_mz and scan and sum intensity
 795            ms1_data_sub_group = (
 796                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
 797                .sum()
 798                .reset_index()
 799            )
 800
 801            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
 802            corr = (
 803                ms1_data_sub_group.pivot(
 804                    index="scan", columns="mass_feature_mz", values="intensity"
 805                )
 806                .fillna(0)
 807                .corr()
 808            )
 809
 810            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
 811            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
 812
 813            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
 814            try:
 815                corr_subset = corr.loc[mass_feature.mz,]
 816            except KeyError:
 817                # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
 818                continue
 819
 820            corr_subset = corr_subset[corr_subset > decon_corr_min]
 821
 822            # Get the masses from the mass spectrum that are the result of the deconvolution
 823            mzs_decon = corr_subset.index.values
 824
 825            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
 826            mzs_decon_idx = [
 827                id
 828                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
 829                if mz in mzs_decon
 830            ]
 831            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
 832
 833            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
 834            if (
 835                mass_feature.ms1_peak.abundance
 836                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
 837            ):
 838                mass_feature.mass_spectrum_deconvoluted_parent = True
 839            else:
 840                mass_feature.mass_spectrum_deconvoluted_parent = False
 841
 842            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
 843            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
 844            mass_feature_df_sub = mass_feature_df[
 845                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
 846                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 847            ].copy()
 848            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
 849            mass_feature_df_sub["mz_diff_ppm"] = [
 850                np.abs(mzs_decon - mz).min() / mz * 10**6
 851                for mz in mass_feature_df_sub["mz"]
 852            ]
 853            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
 854            mfs_associated_decon = mass_feature_df_sub[
 855                mass_feature_df_sub["mz_diff_ppm"]
 856                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
 857            ].index.values
 858
 859            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon
 860
 861    def _remove_redundant_mass_features(
 862        self,
 863        ) -> None:
 864        """
 865        Identify and remove redundant mass features that are likely contaminants based on their m/z values and scan frequency. 
 866        Especially useful for HILIC data where signals do not return to baseline between peaks or for data with significant background noise.
 867        
 868        Contaminants are characterized by:
 869        1. Similar m/z values (within ppm_tolerance)
 870        2. High frequency across scan numbers (ubiquitous presence)
 871    
 872        Notes
 873        -----
 874        Depends on self.mass_features being populated, uses the parameters in self.parameters.lc_ms for tolerances (mass_feature_cluster_mz_tolerance_rel)
 875        """
 876        ppm_tolerance = self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel*1e6
 877        min_scan_frequency = self.parameters.lc_ms.redundant_scan_frequency_min
 878        n_retain = self.parameters.lc_ms.redundant_feature_retain_n
 879
 880        df = self.mass_features_to_df()
 881
 882        if df.empty:
 883            return pd.DataFrame()
 884        # df index should be mf_id
 885        if 'mf_id' not in df.columns:
 886            if 'mf_id' in df.index.names:
 887                df = df.reset_index()
 888            else:
 889                raise ValueError("DataFrame must contain 'mf_id' column or index.")
 890        
 891        # Sort by m/z for efficient grouping
 892        df_sorted = df.sort_values('mz').reset_index(drop=True)
 893        
 894        # Calculate total number of unique scans for frequency calculation
 895        # Calculate total possible scans (check the cluster rt tolerance and the min rt and max rt of the data)
 896        total_time = self.scan_df['scan_time'].max() - self.scan_df['scan_time'].min()
 897        cluster_rt_tolerance = self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
 898        # If the feature was detected in every possible scan (and then rolled up), it would be in this many scans
 899        total_scans =  int(total_time / cluster_rt_tolerance) + 1
 900
 901        # Group similar m/z values using ppm tolerance
 902        mz_groups = []
 903        current_group = []
 904        
 905        for i, row in df_sorted.iterrows():
 906            current_mz = row['mz']
 907            
 908            if not current_group:
 909                # Start first group
 910                current_group = [i]
 911            else:
 912                # Check if current m/z is within tolerance of group representative
 913                group_representative_mz = df_sorted.iloc[current_group[0]]['mz']
 914                ppm_diff = abs(current_mz - group_representative_mz) / group_representative_mz * 1e6
 915                
 916                if ppm_diff <= ppm_tolerance:
 917                    # Add to current group
 918                    current_group.append(i)
 919                else:
 920                    # Start new group, but first process current group
 921                    if len(current_group) > 0:
 922                        mz_groups.append(current_group)
 923                    current_group = [i]
 924        
 925        # Don't forget the last group
 926        if current_group:
 927            mz_groups.append(current_group)
 928        
 929        # Analyze each m/z group for contaminant characteristics
 930        
 931        for group_indices in mz_groups:
 932            group_data = df_sorted.iloc[group_indices]
 933            
 934            # Calculate group statistics
 935            unique_scans = group_data['apex_scan'].nunique()
 936            scan_frequency = unique_scans / total_scans
 937            
 938            # Check if this group meets contaminant criteria
 939            if scan_frequency >= min_scan_frequency:
 940                group_data = group_data.sort_values('intensity', ascending=False)
 941                non_representative_mf_id = group_data.iloc[n_retain:]['mf_id'].tolist()  # These will be removed
 942
 943                self.mass_features = {
 944                    k: v for k, v in self.mass_features.items() if k not in non_representative_mf_id
 945                }
 946
 947    def _remove_mass_features_by_peak_metrics(self) -> None:
 948        """Remove mass features based on peak metrics defined in mass_feature_attribute_filter_dict.
 949        
 950        This method filters mass features based on various peak shape metrics and quality indicators
 951        such as noise scores, Gaussian similarity, tailing factors, dispersity index, etc.
 952        
 953        The filtering criteria are defined in the mass_feature_attribute_filter_dict parameter,
 954        which should contain attribute names as keys and filter specifications as values.
 955        
 956        Filter specification format:
 957        {attribute_name: {'value': threshold, 'operator': comparison}}
 958        
 959        Available operators:
 960        - '>' or 'greater': Keep features where attribute > threshold
 961        - '<' or 'less': Keep features where attribute < threshold  
 962        - '>=' or 'greater_equal': Keep features where attribute >= threshold
 963        - '<=' or 'less_equal': Keep features where attribute <= threshold
 964        
 965        Examples:
 966        - {'noise_score_max': {'value': 0.5, 'operator': '>='}} - Keep features with noise_score_max >= 0.5
 967        - {'dispersity_index': {'value': 0.1, 'operator': '<'}} - Keep features with dispersity_index < 0.1
 968        - {'gaussian_similarity': {'value': 0.7, 'operator': '>='}} - Keep features with gaussian_similarity >= 0.7
 969        
 970        Returns
 971        -------
 972        None
 973            Modifies self.mass_features in place by removing filtered features.
 974            
 975        Raises
 976        ------
 977        ValueError
 978            If no mass features are found, if an invalid attribute is specified, or if filter specification is malformed.
 979        """
 980        if self.mass_features is None or len(self.mass_features) == 0:
 981            raise ValueError("No mass features found, run find_mass_features() first")
 982            
 983        filter_dict = self.parameters.lc_ms.mass_feature_attribute_filter_dict
 984        
 985        if not filter_dict:
 986            # No filtering criteria specified, return early
 987            return
 988            
 989        verbose = self.parameters.lc_ms.verbose_processing
 990        initial_count = len(self.mass_features)
 991        
 992        if verbose:
 993            print(f"Filtering mass features using peak metrics. Initial count: {initial_count}")
 994            
 995        # List to collect IDs of mass features to remove
 996        features_to_remove = []
 997        
 998        for mf_id, mass_feature in self.mass_features.items():
 999            should_remove = False
1000            
1001            for attribute_name, filter_spec in filter_dict.items():
1002                # Validate filter specification structure
1003                if not isinstance(filter_spec, dict):
1004                    raise ValueError(f"Filter specification for '{attribute_name}' must be a dictionary with 'value' and 'operator' keys")
1005                
1006                if 'value' not in filter_spec or 'operator' not in filter_spec:
1007                    raise ValueError(f"Filter specification for '{attribute_name}' must contain both 'value' and 'operator' keys")
1008                
1009                threshold_value = filter_spec['value']
1010                operator = filter_spec['operator'].lower().strip()
1011                
1012                # Validate operator
1013                valid_operators = {'>', '<', '>=', '<=', 'greater', 'less', 'greater_equal', 'less_equal'}
1014                if operator not in valid_operators:
1015                    raise ValueError(f"Invalid operator '{operator}' for attribute '{attribute_name}'. Valid operators: {valid_operators}")
1016                
1017                # Normalize operator names
1018                operator_map = {
1019                    'greater': '>',
1020                    'less': '<', 
1021                    'greater_equal': '>=',
1022                    'less_equal': '<='
1023                }
1024                operator = operator_map.get(operator, operator)
1025                
1026                # Get the attribute value from the mass feature
1027                try:
1028                    if hasattr(mass_feature, attribute_name):
1029                        attribute_value = getattr(mass_feature, attribute_name)
1030                    else:
1031                        raise ValueError(f"Mass feature does not have attribute '{attribute_name}'")
1032                        
1033                    # Handle None values or attributes that haven't been calculated
1034                    if attribute_value is None:
1035                        if verbose:
1036                            print(f"Warning: Mass feature {mf_id} has None value for '{attribute_name}'. Removing feature.")
1037                        should_remove = True
1038                        break
1039                        
1040                    # Handle numpy arrays (like half_height_width which returns mean)
1041                    if hasattr(attribute_value, '__len__') and not isinstance(attribute_value, str):
1042                        # For arrays, we use the mean or appropriate summary statistic
1043                        if attribute_name == 'half_height_width':
1044                            # half_height_width property already returns the mean
1045                            pass
1046                        else:
1047                            attribute_value = float(np.mean(attribute_value))
1048                    
1049                    # Handle NaN values
1050                    if np.isnan(float(attribute_value)):
1051                        if verbose:
1052                            print(f"Warning: Mass feature {mf_id} has NaN value for '{attribute_name}'. Removing feature.")
1053                        should_remove = True
1054                        break
1055                    
1056                    # Apply the threshold comparison based on operator
1057                    attribute_value = float(attribute_value)
1058                    threshold_value = float(threshold_value)
1059                    
1060                    if operator == '>' and not (attribute_value > threshold_value):
1061                        should_remove = True
1062                        break
1063                    elif operator == '<' and not (attribute_value < threshold_value):
1064                        should_remove = True
1065                        break
1066                    elif operator == '>=' and not (attribute_value >= threshold_value):
1067                        should_remove = True
1068                        break  
1069                    elif operator == '<=' and not (attribute_value <= threshold_value):
1070                        should_remove = True
1071                        break
1072                        
1073                except (AttributeError, ValueError, TypeError) as e:
1074                    if verbose:
1075                        print(f"Error evaluating filter '{attribute_name}' for mass feature {mf_id}: {e}")
1076                    should_remove = True
1077                    break
1078            
1079            if should_remove:
1080                features_to_remove.append(mf_id)
1081        
1082        # Remove filtered mass features
1083        for mf_id in features_to_remove:
1084            del self.mass_features[mf_id]
1085        
1086        # Clean up unassociated EICs and ms1 data
1087        self._remove_unassociated_eics()
1088        self._remove_unassociated_ms1_spectra()
1089            
1090    def _remove_unassociated_eics(self) -> None:
1091        """Remove EICs that are not associated with any mass features.
1092
1093        This method cleans up the eics attribute by removing any EICs that do not correspond to
1094        any mass features currently stored in the mass_features attribute. This is useful for
1095        freeing up memory and ensuring that only relevant EICs are retained.
1096
1097        Returns
1098        -------
1099        None
1100            Modifies self.eics in place by removing unassociated EICs.
1101        """
1102        if self.mass_features is None or len(self.mass_features) == 0:
1103            self.eics = {}
1104            return
1105
1106        # Get the set of m/z values associated with current mass features
1107        associated_mzs = {mf.mz for mf in self.mass_features.values()}
1108
1109        # Remove EICs that are not associated with any mass features
1110        self.eics = {mz: eic for mz, eic in self.eics.items() if mz in associated_mzs}
1111    
1112    def _remove_unassociated_ms1_spectra(self) -> None:
1113        """Remove MS1 spectra that are not associated with any mass features.
1114        This method cleans up the _ms_unprocessed attribute by removing any MS1 spectra that do not correspond to
1115        any mass features currently stored in the mass_features attribute. This is useful for freeing up memory
1116        and ensuring that only relevant MS1 spectra are retained.
1117
1118        Returns
1119        -------
1120        None
1121        """
1122        if self.mass_features is None or len(self.mass_features) == 0:
1123            self._ms_unprocessed = {}
1124            return
1125
1126        # Get the set of m/z values associated with current mass features
1127        associated_ms1_scans = {mf.apex_scan for mf in self.mass_features.values()}
1128        associated_ms1_scans = [int(scan) for scan in associated_ms1_scans]
1129        
1130        # Get keys within the _ms attribute (these are individual MassSpectrum objects)
1131        current_stored_spectra = list(set(self._ms.keys()))
1132        if len(current_stored_spectra) == 0:
1133            return
1134        current_stored_spectra = [int(scan) for scan in current_stored_spectra]
1135
1136        # Filter the current_stored_spectra to only ms1 scans
1137        current_stored_spectra_ms1 = [ scan for scan in current_stored_spectra if scan in self.ms1_scans ]
1138
1139        # Remove MS1 spectra that are not associated with any mass features
1140        scans_to_drop = [scan for scan in current_stored_spectra_ms1 if scan not in associated_ms1_scans]
1141        for scan in scans_to_drop:
1142            if scan in self._ms:
1143                del self._ms[scan]

Methods for performing LC calculations on mass spectra data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • get_max_eic(eic_data). Returns the maximum EIC value from the given EIC data. A static method.
  • smooth_tic(tic). Smooths the TIC data using the specified smoothing method and settings.
  • eic_centroid_detector(rt, eic, max_eic). Performs EIC centroid detection on the given EIC data.
  • find_nearest_scan(rt). Finds the nearest scan to the given retention time.
  • get_average_mass_spectrum(scan_list, apex_scan, spectrum_mode="profile", ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None). Returns an averaged mass spectrum object.
  • find_mass_features(ms_level=1). Find regions of interest for a given MS level (default is MS1).
  • integrate_mass_features(drop_if_fail=False, ms_level=1). Integrate mass features of interest and extracts EICs.
  • find_c13_mass_features(). Evaluate mass features and mark likely C13 isotopes.
  • deconvolute_ms1_mass_features(). Deconvolute mass features' ms1 mass spectra.
@staticmethod
def get_max_eic(eic_data: dict):
66    @staticmethod
67    def get_max_eic(eic_data: dict):
68        """Returns the maximum EIC value from the given EIC data.
69
70        Notes
71        -----
72        This is a static method.
73
74        Parameters
75        ----------
76        eic_data : dict
77            A dictionary containing EIC data.
78
79        Returns
80        -------
81        float
82            The maximum EIC value.
83        """
84        max_eic = 0
85        for eic_data in eic_data.values():
86            ind_max_eic = max(eic_data.get("EIC"))
87            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
88
89        return max_eic

Returns the maximum EIC value from the given EIC data.

Notes

This is a static method.

Parameters
  • eic_data (dict): A dictionary containing EIC data.
Returns
  • float: The maximum EIC value.
def smooth_tic(self, tic):
 91    def smooth_tic(self, tic):
 92        """Smooths the TIC or EIC data using the specified smoothing method and settings.
 93
 94        Parameters
 95        ----------
 96        tic : numpy.ndarray
 97            The TIC (or EIC) data to be smoothed.
 98
 99        Returns
100        -------
101        numpy.ndarray
102            The smoothed TIC data.
103        """
104        implemented_smooth_method = self.parameters.lc_ms.implemented_smooth_method
105
106        pol_order = self.parameters.lc_ms.savgol_pol_order
107
108        window_len = self.parameters.lc_ms.smooth_window
109
110        window = self.parameters.lc_ms.smooth_method
111
112        return sp.smooth_signal(
113            tic, window_len, window, pol_order, implemented_smooth_method
114        )

Smooths the TIC or EIC data using the specified smoothing method and settings.

Parameters
  • tic (numpy.ndarray): The TIC (or EIC) data to be smoothed.
Returns
  • numpy.ndarray: The smoothed TIC data.
def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
116    def eic_centroid_detector(self, rt, eic, max_eic, apex_indexes=[]):
117        """Performs EIC centroid detection on the given EIC data.
118
119        Parameters
120        ----------
121        rt : numpy.ndarray
122            The retention time data.
123        eic : numpy.ndarray
124            The EIC data.
125        max_eic : float
126            The maximum EIC value.
127        apex_indexes : list, optional
128            The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
129
130        Returns
131        -------
132        numpy.ndarray
133            The indexes of left, apex, and right limits as a generator.
134        """
135
136        max_prominence = self.parameters.lc_ms.peak_max_prominence_percent
137
138        max_height = self.parameters.lc_ms.peak_height_max_percent
139
140        signal_threshold = self.parameters.lc_ms.eic_signal_threshold
141
142        min_peak_datapoints = self.parameters.lc_ms.min_peak_datapoints
143
144        peak_derivative_threshold = self.parameters.lc_ms.peak_derivative_threshold
145
146        include_indexes = sp.peak_picking_first_derivative(
147            domain=rt,
148            signal=eic,
149            max_height=max_height,
150            max_prominence=max_prominence,
151            max_signal=max_eic,
152            min_peak_datapoints=min_peak_datapoints,
153            peak_derivative_threshold=peak_derivative_threshold,
154            signal_threshold=signal_threshold,
155            correct_baseline=False,
156            plot_res=False,
157            apex_indexes=apex_indexes,
158        )
159        #include_indexes is a generator of tuples (left_index, apex_index, right_index)
160        include_indexes = list(include_indexes)
161        # Add check to make sure that there are at least 1/2 of min_peak_datapoints on either side of the apex
162        indicies = [x for x in include_indexes]
163        for idx in indicies:
164            if (idx[1] - idx[0] < min_peak_datapoints / 2) or (
165                idx[2] - idx[1] < min_peak_datapoints / 2
166            ):
167                include_indexes.remove(idx)
168        return include_indexes

Performs EIC centroid detection on the given EIC data.

Parameters
  • rt (numpy.ndarray): The retention time data.
  • eic (numpy.ndarray): The EIC data.
  • max_eic (float): The maximum EIC value.
  • apex_indexes (list, optional): The apexes of the EIC peaks. Defaults to [], which means that the apexes will be calculated by the function.
Returns
  • numpy.ndarray: The indexes of left, apex, and right limits as a generator.
def find_nearest_scan(self, rt):
170    def find_nearest_scan(self, rt):
171        """Finds the nearest scan to the given retention time.
172
173        Parameters
174        ----------
175        rt : float
176            The retention time (in minutes) to find the nearest scan for.
177
178        Returns
179        -------
180        int
181            The scan number of the nearest scan.
182        """
183        array_rt = np.array(self.retention_time)
184
185        scan_index = (np.abs(array_rt - rt)).argmin()
186
187        real_scan = self.scans_number[scan_index]
188
189        return real_scan

Finds the nearest scan to the given retention time.

Parameters
  • rt (float): The retention time (in minutes) to find the nearest scan for.
Returns
  • int: The scan number of the nearest scan.
def add_peak_metrics(self, remove_by_metrics=True):
191    def add_peak_metrics(self, remove_by_metrics=True):
192        """Add peak metrics to the mass features.
193
194        This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.
195
196        Parameters
197        ----------
198        remove_by_metrics : bool, optional
199            If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity,
200            dispersity index, and noise score. Default is True, which checks the setting in the processing parameters.
201            If False, peak metrics are calculated but no mass features are removed, regardless of the setting in the processing parameters.
202        """
203        # Check that at least some mass features have eic data
204        if not any([mf._eic_data is not None for mf in self.mass_features.values()]):
205            raise ValueError(
206                "No mass features have EIC data. Run integrate_mass_features first."
207            )
208
209        for mass_feature in self.mass_features.values():
210            # Check if the mass feature has been integrated
211            if mass_feature._eic_data is not None:
212                # Calculate peak metrics
213                mass_feature.calc_half_height_width()
214                mass_feature.calc_tailing_factor()
215                mass_feature.calc_dispersity_index()
216                mass_feature.calc_gaussian_similarity()
217                mass_feature.calc_noise_score()
218        
219        # Remove mass features by peak metrics if designated in parameters
220        if self.parameters.lc_ms.remove_mass_features_by_peak_metrics and remove_by_metrics:
221            self._remove_mass_features_by_peak_metrics()

Add peak metrics to the mass features.

This function calculates the peak metrics for each mass feature and adds them to the mass feature objects.

Parameters
  • remove_by_metrics (bool, optional): If True, remove mass features based on their peak metrics such as S/N, Gaussian similarity, dispersity index, and noise score. Default is True, which checks the setting in the processing parameters. If False, peak metrics are calculated but no mass features are removed, regardless of the setting in the processing parameters.
def get_average_mass_spectrum( self, scan_list, apex_scan, spectrum_mode='profile', ms_level=1, auto_process=True, use_parser=False, perform_checks=True, polarity=None, ms_params=None):
223    def get_average_mass_spectrum(
224        self,
225        scan_list,
226        apex_scan,
227        spectrum_mode="profile",
228        ms_level=1,
229        auto_process=True,
230        use_parser=False,
231        perform_checks=True,
232        polarity=None,
233        ms_params=None,
234    ):
235        """Returns an averaged mass spectrum object
236
237        Parameters
238        ----------
239        scan_list : list
240            List of scan numbers to average.
241        apex_scan : int
242            Number of the apex scan
243        spectrum_mode : str, optional
244            The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
245        ms_level : int, optional
246            The MS level to use. Defaults to 1.
247        auto_process : bool, optional
248            If True, the averaged mass spectrum will be auto-processed. Defaults to True.
249        use_parser : bool, optional
250            If True, the mass spectra will be obtained from the parser. Defaults to False.
251        perform_checks : bool, optional
252            If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary!  ms_unprocessed dictionary also must be indexed on scan
253        polarity : int, optional
254            The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
255        ms_params : MSParameters, optional
256            The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
257
258        Returns
259        -------
260        MassSpectrumProfile
261            The averaged mass spectrum object.
262
263        Raises
264        ------
265        ValueError
266            If the spectrum mode is not "profile".
267            If the MS level is not found in the unprocessed mass spectra dictionary.
268            If not all scan numbers are found in the unprocessed mass spectra dictionary.
269        """
270        if perform_checks:
271            if spectrum_mode != "profile":
272                raise ValueError("Averaging only supported for profile mode")
273
274        if polarity is None:
275            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
276            if self.polarity == "negative":
277                polarity = -1
278            elif self.polarity == "positive":
279                polarity = 1
280            else:
281                raise ValueError(
282                    "Polarity not set for dataset, must be a set containing either 'positive' or 'negative'"
283                )
284
285        # if not using_parser, check that scan numbers are in _ms_unprocessed
286        if not use_parser:
287            if perform_checks:
288                # Set index to scan for faster lookup
289                ms_df = (
290                    self._ms_unprocessed[ms_level]
291                    .copy()
292                    .set_index("scan", drop=False)
293                    .sort_index()
294                )
295                my_ms_df = ms_df.loc[scan_list]
296                # Check that all scan numbers are in the ms_df
297                if not all(np.isin(scan_list, ms_df.index)):
298                    raise ValueError(
299                        "Not all scan numbers found in the unprocessed mass spectra dictionary"
300                    )
301            else:
302                my_ms_df = (
303                    pd.DataFrame({"scan": scan_list})
304                    .set_index("scan")
305                    .join(self._ms_unprocessed[ms_level], how="left")
306                )
307
308        if use_parser:
309            ms_list = [
310                self.spectra_parser.get_mass_spectrum_from_scan(
311                    x, spectrum_mode=spectrum_mode, auto_process=False
312                )
313                for x in scan_list
314            ]
315            ms_mz = [x._mz_exp for x in ms_list]
316            ms_int = [x._abundance for x in ms_list]
317            my_ms_df = []
318            for i in np.arange(len(ms_mz)):
319                my_ms_df.append(
320                    pd.DataFrame(
321                        {"mz": ms_mz[i], "intensity": ms_int[i], "scan": scan_list[i]}
322                    )
323                )
324            my_ms_df = pd.concat(my_ms_df)
325
326        if not self.check_if_grid(my_ms_df):
327            my_ms_df = self.grid_data(my_ms_df)
328
329        my_ms_ave = my_ms_df.groupby("mz")["intensity"].sum().reset_index()
330
331        ms = ms_from_array_profile(
332            my_ms_ave.mz,
333            my_ms_ave.intensity,
334            self.file_location,
335            polarity=polarity,
336            auto_process=False,
337        )
338
339        # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
340        if ms is not None:
341            if ms_params is not None:
342                ms.parameters = ms_params
343            ms.scan_number = apex_scan
344            if auto_process:
345                ms.process_mass_spec()
346        return ms

Returns an averaged mass spectrum object

Parameters
  • scan_list (list): List of scan numbers to average.
  • apex_scan (int): Number of the apex scan
  • spectrum_mode (str, optional): The spectrum mode to use. Defaults to "profile". Not that only "profile" mode is supported for averaging.
  • ms_level (int, optional): The MS level to use. Defaults to 1.
  • auto_process (bool, optional): If True, the averaged mass spectrum will be auto-processed. Defaults to True.
  • use_parser (bool, optional): If True, the mass spectra will be obtained from the parser. Defaults to False.
  • perform_checks (bool, optional): If True, the function will check if the data are within the ms_unprocessed dictionary and are the correct mode. Defaults to True. Only set to False if you are sure the data are profile, and (if not using the parser) are in the ms_unprocessed dictionary! ms_unprocessed dictionary also must be indexed on scan
  • polarity (int, optional): The polarity of the mass spectra (1 or -1). If not set, the polarity will be determined from the dataset. Defaults to None. (fastest if set to -1 or 1)
  • ms_params (MSParameters, optional): The mass spectrum parameters to use. If not set (None), the globally set parameters will be used. Defaults to None.
Returns
  • MassSpectrumProfile: The averaged mass spectrum object.
Raises
  • ValueError: If the spectrum mode is not "profile". If the MS level is not found in the unprocessed mass spectra dictionary. If not all scan numbers are found in the unprocessed mass spectra dictionary.
def find_mass_features(self, ms_level=1, grid=True):
348    def find_mass_features(self, ms_level=1, grid=True):
349        """Find mass features within an LCMSBase object
350
351        Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.
352
353        Parameters
354        ----------
355        ms_level : int, optional
356            The MS level to use for peak picking Default is 1.
357        grid : bool, optional
358            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded),
359            used for persistent homology peak picking for profile data only. Default is True.
360
361        Raises
362        ------
363        ValueError
364            If no MS level data is found on the object.
365            If persistent homology peak picking is attempted on non-profile mode data.
366            If data is not gridded and grid is False.
367            If peak picking method is not implemented.
368
369        Returns
370        -------
371        None, but assigns the mass_features and eics attributes to the object.
372
373        """
374        pp_method = self.parameters.lc_ms.peak_picking_method
375
376        if pp_method == "persistent homology":
377            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
378            if all(msx_scan_df["ms_format"] == "profile"):
379                self.find_mass_features_ph(ms_level=ms_level, grid=grid)
380            else:
381                raise ValueError(
382                    "MS{} scans are not profile mode, which is required for persistent homology peak picking.".format(
383                        ms_level
384                    )
385                )
386        elif pp_method == "centroided_persistent_homology":
387            msx_scan_df = self.scan_df[self.scan_df["ms_level"] == ms_level]
388            if all(msx_scan_df["ms_format"] == "centroid"):
389                self.find_mass_features_ph_centroid(ms_level=ms_level)
390            else:
391                raise ValueError(
392                    "MS{} scans are not centroid mode, which is required for persistent homology centroided peak picking.".format(
393                        ms_level
394                    )
395                )
396        else:
397            raise ValueError("Peak picking method not implemented")
398        
399        # Remove noisey mass features if designated in parameters
400        if self.parameters.lc_ms.remove_redundant_mass_features:
401            self._remove_redundant_mass_features()

Find mass features within an LCMSBase object

Note that this is a wrapper function that calls the find_mass_features_ph function, but can be extended to support other peak picking methods in the future.

Parameters
  • ms_level (int, optional): The MS level to use for peak picking Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded), used for persistent homology peak picking for profile data only. Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If persistent homology peak picking is attempted on non-profile mode data. If data is not gridded and grid is False. If peak picking method is not implemented.
Returns
  • None, but assigns the mass_features and eics attributes to the object.
def integrate_mass_features(self, drop_if_fail=True, drop_duplicates=True, ms_level=1):
403    def integrate_mass_features(
404        self, drop_if_fail=True, drop_duplicates=True, ms_level=1
405    ):
406        """Integrate mass features and extract EICs.
407
408        Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.
409
410        Parameters
411        ----------
412        drop_if_fail : bool, optional
413            Whether to drop mass features if the EIC limit calculations fail.
414            Default is True.
415        drop_duplicates : bool, optional
416            Whether to mass features that appear to be duplicates
417            (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating).
418            Default is True.
419        ms_level : int, optional
420            The MS level to use. Default is 1.
421
422        Raises
423        ------
424        ValueError
425            If no mass features are found.
426            If no MS level data is found for the given MS level (either in data or in the scan data)
427
428        Returns
429        -------
430        None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
431
432        Notes
433        -----
434        drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).
435        """
436        # Check if there is data
437        if ms_level in self._ms_unprocessed.keys():
438            raw_data = self._ms_unprocessed[ms_level].copy()
439        else:
440            raise ValueError("No MS level " + str(ms_level) + " data found")
441        if self.mass_features is not None:
442            mf_df = self.mass_features_to_df().copy()
443        else:
444            raise ValueError(
445                "No mass features found, did you run find_mass_features() first?"
446            )
447
448        # Subset scan data to only include correct ms_level
449        scan_df_sub = self.scan_df[
450            self.scan_df["ms_level"] == int(ms_level)
451        ].reset_index(drop=True)
452        if scan_df_sub.empty:
453            raise ValueError("No MS level " + ms_level + " data found in scan data")
454        scan_df_sub = scan_df_sub[["scan", "scan_time"]].copy()
455
456        mzs_to_extract = np.unique(mf_df["mz"].values)
457        mzs_to_extract.sort()
458
459        # Pre-sort raw_data by mz for faster filtering
460        raw_data_sorted = raw_data.sort_values(["mz", "scan"]).reset_index(drop=True)
461        raw_data_mz = raw_data_sorted["mz"].values
462
463        # Get EICs for each unique mz in mass features list
464        for mz in mzs_to_extract:
465            mz_max = mz + self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
466            mz_min = mz - self.parameters.lc_ms.eic_tolerance_ppm * mz / 1e6
467
468            # Use binary search for faster mz range filtering
469            left_idx = np.searchsorted(raw_data_mz, mz_min, side="left")
470            right_idx = np.searchsorted(raw_data_mz, mz_max, side="right")
471            raw_data_sub = raw_data_sorted.iloc[left_idx:right_idx].copy()
472
473            raw_data_sub = (
474                raw_data_sub.groupby(["scan"])["intensity"].sum().reset_index()
475            )
476            raw_data_sub = scan_df_sub.merge(raw_data_sub, on="scan", how="left")
477            raw_data_sub["intensity"] = raw_data_sub["intensity"].fillna(0)
478            myEIC = EIC_Data(
479                scans=raw_data_sub["scan"].values,
480                time=raw_data_sub["scan_time"].values,
481                eic=raw_data_sub["intensity"].values,
482            )
483            # Smooth EIC
484            smoothed_eic = self.smooth_tic(myEIC.eic)
485            smoothed_eic[smoothed_eic < 0] = 0
486            myEIC.eic_smoothed = smoothed_eic
487            self.eics[mz] = myEIC
488
489        # Get limits of mass features using EIC centroid detector and integrate
490        mf_df["area"] = np.nan
491        for idx, mass_feature in mf_df.iterrows():
492            mz = mass_feature.mz
493            apex_scan = mass_feature.apex_scan
494
495            # Pull EIC data and find apex scan index
496            myEIC = self.eics[mz]
497            self.mass_features[idx]._eic_data = myEIC
498            apex_index = np.where(myEIC.scans == apex_scan)[0][0]
499
500            # Find left and right limits of peak using EIC centroid detector, add to EICData
501            centroid_eics = self.eic_centroid_detector(
502                myEIC.time,
503                myEIC.eic_smoothed,
504                mass_feature.intensity * 1.1,
505                apex_indexes=[int(apex_index)],
506            )
507            l_a_r_scan_idx = [i for i in centroid_eics]
508            if len(l_a_r_scan_idx) > 0:
509                # Calculate number of consecutive scans with intensity > 0 and check if it is above the minimum consecutive scans
510                # Find the number of consecutive non-zero values in the EIC segment
511                mask = myEIC.eic[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1] > 0
512                # Find the longest run of consecutive True values
513                if np.any(mask):
514                    # Find indices where mask changes value
515                    diff = np.diff(np.concatenate(([0], mask.astype(int), [0])))
516                    starts = np.where(diff == 1)[0]
517                    ends = np.where(diff == -1)[0]
518                    consecutive_scans = (ends - starts).max()
519                else:
520                    consecutive_scans = 0
521                if consecutive_scans < self.parameters.lc_ms.consecutive_scan_min:
522                    self.mass_features.pop(idx)
523                    continue
524                # Add start and final scan to mass_features and EICData
525                left_scan, right_scan = (
526                    myEIC.scans[l_a_r_scan_idx[0][0]],
527                    myEIC.scans[l_a_r_scan_idx[0][2]],
528                )
529                mf_scan_apex = [(left_scan, int(apex_scan), right_scan)]
530                myEIC.apexes = myEIC.apexes + mf_scan_apex
531                self.mass_features[idx].start_scan = left_scan
532                self.mass_features[idx].final_scan = right_scan
533
534                # Find area under peak using limits from EIC centroid detector, add to mass_features and EICData
535                area = np.trapz(
536                    myEIC.eic_smoothed[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
537                    myEIC.time[l_a_r_scan_idx[0][0] : l_a_r_scan_idx[0][2] + 1],
538                )
539                mf_df.at[idx, "area"] = area
540                myEIC.areas = myEIC.areas + [area]
541                self.eics[mz] = myEIC
542                self.mass_features[idx]._area = area
543            else:
544                if drop_if_fail is True:
545                    self.mass_features.pop(idx)
546
547        if drop_duplicates:
548            # Prepare mass feature dataframe
549            mf_df = self.mass_features_to_df()
550
551            # For each mass feature, find all mass features within the clustering tolerance ppm and drop if their start and end times are within another mass feature
552            # Keep the first mass feature (highest persistence)
553            for idx, mass_feature in mf_df.iterrows():
554                mz = mass_feature.mz
555                apex_scan = mass_feature.apex_scan
556
557                mf_df["mz_diff_ppm"] = np.abs(mf_df["mz"] - mz) / mz * 10**6
558                mf_df_sub = mf_df[
559                    mf_df["mz_diff_ppm"]
560                    < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
561                    * 10**6
562                ].copy()
563
564                # For all mass features within the clustering tolerance, check if the start and end times are within the start and end times of the mass feature
565                for idx2, mass_feature2 in mf_df_sub.iterrows():
566                    if idx2 != idx:
567                        if (
568                            mass_feature2.start_scan >= mass_feature.start_scan
569                            and mass_feature2.final_scan <= mass_feature.final_scan
570                        ):
571                            if idx2 in self.mass_features.keys():
572                                self.mass_features.pop(idx2)

Integrate mass features and extract EICs.

Populates the _eics attribute on the LCMSBase object for each unique mz in the mass_features dataframe and adds data (start_scan, final_scan, area) to the mass_features attribute.

Parameters
  • drop_if_fail (bool, optional): Whether to drop mass features if the EIC limit calculations fail. Default is True.
  • drop_duplicates (bool, optional): Whether to mass features that appear to be duplicates (i.e., mz is similar to another mass feature and limits of the EIC are similar or encapsulating). Default is True.
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no mass features are found. If no MS level data is found for the given MS level (either in data or in the scan data)
Returns
  • None, but populates the eics attribute on the LCMSBase object and adds data (start_scan, final_scan, area) to the mass_features attribute.
Notes

drop_if_fail is useful for discarding mass features that do not have good shapes, usually due to a detection on a shoulder of a peak or a noisy region (especially if minimal smoothing is used during mass feature detection).

def find_c13_mass_features(self):
574    def find_c13_mass_features(self):
575        """Mark likely C13 isotopes and connect to monoisoitopic mass features.
576
577        Returns
578        -------
579        None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
580
581        Raises
582        ------
583        ValueError
584            If no mass features are found.
585        """
586        verbose = self.parameters.lc_ms.verbose_processing
587        if verbose:
588            print("evaluating mass features for C13 isotopes")
589        if self.mass_features is None:
590            raise ValueError("No mass features found, run find_mass_features() first")
591
592        # Data prep fo sparse distance matrix
593        dims = ["mz", "scan_time"]
594        mf_df = self.mass_features_to_df().copy()
595        # Drop mass features that have no area (these are likely to be noise)
596        mf_df = mf_df[mf_df["area"].notnull()]
597        mf_df["mf_id"] = mf_df.index.values
598        dims = ["mz", "scan_time"]
599
600        # Sort my ascending mz so we always get the monoisotopic mass first, regardless of the order/intensity of the mass features
601        mf_df = mf_df.sort_values(by=["mz"]).reset_index(drop=True).copy()
602
603        mz_diff = 1.003355  # C13-C12 mass difference
604        tol = [
605            mf_df["mz"].median()
606            * self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
607            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance * 0.5,
608        ]  # mz, in relative; scan_time in minutes
609
610        # Compute inter-feature distances
611        distances = None
612        for i in range(len(dims)):
613            # Construct k-d tree
614            values = mf_df[dims[i]].values
615            tree = KDTree(values.reshape(-1, 1))
616
617            max_tol = tol[i]
618            if dims[i] == "mz":
619                # Maximum absolute tolerance
620                max_tol = mz_diff + tol[i]
621
622            # Compute sparse distance matrix
623            # the larger the max_tol, the slower this operation is
624            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
625
626            # Only consider forward case, exclude diagonal
627            sdm = sparse.triu(sdm, k=1)
628
629            if dims[i] == "mz":
630                min_tol = mz_diff - tol[i]
631                # Get only the ones that are above the min tol
632                idx = sdm.data > min_tol
633
634                # Reconstruct sparse distance matrix
635                sdm = sparse.coo_matrix(
636                    (sdm.data[idx], (sdm.row[idx], sdm.col[idx])),
637                    shape=(len(values), len(values)),
638                )
639
640            # Cast as binary matrix
641            sdm.data = np.ones_like(sdm.data)
642
643            # Stack distances
644            if distances is None:
645                distances = sdm
646            else:
647                distances = distances.multiply(sdm)
648
649        # Extract indices of within-tolerance points
650        distances = distances.tocoo()
651        pairs = np.stack((distances.row, distances.col), axis=1)  # C12 to C13 pairs
652
653        # Turn pairs (which are index of mf_df) into mf_id and then into two dataframes to join to mf_df
654        pairs_mf = pairs.copy()
655        pairs_mf[:, 0] = mf_df.iloc[pairs[:, 0]].mf_id.values
656        pairs_mf[:, 1] = mf_df.iloc[pairs[:, 1]].mf_id.values
657
658        # Connect monoisotopic masses with isotopologes within mass_features
659        monos = np.setdiff1d(np.unique(pairs_mf[:, 0]), np.unique(pairs_mf[:, 1]))
660        for mono in monos:
661            self.mass_features[mono].monoisotopic_mf_id = mono
662        pairs_iso_df = pd.DataFrame(pairs_mf, columns=["parent", "child"])
663        while not pairs_iso_df.empty:
664            pairs_iso_df = pairs_iso_df.set_index("parent", drop=False)
665            m1_isos = pairs_iso_df.loc[monos, "child"].unique()
666            for iso in m1_isos:
667                # Set monoisotopic_mf_id and isotopologue_type for isotopologues
668                parent = pairs_mf[pairs_mf[:, 1] == iso, 0]
669                if len(parent) > 1:
670                    # Choose the parent that is closest in time to the isotopologue
671                    parent_time = [self.mass_features[p].retention_time for p in parent]
672                    time_diff = [
673                        np.abs(self.mass_features[iso].retention_time - x)
674                        for x in parent_time
675                    ]
676                    parent = parent[np.argmin(time_diff)]
677                else:
678                    parent = parent[0]
679                self.mass_features[iso].monoisotopic_mf_id = self.mass_features[
680                    parent
681                ].monoisotopic_mf_id
682                if self.mass_features[iso].monoisotopic_mf_id is not None:
683                    mass_diff = (
684                        self.mass_features[iso].mz
685                        - self.mass_features[
686                            self.mass_features[iso].monoisotopic_mf_id
687                        ].mz
688                    )
689                    self.mass_features[iso].isotopologue_type = "13C" + str(
690                        int(round(mass_diff, 0))
691                    )
692
693            # Drop the mono and iso from the pairs_iso_df
694            pairs_iso_df = pairs_iso_df.drop(
695                index=monos, errors="ignore"
696            )  # Drop pairs where the parent is a child that is a child of a root
697            pairs_iso_df = pairs_iso_df.set_index("child", drop=False)
698            pairs_iso_df = pairs_iso_df.drop(index=m1_isos, errors="ignore")
699
700            if not pairs_iso_df.empty:
701                # Get new monos, recognizing that these are just 13C isotopologues that are connected to other 13C isotopologues to repeat the process
702                monos = np.setdiff1d(
703                    np.unique(pairs_iso_df.parent), np.unique(pairs_iso_df.child)
704                )
705        if verbose:
706            # Report fraction of compounds annotated with isotopes
707            mf_df["c13_flag"] = np.where(
708                np.logical_or(
709                    np.isin(mf_df["mf_id"], pairs_mf[:, 0]),
710                    np.isin(mf_df["mf_id"], pairs_mf[:, 1]),
711                ),
712                1,
713                0,
714            )
715            print(
716                str(round(len(mf_df[mf_df["c13_flag"] == 1]) / len(mf_df), ndigits=3))
717                + " of mass features have or are C13 isotopes"
718            )

Mark likely C13 isotopes and connect to monoisoitopic mass features.

Returns
  • None, but populates the monoisotopic_mf_id and isotopologue_type attributes to the indivual LCMSMassFeatures within the mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found.
def deconvolute_ms1_mass_features(self):
720    def deconvolute_ms1_mass_features(self):
721        """Deconvolute MS1 mass features
722
723        Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features
724
725        Parameters
726        ----------
727        None
728
729        Returns
730        -------
731        None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
732        and associated_mass_features_deconvoluted attributes to the mass features in the
733        mass_features attribute of the LCMSBase object.
734
735        Raises
736        ------
737        ValueError
738            If no mass features are found, must run find_mass_features() first.
739            If no EICs are found, did you run integrate_mass_features() first?
740
741        """
742        # Checks for set mass_features and eics
743        if self.mass_features is None:
744            raise ValueError(
745                "No mass features found, did you run find_mass_features() first?"
746            )
747
748        if self.eics == {}:
749            raise ValueError(
750                "No EICs found, did you run integrate_mass_features() first?"
751            )
752
753        if 1 not in self._ms_unprocessed.keys():
754            raise ValueError("No unprocessed MS1 spectra found.")
755
756        # Prep ms1 data
757        ms1_data = self._ms_unprocessed[1].copy()
758        ms1_data = ms1_data.set_index("scan")
759
760        # Prep mass feature summary
761        mass_feature_df = self.mass_features_to_df()
762
763        # Loop through each mass feature
764        for mf_id, mass_feature in self.mass_features.items():
765            # Check that the mass_feature.mz attribute == the mz of the mass feature in the mass_feature_df
766            if mass_feature.mz != mass_feature.ms1_peak.mz_exp:
767                continue
768
769            # Get the left and right limits of the EIC of the mass feature
770            l_scan, _, r_scan = mass_feature._eic_data.apexes[0]
771
772            # Pull from the _ms1_unprocessed data the scan range of interest and sort by mz
773            ms1_data_sub = ms1_data.loc[l_scan:r_scan].copy()
774            ms1_data_sub = ms1_data_sub.sort_values(by=["mz"]).reset_index(drop=False)
775
776            # Get the centroided masses of the mass feature
777            mf_mspeak_mzs = mass_feature.mass_spectrum.mz_exp
778
779            # Find the closest mz in the ms1 data to the centroided masses of the mass feature
780            ms1_data_sub["mass_feature_mz"] = mf_mspeak_mzs[
781                find_closest(mf_mspeak_mzs, ms1_data_sub.mz.values)
782            ]
783
784            # Drop rows with mz_diff > 0.01 between the mass feature mz and the ms1 data mz
785            ms1_data_sub["mz_diff_rel"] = (
786                np.abs(ms1_data_sub["mass_feature_mz"] - ms1_data_sub["mz"])
787                / ms1_data_sub["mz"]
788            )
789            ms1_data_sub = ms1_data_sub[
790                ms1_data_sub["mz_diff_rel"]
791                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel
792            ].reset_index(drop=True)
793
794            # Group by mass_feature_mz and scan and sum intensity
795            ms1_data_sub_group = (
796                ms1_data_sub.groupby(["mass_feature_mz", "scan"])["intensity"]
797                .sum()
798                .reset_index()
799            )
800
801            # Calculate the correlation of the intensities of the mass feature and the ms1 data (set to 0 if no intensity)
802            corr = (
803                ms1_data_sub_group.pivot(
804                    index="scan", columns="mass_feature_mz", values="intensity"
805                )
806                .fillna(0)
807                .corr()
808            )
809
810            # Subset the correlation matrix to only include the masses of the mass feature and those with a correlation > 0.8
811            decon_corr_min = self.parameters.lc_ms.ms1_deconvolution_corr_min
812
813            # Try catch for KeyError in case the mass feature mz is not in the correlation matrix
814            try:
815                corr_subset = corr.loc[mass_feature.mz,]
816            except KeyError:
817                # If the mass feature mz is not in the correlation matrix, skip to the next mass feature
818                continue
819
820            corr_subset = corr_subset[corr_subset > decon_corr_min]
821
822            # Get the masses from the mass spectrum that are the result of the deconvolution
823            mzs_decon = corr_subset.index.values
824
825            # Get the indices of the mzs_decon in mass_feature.mass_spectrum.mz_exp and assign to the mass feature
826            mzs_decon_idx = [
827                id
828                for id, mz in enumerate(mass_feature.mass_spectrum.mz_exp)
829                if mz in mzs_decon
830            ]
831            mass_feature._ms_deconvoluted_idx = mzs_decon_idx
832
833            # Check if the mass feature's ms1 peak is the largest in the deconvoluted mass spectrum
834            if (
835                mass_feature.ms1_peak.abundance
836                == mass_feature.mass_spectrum.abundance[mzs_decon_idx].max()
837            ):
838                mass_feature.mass_spectrum_deconvoluted_parent = True
839            else:
840                mass_feature.mass_spectrum_deconvoluted_parent = False
841
842            # Check for other mass features that are in the deconvoluted mass spectrum and add the deconvoluted mass spectrum to the mass feature
843            # Subset mass_feature_df to only include mass features that are within the clustering tolerance
844            mass_feature_df_sub = mass_feature_df[
845                abs(mass_feature.retention_time - mass_feature_df["scan_time"])
846                < self.parameters.lc_ms.mass_feature_cluster_rt_tolerance
847            ].copy()
848            # Calculate the mz difference in ppm between the mass feature and the peaks in the deconvoluted mass spectrum
849            mass_feature_df_sub["mz_diff_ppm"] = [
850                np.abs(mzs_decon - mz).min() / mz * 10**6
851                for mz in mass_feature_df_sub["mz"]
852            ]
853            # Subset mass_feature_df to only include mass features that are within 1 ppm of the deconvoluted masses
854            mfs_associated_decon = mass_feature_df_sub[
855                mass_feature_df_sub["mz_diff_ppm"]
856                < self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel * 10**6
857            ].index.values
858
859            mass_feature.associated_mass_features_deconvoluted = mfs_associated_decon

Deconvolute MS1 mass features

Deconvolute mass features ms1 spectrum based on the correlation of all masses within a spectrum over the EIC of the mass features

Parameters
  • None
Returns
  • None, but assigns the _ms_deconvoluted_idx, mass_spectrum_deconvoluted_parent,
  • and associated_mass_features_deconvoluted attributes to the mass features in the
  • mass_features attribute of the LCMSBase object.
Raises
  • ValueError: If no mass features are found, must run find_mass_features() first. If no EICs are found, did you run integrate_mass_features() first?
class PHCalculations:
1145class PHCalculations:
1146    """Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.
1147
1148    Notes
1149    -----
1150    This class is intended to be used as a mixin for the LCMSBase class.
1151
1152    Methods
1153    -------
1154    * sparse_mean_filter(idx, V, radius=[0, 1, 1]).
1155        Sparse implementation of a mean filter.
1156    * embed_unique_indices(a).
1157        Creates an array of indices, sorted by unique element.
1158    * sparse_upper_star(idx, V).
1159        Sparse implementation of an upper star filtration.
1160    * check_if_grid(data).
1161        Check if the data is gridded in mz space.
1162    * grid_data(data).
1163        Grid the data in the mz dimension.
1164    * find_mass_features_ph(ms_level=1, grid=True).
1165        Find mass features within an LCMSBase object using persistent homology.
1166    * cluster_mass_features(drop_children=True).
1167        Cluster regions of interest.
1168    """
1169
1170    @staticmethod
1171    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
1172        """Sparse implementation of a mean filter.
1173
1174        Parameters
1175        ----------
1176        idx : :obj:`~numpy.array`
1177            Edge indices for each dimension (MxN).
1178        V : :obj:`~numpy.array`
1179            Array of intensity data (Mx1).
1180        radius : float or list
1181            Radius of the sparse filter in each dimension. Values less than
1182            zero indicate no connectivity in that dimension.
1183
1184        Returns
1185        -------
1186        :obj:`~numpy.array`
1187            Filtered intensities (Mx1).
1188
1189        Notes
1190        -----
1191        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
1192        This is a static method.
1193        """
1194
1195        # Copy indices
1196        idx = idx.copy().astype(V.dtype)
1197
1198        # Scale
1199        for i, r in enumerate(radius):
1200            # Increase inter-index distance
1201            if r < 1:
1202                idx[:, i] *= 2
1203
1204            # Do nothing
1205            elif r == 1:
1206                pass
1207
1208            # Decrease inter-index distance
1209            else:
1210                idx[:, i] /= r
1211
1212        # Connectivity matrix
1213        cmat = KDTree(idx)
1214        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1215        cmat.setdiag(1)
1216
1217        # Pair indices
1218        I, J = cmat.nonzero()
1219
1220        # Delete cmat
1221        cmat_shape = cmat.shape
1222        del cmat
1223
1224        # Sum over columns
1225        V_sum = sparse.bsr_matrix(
1226            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
1227        ).diagonal(0)
1228
1229        # Count over columns
1230        V_count = sparse.bsr_matrix(
1231            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
1232        ).diagonal(0)
1233
1234        return V_sum / V_count
1235
1236    @staticmethod
1237    def embed_unique_indices(a):
1238        """Creates an array of indices, sorted by unique element.
1239
1240        Parameters
1241        ----------
1242        a : :obj:`~numpy.array`
1243            Array of unique elements (Mx1).
1244
1245        Returns
1246        -------
1247        :obj:`~numpy.array`
1248            Array of indices (Mx1).
1249
1250        Notes
1251        -----
1252        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1253        This is a static method.
1254        """
1255
1256        def count_tens(n):
1257            # Count tens
1258            ntens = (n - 1) // 10
1259
1260            while True:
1261                ntens_test = (ntens + n - 1) // 10
1262
1263                if ntens_test == ntens:
1264                    return ntens
1265                else:
1266                    ntens = ntens_test
1267
1268        def arange_exclude_10s(n):
1269            # How many 10s will there be?
1270            ntens = count_tens(n)
1271
1272            # Base array
1273            arr = np.arange(0, n + ntens)
1274
1275            # Exclude 10s
1276            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
1277
1278            return arr
1279
1280        # Creates an array of indices, sorted by unique element
1281        idx_sort = np.argsort(a)
1282        idx_unsort = np.argsort(idx_sort)
1283
1284        # Sorts records array so all unique elements are together
1285        sorted_a = a[idx_sort]
1286
1287        # Returns the unique values, the index of the first occurrence,
1288        # and the count for each element
1289        vals, idx_start, count = np.unique(
1290            sorted_a, return_index=True, return_counts=True
1291        )
1292
1293        # Splits the indices into separate arrays
1294        splits = np.split(idx_sort, idx_start[1:])
1295
1296        # Creates unique indices for each split
1297        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
1298
1299        # Reorders according to input array
1300        idx_unq = idx_unq[idx_unsort]
1301
1302        # Magnitude of each index
1303        exp = np.log10(
1304            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
1305        )
1306        idx_unq_mag = np.power(10, np.floor(exp) + 1)
1307
1308        # Result
1309        return a + idx_unq / idx_unq_mag
1310
1311    @staticmethod
1312    def roll_up_dataframe(
1313        df: pd.DataFrame,
1314        sort_by: str,
1315        tol: list,
1316        relative: list,
1317        dims: list,
1318        memory_opt_threshold: int = 10000,
1319    ):
1320        """Subset data by rolling up into apex in appropriate dimensions.
1321
1322        Parameters
1323        ----------
1324        data : pd.DataFrame
1325            The input data containing "dims" columns and the "sort_by" column.
1326        sort_by : str
1327            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature
1328            (i.e., the mass feature with the highest value in the sort_by column).
1329        dims : list
1330            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1331        tol : list
1332            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1333            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1334            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1335            If relative is False, the tolerance will be used as is.
1336        relative : list
1337            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1338        memory_opt_threshold : int, optional
1339            Minimum number of rows to trigger memory-optimized processing. Default is 10000.
1340
1341        Returns
1342        -------
1343        pd.DataFrame
1344            A DataFrame with only the rolled up mass features, with the original index and columns.
1345
1346
1347        Raises
1348        ------
1349        ValueError
1350            If the input data is not a pandas DataFrame.
1351            If the input data does not have columns for each of the dimensions in "dims".
1352            If the length of "dims", "tol", and "relative" do not match.
1353        """
1354        og_columns = df.columns.copy()
1355
1356        # Unindex the data, but keep the original index
1357        if df.index.name is not None:
1358            og_index = df.index.name
1359        else:
1360            og_index = "index"
1361        df = df.reset_index(drop=False)
1362
1363        # Sort data by sort_by column, and reindex
1364        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1365
1366        # Check that data is a DataFrame and has columns for each of the dims
1367        if not isinstance(df, pd.DataFrame):
1368            raise ValueError("Data must be a pandas DataFrame")
1369        for dim in dims:
1370            if dim not in df.columns:
1371                raise ValueError(f"Data must have a column for {dim}")
1372        if len(dims) != len(tol) or len(dims) != len(relative):
1373            raise ValueError(
1374                "Dimensions, tolerances, and relative flags must be the same length"
1375            )
1376
1377        # Pre-compute all values arrays
1378        all_values = [df[dim].values for dim in dims]
1379
1380        # Choose processing method based on dataframe size
1381        if len(df) >= memory_opt_threshold:
1382            # Memory-optimized approach for large dataframes
1383            distances = PHCalculations._compute_distances_memory_optimized(
1384                all_values, tol, relative
1385            )
1386        else:
1387            # Faster approach for smaller dataframes
1388            distances = PHCalculations._compute_distances_original(
1389                all_values, tol, relative
1390            )
1391
1392        # Process pairs with original logic but memory optimizations
1393        distances = distances.tocoo()
1394        pairs = np.stack((distances.row, distances.col), axis=1)
1395        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent")
1396        del distances, pairs  # Free memory immediately
1397
1398        to_drop = []
1399        while not pairs_df.empty:
1400            # Find root_parents and their children (original logic preserved)
1401            root_parents = np.setdiff1d(
1402                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1403            )
1404            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1405            to_drop.extend(children_of_roots)  # Use extend instead of append
1406
1407            # Remove root_children as possible parents from pairs_df for next iteration
1408            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1409            pairs_df = pairs_df.reset_index().set_index("child")
1410            # Remove root_children as possible children from pairs_df for next iteration
1411            pairs_df = pairs_df.drop(index=children_of_roots)
1412
1413            # Prepare for next iteration
1414            pairs_df = pairs_df.reset_index().set_index("parent")
1415
1416        # Convert to numpy array for efficient dropping
1417        to_drop = np.array(to_drop)
1418
1419        # Drop mass features that are not cluster parents
1420        df_sub = df.drop(index=to_drop)
1421
1422        # Set index back to og_index and only keep original columns
1423        df_sub = df_sub.set_index(og_index).sort_index()[og_columns]
1424
1425        return df_sub
1426
1427    @staticmethod
1428    def _compute_distances_original(all_values, tol, relative):
1429        """Original distance computation method for smaller datasets.
1430
1431        This method computes the pairwise distances between features in the dataset
1432        using a straightforward approach. It is suitable for smaller datasets where
1433        memory usage is not a primary concern.
1434
1435        Parameters
1436        ----------
1437        all_values : list of :obj:`~numpy.array`
1438            List of arrays containing the values for each dimension.
1439        tol : list of float
1440            List of tolerances for each dimension.
1441        relative : list of bool
1442            List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1443
1444        Returns
1445        -------
1446        :obj:`~scipy.sparse.coo_matrix`
1447            Sparse matrix indicating pairwise distances within tolerances.
1448        """
1449        # Compute inter-feature distances with memory optimization
1450        distances = None
1451        for i in range(len(all_values)):
1452            values = all_values[i]
1453            # Use single precision if possible to reduce memory
1454            tree = KDTree(values.reshape(-1, 1).astype(np.float32))
1455
1456            max_tol = tol[i]
1457            if relative[i] is True:
1458                max_tol = tol[i] * values.max()
1459
1460            # Compute sparse distance matrix with smaller chunks if memory is an issue
1461            sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
1462
1463            # Only consider forward case, exclude diagonal
1464            sdm = sparse.triu(sdm, k=1)
1465
1466            # Process relative distances more efficiently
1467            if relative[i] is True:
1468                # Vectorized computation without creating intermediate arrays
1469                row_values = values[sdm.row]
1470                valid_idx = sdm.data <= tol[i] * row_values
1471
1472                # Reconstruct sparse matrix more efficiently
1473                sdm = sparse.coo_matrix(
1474                    (
1475                        np.ones(valid_idx.sum(), dtype=np.uint8),
1476                        (sdm.row[valid_idx], sdm.col[valid_idx]),
1477                    ),
1478                    shape=(len(values), len(values)),
1479                )
1480            else:
1481                # Cast as binary matrix with smaller data type
1482                sdm.data = np.ones(len(sdm.data), dtype=np.uint8)
1483
1484            # Stack distances with memory-efficient multiplication
1485            if distances is None:
1486                distances = sdm
1487            else:
1488                # Use in-place operations where possible
1489                distances = distances.multiply(sdm)
1490                del sdm  # Free memory immediately
1491
1492        return distances
1493
1494    @staticmethod
1495    def _compute_distances_memory_optimized(all_values, tol, relative):
1496        """Memory-optimized distance computation for large datasets.
1497
1498        This method computes the pairwise distances between features in the dataset
1499        using a more memory-efficient approach. It is suitable for larger datasets
1500        where memory usage is a primary concern.
1501
1502        Parameters
1503        ----------
1504        all_values : list of :obj:`~numpy.array`
1505            List of arrays containing the values for each dimension.
1506        tol : list of float
1507            List of tolerances for each dimension.
1508        relative : list of bool
1509            List of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1510
1511        Returns
1512        -------
1513        :obj:`~scipy.sparse.coo_matrix`
1514            Sparse matrix indicating pairwise distances within tolerances.
1515        """
1516        # Compute distance matrix for first dimension (full matrix as before)
1517        values_0 = all_values[0].astype(np.float32)
1518        tree_0 = KDTree(values_0.reshape(-1, 1))
1519
1520        max_tol_0 = tol[0]
1521        if relative[0] is True:
1522            max_tol_0 = tol[0] * values_0.max()
1523
1524        # Compute sparse distance matrix for first dimension
1525        distances = tree_0.sparse_distance_matrix(
1526            tree_0, max_tol_0, output_type="coo_matrix"
1527        )
1528        distances = sparse.triu(distances, k=1)
1529
1530        # Process relative distances for first dimension
1531        if relative[0] is True:
1532            row_values = values_0[distances.row]
1533            valid_idx = distances.data <= tol[0] * row_values
1534            distances = sparse.coo_matrix(
1535                (
1536                    np.ones(valid_idx.sum(), dtype=np.uint8),
1537                    (distances.row[valid_idx], distances.col[valid_idx]),
1538                ),
1539                shape=(len(values_0), len(values_0)),
1540            )
1541        else:
1542            distances.data = np.ones(len(distances.data), dtype=np.uint8)
1543
1544        # For remaining dimensions, work only on chunks defined by first dimension pairs
1545        if len(all_values) > 1:
1546            distances_coo = distances.tocoo()
1547            valid_pairs = []
1548
1549            # Process each pair from first dimension
1550            for idx in range(len(distances_coo.data)):
1551                i, j = distances_coo.row[idx], distances_coo.col[idx]
1552                is_valid_pair = True
1553
1554                # Check remaining dimensions for this specific pair
1555                for dim_idx in range(1, len(all_values)):
1556                    values = all_values[dim_idx]
1557                    val_i, val_j = values[i], values[j]
1558
1559                    max_tol = tol[dim_idx]
1560                    if relative[dim_idx] is True:
1561                        max_tol = tol[dim_idx] * values.max()
1562
1563                    distance_ij = abs(val_i - val_j)
1564
1565                    # Check if this pair satisfies the tolerance for this dimension
1566                    if relative[dim_idx] is True:
1567                        if distance_ij > tol[dim_idx] * val_i:
1568                            is_valid_pair = False
1569                            break
1570                    else:
1571                        if distance_ij > max_tol:
1572                            is_valid_pair = False
1573                            break
1574
1575                if is_valid_pair:
1576                    valid_pairs.append((i, j))
1577
1578            # Rebuild distances matrix with only valid pairs
1579            if valid_pairs:
1580                valid_pairs = np.array(valid_pairs)
1581                distances = sparse.coo_matrix(
1582                    (
1583                        np.ones(len(valid_pairs), dtype=np.uint8),
1584                        (valid_pairs[:, 0], valid_pairs[:, 1]),
1585                    ),
1586                    shape=(len(values_0), len(values_0)),
1587                )
1588            else:
1589                # No valid pairs found
1590                distances = sparse.coo_matrix(
1591                    (len(values_0), len(values_0)), dtype=np.uint8
1592                )
1593
1594        return distances
1595
1596    def sparse_upper_star(self, idx, V):
1597        """Sparse implementation of an upper star filtration.
1598
1599        Parameters
1600        ----------
1601        idx : :obj:`~numpy.array`
1602            Edge indices for each dimension (MxN).
1603        V : :obj:`~numpy.array`
1604            Array of intensity data (Mx1).
1605        Returns
1606        -------
1607        idx : :obj:`~numpy.array`
1608            Index of filtered points (Mx1).
1609        persistence : :obj:`~numpy.array`
1610            Persistence of each filtered point (Mx1).
1611
1612        Notes
1613        -----
1614        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1615        """
1616
1617        # Invert
1618        V = -1 * V.copy().astype(int)
1619
1620        # Embed indices
1621        V = self.embed_unique_indices(V)
1622
1623        # Connectivity matrix
1624        cmat = KDTree(idx)
1625        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1626        cmat.setdiag(1)
1627        cmat = sparse.triu(cmat)
1628
1629        # Pairwise minimums
1630        I, J = cmat.nonzero()
1631        d = np.maximum(V[I], V[J])
1632
1633        # Delete connectiity matrix
1634        cmat_shape = cmat.shape
1635        del cmat
1636
1637        # Sparse distance matrix
1638        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1639
1640        # Delete pairwise mins
1641        del d, I, J
1642
1643        # Persistence homology
1644        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1645
1646        # Bound death values
1647        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1648
1649        # Construct tree to query against
1650        tree = KDTree(V.reshape((-1, 1)))
1651
1652        # Get the indexes of the first nearest neighbor by birth
1653        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1654
1655        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)
1656
1657    def check_if_grid(self, data):
1658        """Check if the data are gridded in mz space.
1659
1660        Parameters
1661        ----------
1662        data : DataFrame
1663            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1664
1665        Returns
1666        -------
1667        bool
1668            True if the data is gridded in the mz direction, False otherwise.
1669
1670        Notes
1671        -----
1672        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1673        """
1674        # Calculate the difference between consecutive mz values in a single scan
1675        dat_check = data.copy().reset_index(drop=True)
1676        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1677        mz_diff_min = (
1678            dat_check.groupby("scan")["mz_diff"].min().min()
1679        )  # within each scan, what is the smallest mz difference between consecutive mz values
1680
1681        # Find the mininum mz difference between mz values in the data; regardless of scan
1682        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1683        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1684        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1685
1686        # Get minimum mz_diff between mz values in the data
1687        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1688
1689        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1690        if mz_diff_min_raw < mz_diff_min:
1691            return False
1692        else:
1693            return True
1694
1695    def grid_data(self, data, attempts=5):
1696        """Grid the data in the mz dimension.
1697
1698        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1699
1700        Parameters
1701        ----------
1702        data : DataFrame
1703            The input data containing mz, scan, scan_time, and intensity columns.
1704        attempts : int, optional
1705            The number of attempts to grid the data. Default is 5.
1706
1707        Returns
1708        -------
1709        DataFrame
1710            The gridded data with mz, scan, scan_time, and intensity columns.
1711
1712        Raises
1713        ------
1714        ValueError
1715            If gridding fails after the specified number of attempts.
1716        """
1717        attempt_i = 0
1718        while attempt_i < attempts:
1719            attempt_i += 1
1720            data = self._grid_data(data)
1721
1722            if self.check_if_grid(data):
1723                return data
1724
1725        if not self.check_if_grid(data):
1726            raise ValueError(
1727                "Gridding failed after "
1728                + str(attempt_i)
1729                + " attempts. Please check the data."
1730            )
1731        else:
1732            return data
1733
1734    def _grid_data(self, data):
1735        """Internal method to grid the data in the mz dimension.
1736
1737        Notes
1738        -----
1739        This method is called by the grid_data method and should not be called directly.
1740        It will attempt to grid the data in the mz dimension by creating a grid of mz values based on the minimum mz difference within each scan,
1741        but it does not check if the data is already gridded or if the gridding is successful.
1742
1743        Parameters
1744        ----------
1745        data : pd.DataFrame or pl.DataFrame
1746            The input data to grid.
1747
1748        Returns
1749        -------
1750        pd.DataFrame or pl.DataFrame
1751            The data after attempting to grid it in the mz dimension.
1752        """
1753        # Calculate the difference between consecutive mz values in a single scan for grid spacing
1754        data_w = data.copy().reset_index(drop=True)
1755        data_w["mz_diff"] = np.abs(data_w["mz"].diff())
1756        mz_diff_min = data_w.groupby("scan")["mz_diff"].min().min() * 0.99999
1757
1758        # Need high intensity mz values first so they are parents in the output pairs stack
1759        dat_mz = data_w[["mz", "intensity"]].sort_values(
1760            by=["intensity"], ascending=False
1761        )
1762        dat_mz = dat_mz[["mz"]].drop_duplicates().reset_index(drop=True).copy()
1763
1764        # Construct KD tree
1765        tree = KDTree(dat_mz.mz.values.reshape(-1, 1))
1766        sdm = tree.sparse_distance_matrix(tree, mz_diff_min, output_type="coo_matrix")
1767        sdm = sparse.triu(sdm, k=1)
1768        sdm.data = np.ones_like(sdm.data)
1769        distances = sdm.tocoo()
1770        pairs = np.stack((distances.row, distances.col), axis=1)
1771
1772        # Cull pairs to just get root
1773        to_drop = []
1774        while len(pairs) > 0:
1775            root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
1776            id_root_parents = np.isin(pairs[:, 0], root_parents)
1777            children_of_roots = np.unique(pairs[id_root_parents, 1])
1778            to_drop = np.append(to_drop, children_of_roots)
1779
1780            # Set up pairs array for next iteration by removing pairs with children or parents already dropped
1781            pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
1782            pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
1783        dat_mz = dat_mz.reset_index(drop=True).drop(index=np.array(to_drop))
1784        mz_dat_np = (
1785            dat_mz[["mz"]]
1786            .sort_values(by=["mz"])
1787            .reset_index(drop=True)
1788            .values.flatten()
1789        )
1790
1791        # Sort data by mz and recast mz to nearest value in mz_dat_np
1792        data_w = data_w.sort_values(by=["mz"]).reset_index(drop=True).copy()
1793        data_w["mz_new"] = mz_dat_np[find_closest(mz_dat_np, data_w["mz"].values)]
1794        data_w["mz_diff"] = np.abs(data_w["mz"] - data_w["mz_new"])
1795
1796        # Rename mz_new as mz; drop mz_diff; groupby scan and mz and sum intensity
1797        new_data_w = data_w.rename(columns={"mz": "mz_orig", "mz_new": "mz"}).copy()
1798        new_data_w = (
1799            new_data_w.drop(columns=["mz_diff", "mz_orig"])
1800            .groupby(["scan", "mz"])["intensity"]
1801            .sum()
1802            .reset_index()
1803        )
1804        new_data_w = (
1805            new_data_w.sort_values(by=["scan", "mz"], ascending=[True, True])
1806            .reset_index(drop=True)
1807            .copy()
1808        )
1809
1810        return new_data_w
1811
1812    def find_mass_features_ph(self, ms_level=1, grid=True):
1813        """Find mass features within an LCMSBase object using persistent homology.
1814
1815        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1816
1817        Parameters
1818        ----------
1819        ms_level : int, optional
1820            The MS level to use. Default is 1.
1821        grid : bool, optional
1822            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1823
1824        Raises
1825        ------
1826        ValueError
1827            If no MS level data is found on the object.
1828            If data is not gridded and grid is False.
1829
1830        Returns
1831        -------
1832        None, but assigns the mass_features attribute to the object.
1833
1834        Notes
1835        -----
1836        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1837        """
1838        # Check that ms_level is a key in self._ms_uprocessed
1839        if ms_level not in self._ms_unprocessed.keys():
1840            raise ValueError(
1841                "No MS level "
1842                + str(ms_level)
1843                + " data found, did you instantiate with parser specific to MS level?"
1844            )
1845
1846        # Get ms data
1847        data = self._ms_unprocessed[ms_level].copy()
1848
1849        # Drop rows with missing intensity values and reset index
1850        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1851
1852        # Threshold data
1853        dims = ["mz", "scan_time"]
1854        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1855        persistence_threshold = (
1856            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1857        )
1858        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1859
1860        # Check if gridded, if not, grid
1861        gridded_mz = self.check_if_grid(data_thres)
1862        if gridded_mz is False:
1863            if grid is False:
1864                raise ValueError(
1865                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1866                )
1867            else:
1868                data_thres = self.grid_data(data_thres)
1869
1870        # Add scan_time
1871        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1872        # Process in chunks if required
1873        if len(data_thres) > 10000:
1874            return self._find_mass_features_ph_partition(
1875                data_thres, dims, persistence_threshold
1876            )
1877        else:
1878            # Process all at once
1879            return self._find_mass_features_ph_single(
1880                data_thres, dims, persistence_threshold
1881            )
1882
1883    def _find_mass_features_ph_single(self, data_thres, dims, persistence_threshold):
1884        """Process all data at once (original logic)."""
1885        # Build factors
1886        factors = {
1887            dim: pd.factorize(data_thres[dim], sort=True)[1].astype(np.float32)
1888            for dim in dims
1889        }
1890
1891        # Build indexes
1892        index = {
1893            dim: np.searchsorted(factors[dim], data_thres[dim]).astype(np.float32)
1894            for dim in factors
1895        }
1896
1897        # Smooth and process
1898        mass_features_df = self._process_partition_ph(
1899            data_thres, index, dims, persistence_threshold
1900        )
1901
1902        # Roll up within chunk to remove duplicates
1903        mass_features_df = self.roll_up_dataframe(
1904            df=mass_features_df,
1905            sort_by="persistence",
1906            dims=["mz", "scan_time"],
1907            tol=[
1908                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
1909                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
1910            ],
1911            relative=[True, False],
1912        )
1913        mass_features_df = mass_features_df.reset_index(drop=True)
1914
1915        # Populate mass_features attribute
1916        self._populate_mass_features(mass_features_df)
1917
1918    def _find_mass_features_ph_partition(self, data_thres, dims, persistence_threshold):
1919        """Partition the persistent homology mass feature detection for large datasets.
1920
1921        This method splits the input data into overlapping scan partitions, processes each partition to detect mass features
1922        using persistent homology, rolls up duplicates within and across partitions, and populates the mass_features attribute.
1923
1924        Parameters
1925        ----------
1926        data_thres : pd.DataFrame
1927            The thresholded input data containing mass spectrometry information.
1928        dims : list
1929            List of dimension names (e.g., ["mz", "scan_time"]) used for feature detection.
1930        persistence_threshold : float
1931            Minimum persistence value required for a detected mass feature to be retained.
1932
1933        Returns
1934        -------
1935        None
1936            Populates the mass_features attribute of the object with detected mass features.
1937        """
1938        all_mass_features = []
1939
1940        # Split scans into partitions
1941        unique_scans = sorted(data_thres["scan"].unique())
1942        unique_scans_n = len(unique_scans)
1943
1944        # Calculate partition size in scans based on goal
1945        partition_size_goal = 5000
1946        scans_per_partition = max(
1947            1, partition_size_goal // (len(data_thres) // unique_scans_n)
1948        )
1949        if scans_per_partition == 0:
1950            scans_per_partition = 1
1951
1952        # Make partitions based on scans, with overlapping in partitioned scans
1953        scan_overlap = 4
1954        partition_scans = []
1955        for i in range(0, unique_scans_n, scans_per_partition):
1956            start_idx = max(0, i - scan_overlap)
1957            end_idx = min(
1958                unique_scans_n - 1, i + scans_per_partition - 1 + scan_overlap
1959            )
1960            scans_group = [int(s) for s in unique_scans[start_idx : end_idx + 1]]
1961            partition_scans.append(scans_group)
1962
1963        # Set index to scan for faster filtering
1964        data_thres = data_thres.set_index("scan")
1965        for scans in partition_scans:
1966            # Determine start and end scan for partition, with 5 scans overlap
1967            partition_data = data_thres.loc[scans].reset_index(drop=False).copy()
1968
1969            if len(partition_data) == 0:
1970                continue
1971
1972            # Build factors for this partition
1973            factors = {
1974                dim: pd.factorize(partition_data[dim], sort=True)[1].astype(np.float32)
1975                for dim in dims
1976            }
1977
1978            # Build indexes
1979            index = {
1980                dim: np.searchsorted(factors[dim], partition_data[dim]).astype(
1981                    np.float32
1982                )
1983                for dim in factors
1984            }
1985
1986            # Process partition
1987            partition_features = self._process_partition_ph(
1988                partition_data, index, dims, persistence_threshold
1989            )
1990
1991            if len(partition_features) == 0:
1992                continue
1993
1994            # Roll up within partition
1995            partition_features = self.roll_up_dataframe(
1996                df=partition_features,
1997                sort_by="persistence",
1998                dims=["mz", "scan_time"],
1999                tol=[
2000                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2001                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2002                ],
2003                relative=[True, False],
2004            )
2005            partition_features = partition_features.reset_index(drop=True)
2006
2007            if len(partition_features) > 0:
2008                all_mass_features.append(partition_features)
2009
2010        # Combine results from all partitions
2011        if all_mass_features:
2012            combined_features = pd.concat(all_mass_features, ignore_index=True)
2013
2014            # Sort by persistence
2015            combined_features = combined_features.sort_values(
2016                by="persistence", ascending=False
2017            ).reset_index(drop=True)
2018
2019            # Remove duplicates from overlapping regions
2020            combined_features = self.roll_up_dataframe(
2021                df=combined_features,
2022                sort_by="persistence",
2023                dims=["mz", "scan_time"],
2024                tol=[
2025                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2026                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2027                ],
2028                relative=[True, False],
2029            )
2030
2031            # resort by persistence and reset index
2032            combined_features = combined_features.reset_index(drop=True)
2033
2034            # Populate mass_features attribute
2035            self._populate_mass_features(combined_features)
2036        else:
2037            self.mass_features = {}
2038
2039    def _process_partition_ph(self, partition_data, index, dims, persistence_threshold):
2040        """Process a single partition with persistent homology."""
2041        # Smooth data
2042        iterations = self.parameters.lc_ms.ph_smooth_it
2043        smooth_radius = [
2044            self.parameters.lc_ms.ph_smooth_radius_mz,
2045            self.parameters.lc_ms.ph_smooth_radius_scan,
2046        ]
2047
2048        index_array = np.vstack([index[dim] for dim in dims]).T
2049        V = partition_data["intensity"].values
2050        resid = np.inf
2051
2052        for i in range(iterations):
2053            # Previous iteration
2054            V_prev = V.copy()
2055            resid_prev = resid
2056            V = self.sparse_mean_filter(index_array, V, radius=smooth_radius)
2057
2058            # Calculate residual with previous iteration
2059            resid = np.sqrt(np.mean(np.square(V - V_prev)))
2060
2061            # Evaluate convergence
2062            if i > 0:
2063                # Percent change in residual
2064                test = np.abs(resid - resid_prev) / resid_prev
2065
2066                # Exit criteria
2067                if test <= 0:
2068                    break
2069
2070        # Overwrite values
2071        partition_data = partition_data.copy()
2072        partition_data["intensity"] = V
2073
2074        # Use persistent homology to find regions of interest
2075        pidx, pers = self.sparse_upper_star(index_array, V)
2076        pidx = pidx[pers > 1]
2077        pers = pers[pers > 1]
2078
2079        if len(pidx) == 0:
2080            return pd.DataFrame()
2081
2082        # Get peaks
2083        peaks = partition_data.iloc[pidx, :].reset_index(drop=True)
2084
2085        # Add persistence column
2086        peaks["persistence"] = pers
2087        mass_features = peaks.sort_values(
2088            by="persistence", ascending=False
2089        ).reset_index(drop=True)
2090
2091        # Filter by persistence threshold
2092        mass_features = mass_features.loc[
2093            mass_features["persistence"] > persistence_threshold, :
2094        ].reset_index(drop=True)
2095
2096        return mass_features
2097
2098    def _populate_mass_features(self, mass_features_df):
2099        """Populate the mass_features attribute from a DataFrame.
2100
2101        Parameters
2102        ----------
2103        mass_features_df : pd.DataFrame
2104            DataFrame containing mass feature information.
2105            Note that the order of this DataFrame will determine the order of mass features in the mass_features attribute.
2106
2107        Returns
2108        -------
2109        None, but assigns the mass_features attribute to the object.
2110        """
2111        # Rename scan column to apex_scan
2112        mass_features_df = mass_features_df.rename(
2113            columns={"scan": "apex_scan", "scan_time": "retention_time"}
2114        )
2115
2116        # Populate mass_features attribute
2117        self.mass_features = {}
2118        for row in mass_features_df.itertuples():
2119            row_dict = mass_features_df.iloc[row.Index].to_dict()
2120            lcms_feature = LCMSMassFeature(self, **row_dict)
2121            self.mass_features[lcms_feature.id] = lcms_feature
2122
2123        if self.parameters.lc_ms.verbose_processing:
2124            print("Found " + str(len(mass_features_df)) + " initial mass features")
2125
2126    def find_mass_features_ph_centroid(self, ms_level=1):
2127        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
2128
2129        Parameters
2130        ----------
2131        ms_level : int, optional
2132            The MS level to use. Default is 1.
2133
2134        Raises
2135        ------
2136        ValueError
2137            If no MS level data is found on the object.
2138
2139        Returns
2140        -------
2141        None, but assigns the mass_features attribute to the object.
2142        """
2143        # Check that ms_level is a key in self._ms_uprocessed
2144        if ms_level not in self._ms_unprocessed.keys():
2145            raise ValueError(
2146                "No MS level "
2147                + str(ms_level)
2148                + " data found, did you instantiate with parser specific to MS level?"
2149            )
2150
2151        # Work with reference instead of copy
2152        data = self._ms_unprocessed[ms_level]
2153
2154        # Calculate threshold first to avoid multiple operations
2155        max_intensity = data["intensity"].max()
2156        threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity
2157
2158        # Create single filter condition and apply to required columns only
2159        valid_mask = data["intensity"].notna() & (data["intensity"] > threshold)
2160        required_cols = ["mz", "intensity", "scan"]
2161        data_thres = data.loc[valid_mask, required_cols].copy()
2162        data_thres["persistence"] = data_thres["intensity"]
2163
2164        # Merge with required scan data
2165        scan_subset = self.scan_df[["scan", "scan_time"]]
2166        mf_df = data_thres.merge(scan_subset, on="scan", how="inner")
2167        del data_thres, scan_subset
2168
2169        # Order by scan_time and then mz to ensure features near in rt are processed together
2170        # It's ok that different scans are in different partitions; we will roll up later
2171        mf_df = mf_df.sort_values(
2172            by=["scan_time", "mz"], ascending=[True, True]
2173        ).reset_index(drop=True)
2174        partition_size = 10000
2175        partitions = [
2176            mf_df.iloc[i : i + partition_size].reset_index(drop=True)
2177            for i in range(0, len(mf_df), partition_size)
2178        ]
2179        del mf_df
2180
2181        # Run roll_up_dataframe on each partition
2182        rolled_partitions = []
2183        for part in partitions:
2184            rolled = self.roll_up_dataframe(
2185                df=part,
2186                sort_by="persistence",
2187                dims=["mz", "scan_time"],
2188                tol=[
2189                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2190                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2191                ],
2192                relative=[True, False],
2193            )
2194            rolled_partitions.append(rolled)
2195        del partitions
2196
2197        # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries
2198
2199        # Combine results and run a final roll-up to merge features near partition boundaries
2200        mf_df_final = pd.concat(rolled_partitions, ignore_index=True)
2201        del rolled_partitions
2202
2203        # Reorder by persistence before final roll-up
2204        mf_df_final = mf_df_final.sort_values(
2205            by="persistence", ascending=False
2206        ).reset_index(drop=True)
2207
2208        mf_df_final = self.roll_up_dataframe(
2209            df=mf_df_final,
2210            sort_by="persistence",
2211            dims=["mz", "scan_time"],
2212            tol=[
2213                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2214                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2215            ],
2216            relative=[True, False],
2217        )
2218        # reset index
2219        mf_df_final = mf_df_final.reset_index(drop=True)
2220
2221        # Combine rename and sort operations
2222        mass_features = (
2223            mf_df_final.rename(
2224                columns={"scan": "apex_scan", "scan_time": "retention_time"}
2225            )
2226            .sort_values(by="persistence", ascending=False)
2227            .reset_index(drop=True)
2228        )
2229        del mf_df_final  # Free memory
2230
2231        # Order by persistence and reset index
2232        mass_features = mass_features.sort_values(
2233            by="persistence", ascending=False
2234        ).reset_index(drop=True)
2235
2236        self.mass_features = {}
2237        for idx, row in mass_features.iterrows():
2238            row_dict = row.to_dict()
2239            lcms_feature = LCMSMassFeature(self, **row_dict)
2240            self.mass_features[lcms_feature.id] = lcms_feature
2241
2242        if self.parameters.lc_ms.verbose_processing:
2243            print("Found " + str(len(mass_features)) + " initial mass features")
2244
2245    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
2246        """Cluster mass features
2247
2248        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
2249
2250        Parameters
2251        ----------
2252        drop_children : bool, optional
2253            Whether to drop the mass features that are not cluster parents. Default is True.
2254        sort_by : str, optional
2255            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
2256
2257        Raises
2258        ------
2259        ValueError
2260            If no mass features are found.
2261            If too many mass features are found.
2262
2263        Returns
2264        -------
2265        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
2266        """
2267        if self.mass_features is None:
2268            raise ValueError("No mass features found, run find_mass_features() first")
2269        if len(self.mass_features) > 400000:
2270            raise ValueError(
2271                "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold"
2272            )
2273        dims = ["mz", "scan_time"]
2274        mf_df_og = self.mass_features_to_df()
2275        mf_df = mf_df_og.copy()
2276
2277        tol = [
2278            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2279            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2280        ]  # mz, in relative; scan_time in minutes
2281        relative = [True, False]
2282
2283        # Roll up mass features based on their proximity in the declared dimensions
2284        mf_df_new = self.roll_up_dataframe(
2285            df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative
2286        )
2287
2288        mf_df["cluster_parent"] = np.where(
2289            np.isin(mf_df.index, mf_df_new.index), True, False
2290        )
2291
2292        # get mass feature ids of features that are not cluster parents
2293        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
2294        if drop_children is True:
2295            # Drop mass features that are not cluster parents from self
2296            self.mass_features = {
2297                k: v
2298                for k, v in self.mass_features.items()
2299                if k not in cluster_daughters
2300            }
2301        else:
2302            return cluster_daughters

Methods for performing calculations related to 2D peak picking via persistent homology on LCMS data.

Notes

This class is intended to be used as a mixin for the LCMSBase class.

Methods
  • sparse_mean_filter(idx, V, radius=[0, 1, 1]). Sparse implementation of a mean filter.
  • embed_unique_indices(a). Creates an array of indices, sorted by unique element.
  • sparse_upper_star(idx, V). Sparse implementation of an upper star filtration.
  • check_if_grid(data). Check if the data is gridded in mz space.
  • grid_data(data). Grid the data in the mz dimension.
  • find_mass_features_ph(ms_level=1, grid=True). Find mass features within an LCMSBase object using persistent homology.
  • cluster_mass_features(drop_children=True). Cluster regions of interest.
@staticmethod
def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
1170    @staticmethod
1171    def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
1172        """Sparse implementation of a mean filter.
1173
1174        Parameters
1175        ----------
1176        idx : :obj:`~numpy.array`
1177            Edge indices for each dimension (MxN).
1178        V : :obj:`~numpy.array`
1179            Array of intensity data (Mx1).
1180        radius : float or list
1181            Radius of the sparse filter in each dimension. Values less than
1182            zero indicate no connectivity in that dimension.
1183
1184        Returns
1185        -------
1186        :obj:`~numpy.array`
1187            Filtered intensities (Mx1).
1188
1189        Notes
1190        -----
1191        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos.
1192        This is a static method.
1193        """
1194
1195        # Copy indices
1196        idx = idx.copy().astype(V.dtype)
1197
1198        # Scale
1199        for i, r in enumerate(radius):
1200            # Increase inter-index distance
1201            if r < 1:
1202                idx[:, i] *= 2
1203
1204            # Do nothing
1205            elif r == 1:
1206                pass
1207
1208            # Decrease inter-index distance
1209            else:
1210                idx[:, i] /= r
1211
1212        # Connectivity matrix
1213        cmat = KDTree(idx)
1214        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1215        cmat.setdiag(1)
1216
1217        # Pair indices
1218        I, J = cmat.nonzero()
1219
1220        # Delete cmat
1221        cmat_shape = cmat.shape
1222        del cmat
1223
1224        # Sum over columns
1225        V_sum = sparse.bsr_matrix(
1226            (V[J], (I, I)), shape=cmat_shape, dtype=V.dtype
1227        ).diagonal(0)
1228
1229        # Count over columns
1230        V_count = sparse.bsr_matrix(
1231            (np.ones_like(J), (I, I)), shape=cmat_shape, dtype=V.dtype
1232        ).diagonal(0)
1233
1234        return V_sum / V_count

Sparse implementation of a mean filter.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
  • radius (float or list): Radius of the sparse filter in each dimension. Values less than zero indicate no connectivity in that dimension.
Returns
  • ~numpy.array: Filtered intensities (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos. This is a static method.

@staticmethod
def embed_unique_indices(a):
1236    @staticmethod
1237    def embed_unique_indices(a):
1238        """Creates an array of indices, sorted by unique element.
1239
1240        Parameters
1241        ----------
1242        a : :obj:`~numpy.array`
1243            Array of unique elements (Mx1).
1244
1245        Returns
1246        -------
1247        :obj:`~numpy.array`
1248            Array of indices (Mx1).
1249
1250        Notes
1251        -----
1252        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1253        This is a static method.
1254        """
1255
1256        def count_tens(n):
1257            # Count tens
1258            ntens = (n - 1) // 10
1259
1260            while True:
1261                ntens_test = (ntens + n - 1) // 10
1262
1263                if ntens_test == ntens:
1264                    return ntens
1265                else:
1266                    ntens = ntens_test
1267
1268        def arange_exclude_10s(n):
1269            # How many 10s will there be?
1270            ntens = count_tens(n)
1271
1272            # Base array
1273            arr = np.arange(0, n + ntens)
1274
1275            # Exclude 10s
1276            arr = arr[(arr == 0) | (arr % 10 != 0)][:n]
1277
1278            return arr
1279
1280        # Creates an array of indices, sorted by unique element
1281        idx_sort = np.argsort(a)
1282        idx_unsort = np.argsort(idx_sort)
1283
1284        # Sorts records array so all unique elements are together
1285        sorted_a = a[idx_sort]
1286
1287        # Returns the unique values, the index of the first occurrence,
1288        # and the count for each element
1289        vals, idx_start, count = np.unique(
1290            sorted_a, return_index=True, return_counts=True
1291        )
1292
1293        # Splits the indices into separate arrays
1294        splits = np.split(idx_sort, idx_start[1:])
1295
1296        # Creates unique indices for each split
1297        idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])
1298
1299        # Reorders according to input array
1300        idx_unq = idx_unq[idx_unsort]
1301
1302        # Magnitude of each index
1303        exp = np.log10(
1304            idx_unq, where=idx_unq > 0, out=np.zeros_like(idx_unq, dtype=np.float64)
1305        )
1306        idx_unq_mag = np.power(10, np.floor(exp) + 1)
1307
1308        # Result
1309        return a + idx_unq / idx_unq_mag

Creates an array of indices, sorted by unique element.

Parameters
  • a (~numpy.array): Array of unique elements (Mx1).
Returns
  • ~numpy.array: Array of indices (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos This is a static method.

@staticmethod
def roll_up_dataframe( df: pandas.core.frame.DataFrame, sort_by: str, tol: list, relative: list, dims: list, memory_opt_threshold: int = 10000):
1311    @staticmethod
1312    def roll_up_dataframe(
1313        df: pd.DataFrame,
1314        sort_by: str,
1315        tol: list,
1316        relative: list,
1317        dims: list,
1318        memory_opt_threshold: int = 10000,
1319    ):
1320        """Subset data by rolling up into apex in appropriate dimensions.
1321
1322        Parameters
1323        ----------
1324        data : pd.DataFrame
1325            The input data containing "dims" columns and the "sort_by" column.
1326        sort_by : str
1327            The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature
1328            (i.e., the mass feature with the highest value in the sort_by column).
1329        dims : list
1330            A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
1331        tol : list
1332            A list of tolerances for each dimension. The length of the list must match the number of dimensions.
1333            The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension).
1334            If relative is True, the tolerance will be multiplied by the maximum value in that dimension.
1335            If relative is False, the tolerance will be used as is.
1336        relative : list
1337            A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
1338        memory_opt_threshold : int, optional
1339            Minimum number of rows to trigger memory-optimized processing. Default is 10000.
1340
1341        Returns
1342        -------
1343        pd.DataFrame
1344            A DataFrame with only the rolled up mass features, with the original index and columns.
1345
1346
1347        Raises
1348        ------
1349        ValueError
1350            If the input data is not a pandas DataFrame.
1351            If the input data does not have columns for each of the dimensions in "dims".
1352            If the length of "dims", "tol", and "relative" do not match.
1353        """
1354        og_columns = df.columns.copy()
1355
1356        # Unindex the data, but keep the original index
1357        if df.index.name is not None:
1358            og_index = df.index.name
1359        else:
1360            og_index = "index"
1361        df = df.reset_index(drop=False)
1362
1363        # Sort data by sort_by column, and reindex
1364        df = df.sort_values(by=sort_by, ascending=False).reset_index(drop=True)
1365
1366        # Check that data is a DataFrame and has columns for each of the dims
1367        if not isinstance(df, pd.DataFrame):
1368            raise ValueError("Data must be a pandas DataFrame")
1369        for dim in dims:
1370            if dim not in df.columns:
1371                raise ValueError(f"Data must have a column for {dim}")
1372        if len(dims) != len(tol) or len(dims) != len(relative):
1373            raise ValueError(
1374                "Dimensions, tolerances, and relative flags must be the same length"
1375            )
1376
1377        # Pre-compute all values arrays
1378        all_values = [df[dim].values for dim in dims]
1379
1380        # Choose processing method based on dataframe size
1381        if len(df) >= memory_opt_threshold:
1382            # Memory-optimized approach for large dataframes
1383            distances = PHCalculations._compute_distances_memory_optimized(
1384                all_values, tol, relative
1385            )
1386        else:
1387            # Faster approach for smaller dataframes
1388            distances = PHCalculations._compute_distances_original(
1389                all_values, tol, relative
1390            )
1391
1392        # Process pairs with original logic but memory optimizations
1393        distances = distances.tocoo()
1394        pairs = np.stack((distances.row, distances.col), axis=1)
1395        pairs_df = pd.DataFrame(pairs, columns=["parent", "child"]).set_index("parent")
1396        del distances, pairs  # Free memory immediately
1397
1398        to_drop = []
1399        while not pairs_df.empty:
1400            # Find root_parents and their children (original logic preserved)
1401            root_parents = np.setdiff1d(
1402                np.unique(pairs_df.index.values), np.unique(pairs_df.child.values)
1403            )
1404            children_of_roots = pairs_df.loc[root_parents, "child"].unique()
1405            to_drop.extend(children_of_roots)  # Use extend instead of append
1406
1407            # Remove root_children as possible parents from pairs_df for next iteration
1408            pairs_df = pairs_df.drop(index=children_of_roots, errors="ignore")
1409            pairs_df = pairs_df.reset_index().set_index("child")
1410            # Remove root_children as possible children from pairs_df for next iteration
1411            pairs_df = pairs_df.drop(index=children_of_roots)
1412
1413            # Prepare for next iteration
1414            pairs_df = pairs_df.reset_index().set_index("parent")
1415
1416        # Convert to numpy array for efficient dropping
1417        to_drop = np.array(to_drop)
1418
1419        # Drop mass features that are not cluster parents
1420        df_sub = df.drop(index=to_drop)
1421
1422        # Set index back to og_index and only keep original columns
1423        df_sub = df_sub.set_index(og_index).sort_index()[og_columns]
1424
1425        return df_sub

Subset data by rolling up into apex in appropriate dimensions.

Parameters
  • data (pd.DataFrame): The input data containing "dims" columns and the "sort_by" column.
  • sort_by (str): The column to sort the data by, this will determine which mass features get rolled up into a parent mass feature (i.e., the mass feature with the highest value in the sort_by column).
  • dims (list): A list of dimension names (column names in the data DataFrame) to roll up the mass features by.
  • tol (list): A list of tolerances for each dimension. The length of the list must match the number of dimensions. The tolerances can be relative (as a fraction of the maximum value in that dimension) or absolute (in the units of that dimension). If relative is True, the tolerance will be multiplied by the maximum value in that dimension. If relative is False, the tolerance will be used as is.
  • relative (list): A list of booleans indicating whether the tolerance for each dimension is relative (True) or absolute (False).
  • memory_opt_threshold (int, optional): Minimum number of rows to trigger memory-optimized processing. Default is 10000.
Returns
  • pd.DataFrame: A DataFrame with only the rolled up mass features, with the original index and columns.
Raises
  • ValueError: If the input data is not a pandas DataFrame. If the input data does not have columns for each of the dimensions in "dims". If the length of "dims", "tol", and "relative" do not match.
def sparse_upper_star(self, idx, V):
1596    def sparse_upper_star(self, idx, V):
1597        """Sparse implementation of an upper star filtration.
1598
1599        Parameters
1600        ----------
1601        idx : :obj:`~numpy.array`
1602            Edge indices for each dimension (MxN).
1603        V : :obj:`~numpy.array`
1604            Array of intensity data (Mx1).
1605        Returns
1606        -------
1607        idx : :obj:`~numpy.array`
1608            Index of filtered points (Mx1).
1609        persistence : :obj:`~numpy.array`
1610            Persistence of each filtered point (Mx1).
1611
1612        Notes
1613        -----
1614        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1615        """
1616
1617        # Invert
1618        V = -1 * V.copy().astype(int)
1619
1620        # Embed indices
1621        V = self.embed_unique_indices(V)
1622
1623        # Connectivity matrix
1624        cmat = KDTree(idx)
1625        cmat = cmat.sparse_distance_matrix(cmat, 1, p=np.inf, output_type="coo_matrix")
1626        cmat.setdiag(1)
1627        cmat = sparse.triu(cmat)
1628
1629        # Pairwise minimums
1630        I, J = cmat.nonzero()
1631        d = np.maximum(V[I], V[J])
1632
1633        # Delete connectiity matrix
1634        cmat_shape = cmat.shape
1635        del cmat
1636
1637        # Sparse distance matrix
1638        sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)
1639
1640        # Delete pairwise mins
1641        del d, I, J
1642
1643        # Persistence homology
1644        ph = ripser(sdm, distance_matrix=True, maxdim=0)["dgms"][0]
1645
1646        # Bound death values
1647        ph[ph[:, 1] == np.inf, 1] = np.max(V)
1648
1649        # Construct tree to query against
1650        tree = KDTree(V.reshape((-1, 1)))
1651
1652        # Get the indexes of the first nearest neighbor by birth
1653        _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)
1654
1655        return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)

Sparse implementation of an upper star filtration.

Parameters
  • idx (~numpy.array): Edge indices for each dimension (MxN).
  • V (~numpy.array): Array of intensity data (Mx1).
Returns
  • idx (~numpy.array): Index of filtered points (Mx1).
  • persistence (~numpy.array): Persistence of each filtered point (Mx1).
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def check_if_grid(self, data):
1657    def check_if_grid(self, data):
1658        """Check if the data are gridded in mz space.
1659
1660        Parameters
1661        ----------
1662        data : DataFrame
1663            DataFrame containing the mass spectrometry data.  Needs to have mz and scan columns.
1664
1665        Returns
1666        -------
1667        bool
1668            True if the data is gridded in the mz direction, False otherwise.
1669
1670        Notes
1671        -----
1672        This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.
1673        """
1674        # Calculate the difference between consecutive mz values in a single scan
1675        dat_check = data.copy().reset_index(drop=True)
1676        dat_check["mz_diff"] = np.abs(dat_check["mz"].diff())
1677        mz_diff_min = (
1678            dat_check.groupby("scan")["mz_diff"].min().min()
1679        )  # within each scan, what is the smallest mz difference between consecutive mz values
1680
1681        # Find the mininum mz difference between mz values in the data; regardless of scan
1682        dat_check_mz = dat_check[["mz"]].drop_duplicates().copy()
1683        dat_check_mz = dat_check_mz.sort_values(by=["mz"]).reset_index(drop=True)
1684        dat_check_mz["mz_diff"] = np.abs(dat_check_mz["mz"].diff())
1685
1686        # Get minimum mz_diff between mz values in the data
1687        mz_diff_min_raw = dat_check_mz["mz_diff"].min()
1688
1689        # If the minimum mz difference between mz values in the data is less than the minimum mz difference between mz values within a single scan, then the data is not gridded
1690        if mz_diff_min_raw < mz_diff_min:
1691            return False
1692        else:
1693            return True

Check if the data are gridded in mz space.

Parameters
  • data (DataFrame): DataFrame containing the mass spectrometry data. Needs to have mz and scan columns.
Returns
  • bool: True if the data is gridded in the mz direction, False otherwise.
Notes

This function is used within the grid_data function and the find_mass_features function and is not intended to be called directly.

def grid_data(self, data, attempts=5):
1695    def grid_data(self, data, attempts=5):
1696        """Grid the data in the mz dimension.
1697
1698        Data must be gridded prior to persistent homology calculations and computing average mass spectrum
1699
1700        Parameters
1701        ----------
1702        data : DataFrame
1703            The input data containing mz, scan, scan_time, and intensity columns.
1704        attempts : int, optional
1705            The number of attempts to grid the data. Default is 5.
1706
1707        Returns
1708        -------
1709        DataFrame
1710            The gridded data with mz, scan, scan_time, and intensity columns.
1711
1712        Raises
1713        ------
1714        ValueError
1715            If gridding fails after the specified number of attempts.
1716        """
1717        attempt_i = 0
1718        while attempt_i < attempts:
1719            attempt_i += 1
1720            data = self._grid_data(data)
1721
1722            if self.check_if_grid(data):
1723                return data
1724
1725        if not self.check_if_grid(data):
1726            raise ValueError(
1727                "Gridding failed after "
1728                + str(attempt_i)
1729                + " attempts. Please check the data."
1730            )
1731        else:
1732            return data

Grid the data in the mz dimension.

Data must be gridded prior to persistent homology calculations and computing average mass spectrum

Parameters
  • data (DataFrame): The input data containing mz, scan, scan_time, and intensity columns.
  • attempts (int, optional): The number of attempts to grid the data. Default is 5.
Returns
  • DataFrame: The gridded data with mz, scan, scan_time, and intensity columns.
Raises
  • ValueError: If gridding fails after the specified number of attempts.
def find_mass_features_ph(self, ms_level=1, grid=True):
1812    def find_mass_features_ph(self, ms_level=1, grid=True):
1813        """Find mass features within an LCMSBase object using persistent homology.
1814
1815        Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)
1816
1817        Parameters
1818        ----------
1819        ms_level : int, optional
1820            The MS level to use. Default is 1.
1821        grid : bool, optional
1822            If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
1823
1824        Raises
1825        ------
1826        ValueError
1827            If no MS level data is found on the object.
1828            If data is not gridded and grid is False.
1829
1830        Returns
1831        -------
1832        None, but assigns the mass_features attribute to the object.
1833
1834        Notes
1835        -----
1836        This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos
1837        """
1838        # Check that ms_level is a key in self._ms_uprocessed
1839        if ms_level not in self._ms_unprocessed.keys():
1840            raise ValueError(
1841                "No MS level "
1842                + str(ms_level)
1843                + " data found, did you instantiate with parser specific to MS level?"
1844            )
1845
1846        # Get ms data
1847        data = self._ms_unprocessed[ms_level].copy()
1848
1849        # Drop rows with missing intensity values and reset index
1850        data = data.dropna(subset=["intensity"]).reset_index(drop=True)
1851
1852        # Threshold data
1853        dims = ["mz", "scan_time"]
1854        threshold = self.parameters.lc_ms.ph_inten_min_rel * data.intensity.max()
1855        persistence_threshold = (
1856            self.parameters.lc_ms.ph_persis_min_rel * data.intensity.max()
1857        )
1858        data_thres = data[data["intensity"] > threshold].reset_index(drop=True).copy()
1859
1860        # Check if gridded, if not, grid
1861        gridded_mz = self.check_if_grid(data_thres)
1862        if gridded_mz is False:
1863            if grid is False:
1864                raise ValueError(
1865                    "Data are not gridded in mz dimension, try reprocessing with a different params or grid data before running this function"
1866                )
1867            else:
1868                data_thres = self.grid_data(data_thres)
1869
1870        # Add scan_time
1871        data_thres = data_thres.merge(self.scan_df[["scan", "scan_time"]], on="scan")
1872        # Process in chunks if required
1873        if len(data_thres) > 10000:
1874            return self._find_mass_features_ph_partition(
1875                data_thres, dims, persistence_threshold
1876            )
1877        else:
1878            # Process all at once
1879            return self._find_mass_features_ph_single(
1880                data_thres, dims, persistence_threshold
1881            )

Find mass features within an LCMSBase object using persistent homology.

Assigns the mass_features attribute to the object (a dictionary of LCMSMassFeature objects, keyed by mass feature id)

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
  • grid (bool, optional): If True, will regrid the data before running the persistent homology calculations (after checking if the data is gridded). Default is True.
Raises
  • ValueError: If no MS level data is found on the object. If data is not gridded and grid is False.
Returns
  • None, but assigns the mass_features attribute to the object.
Notes

This function has been adapted from the original implementation in the Deimos package: https://github.com/pnnl/deimos

def find_mass_features_ph_centroid(self, ms_level=1):
2126    def find_mass_features_ph_centroid(self, ms_level=1):
2127        """Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.
2128
2129        Parameters
2130        ----------
2131        ms_level : int, optional
2132            The MS level to use. Default is 1.
2133
2134        Raises
2135        ------
2136        ValueError
2137            If no MS level data is found on the object.
2138
2139        Returns
2140        -------
2141        None, but assigns the mass_features attribute to the object.
2142        """
2143        # Check that ms_level is a key in self._ms_uprocessed
2144        if ms_level not in self._ms_unprocessed.keys():
2145            raise ValueError(
2146                "No MS level "
2147                + str(ms_level)
2148                + " data found, did you instantiate with parser specific to MS level?"
2149            )
2150
2151        # Work with reference instead of copy
2152        data = self._ms_unprocessed[ms_level]
2153
2154        # Calculate threshold first to avoid multiple operations
2155        max_intensity = data["intensity"].max()
2156        threshold = self.parameters.lc_ms.ph_inten_min_rel * max_intensity
2157
2158        # Create single filter condition and apply to required columns only
2159        valid_mask = data["intensity"].notna() & (data["intensity"] > threshold)
2160        required_cols = ["mz", "intensity", "scan"]
2161        data_thres = data.loc[valid_mask, required_cols].copy()
2162        data_thres["persistence"] = data_thres["intensity"]
2163
2164        # Merge with required scan data
2165        scan_subset = self.scan_df[["scan", "scan_time"]]
2166        mf_df = data_thres.merge(scan_subset, on="scan", how="inner")
2167        del data_thres, scan_subset
2168
2169        # Order by scan_time and then mz to ensure features near in rt are processed together
2170        # It's ok that different scans are in different partitions; we will roll up later
2171        mf_df = mf_df.sort_values(
2172            by=["scan_time", "mz"], ascending=[True, True]
2173        ).reset_index(drop=True)
2174        partition_size = 10000
2175        partitions = [
2176            mf_df.iloc[i : i + partition_size].reset_index(drop=True)
2177            for i in range(0, len(mf_df), partition_size)
2178        ]
2179        del mf_df
2180
2181        # Run roll_up_dataframe on each partition
2182        rolled_partitions = []
2183        for part in partitions:
2184            rolled = self.roll_up_dataframe(
2185                df=part,
2186                sort_by="persistence",
2187                dims=["mz", "scan_time"],
2188                tol=[
2189                    self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2190                    self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2191                ],
2192                relative=[True, False],
2193            )
2194            rolled_partitions.append(rolled)
2195        del partitions
2196
2197        # Run roll_up_dataframe on the rolled_up partitions to merge features near partition boundaries
2198
2199        # Combine results and run a final roll-up to merge features near partition boundaries
2200        mf_df_final = pd.concat(rolled_partitions, ignore_index=True)
2201        del rolled_partitions
2202
2203        # Reorder by persistence before final roll-up
2204        mf_df_final = mf_df_final.sort_values(
2205            by="persistence", ascending=False
2206        ).reset_index(drop=True)
2207
2208        mf_df_final = self.roll_up_dataframe(
2209            df=mf_df_final,
2210            sort_by="persistence",
2211            dims=["mz", "scan_time"],
2212            tol=[
2213                self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2214                self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2215            ],
2216            relative=[True, False],
2217        )
2218        # reset index
2219        mf_df_final = mf_df_final.reset_index(drop=True)
2220
2221        # Combine rename and sort operations
2222        mass_features = (
2223            mf_df_final.rename(
2224                columns={"scan": "apex_scan", "scan_time": "retention_time"}
2225            )
2226            .sort_values(by="persistence", ascending=False)
2227            .reset_index(drop=True)
2228        )
2229        del mf_df_final  # Free memory
2230
2231        # Order by persistence and reset index
2232        mass_features = mass_features.sort_values(
2233            by="persistence", ascending=False
2234        ).reset_index(drop=True)
2235
2236        self.mass_features = {}
2237        for idx, row in mass_features.iterrows():
2238            row_dict = row.to_dict()
2239            lcms_feature = LCMSMassFeature(self, **row_dict)
2240            self.mass_features[lcms_feature.id] = lcms_feature
2241
2242        if self.parameters.lc_ms.verbose_processing:
2243            print("Found " + str(len(mass_features)) + " initial mass features")

Find mass features within an LCMSBase object using persistent homology-type approach but with centroided data.

Parameters
  • ms_level (int, optional): The MS level to use. Default is 1.
Raises
  • ValueError: If no MS level data is found on the object.
Returns
  • None, but assigns the mass_features attribute to the object.
def cluster_mass_features(self, drop_children=True, sort_by='persistence'):
2245    def cluster_mass_features(self, drop_children=True, sort_by="persistence"):
2246        """Cluster mass features
2247
2248        Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.
2249
2250        Parameters
2251        ----------
2252        drop_children : bool, optional
2253            Whether to drop the mass features that are not cluster parents. Default is True.
2254        sort_by : str, optional
2255            The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
2256
2257        Raises
2258        ------
2259        ValueError
2260            If no mass features are found.
2261            If too many mass features are found.
2262
2263        Returns
2264        -------
2265        None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.
2266        """
2267        if self.mass_features is None:
2268            raise ValueError("No mass features found, run find_mass_features() first")
2269        if len(self.mass_features) > 400000:
2270            raise ValueError(
2271                "Too many mass features of interest found, run find_mass_features() with a higher intensity threshold"
2272            )
2273        dims = ["mz", "scan_time"]
2274        mf_df_og = self.mass_features_to_df()
2275        mf_df = mf_df_og.copy()
2276
2277        tol = [
2278            self.parameters.lc_ms.mass_feature_cluster_mz_tolerance_rel,
2279            self.parameters.lc_ms.mass_feature_cluster_rt_tolerance,
2280        ]  # mz, in relative; scan_time in minutes
2281        relative = [True, False]
2282
2283        # Roll up mass features based on their proximity in the declared dimensions
2284        mf_df_new = self.roll_up_dataframe(
2285            df=mf_df, sort_by=sort_by, dims=dims, tol=tol, relative=relative
2286        )
2287
2288        mf_df["cluster_parent"] = np.where(
2289            np.isin(mf_df.index, mf_df_new.index), True, False
2290        )
2291
2292        # get mass feature ids of features that are not cluster parents
2293        cluster_daughters = mf_df[~mf_df["cluster_parent"]].index.values
2294        if drop_children is True:
2295            # Drop mass features that are not cluster parents from self
2296            self.mass_features = {
2297                k: v
2298                for k, v in self.mass_features.items()
2299                if k not in cluster_daughters
2300            }
2301        else:
2302            return cluster_daughters

Cluster mass features

Based on their proximity in the mz and scan_time dimensions, priorizies the mass features with the highest persistence.

Parameters
  • drop_children (bool, optional): Whether to drop the mass features that are not cluster parents. Default is True.
  • sort_by (str, optional): The column to sort the mass features by, this will determine which mass features get rolled up into a parent mass feature. Default is "persistence".
Raises
  • ValueError: If no mass features are found. If too many mass features are found.
Returns
  • None if drop_children is True, otherwise returns a list of mass feature ids that are not cluster parents.