corems.mass_spectra.factory.lc_class

   1from pathlib import Path
   2
   3import numpy as np
   4import pandas as pd
   5import warnings
   6import matplotlib.pyplot as plt
   7
   8from corems.encapsulation.factory.parameters import LCMSParameters
   9from corems.mass_spectra.calc.lc_calc import LCCalculations, PHCalculations
  10from corems.molecular_id.search.lcms_spectral_search import LCMSSpectralSearch
  11from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  12from corems.mass_spectra.calc.lc_calc import find_closest
  13
  14
  15class MassSpectraBase:
  16    """Base class for mass spectra objects.
  17
  18    Parameters
  19    -----------
  20    file_location : str or Path
  21        The location of the file containing the mass spectra data.
  22    analyzer : str, optional
  23        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
  24    instrument_label : str, optional
  25        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
  26    sample_name : str, optional
  27        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
  28    spectra_parser : object, optional
  29        The spectra parser object used to create the mass spectra object. Defaults to None.
  30
  31    Attributes
  32    -----------
  33    spectra_parser_class : class
  34        The class of the spectra parser used to create the mass spectra object.
  35    file_location : str or Path
  36        The location of the file containing the mass spectra data.
  37    sample_name : str
  38        The name of the sample; defaults to the file name if not provided to the parser.
  39    analyzer : str
  40        The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
  41    instrument_label : str
  42        The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
  43    _scan_info : dict
  44        A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z,
  45        scan text, and scan window (lower and upper).
  46        Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
  47    _ms : dict
  48        A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
  49    _ms_unprocessed: dictionary of pandas.DataFrames or None
  50        A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking.
  51        Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
  52
  53    Methods
  54    --------
  55    * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True).
  56        Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
  57    * get_time_of_scan_id(scan).
  58        Returns the scan time for the specified scan number.
  59    """
  60
  61    def __init__(
  62        self,
  63        file_location,
  64        analyzer="Unknown",
  65        instrument_label="Unknown",
  66        sample_name=None,
  67        spectra_parser=None,
  68    ):
  69        if isinstance(file_location, str):
  70            file_location = Path(file_location)
  71        else:
  72            file_location = file_location
  73        if not file_location.exists():
  74            raise FileExistsError("File does not exist: " + str(file_location))
  75
  76        if sample_name:
  77            self.sample_name = sample_name
  78        else:
  79            self.sample_name = file_location.stem
  80
  81        self.file_location = file_location
  82        self.analyzer = analyzer
  83        self.instrument_label = instrument_label
  84
  85        # Add the spectra parser class to the object if it is not None
  86        if spectra_parser is not None:
  87            self.spectra_parser_class = spectra_parser.__class__
  88            self.spectra_parser = spectra_parser
  89            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
  90            if (
  91                self.sample_name is not None
  92                and self.sample_name != self.spectra_parser.sample_name
  93            ):
  94                warnings.warn(
  95                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
  96                    UserWarning,
  97                )
  98            if self.analyzer != self.spectra_parser.analyzer:
  99                warnings.warn(
 100                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
 101                    UserWarning,
 102                )
 103            if self.instrument_label != self.spectra_parser.instrument_label:
 104                warnings.warn(
 105                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
 106                    UserWarning,
 107                )
 108            if self.file_location != self.spectra_parser.file_location:
 109                warnings.warn(
 110                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
 111                    UserWarning,
 112                )
 113
 114        # Instantiate empty dictionaries for scan information and mass spectra
 115        self._scan_info = {}
 116        self._ms = {}
 117        self._ms_unprocessed = {}
 118
 119    def add_mass_spectrum(self, mass_spec):
 120        """Adds a mass spectrum to the dataset.
 121
 122        Parameters
 123        -----------
 124        mass_spec : MassSpectrum
 125            The corems MassSpectrum object to be added to the dataset.
 126
 127        Notes
 128        -----
 129        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
 130        """
 131        # check if mass_spec has a scan_number attribute
 132        if not hasattr(mass_spec, "scan_number"):
 133            raise ValueError(
 134                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
 135            )
 136        self._ms[mass_spec.scan_number] = mass_spec
 137
 138    def add_mass_spectra(
 139        self,
 140        scan_list,
 141        spectrum_mode=None,
 142        ms_level=1,
 143        use_parser=True,
 144        auto_process=True,
 145        ms_params=None,
 146    ):
 147        """Add mass spectra to _ms dictionary, from a list of scans or single scan
 148
 149        Notes
 150        -----
 151        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
 152
 153
 154        Parameters
 155        -----------
 156        scan_list : list of ints
 157            List of scans to use to populate _ms slot
 158        spectrum_mode : str or None
 159            The spectrum mode to use for the mass spectra.
 160            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 161            Defaults to None.
 162        ms_level : int, optional
 163            The MS level to use for the mass spectra.
 164            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
 165            Defaults to 1.
 166        using_parser : bool
 167            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
 168        auto_process : bool
 169            Whether to auto-process the mass spectra.  Defaults to True.
 170        ms_params : MSParameters or None
 171            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
 172
 173        Raises
 174        ------
 175        TypeError
 176            If scan_list is not a list of ints
 177        ValueError
 178            If polarity is not 'positive' or 'negative'
 179            If ms_level is not 1 or 2
 180        """
 181
 182        # check if scan_list is a list or a single int; if single int, convert to list
 183        if isinstance(scan_list, int):
 184            scan_list = [scan_list]
 185        if not isinstance(scan_list, list):
 186            raise TypeError("scan_list must be a list of integers")
 187        for scan in scan_list:
 188            if not isinstance(scan, int):
 189                raise TypeError("scan_list must be a list of integers")
 190
 191        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 192        if self.polarity == "negative":
 193            polarity = -1
 194        elif self.polarity == "positive":
 195            polarity = 1
 196        else:
 197            raise ValueError(
 198                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
 199            )
 200
 201        # is not using_parser, check that ms1 and ms2 are not None
 202        if not use_parser:
 203            if ms_level not in self._ms_unprocessed.keys():
 204                raise ValueError(
 205                    "ms_level {} not found in _ms_unprocessed dictionary".format(
 206                        ms_level
 207                    )
 208                )
 209
 210        scan_list = list(set(scan_list))
 211        scan_list.sort()
 212        if not use_parser:
 213            if self._ms_unprocessed[ms_level] is None:
 214                raise ValueError(
 215                    "No unprocessed data found for ms_level {}".format(ms_level)
 216                )
 217            if (
 218                len(
 219                    np.setdiff1d(
 220                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
 221                    )
 222                )
 223                > 0
 224            ):
 225                raise ValueError(
 226                    "Not all scans in scan_list are present in the unprocessed data"
 227                )
 228            # Prepare the ms_df for parsing
 229            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
 230
 231        for scan in scan_list:
 232            ms = None
 233            if spectrum_mode is None:
 234                # get spectrum mode from _scan_info
 235                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
 236            else:
 237                spectrum_mode_scan = spectrum_mode
 238            # Instantiate the mass spectrum object using the parser or the unprocessed data
 239            if not use_parser:
 240                my_ms_df = ms_df.loc[scan]
 241                if spectrum_mode_scan == "profile":
 242                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
 243                    ms = ms_from_array_profile(
 244                        my_ms_df.mz,
 245                        my_ms_df.intensity,
 246                        self.file_location,
 247                        polarity=polarity,
 248                        auto_process=False,
 249                    )
 250                else:
 251                    raise ValueError(
 252                        "Only profile mode is supported for unprocessed data"
 253                    )
 254            if use_parser:
 255                ms = self.spectra_parser.get_mass_spectrum_from_scan(
 256                    scan_number=scan,
 257                    spectrum_mode=spectrum_mode_scan,
 258                    auto_process=False,
 259                )
 260
 261            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 262            if ms is not None:
 263                if ms_params is not None:
 264                    ms.parameters = ms_params
 265                ms.scan_number = scan
 266                if auto_process:
 267                    ms.process_mass_spec()
 268                self.add_mass_spectrum(ms)
 269
 270    def get_time_of_scan_id(self, scan):
 271        """Returns the scan time for the specified scan number.
 272
 273        Parameters
 274        -----------
 275        scan : int
 276            The scan number of the desired scan time.
 277
 278        Returns
 279        --------
 280        float
 281            The scan time for the specified scan number (in minutes).
 282
 283        Raises
 284        ------
 285        ValueError
 286            If no scan time is found for the specified scan number.
 287        """
 288        # Check if _retenion_time_list is empty and raise error if so
 289        if len(self._retention_time_list) == 0:
 290            raise ValueError("No retention times found in dataset")
 291        rt = self._retention_time_list[self._scans_number_list.index(scan)]
 292        return rt
 293
 294    @property
 295    def scan_df(self):
 296        """
 297        pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).
 298        """
 299        scan_df = pd.DataFrame.from_dict(self._scan_info)
 300        return scan_df
 301        
 302    @property
 303    def ms(self):
 304        """
 305        dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles
 306        """
 307        return self._ms
 308
 309    
 310    @scan_df.setter
 311    def scan_df(self, df):
 312        """
 313        Sets the scan data for the dataset.
 314
 315        Parameters
 316        -----------
 317        df : pandas.DataFrame
 318            A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level,
 319            precursor m/z, scan text, and scan window (lower and upper).
 320        """
 321        self._scan_info = df.to_dict()
 322
 323    def __getitem__(self, scan_number):
 324        return self._ms.get(scan_number)
 325
 326
 327class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch):
 328    """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.
 329
 330    This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.
 331
 332    Parameters
 333    -----------
 334    file_location : str or Path
 335        The location of the file containing the mass spectra data.
 336    analyzer : str, optional
 337        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
 338    instrument_label : str, optional
 339        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
 340    sample_name : str, optional
 341        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
 342    spectra_parser : object, optional
 343        The spectra parser object used to create the mass spectra object. Defaults to None.
 344
 345    Attributes
 346    -----------
 347    polarity : str
 348        The polarity of the ionization mode used for the dataset.
 349    _parameters : LCMSParameters
 350        The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
 351    _retention_time_list : numpy.ndarray
 352        An array of retention times for the dataset.
 353    _scans_number_list : list
 354        A list of scan numbers for the dataset.
 355    _tic_list : numpy.ndarray
 356        An array of total ion current (TIC) values for the dataset.
 357    eics : dict
 358        A dictionary containing extracted ion chromatograms (EICs) for the dataset.
 359        Key is the mz of the EIC. Initialized as an empty dictionary.
 360    mass_features : dictionary of LCMSMassFeature objects
 361        A dictionary containing mass features for the dataset.
 362        Key is mass feature ID. Initialized as an empty dictionary.
 363    spectral_search_results : dictionary of MS2SearchResults objects
 364        A dictionary containing spectral search results for the dataset.
 365        Key is scan number : precursor mz. Initialized as an empty dictionary.
 366
 367    Methods
 368    --------
 369    * get_parameters_json().
 370        Returns the parameters used for the LC-MS analysis in JSON format.
 371    * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 372        Adds which MS2 scans are associated with each mass feature to the
 373        mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
 374    * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 375        Adds the MS1 spectra associated with each mass feature to the
 376        mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
 377    * mass_features_to_df()
 378        Returns a pandas dataframe summarizing the mass features in the dataset.
 379    * set_tic_list_from_data(overwrite=False)
 380        Sets the TIC list from the mass spectrum objects within the _ms dictionary.
 381    * set_retention_time_from_data(overwrite=False)
 382        Sets the retention time list from the data in the _ms dictionary.
 383    * set_scans_number_from_data(overwrite=False)
 384        Sets the scan number list from the data in the _ms dictionary.
 385    * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False)
 386        Generates plot of M/Z features comparing scan time vs M/Z value
 387    """
 388
 389    def __init__(
 390        self,
 391        file_location,
 392        analyzer="Unknown",
 393        instrument_label="Unknown",
 394        sample_name=None,
 395        spectra_parser=None,
 396    ):
 397        super().__init__(
 398            file_location, analyzer, instrument_label, sample_name, spectra_parser
 399        )
 400        self.polarity = ""
 401        self._parameters = LCMSParameters()
 402        self._retention_time_list = []
 403        self._scans_number_list = []
 404        self._tic_list = []
 405        self.eics = {}
 406        self.mass_features = {}
 407        self.spectral_search_results = {}
 408
 409    def get_parameters_json(self):
 410        """Returns the parameters stored for the LC-MS object in JSON format.
 411
 412        Returns
 413        --------
 414        str
 415            The parameters used for the LC-MS analysis in JSON format.
 416        """
 417        return self.parameters.to_json()
 418
 419    def remove_unprocessed_data(self, ms_level=None):
 420        """Removes the unprocessed data from the LCMSBase object.
 421
 422        Parameters
 423        -----------
 424        ms_level : int, optional
 425            The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
 426
 427        Raises
 428        ------
 429        ValueError
 430            If ms_level is not 1 or 2.
 431
 432        Notes
 433        -----
 434        This method is useful for freeing up memory after the data has been processed.
 435        """
 436        if ms_level is None:
 437            for ms_level in self._ms_unprocessed.keys():
 438                self._ms_unprocessed[ms_level] = None
 439        if ms_level not in [1, 2]:
 440            raise ValueError("ms_level must be 1 or 2")
 441        self._ms_unprocessed[ms_level] = None
 442
 443    def add_associated_ms2_dda(
 444        self,
 445        auto_process=True,
 446        use_parser=True,
 447        spectrum_mode=None,
 448        ms_params_key="ms2",
 449        scan_filter=None,
 450    ):
 451        """Add MS2 spectra associated with mass features to the dataset.
 452
 453        Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
 454
 455        Parameters
 456        -----------
 457        auto_process : bool, optional
 458            If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
 459        use_parser : bool, optional
 460            If True, envoke the spectra parser to get the MS2 spectra. Default is True.
 461        spectrum_mode : str or None, optional
 462            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 463            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 464            Defaults to None. (faster if defined, otherwise will check each scan)
 465        ms_params_key : string, optional
 466            The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute.
 467            Defaults to 'ms2'.
 468        scan_filter : str
 469            A string to filter the scans to add to the _ms dictionary.  If None, all scans are added.  Defaults to None.
 470            "hcd" will pull out only HCD scans.
 471
 472        Raises
 473        ------
 474        ValueError
 475            If mass_features is not set, must run find_mass_features() first.
 476            If no MS2 scans are found in the dataset.
 477            If no precursor m/z values are found in MS2 scans, not a DDA dataset.
 478        """
 479        # Check if mass_features is set, raise error if not
 480        if self.mass_features is None:
 481            raise ValueError(
 482                "mass_features not set, must run find_mass_features() first"
 483            )
 484
 485        # reconfigure ms_params to get the correct mass spectrum parameters from the key
 486        ms_params = self.parameters.mass_spectrum[ms_params_key]
 487
 488        mf_df = self.mass_features_to_df().copy()
 489        # Find ms2 scans that have a precursor m/z value
 490        ms2_scans = self.scan_df[self.scan_df.ms_level == 2]
 491        ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()]
 492        # drop ms2 scans that have no tic
 493        ms2_scans = ms2_scans[ms2_scans.tic > 0]
 494        if ms2_scans is None:
 495            raise ValueError("No DDA scans found in dataset")
 496
 497        if scan_filter is not None:
 498            ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)]
 499        # set tolerance in rt space (in minutes) and mz space (in daltons)
 500        time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance
 501        mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance
 502
 503        # for each mass feature, find the ms2 scans that are within the roi scan time and mz range
 504        dda_scans = []
 505        for i, row in mf_df.iterrows():
 506            ms2_scans_filtered = ms2_scans[
 507                ms2_scans.scan_time.between(
 508                    row.scan_time - time_tol, row.scan_time + time_tol
 509                )
 510            ]
 511            ms2_scans_filtered = ms2_scans_filtered[
 512                ms2_scans_filtered.precursor_mz.between(
 513                    row.mz - mz_tol, row.mz + mz_tol
 514                )
 515            ]
 516            dda_scans = dda_scans + ms2_scans_filtered.scan.tolist()
 517            self.mass_features[i].ms2_scan_numbers = (
 518                ms2_scans_filtered.scan.tolist()
 519                + self.mass_features[i].ms2_scan_numbers
 520            )
 521        # add to _ms attribute
 522        self.add_mass_spectra(
 523            scan_list=list(set(dda_scans)),
 524            auto_process=auto_process,
 525            spectrum_mode=spectrum_mode,
 526            use_parser=use_parser,
 527            ms_params=ms_params,
 528        )
 529        # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute
 530        for mf_id in self.mass_features:
 531            if self.mass_features[mf_id].ms2_scan_numbers is not None:
 532                for dda_scan in self.mass_features[mf_id].ms2_scan_numbers:
 533                    if dda_scan in self._ms.keys():
 534                        self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[
 535                            dda_scan
 536                        ]
 537
 538    def add_associated_ms1(
 539        self, auto_process=True, use_parser=True, spectrum_mode=None
 540    ):
 541        """Add MS1 spectra associated with mass features to the dataset.
 542
 543        Parameters
 544        -----------
 545        auto_process : bool, optional
 546            If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
 547        use_parser : bool, optional
 548            If True, envoke the spectra parser to get the MS1 spectra. Default is True.
 549        spectrum_mode : str or None, optional
 550            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 551            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 552            Defaults to None. (faster if defined, otherwise will check each scan)
 553
 554        Raises
 555        ------
 556        ValueError
 557            If mass_features is not set, must run find_mass_features() first.
 558            If apex scans are not profile mode, all apex scans must be profile mode for averaging.
 559            If number of scans to average is not  1 or an integer with an integer median (i.e. 3, 5, 7, 9).
 560            If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
 561        """
 562        # Check if mass_features is set, raise error if not
 563        if self.mass_features is None:
 564            raise ValueError(
 565                "mass_features not set, must run find_mass_features() first"
 566            )
 567        scans_to_average = self.parameters.lc_ms.ms1_scans_to_average
 568
 569        if scans_to_average == 1:
 570            # Add to LCMSobj
 571            self.add_mass_spectra(
 572                scan_list=[
 573                    int(x) for x in self.mass_features_to_df().apex_scan.tolist()
 574                ],
 575                auto_process=auto_process,
 576                use_parser=use_parser,
 577                spectrum_mode=spectrum_mode,
 578                ms_params=self.parameters.mass_spectrum["ms1"],
 579            )
 580
 581        elif (
 582            (scans_to_average - 1) % 2
 583        ) == 0:  # scans_to_average = 3, 5, 7 etc, mirror l/r around apex
 584            apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist()))
 585            # Check if all apex scans are profile mode, raise error if not
 586            if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"):
 587                raise ValueError("All apex scans must be profile mode for averaging")
 588
 589            # First get sets of scans to average
 590            def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average):
 591                ms1_idx_start = ms1_scans.index(apex_scan) - int(
 592                    (scans_to_average - 1) / 2
 593                )
 594                if ms1_idx_start < 0:
 595                    ms1_idx_start = 0
 596                ms1_idx_end = (
 597                    ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1
 598                )
 599                if ms1_idx_end > (len(ms1_scans) - 1):
 600                    ms1_idx_end = len(ms1_scans) - 1
 601                scan_list = ms1_scans[ms1_idx_start:ms1_idx_end]
 602                return scan_list
 603
 604            ms1_scans = self.ms1_scans
 605            scans_lists = [
 606                get_scans_from_apex(ms1_scans, apex_scan, scans_to_average)
 607                for apex_scan in apex_scans
 608            ]
 609
 610            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 611            if self.polarity == "negative":
 612                polarity = -1
 613            elif self.polarity == "positive":
 614                polarity = 1
 615
 616            if not use_parser:
 617                # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once)
 618                ms1_unprocessed = self._ms_unprocessed[1].copy()
 619                # Set the index on _ms_unprocessed[1] to scan number
 620                ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False)
 621                self._ms_unprocessed[1] = ms1_unprocessed
 622
 623                # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1]
 624                scans_lists_flat = list(
 625                    set([scan for sublist in scans_lists for scan in sublist])
 626                )
 627                if (
 628                    len(
 629                        np.setdiff1d(
 630                            np.sort(scans_lists_flat),
 631                            np.sort(ms1_unprocessed.index.values),
 632                        )
 633                    )
 634                    > 0
 635                ):
 636                    raise ValueError(
 637                        "Not all scans to average are present in the unprocessed data"
 638                    )
 639
 640            for scan_list_average, apex_scan in zip(scans_lists, apex_scans):
 641                # Get unprocessed mass spectrum from scans
 642                ms = self.get_average_mass_spectrum(
 643                    scan_list=scan_list_average,
 644                    apex_scan=apex_scan,
 645                    spectrum_mode="profile",
 646                    ms_level=1,
 647                    auto_process=auto_process,
 648                    use_parser=use_parser,
 649                    perform_checks=False,
 650                    polarity=polarity,
 651                    ms_params=self.parameters.mass_spectrum["ms1"],
 652                )
 653                # Add mass spectrum to LCMS object and associated with mass feature
 654                self.add_mass_spectrum(ms)
 655
 656            if not use_parser:
 657                # Reset the index on _ms_unprocessed[1] to not be scan number
 658                ms1_unprocessed = ms1_unprocessed.reset_index(drop=True)
 659                self._ms_unprocessed[1] = ms1_unprocessed
 660        else:
 661            raise ValueError(
 662                "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)"
 663            )
 664
 665        # Associate the ms1 spectra with the mass features
 666        for mf_id in self.mass_features:
 667            self.mass_features[mf_id].mass_spectrum = self._ms[
 668                self.mass_features[mf_id].apex_scan
 669            ]
 670            self.mass_features[mf_id].update_mz()
 671
 672        # Re-process clustering if persistent homology is selected to remove duplicate mass features after adding and processing MS1 spectra
 673        if self.parameters.lc_ms.peak_picking_method == "persistent homology":
 674            self.cluster_mass_features(drop_children=True, sort_by="persistence")
 675
 676    def mass_features_to_df(self):
 677        """Returns a pandas dataframe summarizing the mass features.
 678
 679        The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity,
 680        persistence, area, monoisotopic_mf_id, and isotopologue_type.  The index is set to mf_id (mass feature ID).
 681
 682
 683        Returns
 684        --------
 685        pandas.DataFrame
 686            A pandas dataframe of mass features with the following columns:
 687            mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
 688        """
 689
 690        def mass_spectrum_to_string(
 691            mass_spec, normalize=True, min_normalized_abun=0.01
 692        ):
 693            """Converts a mass spectrum to a string of m/z:abundance pairs.
 694
 695            Parameters
 696            -----------
 697            mass_spec : MassSpectrum
 698                A MassSpectrum object to be converted to a string.
 699            normalize : bool, optional
 700                If True, normalizes the abundance values to a maximum of 1. Defaults to True.
 701            min_normalized_abun : float, optional
 702                The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01.
 703
 704            Returns
 705            --------
 706            str
 707                A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon.
 708            """
 709            mz_np = mass_spec.to_dataframe()["m/z"].values
 710            abun_np = mass_spec.to_dataframe()["Peak Height"].values
 711            if normalize:
 712                abun_np = abun_np / abun_np.max()
 713            mz_abun = np.column_stack((mz_np, abun_np))
 714            if normalize:
 715                mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun]
 716            mz_abun_str = [
 717                str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2))
 718                for mz, abun in mz_abun
 719            ]
 720            return "; ".join(mz_abun_str)
 721
 722        cols_in_df = [
 723            "id",
 724            "_apex_scan",
 725            "start_scan",
 726            "final_scan",
 727            "_retention_time",
 728            "_intensity",
 729            "_persistence",
 730            "_area",
 731            "_dispersity_index",
 732            "_tailing_factor",
 733            "monoisotopic_mf_id",
 734            "isotopologue_type",
 735            "mass_spectrum_deconvoluted_parent",
 736        ]
 737        df_mf_list = []
 738        for mf_id in self.mass_features.keys():
 739            # Find cols_in_df that are in single_mf
 740            df_keys = list(
 741                set(cols_in_df).intersection(self.mass_features[mf_id].__dir__())
 742            )
 743            dict_mf = {}
 744            for key in df_keys:
 745                dict_mf[key] = getattr(self.mass_features[mf_id], key)
 746            if len(self.mass_features[mf_id].ms2_scan_numbers) > 0:
 747                # Add MS2 spectra info
 748                best_ms2_spectrum = self.mass_features[mf_id].best_ms2
 749                dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum)
 750            if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0:
 751                dict_mf["associated_mass_features"] = ", ".join(
 752                    map(
 753                        str,
 754                        self.mass_features[mf_id].associated_mass_features_deconvoluted,
 755                    )
 756                )
 757            if self.mass_features[mf_id]._half_height_width is not None:
 758                dict_mf["half_height_width"] = self.mass_features[
 759                    mf_id
 760                ].half_height_width
 761            # Check if EIC for mass feature is set
 762            df_mf_single = pd.DataFrame(dict_mf, index=[mf_id])
 763            df_mf_single["mz"] = self.mass_features[mf_id].mz
 764            df_mf_list.append(df_mf_single)
 765        df_mf = pd.concat(df_mf_list)
 766
 767        # rename _area to area and id to mf_id
 768        df_mf = df_mf.rename(
 769            columns={
 770                "_area": "area",
 771                "id": "mf_id",
 772                "_apex_scan": "apex_scan",
 773                "_retention_time": "scan_time",
 774                "_intensity": "intensity",
 775                "_persistence": "persistence",
 776                "_dispersity_index": "dispersity_index",
 777                "_tailing_factor": "tailing_factor",
 778            }
 779        )
 780
 781        # reorder columns
 782        col_order = [
 783            "mf_id",
 784            "scan_time",
 785            "mz",
 786            "apex_scan",
 787            "start_scan",
 788            "final_scan",
 789            "intensity",
 790            "persistence",
 791            "area",
 792            "half_height_width",
 793            "tailing_factor",
 794            "dispersity_index",
 795            "monoisotopic_mf_id",
 796            "isotopologue_type",
 797            "mass_spectrum_deconvoluted_parent",
 798            "associated_mass_features",
 799            "ms2_spectrum",
 800        ]
 801        # drop columns that are not in col_order
 802        cols_to_order = [col for col in col_order if col in df_mf.columns]
 803        df_mf = df_mf[cols_to_order]
 804
 805        # reset index to mf_id
 806        df_mf = df_mf.set_index("mf_id")
 807        df_mf.index.name = "mf_id"
 808
 809        return df_mf
 810
 811    def mass_features_ms1_annot_to_df(self):
 812        """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
 813
 814        Returns
 815        --------
 816        pandas.DataFrame
 817            A pandas dataframe of MS1 annotations for the mass features in the dataset.
 818            The index is set to mf_id (mass feature ID)
 819
 820        Raises
 821        ------
 822        Warning
 823            If no MS1 annotations were found for the mass features in the dataset.
 824        """
 825        annot_df_list_ms1 = []
 826        for mf_id in self.mass_features.keys():
 827            if self.mass_features[mf_id].mass_spectrum is None:
 828                pass
 829            else:
 830                # Add ms1 annotations to ms1 annotation list
 831                if (
 832                    np.abs(
 833                        (
 834                            self.mass_features[mf_id].ms1_peak.mz_exp
 835                            - self.mass_features[mf_id].mz
 836                        )
 837                    )
 838                    < 0.01
 839                ):
 840                    # Get the molecular formula from the mass spectrum
 841                    annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe()
 842                    # Subset to pull out only the peak associated with the mass feature
 843                    annot_df = annot_df[
 844                        annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index
 845                    ].copy()
 846
 847                    # Remove the index column and add column for mf_id
 848                    annot_df = annot_df.drop(columns=["Index"])
 849                    annot_df["mf_id"] = mf_id
 850                    annot_df_list_ms1.append(annot_df)
 851
 852        if len(annot_df_list_ms1) > 0:
 853            annot_ms1_df_full = pd.concat(annot_df_list_ms1)
 854            annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id")
 855            annot_ms1_df_full.index.name = "mf_id"
 856
 857        else:
 858            annot_ms1_df_full = None
 859            # Warn that no ms1 annotations were found
 860            warnings.warn(
 861                "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?",
 862                UserWarning,
 863            )
 864
 865        return annot_ms1_df_full
 866
 867    def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
 868        """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
 869
 870        Parameters
 871        -----------
 872        molecular_metadata :  dict of MolecularMetadata objects
 873            A dictionary of MolecularMetadata objects, keyed by metabref_mol_id.  Defaults to None.
 874
 875        Returns
 876        --------
 877        pandas.DataFrame
 878            A pandas dataframe of MS2 annotations for the mass features in the dataset,
 879            and optionally molecular metadata. The index is set to mf_id (mass feature ID)
 880
 881        Raises
 882        ------
 883        Warning
 884            If no MS2 annotations were found for the mass features in the dataset.
 885        """
 886        annot_df_list_ms2 = []
 887        for mf_id in self.mass_features.keys():
 888            if len(self.mass_features[mf_id].ms2_similarity_results) > 0:
 889                # Add ms2 annotations to ms2 annotation list
 890                for result in self.mass_features[mf_id].ms2_similarity_results:
 891                    annot_df_ms2 = result.to_dataframe()
 892                    annot_df_ms2["mf_id"] = mf_id
 893                    annot_df_list_ms2.append(annot_df_ms2)
 894
 895        if len(annot_df_list_ms2) > 0:
 896            annot_ms2_df_full = pd.concat(annot_df_list_ms2)
 897            if molecular_metadata is not None:
 898                molecular_metadata_df = pd.concat(
 899                    [
 900                        pd.DataFrame.from_dict(v.__dict__, orient="index").transpose()
 901                        for k, v in molecular_metadata.items()
 902                    ],
 903                    ignore_index=True,
 904                )
 905                molecular_metadata_df = molecular_metadata_df.rename(
 906                    columns={"id": "ref_mol_id"}
 907                )
 908                annot_ms2_df_full = annot_ms2_df_full.merge(
 909                    molecular_metadata_df, on="ref_mol_id", how="left"
 910                )
 911            annot_ms2_df_full = annot_ms2_df_full.drop_duplicates(
 912                subset=["mf_id", "query_spectrum_id", "ref_ms_id"]
 913            ).copy()
 914            annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id")
 915            annot_ms2_df_full.index.name = "mf_id"
 916        else:
 917            annot_ms2_df_full = None
 918            # Warn that no ms2 annotations were found
 919            warnings.warn(
 920                "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?",
 921                UserWarning,
 922            )
 923
 924        return annot_ms2_df_full
 925
 926    def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False):
 927        """Returns a figure displaying 
 928            (1) thresholded, unprocessed data
 929            (2) the m/z features
 930            (3) which m/z features are associated with MS2 spectra
 931
 932        Parameters
 933        -----------
 934        binsize :  float
 935            Desired binsize for the m/z axis of the composite feature map.  Defaults to 1e-4.
 936        mf_plot : boolean
 937            Indicates whether to plot the m/z features. Defaults to True.
 938        ms2_plot : boolean
 939            Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
 940        return_fig : boolean
 941            Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
 942
 943        Returns
 944        --------
 945        matplotlib.pyplot.Figure
 946            A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 
 947            scan time. Unprocessed data is displayed in gray scale with darker colors indicating 
 948            higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z
 949            features with associated with MS2 spectra are plotted, they are displayed in red.
 950
 951        Raises
 952        ------
 953        Warning
 954            If m/z features are set to be plot but aren't in the dataset.
 955            If m/z features with associated MS2 data are set to be plot but no MS2 annotations 
 956            were found for the m/z features in the dataset.
 957        """
 958        if mf_plot:
 959            # Check if mass_features is set, raise error if not
 960            if self.mass_features is None:
 961                raise ValueError(
 962                    "mass_features not set, must run find_mass_features() first"
 963                )
 964            ## call mass feature data
 965            mf_df = self.mass_features_to_df()
 966
 967        if ms2_plot:
 968            if not mf_plot:
 969                # Check if mass_features is set, raise error if not
 970                if self.mass_features is None:
 971                    raise ValueError(
 972                        "mass_features not set, must run find_mass_features() first"
 973                    )
 974
 975            ## call m/z feature data
 976            mf_df = self.mass_features_to_df()
 977
 978            # Check if ms2_spectrum is set, raise error if not
 979            if 'ms2_spectrum' not in mf_df.columns:
 980                raise ValueError(                
 981                    "ms2_spectrum not set, must run add_associated_ms2_dda() first"            
 982                )
 983    
 984        ## threshold and grid unprocessed data
 985        df = self._ms_unprocessed[1].copy()
 986        df = df.dropna(subset=['intensity']).reset_index(drop = True)
 987        threshold = ph_int_min_thresh * df.intensity.max()
 988        df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy()
 989        df = self.grid_data(df_thres)
 990    
 991        ## format unprocessed data for plotting
 992        df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan')
 993        mz_grid = np.arange(0, np.max(df.mz), binsize)
 994        mz_data = np.array(df.mz)
 995        df['mz_bin'] = find_closest(mz_grid, mz_data)
 996        df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum)
 997        unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True)
 998
 999        ## generate figure
1000        fig = plt.figure()
1001        plt.scatter(
1002            unproc_df.scan_time,
1003            unproc_df.mz_bin*binsize,
1004            c = unproc_df.ab_bin/np.max(unproc_df.ab_bin),
1005            alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 
1006            cmap = 'Greys_r',
1007            s = 1
1008        )
1009
1010        if mf_plot:
1011            if ms2_plot:
1012                plt.scatter(
1013                    mf_df[mf_df.ms2_spectrum.isna()].scan_time,
1014                    mf_df[mf_df.ms2_spectrum.isna()].mz,
1015                    c = 'c',
1016                    s = 4,
1017                    label = 'M/Z features without MS2'
1018                )
1019            else:
1020                plt.scatter(
1021                    mf_df.scan_time,
1022                    mf_df.mz,
1023                    c = 'c',
1024                    s = 4,
1025                    label = 'M/Z features'
1026                )
1027
1028        if ms2_plot: 
1029            plt.scatter(
1030                mf_df[~mf_df.ms2_spectrum.isna()].scan_time,
1031                mf_df[~mf_df.ms2_spectrum.isna()].mz,
1032                c = 'r',
1033                s = 2,
1034                label = 'M/Z features with MS2'
1035            )
1036
1037        if mf_plot == True or ms2_plot == True:
1038            plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2)
1039        plt.xlabel('Scan time')
1040        plt.ylabel('m/z')
1041        plt.ylim(0, np.ceil(np.max(df.mz)))
1042        plt.xlim(0, np.ceil(np.max(df.scan_time)))
1043        plt.title('Composite Feature Map')
1044
1045        if return_fig:
1046            plt.close(fig)
1047            return fig
1048
1049        else:
1050            plt.show()
1051
1052    def __len__(self):
1053        """
1054        Returns the number of mass spectra in the dataset.
1055
1056        Returns
1057        --------
1058        int
1059            The number of mass spectra in the dataset.
1060        """
1061        return len(self._ms)
1062
1063    def __getitem__(self, scan_number):
1064        """
1065        Returns the mass spectrum corresponding to the specified scan number.
1066
1067        Parameters
1068        -----------
1069        scan_number : int
1070            The scan number of the desired mass spectrum.
1071
1072        Returns
1073        --------
1074        MassSpectrum
1075            The mass spectrum corresponding to the specified scan number.
1076        """
1077        return self._ms.get(scan_number)
1078
1079    def __iter__(self):
1080        """Returns an iterator over the mass spectra in the dataset.
1081
1082        Returns
1083        --------
1084        iterator
1085            An iterator over the mass spectra in the dataset.
1086        """
1087        return iter(self._ms.values())
1088
1089    def set_tic_list_from_data(self, overwrite=False):
1090        """Sets the TIC list from the mass spectrum objects within the _ms dictionary.
1091
1092        Parameters
1093        -----------
1094        overwrite : bool, optional
1095            If True, overwrites the TIC list if it is already set. Defaults to False.
1096
1097        Notes
1098        -----
1099        If the _ms dictionary is incomplete, sets the TIC list to an empty list.
1100
1101        Raises
1102        ------
1103        ValueError
1104            If no mass spectra are found in the dataset.
1105            If the TIC list is already set and overwrite is False.
1106        """
1107        # Check if _ms is empty and raise error if so
1108        if len(self._ms) == 0:
1109            raise ValueError("No mass spectra found in dataset")
1110
1111        # Check if tic_list is already set and raise error if so
1112        if len(self.tic) > 0 and not overwrite:
1113            raise ValueError("TIC list already set, use overwrite=True to overwrite")
1114
1115        self.tic = [self._ms.get(i).tic for i in self.scans_number]
1116
1117    def set_retention_time_from_data(self, overwrite=False):
1118        """Sets the retention time list from the data in the _ms dictionary.
1119
1120        Parameters
1121        -----------
1122        overwrite : bool, optional
1123            If True, overwrites the retention time list if it is already set. Defaults to False.
1124
1125        Notes
1126        -----
1127        If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
1128
1129        Raises
1130        ------
1131        ValueError
1132            If no mass spectra are found in the dataset.
1133            If the retention time list is already set and overwrite is False.
1134        """
1135        # Check if _ms is empty and raise error if so
1136        if len(self._ms) == 0:
1137            raise ValueError("No mass spectra found in dataset")
1138
1139        # Check if retention_time_list is already set and raise error if so
1140        if len(self.retention_time) > 0 and not overwrite:
1141            raise ValueError(
1142                "Retention time list already set, use overwrite=True to overwrite"
1143            )
1144
1145        retention_time_list = []
1146        for key_ms in sorted(self._ms.keys()):
1147            retention_time_list.append(self._ms.get(key_ms).retention_time)
1148        self.retention_time = retention_time_list
1149
1150    def set_scans_number_from_data(self, overwrite=False):
1151        """Sets the scan number list from the data in the _ms dictionary.
1152
1153        Notes
1154        -----
1155        If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
1156
1157        Raises
1158        ------
1159        ValueError
1160            If no mass spectra are found in the dataset.
1161            If the scan number list is already set and overwrite is False.
1162        """
1163        # Check if _ms is empty and raise error if so
1164        if len(self._ms) == 0:
1165            raise ValueError("No mass spectra found in dataset")
1166
1167        # Check if scans_number_list is already set and raise error if so
1168        if len(self.scans_number) > 0 and not overwrite:
1169            raise ValueError(
1170                "Scan number list already set, use overwrite=True to overwrite"
1171            )
1172
1173        self.scans_number = sorted(self._ms.keys())
1174
1175    @property
1176    def ms1_scans(self):
1177        """
1178        list : A list of MS1 scan numbers for the dataset.
1179        """
1180        return self.scan_df[self.scan_df.ms_level == 1].index.tolist()
1181
1182    @property
1183    def parameters(self):
1184        """
1185        LCMSParameters : The parameters used for the LC-MS analysis.
1186        """
1187        return self._parameters
1188
1189    @parameters.setter
1190    def parameters(self, paramsinstance):
1191        """
1192        Sets the parameters used for the LC-MS analysis.
1193
1194        Parameters
1195        -----------
1196        paramsinstance : LCMSParameters
1197            The parameters used for the LC-MS analysis.
1198        """
1199        self._parameters = paramsinstance
1200
1201    @property
1202    def scans_number(self):
1203        """
1204        list : A list of scan numbers for the dataset.
1205        """
1206        return self._scans_number_list
1207
1208    @scans_number.setter
1209    def scans_number(self, scan_numbers_list):
1210        """
1211        Sets the scan numbers for the dataset.
1212
1213        Parameters
1214        -----------
1215        scan_numbers_list : list
1216            A list of scan numbers for the dataset.
1217        """
1218        self._scans_number_list = scan_numbers_list
1219
1220    @property
1221    def retention_time(self):
1222        """
1223        numpy.ndarray : An array of retention times for the dataset.
1224        """
1225        return self._retention_time_list
1226
1227    @retention_time.setter
1228    def retention_time(self, rt_list):
1229        """
1230        Sets the retention times for the dataset.
1231
1232        Parameters
1233        -----------
1234        rt_list : list
1235            A list of retention times for the dataset.
1236        """
1237        self._retention_time_list = np.array(rt_list)
1238
1239    @property
1240    def tic(self):
1241        """
1242        numpy.ndarray : An array of TIC values for the dataset.
1243        """
1244        return self._tic_list
1245
1246    @tic.setter
1247    def tic(self, tic_list):
1248        """
1249        Sets the TIC values for the dataset.
1250
1251        Parameters
1252        -----------
1253        tic_list : list
1254            A list of TIC values for the dataset.
1255        """
1256        self._tic_list = np.array(tic_list)
class MassSpectraBase:
 16class MassSpectraBase:
 17    """Base class for mass spectra objects.
 18
 19    Parameters
 20    -----------
 21    file_location : str or Path
 22        The location of the file containing the mass spectra data.
 23    analyzer : str, optional
 24        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
 25    instrument_label : str, optional
 26        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
 27    sample_name : str, optional
 28        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
 29    spectra_parser : object, optional
 30        The spectra parser object used to create the mass spectra object. Defaults to None.
 31
 32    Attributes
 33    -----------
 34    spectra_parser_class : class
 35        The class of the spectra parser used to create the mass spectra object.
 36    file_location : str or Path
 37        The location of the file containing the mass spectra data.
 38    sample_name : str
 39        The name of the sample; defaults to the file name if not provided to the parser.
 40    analyzer : str
 41        The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
 42    instrument_label : str
 43        The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
 44    _scan_info : dict
 45        A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z,
 46        scan text, and scan window (lower and upper).
 47        Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
 48    _ms : dict
 49        A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
 50    _ms_unprocessed: dictionary of pandas.DataFrames or None
 51        A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking.
 52        Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
 53
 54    Methods
 55    --------
 56    * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True).
 57        Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
 58    * get_time_of_scan_id(scan).
 59        Returns the scan time for the specified scan number.
 60    """
 61
 62    def __init__(
 63        self,
 64        file_location,
 65        analyzer="Unknown",
 66        instrument_label="Unknown",
 67        sample_name=None,
 68        spectra_parser=None,
 69    ):
 70        if isinstance(file_location, str):
 71            file_location = Path(file_location)
 72        else:
 73            file_location = file_location
 74        if not file_location.exists():
 75            raise FileExistsError("File does not exist: " + str(file_location))
 76
 77        if sample_name:
 78            self.sample_name = sample_name
 79        else:
 80            self.sample_name = file_location.stem
 81
 82        self.file_location = file_location
 83        self.analyzer = analyzer
 84        self.instrument_label = instrument_label
 85
 86        # Add the spectra parser class to the object if it is not None
 87        if spectra_parser is not None:
 88            self.spectra_parser_class = spectra_parser.__class__
 89            self.spectra_parser = spectra_parser
 90            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
 91            if (
 92                self.sample_name is not None
 93                and self.sample_name != self.spectra_parser.sample_name
 94            ):
 95                warnings.warn(
 96                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
 97                    UserWarning,
 98                )
 99            if self.analyzer != self.spectra_parser.analyzer:
100                warnings.warn(
101                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
102                    UserWarning,
103                )
104            if self.instrument_label != self.spectra_parser.instrument_label:
105                warnings.warn(
106                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
107                    UserWarning,
108                )
109            if self.file_location != self.spectra_parser.file_location:
110                warnings.warn(
111                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
112                    UserWarning,
113                )
114
115        # Instantiate empty dictionaries for scan information and mass spectra
116        self._scan_info = {}
117        self._ms = {}
118        self._ms_unprocessed = {}
119
120    def add_mass_spectrum(self, mass_spec):
121        """Adds a mass spectrum to the dataset.
122
123        Parameters
124        -----------
125        mass_spec : MassSpectrum
126            The corems MassSpectrum object to be added to the dataset.
127
128        Notes
129        -----
130        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
131        """
132        # check if mass_spec has a scan_number attribute
133        if not hasattr(mass_spec, "scan_number"):
134            raise ValueError(
135                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
136            )
137        self._ms[mass_spec.scan_number] = mass_spec
138
139    def add_mass_spectra(
140        self,
141        scan_list,
142        spectrum_mode=None,
143        ms_level=1,
144        use_parser=True,
145        auto_process=True,
146        ms_params=None,
147    ):
148        """Add mass spectra to _ms dictionary, from a list of scans or single scan
149
150        Notes
151        -----
152        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
153
154
155        Parameters
156        -----------
157        scan_list : list of ints
158            List of scans to use to populate _ms slot
159        spectrum_mode : str or None
160            The spectrum mode to use for the mass spectra.
161            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
162            Defaults to None.
163        ms_level : int, optional
164            The MS level to use for the mass spectra.
165            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
166            Defaults to 1.
167        using_parser : bool
168            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
169        auto_process : bool
170            Whether to auto-process the mass spectra.  Defaults to True.
171        ms_params : MSParameters or None
172            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
173
174        Raises
175        ------
176        TypeError
177            If scan_list is not a list of ints
178        ValueError
179            If polarity is not 'positive' or 'negative'
180            If ms_level is not 1 or 2
181        """
182
183        # check if scan_list is a list or a single int; if single int, convert to list
184        if isinstance(scan_list, int):
185            scan_list = [scan_list]
186        if not isinstance(scan_list, list):
187            raise TypeError("scan_list must be a list of integers")
188        for scan in scan_list:
189            if not isinstance(scan, int):
190                raise TypeError("scan_list must be a list of integers")
191
192        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
193        if self.polarity == "negative":
194            polarity = -1
195        elif self.polarity == "positive":
196            polarity = 1
197        else:
198            raise ValueError(
199                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
200            )
201
202        # is not using_parser, check that ms1 and ms2 are not None
203        if not use_parser:
204            if ms_level not in self._ms_unprocessed.keys():
205                raise ValueError(
206                    "ms_level {} not found in _ms_unprocessed dictionary".format(
207                        ms_level
208                    )
209                )
210
211        scan_list = list(set(scan_list))
212        scan_list.sort()
213        if not use_parser:
214            if self._ms_unprocessed[ms_level] is None:
215                raise ValueError(
216                    "No unprocessed data found for ms_level {}".format(ms_level)
217                )
218            if (
219                len(
220                    np.setdiff1d(
221                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
222                    )
223                )
224                > 0
225            ):
226                raise ValueError(
227                    "Not all scans in scan_list are present in the unprocessed data"
228                )
229            # Prepare the ms_df for parsing
230            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
231
232        for scan in scan_list:
233            ms = None
234            if spectrum_mode is None:
235                # get spectrum mode from _scan_info
236                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
237            else:
238                spectrum_mode_scan = spectrum_mode
239            # Instantiate the mass spectrum object using the parser or the unprocessed data
240            if not use_parser:
241                my_ms_df = ms_df.loc[scan]
242                if spectrum_mode_scan == "profile":
243                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
244                    ms = ms_from_array_profile(
245                        my_ms_df.mz,
246                        my_ms_df.intensity,
247                        self.file_location,
248                        polarity=polarity,
249                        auto_process=False,
250                    )
251                else:
252                    raise ValueError(
253                        "Only profile mode is supported for unprocessed data"
254                    )
255            if use_parser:
256                ms = self.spectra_parser.get_mass_spectrum_from_scan(
257                    scan_number=scan,
258                    spectrum_mode=spectrum_mode_scan,
259                    auto_process=False,
260                )
261
262            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
263            if ms is not None:
264                if ms_params is not None:
265                    ms.parameters = ms_params
266                ms.scan_number = scan
267                if auto_process:
268                    ms.process_mass_spec()
269                self.add_mass_spectrum(ms)
270
271    def get_time_of_scan_id(self, scan):
272        """Returns the scan time for the specified scan number.
273
274        Parameters
275        -----------
276        scan : int
277            The scan number of the desired scan time.
278
279        Returns
280        --------
281        float
282            The scan time for the specified scan number (in minutes).
283
284        Raises
285        ------
286        ValueError
287            If no scan time is found for the specified scan number.
288        """
289        # Check if _retenion_time_list is empty and raise error if so
290        if len(self._retention_time_list) == 0:
291            raise ValueError("No retention times found in dataset")
292        rt = self._retention_time_list[self._scans_number_list.index(scan)]
293        return rt
294
295    @property
296    def scan_df(self):
297        """
298        pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).
299        """
300        scan_df = pd.DataFrame.from_dict(self._scan_info)
301        return scan_df
302        
303    @property
304    def ms(self):
305        """
306        dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles
307        """
308        return self._ms
309
310    
311    @scan_df.setter
312    def scan_df(self, df):
313        """
314        Sets the scan data for the dataset.
315
316        Parameters
317        -----------
318        df : pandas.DataFrame
319            A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level,
320            precursor m/z, scan text, and scan window (lower and upper).
321        """
322        self._scan_info = df.to_dict()
323
324    def __getitem__(self, scan_number):
325        return self._ms.get(scan_number)

Base class for mass spectra objects.

Parameters
  • file_location (str or Path): The location of the file containing the mass spectra data.
  • analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
  • instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
  • sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
  • spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
  • spectra_parser_class (class): The class of the spectra parser used to create the mass spectra object.
  • file_location (str or Path): The location of the file containing the mass spectra data.
  • sample_name (str): The name of the sample; defaults to the file name if not provided to the parser.
  • analyzer (str): The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
  • instrument_label (str): The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
  • _scan_info (dict): A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
  • _ms (dict): A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
  • _ms_unprocessed (dictionary of pandas.DataFrames or None): A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
Methods
  • add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
  • get_time_of_scan_id(scan). Returns the scan time for the specified scan number.
MassSpectraBase( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None, spectra_parser=None)
 62    def __init__(
 63        self,
 64        file_location,
 65        analyzer="Unknown",
 66        instrument_label="Unknown",
 67        sample_name=None,
 68        spectra_parser=None,
 69    ):
 70        if isinstance(file_location, str):
 71            file_location = Path(file_location)
 72        else:
 73            file_location = file_location
 74        if not file_location.exists():
 75            raise FileExistsError("File does not exist: " + str(file_location))
 76
 77        if sample_name:
 78            self.sample_name = sample_name
 79        else:
 80            self.sample_name = file_location.stem
 81
 82        self.file_location = file_location
 83        self.analyzer = analyzer
 84        self.instrument_label = instrument_label
 85
 86        # Add the spectra parser class to the object if it is not None
 87        if spectra_parser is not None:
 88            self.spectra_parser_class = spectra_parser.__class__
 89            self.spectra_parser = spectra_parser
 90            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
 91            if (
 92                self.sample_name is not None
 93                and self.sample_name != self.spectra_parser.sample_name
 94            ):
 95                warnings.warn(
 96                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
 97                    UserWarning,
 98                )
 99            if self.analyzer != self.spectra_parser.analyzer:
100                warnings.warn(
101                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
102                    UserWarning,
103                )
104            if self.instrument_label != self.spectra_parser.instrument_label:
105                warnings.warn(
106                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
107                    UserWarning,
108                )
109            if self.file_location != self.spectra_parser.file_location:
110                warnings.warn(
111                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
112                    UserWarning,
113                )
114
115        # Instantiate empty dictionaries for scan information and mass spectra
116        self._scan_info = {}
117        self._ms = {}
118        self._ms_unprocessed = {}
file_location
analyzer
instrument_label
def add_mass_spectrum(self, mass_spec):
120    def add_mass_spectrum(self, mass_spec):
121        """Adds a mass spectrum to the dataset.
122
123        Parameters
124        -----------
125        mass_spec : MassSpectrum
126            The corems MassSpectrum object to be added to the dataset.
127
128        Notes
129        -----
130        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
131        """
132        # check if mass_spec has a scan_number attribute
133        if not hasattr(mass_spec, "scan_number"):
134            raise ValueError(
135                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
136            )
137        self._ms[mass_spec.scan_number] = mass_spec

Adds a mass spectrum to the dataset.

Parameters
  • mass_spec (MassSpectrum): The corems MassSpectrum object to be added to the dataset.
Notes

This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.

def add_mass_spectra( self, scan_list, spectrum_mode=None, ms_level=1, use_parser=True, auto_process=True, ms_params=None):
139    def add_mass_spectra(
140        self,
141        scan_list,
142        spectrum_mode=None,
143        ms_level=1,
144        use_parser=True,
145        auto_process=True,
146        ms_params=None,
147    ):
148        """Add mass spectra to _ms dictionary, from a list of scans or single scan
149
150        Notes
151        -----
152        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
153
154
155        Parameters
156        -----------
157        scan_list : list of ints
158            List of scans to use to populate _ms slot
159        spectrum_mode : str or None
160            The spectrum mode to use for the mass spectra.
161            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
162            Defaults to None.
163        ms_level : int, optional
164            The MS level to use for the mass spectra.
165            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
166            Defaults to 1.
167        using_parser : bool
168            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
169        auto_process : bool
170            Whether to auto-process the mass spectra.  Defaults to True.
171        ms_params : MSParameters or None
172            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
173
174        Raises
175        ------
176        TypeError
177            If scan_list is not a list of ints
178        ValueError
179            If polarity is not 'positive' or 'negative'
180            If ms_level is not 1 or 2
181        """
182
183        # check if scan_list is a list or a single int; if single int, convert to list
184        if isinstance(scan_list, int):
185            scan_list = [scan_list]
186        if not isinstance(scan_list, list):
187            raise TypeError("scan_list must be a list of integers")
188        for scan in scan_list:
189            if not isinstance(scan, int):
190                raise TypeError("scan_list must be a list of integers")
191
192        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
193        if self.polarity == "negative":
194            polarity = -1
195        elif self.polarity == "positive":
196            polarity = 1
197        else:
198            raise ValueError(
199                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
200            )
201
202        # is not using_parser, check that ms1 and ms2 are not None
203        if not use_parser:
204            if ms_level not in self._ms_unprocessed.keys():
205                raise ValueError(
206                    "ms_level {} not found in _ms_unprocessed dictionary".format(
207                        ms_level
208                    )
209                )
210
211        scan_list = list(set(scan_list))
212        scan_list.sort()
213        if not use_parser:
214            if self._ms_unprocessed[ms_level] is None:
215                raise ValueError(
216                    "No unprocessed data found for ms_level {}".format(ms_level)
217                )
218            if (
219                len(
220                    np.setdiff1d(
221                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
222                    )
223                )
224                > 0
225            ):
226                raise ValueError(
227                    "Not all scans in scan_list are present in the unprocessed data"
228                )
229            # Prepare the ms_df for parsing
230            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
231
232        for scan in scan_list:
233            ms = None
234            if spectrum_mode is None:
235                # get spectrum mode from _scan_info
236                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
237            else:
238                spectrum_mode_scan = spectrum_mode
239            # Instantiate the mass spectrum object using the parser or the unprocessed data
240            if not use_parser:
241                my_ms_df = ms_df.loc[scan]
242                if spectrum_mode_scan == "profile":
243                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
244                    ms = ms_from_array_profile(
245                        my_ms_df.mz,
246                        my_ms_df.intensity,
247                        self.file_location,
248                        polarity=polarity,
249                        auto_process=False,
250                    )
251                else:
252                    raise ValueError(
253                        "Only profile mode is supported for unprocessed data"
254                    )
255            if use_parser:
256                ms = self.spectra_parser.get_mass_spectrum_from_scan(
257                    scan_number=scan,
258                    spectrum_mode=spectrum_mode_scan,
259                    auto_process=False,
260                )
261
262            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
263            if ms is not None:
264                if ms_params is not None:
265                    ms.parameters = ms_params
266                ms.scan_number = scan
267                if auto_process:
268                    ms.process_mass_spec()
269                self.add_mass_spectrum(ms)

Add mass spectra to _ms dictionary, from a list of scans or single scan

Notes

The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.

Parameters
  • scan_list (list of ints): List of scans to use to populate _ms slot
  • spectrum_mode (str or None): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None.
  • ms_level (int, optional): The MS level to use for the mass spectra. This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. Defaults to 1.
  • using_parser (bool): Whether to use the mass spectra parser to get the mass spectra. Defaults to True.
  • auto_process (bool): Whether to auto-process the mass spectra. Defaults to True.
  • ms_params (MSParameters or None): The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters.
Raises
  • TypeError: If scan_list is not a list of ints
  • ValueError: If polarity is not 'positive' or 'negative' If ms_level is not 1 or 2
def get_time_of_scan_id(self, scan):
271    def get_time_of_scan_id(self, scan):
272        """Returns the scan time for the specified scan number.
273
274        Parameters
275        -----------
276        scan : int
277            The scan number of the desired scan time.
278
279        Returns
280        --------
281        float
282            The scan time for the specified scan number (in minutes).
283
284        Raises
285        ------
286        ValueError
287            If no scan time is found for the specified scan number.
288        """
289        # Check if _retenion_time_list is empty and raise error if so
290        if len(self._retention_time_list) == 0:
291            raise ValueError("No retention times found in dataset")
292        rt = self._retention_time_list[self._scans_number_list.index(scan)]
293        return rt

Returns the scan time for the specified scan number.

Parameters
  • scan (int): The scan number of the desired scan time.
Returns
  • float: The scan time for the specified scan number (in minutes).
Raises
  • ValueError: If no scan time is found for the specified scan number.
scan_df

pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).

ms

dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles

 328class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch):
 329    """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.
 330
 331    This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.
 332
 333    Parameters
 334    -----------
 335    file_location : str or Path
 336        The location of the file containing the mass spectra data.
 337    analyzer : str, optional
 338        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
 339    instrument_label : str, optional
 340        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
 341    sample_name : str, optional
 342        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
 343    spectra_parser : object, optional
 344        The spectra parser object used to create the mass spectra object. Defaults to None.
 345
 346    Attributes
 347    -----------
 348    polarity : str
 349        The polarity of the ionization mode used for the dataset.
 350    _parameters : LCMSParameters
 351        The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
 352    _retention_time_list : numpy.ndarray
 353        An array of retention times for the dataset.
 354    _scans_number_list : list
 355        A list of scan numbers for the dataset.
 356    _tic_list : numpy.ndarray
 357        An array of total ion current (TIC) values for the dataset.
 358    eics : dict
 359        A dictionary containing extracted ion chromatograms (EICs) for the dataset.
 360        Key is the mz of the EIC. Initialized as an empty dictionary.
 361    mass_features : dictionary of LCMSMassFeature objects
 362        A dictionary containing mass features for the dataset.
 363        Key is mass feature ID. Initialized as an empty dictionary.
 364    spectral_search_results : dictionary of MS2SearchResults objects
 365        A dictionary containing spectral search results for the dataset.
 366        Key is scan number : precursor mz. Initialized as an empty dictionary.
 367
 368    Methods
 369    --------
 370    * get_parameters_json().
 371        Returns the parameters used for the LC-MS analysis in JSON format.
 372    * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 373        Adds which MS2 scans are associated with each mass feature to the
 374        mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
 375    * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 376        Adds the MS1 spectra associated with each mass feature to the
 377        mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
 378    * mass_features_to_df()
 379        Returns a pandas dataframe summarizing the mass features in the dataset.
 380    * set_tic_list_from_data(overwrite=False)
 381        Sets the TIC list from the mass spectrum objects within the _ms dictionary.
 382    * set_retention_time_from_data(overwrite=False)
 383        Sets the retention time list from the data in the _ms dictionary.
 384    * set_scans_number_from_data(overwrite=False)
 385        Sets the scan number list from the data in the _ms dictionary.
 386    * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False)
 387        Generates plot of M/Z features comparing scan time vs M/Z value
 388    """
 389
 390    def __init__(
 391        self,
 392        file_location,
 393        analyzer="Unknown",
 394        instrument_label="Unknown",
 395        sample_name=None,
 396        spectra_parser=None,
 397    ):
 398        super().__init__(
 399            file_location, analyzer, instrument_label, sample_name, spectra_parser
 400        )
 401        self.polarity = ""
 402        self._parameters = LCMSParameters()
 403        self._retention_time_list = []
 404        self._scans_number_list = []
 405        self._tic_list = []
 406        self.eics = {}
 407        self.mass_features = {}
 408        self.spectral_search_results = {}
 409
 410    def get_parameters_json(self):
 411        """Returns the parameters stored for the LC-MS object in JSON format.
 412
 413        Returns
 414        --------
 415        str
 416            The parameters used for the LC-MS analysis in JSON format.
 417        """
 418        return self.parameters.to_json()
 419
 420    def remove_unprocessed_data(self, ms_level=None):
 421        """Removes the unprocessed data from the LCMSBase object.
 422
 423        Parameters
 424        -----------
 425        ms_level : int, optional
 426            The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
 427
 428        Raises
 429        ------
 430        ValueError
 431            If ms_level is not 1 or 2.
 432
 433        Notes
 434        -----
 435        This method is useful for freeing up memory after the data has been processed.
 436        """
 437        if ms_level is None:
 438            for ms_level in self._ms_unprocessed.keys():
 439                self._ms_unprocessed[ms_level] = None
 440        if ms_level not in [1, 2]:
 441            raise ValueError("ms_level must be 1 or 2")
 442        self._ms_unprocessed[ms_level] = None
 443
 444    def add_associated_ms2_dda(
 445        self,
 446        auto_process=True,
 447        use_parser=True,
 448        spectrum_mode=None,
 449        ms_params_key="ms2",
 450        scan_filter=None,
 451    ):
 452        """Add MS2 spectra associated with mass features to the dataset.
 453
 454        Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
 455
 456        Parameters
 457        -----------
 458        auto_process : bool, optional
 459            If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
 460        use_parser : bool, optional
 461            If True, envoke the spectra parser to get the MS2 spectra. Default is True.
 462        spectrum_mode : str or None, optional
 463            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 464            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 465            Defaults to None. (faster if defined, otherwise will check each scan)
 466        ms_params_key : string, optional
 467            The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute.
 468            Defaults to 'ms2'.
 469        scan_filter : str
 470            A string to filter the scans to add to the _ms dictionary.  If None, all scans are added.  Defaults to None.
 471            "hcd" will pull out only HCD scans.
 472
 473        Raises
 474        ------
 475        ValueError
 476            If mass_features is not set, must run find_mass_features() first.
 477            If no MS2 scans are found in the dataset.
 478            If no precursor m/z values are found in MS2 scans, not a DDA dataset.
 479        """
 480        # Check if mass_features is set, raise error if not
 481        if self.mass_features is None:
 482            raise ValueError(
 483                "mass_features not set, must run find_mass_features() first"
 484            )
 485
 486        # reconfigure ms_params to get the correct mass spectrum parameters from the key
 487        ms_params = self.parameters.mass_spectrum[ms_params_key]
 488
 489        mf_df = self.mass_features_to_df().copy()
 490        # Find ms2 scans that have a precursor m/z value
 491        ms2_scans = self.scan_df[self.scan_df.ms_level == 2]
 492        ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()]
 493        # drop ms2 scans that have no tic
 494        ms2_scans = ms2_scans[ms2_scans.tic > 0]
 495        if ms2_scans is None:
 496            raise ValueError("No DDA scans found in dataset")
 497
 498        if scan_filter is not None:
 499            ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)]
 500        # set tolerance in rt space (in minutes) and mz space (in daltons)
 501        time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance
 502        mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance
 503
 504        # for each mass feature, find the ms2 scans that are within the roi scan time and mz range
 505        dda_scans = []
 506        for i, row in mf_df.iterrows():
 507            ms2_scans_filtered = ms2_scans[
 508                ms2_scans.scan_time.between(
 509                    row.scan_time - time_tol, row.scan_time + time_tol
 510                )
 511            ]
 512            ms2_scans_filtered = ms2_scans_filtered[
 513                ms2_scans_filtered.precursor_mz.between(
 514                    row.mz - mz_tol, row.mz + mz_tol
 515                )
 516            ]
 517            dda_scans = dda_scans + ms2_scans_filtered.scan.tolist()
 518            self.mass_features[i].ms2_scan_numbers = (
 519                ms2_scans_filtered.scan.tolist()
 520                + self.mass_features[i].ms2_scan_numbers
 521            )
 522        # add to _ms attribute
 523        self.add_mass_spectra(
 524            scan_list=list(set(dda_scans)),
 525            auto_process=auto_process,
 526            spectrum_mode=spectrum_mode,
 527            use_parser=use_parser,
 528            ms_params=ms_params,
 529        )
 530        # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute
 531        for mf_id in self.mass_features:
 532            if self.mass_features[mf_id].ms2_scan_numbers is not None:
 533                for dda_scan in self.mass_features[mf_id].ms2_scan_numbers:
 534                    if dda_scan in self._ms.keys():
 535                        self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[
 536                            dda_scan
 537                        ]
 538
 539    def add_associated_ms1(
 540        self, auto_process=True, use_parser=True, spectrum_mode=None
 541    ):
 542        """Add MS1 spectra associated with mass features to the dataset.
 543
 544        Parameters
 545        -----------
 546        auto_process : bool, optional
 547            If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
 548        use_parser : bool, optional
 549            If True, envoke the spectra parser to get the MS1 spectra. Default is True.
 550        spectrum_mode : str or None, optional
 551            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 552            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 553            Defaults to None. (faster if defined, otherwise will check each scan)
 554
 555        Raises
 556        ------
 557        ValueError
 558            If mass_features is not set, must run find_mass_features() first.
 559            If apex scans are not profile mode, all apex scans must be profile mode for averaging.
 560            If number of scans to average is not  1 or an integer with an integer median (i.e. 3, 5, 7, 9).
 561            If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
 562        """
 563        # Check if mass_features is set, raise error if not
 564        if self.mass_features is None:
 565            raise ValueError(
 566                "mass_features not set, must run find_mass_features() first"
 567            )
 568        scans_to_average = self.parameters.lc_ms.ms1_scans_to_average
 569
 570        if scans_to_average == 1:
 571            # Add to LCMSobj
 572            self.add_mass_spectra(
 573                scan_list=[
 574                    int(x) for x in self.mass_features_to_df().apex_scan.tolist()
 575                ],
 576                auto_process=auto_process,
 577                use_parser=use_parser,
 578                spectrum_mode=spectrum_mode,
 579                ms_params=self.parameters.mass_spectrum["ms1"],
 580            )
 581
 582        elif (
 583            (scans_to_average - 1) % 2
 584        ) == 0:  # scans_to_average = 3, 5, 7 etc, mirror l/r around apex
 585            apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist()))
 586            # Check if all apex scans are profile mode, raise error if not
 587            if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"):
 588                raise ValueError("All apex scans must be profile mode for averaging")
 589
 590            # First get sets of scans to average
 591            def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average):
 592                ms1_idx_start = ms1_scans.index(apex_scan) - int(
 593                    (scans_to_average - 1) / 2
 594                )
 595                if ms1_idx_start < 0:
 596                    ms1_idx_start = 0
 597                ms1_idx_end = (
 598                    ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1
 599                )
 600                if ms1_idx_end > (len(ms1_scans) - 1):
 601                    ms1_idx_end = len(ms1_scans) - 1
 602                scan_list = ms1_scans[ms1_idx_start:ms1_idx_end]
 603                return scan_list
 604
 605            ms1_scans = self.ms1_scans
 606            scans_lists = [
 607                get_scans_from_apex(ms1_scans, apex_scan, scans_to_average)
 608                for apex_scan in apex_scans
 609            ]
 610
 611            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 612            if self.polarity == "negative":
 613                polarity = -1
 614            elif self.polarity == "positive":
 615                polarity = 1
 616
 617            if not use_parser:
 618                # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once)
 619                ms1_unprocessed = self._ms_unprocessed[1].copy()
 620                # Set the index on _ms_unprocessed[1] to scan number
 621                ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False)
 622                self._ms_unprocessed[1] = ms1_unprocessed
 623
 624                # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1]
 625                scans_lists_flat = list(
 626                    set([scan for sublist in scans_lists for scan in sublist])
 627                )
 628                if (
 629                    len(
 630                        np.setdiff1d(
 631                            np.sort(scans_lists_flat),
 632                            np.sort(ms1_unprocessed.index.values),
 633                        )
 634                    )
 635                    > 0
 636                ):
 637                    raise ValueError(
 638                        "Not all scans to average are present in the unprocessed data"
 639                    )
 640
 641            for scan_list_average, apex_scan in zip(scans_lists, apex_scans):
 642                # Get unprocessed mass spectrum from scans
 643                ms = self.get_average_mass_spectrum(
 644                    scan_list=scan_list_average,
 645                    apex_scan=apex_scan,
 646                    spectrum_mode="profile",
 647                    ms_level=1,
 648                    auto_process=auto_process,
 649                    use_parser=use_parser,
 650                    perform_checks=False,
 651                    polarity=polarity,
 652                    ms_params=self.parameters.mass_spectrum["ms1"],
 653                )
 654                # Add mass spectrum to LCMS object and associated with mass feature
 655                self.add_mass_spectrum(ms)
 656
 657            if not use_parser:
 658                # Reset the index on _ms_unprocessed[1] to not be scan number
 659                ms1_unprocessed = ms1_unprocessed.reset_index(drop=True)
 660                self._ms_unprocessed[1] = ms1_unprocessed
 661        else:
 662            raise ValueError(
 663                "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)"
 664            )
 665
 666        # Associate the ms1 spectra with the mass features
 667        for mf_id in self.mass_features:
 668            self.mass_features[mf_id].mass_spectrum = self._ms[
 669                self.mass_features[mf_id].apex_scan
 670            ]
 671            self.mass_features[mf_id].update_mz()
 672
 673        # Re-process clustering if persistent homology is selected to remove duplicate mass features after adding and processing MS1 spectra
 674        if self.parameters.lc_ms.peak_picking_method == "persistent homology":
 675            self.cluster_mass_features(drop_children=True, sort_by="persistence")
 676
 677    def mass_features_to_df(self):
 678        """Returns a pandas dataframe summarizing the mass features.
 679
 680        The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity,
 681        persistence, area, monoisotopic_mf_id, and isotopologue_type.  The index is set to mf_id (mass feature ID).
 682
 683
 684        Returns
 685        --------
 686        pandas.DataFrame
 687            A pandas dataframe of mass features with the following columns:
 688            mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
 689        """
 690
 691        def mass_spectrum_to_string(
 692            mass_spec, normalize=True, min_normalized_abun=0.01
 693        ):
 694            """Converts a mass spectrum to a string of m/z:abundance pairs.
 695
 696            Parameters
 697            -----------
 698            mass_spec : MassSpectrum
 699                A MassSpectrum object to be converted to a string.
 700            normalize : bool, optional
 701                If True, normalizes the abundance values to a maximum of 1. Defaults to True.
 702            min_normalized_abun : float, optional
 703                The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01.
 704
 705            Returns
 706            --------
 707            str
 708                A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon.
 709            """
 710            mz_np = mass_spec.to_dataframe()["m/z"].values
 711            abun_np = mass_spec.to_dataframe()["Peak Height"].values
 712            if normalize:
 713                abun_np = abun_np / abun_np.max()
 714            mz_abun = np.column_stack((mz_np, abun_np))
 715            if normalize:
 716                mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun]
 717            mz_abun_str = [
 718                str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2))
 719                for mz, abun in mz_abun
 720            ]
 721            return "; ".join(mz_abun_str)
 722
 723        cols_in_df = [
 724            "id",
 725            "_apex_scan",
 726            "start_scan",
 727            "final_scan",
 728            "_retention_time",
 729            "_intensity",
 730            "_persistence",
 731            "_area",
 732            "_dispersity_index",
 733            "_tailing_factor",
 734            "monoisotopic_mf_id",
 735            "isotopologue_type",
 736            "mass_spectrum_deconvoluted_parent",
 737        ]
 738        df_mf_list = []
 739        for mf_id in self.mass_features.keys():
 740            # Find cols_in_df that are in single_mf
 741            df_keys = list(
 742                set(cols_in_df).intersection(self.mass_features[mf_id].__dir__())
 743            )
 744            dict_mf = {}
 745            for key in df_keys:
 746                dict_mf[key] = getattr(self.mass_features[mf_id], key)
 747            if len(self.mass_features[mf_id].ms2_scan_numbers) > 0:
 748                # Add MS2 spectra info
 749                best_ms2_spectrum = self.mass_features[mf_id].best_ms2
 750                dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum)
 751            if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0:
 752                dict_mf["associated_mass_features"] = ", ".join(
 753                    map(
 754                        str,
 755                        self.mass_features[mf_id].associated_mass_features_deconvoluted,
 756                    )
 757                )
 758            if self.mass_features[mf_id]._half_height_width is not None:
 759                dict_mf["half_height_width"] = self.mass_features[
 760                    mf_id
 761                ].half_height_width
 762            # Check if EIC for mass feature is set
 763            df_mf_single = pd.DataFrame(dict_mf, index=[mf_id])
 764            df_mf_single["mz"] = self.mass_features[mf_id].mz
 765            df_mf_list.append(df_mf_single)
 766        df_mf = pd.concat(df_mf_list)
 767
 768        # rename _area to area and id to mf_id
 769        df_mf = df_mf.rename(
 770            columns={
 771                "_area": "area",
 772                "id": "mf_id",
 773                "_apex_scan": "apex_scan",
 774                "_retention_time": "scan_time",
 775                "_intensity": "intensity",
 776                "_persistence": "persistence",
 777                "_dispersity_index": "dispersity_index",
 778                "_tailing_factor": "tailing_factor",
 779            }
 780        )
 781
 782        # reorder columns
 783        col_order = [
 784            "mf_id",
 785            "scan_time",
 786            "mz",
 787            "apex_scan",
 788            "start_scan",
 789            "final_scan",
 790            "intensity",
 791            "persistence",
 792            "area",
 793            "half_height_width",
 794            "tailing_factor",
 795            "dispersity_index",
 796            "monoisotopic_mf_id",
 797            "isotopologue_type",
 798            "mass_spectrum_deconvoluted_parent",
 799            "associated_mass_features",
 800            "ms2_spectrum",
 801        ]
 802        # drop columns that are not in col_order
 803        cols_to_order = [col for col in col_order if col in df_mf.columns]
 804        df_mf = df_mf[cols_to_order]
 805
 806        # reset index to mf_id
 807        df_mf = df_mf.set_index("mf_id")
 808        df_mf.index.name = "mf_id"
 809
 810        return df_mf
 811
 812    def mass_features_ms1_annot_to_df(self):
 813        """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
 814
 815        Returns
 816        --------
 817        pandas.DataFrame
 818            A pandas dataframe of MS1 annotations for the mass features in the dataset.
 819            The index is set to mf_id (mass feature ID)
 820
 821        Raises
 822        ------
 823        Warning
 824            If no MS1 annotations were found for the mass features in the dataset.
 825        """
 826        annot_df_list_ms1 = []
 827        for mf_id in self.mass_features.keys():
 828            if self.mass_features[mf_id].mass_spectrum is None:
 829                pass
 830            else:
 831                # Add ms1 annotations to ms1 annotation list
 832                if (
 833                    np.abs(
 834                        (
 835                            self.mass_features[mf_id].ms1_peak.mz_exp
 836                            - self.mass_features[mf_id].mz
 837                        )
 838                    )
 839                    < 0.01
 840                ):
 841                    # Get the molecular formula from the mass spectrum
 842                    annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe()
 843                    # Subset to pull out only the peak associated with the mass feature
 844                    annot_df = annot_df[
 845                        annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index
 846                    ].copy()
 847
 848                    # Remove the index column and add column for mf_id
 849                    annot_df = annot_df.drop(columns=["Index"])
 850                    annot_df["mf_id"] = mf_id
 851                    annot_df_list_ms1.append(annot_df)
 852
 853        if len(annot_df_list_ms1) > 0:
 854            annot_ms1_df_full = pd.concat(annot_df_list_ms1)
 855            annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id")
 856            annot_ms1_df_full.index.name = "mf_id"
 857
 858        else:
 859            annot_ms1_df_full = None
 860            # Warn that no ms1 annotations were found
 861            warnings.warn(
 862                "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?",
 863                UserWarning,
 864            )
 865
 866        return annot_ms1_df_full
 867
 868    def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
 869        """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
 870
 871        Parameters
 872        -----------
 873        molecular_metadata :  dict of MolecularMetadata objects
 874            A dictionary of MolecularMetadata objects, keyed by metabref_mol_id.  Defaults to None.
 875
 876        Returns
 877        --------
 878        pandas.DataFrame
 879            A pandas dataframe of MS2 annotations for the mass features in the dataset,
 880            and optionally molecular metadata. The index is set to mf_id (mass feature ID)
 881
 882        Raises
 883        ------
 884        Warning
 885            If no MS2 annotations were found for the mass features in the dataset.
 886        """
 887        annot_df_list_ms2 = []
 888        for mf_id in self.mass_features.keys():
 889            if len(self.mass_features[mf_id].ms2_similarity_results) > 0:
 890                # Add ms2 annotations to ms2 annotation list
 891                for result in self.mass_features[mf_id].ms2_similarity_results:
 892                    annot_df_ms2 = result.to_dataframe()
 893                    annot_df_ms2["mf_id"] = mf_id
 894                    annot_df_list_ms2.append(annot_df_ms2)
 895
 896        if len(annot_df_list_ms2) > 0:
 897            annot_ms2_df_full = pd.concat(annot_df_list_ms2)
 898            if molecular_metadata is not None:
 899                molecular_metadata_df = pd.concat(
 900                    [
 901                        pd.DataFrame.from_dict(v.__dict__, orient="index").transpose()
 902                        for k, v in molecular_metadata.items()
 903                    ],
 904                    ignore_index=True,
 905                )
 906                molecular_metadata_df = molecular_metadata_df.rename(
 907                    columns={"id": "ref_mol_id"}
 908                )
 909                annot_ms2_df_full = annot_ms2_df_full.merge(
 910                    molecular_metadata_df, on="ref_mol_id", how="left"
 911                )
 912            annot_ms2_df_full = annot_ms2_df_full.drop_duplicates(
 913                subset=["mf_id", "query_spectrum_id", "ref_ms_id"]
 914            ).copy()
 915            annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id")
 916            annot_ms2_df_full.index.name = "mf_id"
 917        else:
 918            annot_ms2_df_full = None
 919            # Warn that no ms2 annotations were found
 920            warnings.warn(
 921                "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?",
 922                UserWarning,
 923            )
 924
 925        return annot_ms2_df_full
 926
 927    def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False):
 928        """Returns a figure displaying 
 929            (1) thresholded, unprocessed data
 930            (2) the m/z features
 931            (3) which m/z features are associated with MS2 spectra
 932
 933        Parameters
 934        -----------
 935        binsize :  float
 936            Desired binsize for the m/z axis of the composite feature map.  Defaults to 1e-4.
 937        mf_plot : boolean
 938            Indicates whether to plot the m/z features. Defaults to True.
 939        ms2_plot : boolean
 940            Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
 941        return_fig : boolean
 942            Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
 943
 944        Returns
 945        --------
 946        matplotlib.pyplot.Figure
 947            A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 
 948            scan time. Unprocessed data is displayed in gray scale with darker colors indicating 
 949            higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z
 950            features with associated with MS2 spectra are plotted, they are displayed in red.
 951
 952        Raises
 953        ------
 954        Warning
 955            If m/z features are set to be plot but aren't in the dataset.
 956            If m/z features with associated MS2 data are set to be plot but no MS2 annotations 
 957            were found for the m/z features in the dataset.
 958        """
 959        if mf_plot:
 960            # Check if mass_features is set, raise error if not
 961            if self.mass_features is None:
 962                raise ValueError(
 963                    "mass_features not set, must run find_mass_features() first"
 964                )
 965            ## call mass feature data
 966            mf_df = self.mass_features_to_df()
 967
 968        if ms2_plot:
 969            if not mf_plot:
 970                # Check if mass_features is set, raise error if not
 971                if self.mass_features is None:
 972                    raise ValueError(
 973                        "mass_features not set, must run find_mass_features() first"
 974                    )
 975
 976            ## call m/z feature data
 977            mf_df = self.mass_features_to_df()
 978
 979            # Check if ms2_spectrum is set, raise error if not
 980            if 'ms2_spectrum' not in mf_df.columns:
 981                raise ValueError(                
 982                    "ms2_spectrum not set, must run add_associated_ms2_dda() first"            
 983                )
 984    
 985        ## threshold and grid unprocessed data
 986        df = self._ms_unprocessed[1].copy()
 987        df = df.dropna(subset=['intensity']).reset_index(drop = True)
 988        threshold = ph_int_min_thresh * df.intensity.max()
 989        df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy()
 990        df = self.grid_data(df_thres)
 991    
 992        ## format unprocessed data for plotting
 993        df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan')
 994        mz_grid = np.arange(0, np.max(df.mz), binsize)
 995        mz_data = np.array(df.mz)
 996        df['mz_bin'] = find_closest(mz_grid, mz_data)
 997        df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum)
 998        unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True)
 999
1000        ## generate figure
1001        fig = plt.figure()
1002        plt.scatter(
1003            unproc_df.scan_time,
1004            unproc_df.mz_bin*binsize,
1005            c = unproc_df.ab_bin/np.max(unproc_df.ab_bin),
1006            alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 
1007            cmap = 'Greys_r',
1008            s = 1
1009        )
1010
1011        if mf_plot:
1012            if ms2_plot:
1013                plt.scatter(
1014                    mf_df[mf_df.ms2_spectrum.isna()].scan_time,
1015                    mf_df[mf_df.ms2_spectrum.isna()].mz,
1016                    c = 'c',
1017                    s = 4,
1018                    label = 'M/Z features without MS2'
1019                )
1020            else:
1021                plt.scatter(
1022                    mf_df.scan_time,
1023                    mf_df.mz,
1024                    c = 'c',
1025                    s = 4,
1026                    label = 'M/Z features'
1027                )
1028
1029        if ms2_plot: 
1030            plt.scatter(
1031                mf_df[~mf_df.ms2_spectrum.isna()].scan_time,
1032                mf_df[~mf_df.ms2_spectrum.isna()].mz,
1033                c = 'r',
1034                s = 2,
1035                label = 'M/Z features with MS2'
1036            )
1037
1038        if mf_plot == True or ms2_plot == True:
1039            plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2)
1040        plt.xlabel('Scan time')
1041        plt.ylabel('m/z')
1042        plt.ylim(0, np.ceil(np.max(df.mz)))
1043        plt.xlim(0, np.ceil(np.max(df.scan_time)))
1044        plt.title('Composite Feature Map')
1045
1046        if return_fig:
1047            plt.close(fig)
1048            return fig
1049
1050        else:
1051            plt.show()
1052
1053    def __len__(self):
1054        """
1055        Returns the number of mass spectra in the dataset.
1056
1057        Returns
1058        --------
1059        int
1060            The number of mass spectra in the dataset.
1061        """
1062        return len(self._ms)
1063
1064    def __getitem__(self, scan_number):
1065        """
1066        Returns the mass spectrum corresponding to the specified scan number.
1067
1068        Parameters
1069        -----------
1070        scan_number : int
1071            The scan number of the desired mass spectrum.
1072
1073        Returns
1074        --------
1075        MassSpectrum
1076            The mass spectrum corresponding to the specified scan number.
1077        """
1078        return self._ms.get(scan_number)
1079
1080    def __iter__(self):
1081        """Returns an iterator over the mass spectra in the dataset.
1082
1083        Returns
1084        --------
1085        iterator
1086            An iterator over the mass spectra in the dataset.
1087        """
1088        return iter(self._ms.values())
1089
1090    def set_tic_list_from_data(self, overwrite=False):
1091        """Sets the TIC list from the mass spectrum objects within the _ms dictionary.
1092
1093        Parameters
1094        -----------
1095        overwrite : bool, optional
1096            If True, overwrites the TIC list if it is already set. Defaults to False.
1097
1098        Notes
1099        -----
1100        If the _ms dictionary is incomplete, sets the TIC list to an empty list.
1101
1102        Raises
1103        ------
1104        ValueError
1105            If no mass spectra are found in the dataset.
1106            If the TIC list is already set and overwrite is False.
1107        """
1108        # Check if _ms is empty and raise error if so
1109        if len(self._ms) == 0:
1110            raise ValueError("No mass spectra found in dataset")
1111
1112        # Check if tic_list is already set and raise error if so
1113        if len(self.tic) > 0 and not overwrite:
1114            raise ValueError("TIC list already set, use overwrite=True to overwrite")
1115
1116        self.tic = [self._ms.get(i).tic for i in self.scans_number]
1117
1118    def set_retention_time_from_data(self, overwrite=False):
1119        """Sets the retention time list from the data in the _ms dictionary.
1120
1121        Parameters
1122        -----------
1123        overwrite : bool, optional
1124            If True, overwrites the retention time list if it is already set. Defaults to False.
1125
1126        Notes
1127        -----
1128        If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
1129
1130        Raises
1131        ------
1132        ValueError
1133            If no mass spectra are found in the dataset.
1134            If the retention time list is already set and overwrite is False.
1135        """
1136        # Check if _ms is empty and raise error if so
1137        if len(self._ms) == 0:
1138            raise ValueError("No mass spectra found in dataset")
1139
1140        # Check if retention_time_list is already set and raise error if so
1141        if len(self.retention_time) > 0 and not overwrite:
1142            raise ValueError(
1143                "Retention time list already set, use overwrite=True to overwrite"
1144            )
1145
1146        retention_time_list = []
1147        for key_ms in sorted(self._ms.keys()):
1148            retention_time_list.append(self._ms.get(key_ms).retention_time)
1149        self.retention_time = retention_time_list
1150
1151    def set_scans_number_from_data(self, overwrite=False):
1152        """Sets the scan number list from the data in the _ms dictionary.
1153
1154        Notes
1155        -----
1156        If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
1157
1158        Raises
1159        ------
1160        ValueError
1161            If no mass spectra are found in the dataset.
1162            If the scan number list is already set and overwrite is False.
1163        """
1164        # Check if _ms is empty and raise error if so
1165        if len(self._ms) == 0:
1166            raise ValueError("No mass spectra found in dataset")
1167
1168        # Check if scans_number_list is already set and raise error if so
1169        if len(self.scans_number) > 0 and not overwrite:
1170            raise ValueError(
1171                "Scan number list already set, use overwrite=True to overwrite"
1172            )
1173
1174        self.scans_number = sorted(self._ms.keys())
1175
1176    @property
1177    def ms1_scans(self):
1178        """
1179        list : A list of MS1 scan numbers for the dataset.
1180        """
1181        return self.scan_df[self.scan_df.ms_level == 1].index.tolist()
1182
1183    @property
1184    def parameters(self):
1185        """
1186        LCMSParameters : The parameters used for the LC-MS analysis.
1187        """
1188        return self._parameters
1189
1190    @parameters.setter
1191    def parameters(self, paramsinstance):
1192        """
1193        Sets the parameters used for the LC-MS analysis.
1194
1195        Parameters
1196        -----------
1197        paramsinstance : LCMSParameters
1198            The parameters used for the LC-MS analysis.
1199        """
1200        self._parameters = paramsinstance
1201
1202    @property
1203    def scans_number(self):
1204        """
1205        list : A list of scan numbers for the dataset.
1206        """
1207        return self._scans_number_list
1208
1209    @scans_number.setter
1210    def scans_number(self, scan_numbers_list):
1211        """
1212        Sets the scan numbers for the dataset.
1213
1214        Parameters
1215        -----------
1216        scan_numbers_list : list
1217            A list of scan numbers for the dataset.
1218        """
1219        self._scans_number_list = scan_numbers_list
1220
1221    @property
1222    def retention_time(self):
1223        """
1224        numpy.ndarray : An array of retention times for the dataset.
1225        """
1226        return self._retention_time_list
1227
1228    @retention_time.setter
1229    def retention_time(self, rt_list):
1230        """
1231        Sets the retention times for the dataset.
1232
1233        Parameters
1234        -----------
1235        rt_list : list
1236            A list of retention times for the dataset.
1237        """
1238        self._retention_time_list = np.array(rt_list)
1239
1240    @property
1241    def tic(self):
1242        """
1243        numpy.ndarray : An array of TIC values for the dataset.
1244        """
1245        return self._tic_list
1246
1247    @tic.setter
1248    def tic(self, tic_list):
1249        """
1250        Sets the TIC values for the dataset.
1251
1252        Parameters
1253        -----------
1254        tic_list : list
1255            A list of TIC values for the dataset.
1256        """
1257        self._tic_list = np.array(tic_list)

A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.

This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.

Parameters
  • file_location (str or Path): The location of the file containing the mass spectra data.
  • analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
  • instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
  • sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
  • spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
  • polarity (str): The polarity of the ionization mode used for the dataset.
  • _parameters (LCMSParameters): The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
  • _retention_time_list (numpy.ndarray): An array of retention times for the dataset.
  • _scans_number_list (list): A list of scan numbers for the dataset.
  • _tic_list (numpy.ndarray): An array of total ion current (TIC) values for the dataset.
  • eics (dict): A dictionary containing extracted ion chromatograms (EICs) for the dataset. Key is the mz of the EIC. Initialized as an empty dictionary.
  • mass_features (dictionary of LCMSMassFeature objects): A dictionary containing mass features for the dataset. Key is mass feature ID. Initialized as an empty dictionary.
  • spectral_search_results (dictionary of MS2SearchResults objects): A dictionary containing spectral search results for the dataset. Key is scan number : precursor mz. Initialized as an empty dictionary.
Methods
  • get_parameters_json(). Returns the parameters used for the LC-MS analysis in JSON format.
  • add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds which MS2 scans are associated with each mass feature to the mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
  • add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds the MS1 spectra associated with each mass feature to the mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
  • mass_features_to_df() Returns a pandas dataframe summarizing the mass features in the dataset.
  • set_tic_list_from_data(overwrite=False) Sets the TIC list from the mass spectrum objects within the _ms dictionary.
  • set_retention_time_from_data(overwrite=False) Sets the retention time list from the data in the _ms dictionary.
  • set_scans_number_from_data(overwrite=False) Sets the scan number list from the data in the _ms dictionary.
  • plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) Generates plot of M/Z features comparing scan time vs M/Z value
LCMSBase( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None, spectra_parser=None)
390    def __init__(
391        self,
392        file_location,
393        analyzer="Unknown",
394        instrument_label="Unknown",
395        sample_name=None,
396        spectra_parser=None,
397    ):
398        super().__init__(
399            file_location, analyzer, instrument_label, sample_name, spectra_parser
400        )
401        self.polarity = ""
402        self._parameters = LCMSParameters()
403        self._retention_time_list = []
404        self._scans_number_list = []
405        self._tic_list = []
406        self.eics = {}
407        self.mass_features = {}
408        self.spectral_search_results = {}
polarity
eics
mass_features
spectral_search_results
def get_parameters_json(self):
410    def get_parameters_json(self):
411        """Returns the parameters stored for the LC-MS object in JSON format.
412
413        Returns
414        --------
415        str
416            The parameters used for the LC-MS analysis in JSON format.
417        """
418        return self.parameters.to_json()

Returns the parameters stored for the LC-MS object in JSON format.

Returns
  • str: The parameters used for the LC-MS analysis in JSON format.
def remove_unprocessed_data(self, ms_level=None):
420    def remove_unprocessed_data(self, ms_level=None):
421        """Removes the unprocessed data from the LCMSBase object.
422
423        Parameters
424        -----------
425        ms_level : int, optional
426            The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
427
428        Raises
429        ------
430        ValueError
431            If ms_level is not 1 or 2.
432
433        Notes
434        -----
435        This method is useful for freeing up memory after the data has been processed.
436        """
437        if ms_level is None:
438            for ms_level in self._ms_unprocessed.keys():
439                self._ms_unprocessed[ms_level] = None
440        if ms_level not in [1, 2]:
441            raise ValueError("ms_level must be 1 or 2")
442        self._ms_unprocessed[ms_level] = None

Removes the unprocessed data from the LCMSBase object.

Parameters
  • ms_level (int, optional): The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
Raises
  • ValueError: If ms_level is not 1 or 2.
Notes

This method is useful for freeing up memory after the data has been processed.

def add_associated_ms2_dda( self, auto_process=True, use_parser=True, spectrum_mode=None, ms_params_key='ms2', scan_filter=None):
444    def add_associated_ms2_dda(
445        self,
446        auto_process=True,
447        use_parser=True,
448        spectrum_mode=None,
449        ms_params_key="ms2",
450        scan_filter=None,
451    ):
452        """Add MS2 spectra associated with mass features to the dataset.
453
454        Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
455
456        Parameters
457        -----------
458        auto_process : bool, optional
459            If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
460        use_parser : bool, optional
461            If True, envoke the spectra parser to get the MS2 spectra. Default is True.
462        spectrum_mode : str or None, optional
463            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
464            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
465            Defaults to None. (faster if defined, otherwise will check each scan)
466        ms_params_key : string, optional
467            The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute.
468            Defaults to 'ms2'.
469        scan_filter : str
470            A string to filter the scans to add to the _ms dictionary.  If None, all scans are added.  Defaults to None.
471            "hcd" will pull out only HCD scans.
472
473        Raises
474        ------
475        ValueError
476            If mass_features is not set, must run find_mass_features() first.
477            If no MS2 scans are found in the dataset.
478            If no precursor m/z values are found in MS2 scans, not a DDA dataset.
479        """
480        # Check if mass_features is set, raise error if not
481        if self.mass_features is None:
482            raise ValueError(
483                "mass_features not set, must run find_mass_features() first"
484            )
485
486        # reconfigure ms_params to get the correct mass spectrum parameters from the key
487        ms_params = self.parameters.mass_spectrum[ms_params_key]
488
489        mf_df = self.mass_features_to_df().copy()
490        # Find ms2 scans that have a precursor m/z value
491        ms2_scans = self.scan_df[self.scan_df.ms_level == 2]
492        ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()]
493        # drop ms2 scans that have no tic
494        ms2_scans = ms2_scans[ms2_scans.tic > 0]
495        if ms2_scans is None:
496            raise ValueError("No DDA scans found in dataset")
497
498        if scan_filter is not None:
499            ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)]
500        # set tolerance in rt space (in minutes) and mz space (in daltons)
501        time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance
502        mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance
503
504        # for each mass feature, find the ms2 scans that are within the roi scan time and mz range
505        dda_scans = []
506        for i, row in mf_df.iterrows():
507            ms2_scans_filtered = ms2_scans[
508                ms2_scans.scan_time.between(
509                    row.scan_time - time_tol, row.scan_time + time_tol
510                )
511            ]
512            ms2_scans_filtered = ms2_scans_filtered[
513                ms2_scans_filtered.precursor_mz.between(
514                    row.mz - mz_tol, row.mz + mz_tol
515                )
516            ]
517            dda_scans = dda_scans + ms2_scans_filtered.scan.tolist()
518            self.mass_features[i].ms2_scan_numbers = (
519                ms2_scans_filtered.scan.tolist()
520                + self.mass_features[i].ms2_scan_numbers
521            )
522        # add to _ms attribute
523        self.add_mass_spectra(
524            scan_list=list(set(dda_scans)),
525            auto_process=auto_process,
526            spectrum_mode=spectrum_mode,
527            use_parser=use_parser,
528            ms_params=ms_params,
529        )
530        # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute
531        for mf_id in self.mass_features:
532            if self.mass_features[mf_id].ms2_scan_numbers is not None:
533                for dda_scan in self.mass_features[mf_id].ms2_scan_numbers:
534                    if dda_scan in self._ms.keys():
535                        self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[
536                            dda_scan
537                        ]

Add MS2 spectra associated with mass features to the dataset.

Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)

Parameters
  • auto_process (bool, optional): If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
  • use_parser (bool, optional): If True, envoke the spectra parser to get the MS2 spectra. Default is True.
  • spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
  • ms_params_key (string, optional): The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. Defaults to 'ms2'.
  • scan_filter (str): A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. "hcd" will pull out only HCD scans.
Raises
  • ValueError: If mass_features is not set, must run find_mass_features() first. If no MS2 scans are found in the dataset. If no precursor m/z values are found in MS2 scans, not a DDA dataset.
def add_associated_ms1(self, auto_process=True, use_parser=True, spectrum_mode=None):
539    def add_associated_ms1(
540        self, auto_process=True, use_parser=True, spectrum_mode=None
541    ):
542        """Add MS1 spectra associated with mass features to the dataset.
543
544        Parameters
545        -----------
546        auto_process : bool, optional
547            If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
548        use_parser : bool, optional
549            If True, envoke the spectra parser to get the MS1 spectra. Default is True.
550        spectrum_mode : str or None, optional
551            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
552            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
553            Defaults to None. (faster if defined, otherwise will check each scan)
554
555        Raises
556        ------
557        ValueError
558            If mass_features is not set, must run find_mass_features() first.
559            If apex scans are not profile mode, all apex scans must be profile mode for averaging.
560            If number of scans to average is not  1 or an integer with an integer median (i.e. 3, 5, 7, 9).
561            If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
562        """
563        # Check if mass_features is set, raise error if not
564        if self.mass_features is None:
565            raise ValueError(
566                "mass_features not set, must run find_mass_features() first"
567            )
568        scans_to_average = self.parameters.lc_ms.ms1_scans_to_average
569
570        if scans_to_average == 1:
571            # Add to LCMSobj
572            self.add_mass_spectra(
573                scan_list=[
574                    int(x) for x in self.mass_features_to_df().apex_scan.tolist()
575                ],
576                auto_process=auto_process,
577                use_parser=use_parser,
578                spectrum_mode=spectrum_mode,
579                ms_params=self.parameters.mass_spectrum["ms1"],
580            )
581
582        elif (
583            (scans_to_average - 1) % 2
584        ) == 0:  # scans_to_average = 3, 5, 7 etc, mirror l/r around apex
585            apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist()))
586            # Check if all apex scans are profile mode, raise error if not
587            if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"):
588                raise ValueError("All apex scans must be profile mode for averaging")
589
590            # First get sets of scans to average
591            def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average):
592                ms1_idx_start = ms1_scans.index(apex_scan) - int(
593                    (scans_to_average - 1) / 2
594                )
595                if ms1_idx_start < 0:
596                    ms1_idx_start = 0
597                ms1_idx_end = (
598                    ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1
599                )
600                if ms1_idx_end > (len(ms1_scans) - 1):
601                    ms1_idx_end = len(ms1_scans) - 1
602                scan_list = ms1_scans[ms1_idx_start:ms1_idx_end]
603                return scan_list
604
605            ms1_scans = self.ms1_scans
606            scans_lists = [
607                get_scans_from_apex(ms1_scans, apex_scan, scans_to_average)
608                for apex_scan in apex_scans
609            ]
610
611            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
612            if self.polarity == "negative":
613                polarity = -1
614            elif self.polarity == "positive":
615                polarity = 1
616
617            if not use_parser:
618                # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once)
619                ms1_unprocessed = self._ms_unprocessed[1].copy()
620                # Set the index on _ms_unprocessed[1] to scan number
621                ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False)
622                self._ms_unprocessed[1] = ms1_unprocessed
623
624                # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1]
625                scans_lists_flat = list(
626                    set([scan for sublist in scans_lists for scan in sublist])
627                )
628                if (
629                    len(
630                        np.setdiff1d(
631                            np.sort(scans_lists_flat),
632                            np.sort(ms1_unprocessed.index.values),
633                        )
634                    )
635                    > 0
636                ):
637                    raise ValueError(
638                        "Not all scans to average are present in the unprocessed data"
639                    )
640
641            for scan_list_average, apex_scan in zip(scans_lists, apex_scans):
642                # Get unprocessed mass spectrum from scans
643                ms = self.get_average_mass_spectrum(
644                    scan_list=scan_list_average,
645                    apex_scan=apex_scan,
646                    spectrum_mode="profile",
647                    ms_level=1,
648                    auto_process=auto_process,
649                    use_parser=use_parser,
650                    perform_checks=False,
651                    polarity=polarity,
652                    ms_params=self.parameters.mass_spectrum["ms1"],
653                )
654                # Add mass spectrum to LCMS object and associated with mass feature
655                self.add_mass_spectrum(ms)
656
657            if not use_parser:
658                # Reset the index on _ms_unprocessed[1] to not be scan number
659                ms1_unprocessed = ms1_unprocessed.reset_index(drop=True)
660                self._ms_unprocessed[1] = ms1_unprocessed
661        else:
662            raise ValueError(
663                "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)"
664            )
665
666        # Associate the ms1 spectra with the mass features
667        for mf_id in self.mass_features:
668            self.mass_features[mf_id].mass_spectrum = self._ms[
669                self.mass_features[mf_id].apex_scan
670            ]
671            self.mass_features[mf_id].update_mz()
672
673        # Re-process clustering if persistent homology is selected to remove duplicate mass features after adding and processing MS1 spectra
674        if self.parameters.lc_ms.peak_picking_method == "persistent homology":
675            self.cluster_mass_features(drop_children=True, sort_by="persistence")

Add MS1 spectra associated with mass features to the dataset.

Parameters
  • auto_process (bool, optional): If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
  • use_parser (bool, optional): If True, envoke the spectra parser to get the MS1 spectra. Default is True.
  • spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
Raises
  • ValueError: If mass_features is not set, must run find_mass_features() first. If apex scans are not profile mode, all apex scans must be profile mode for averaging. If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
def mass_features_to_df(self):
677    def mass_features_to_df(self):
678        """Returns a pandas dataframe summarizing the mass features.
679
680        The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity,
681        persistence, area, monoisotopic_mf_id, and isotopologue_type.  The index is set to mf_id (mass feature ID).
682
683
684        Returns
685        --------
686        pandas.DataFrame
687            A pandas dataframe of mass features with the following columns:
688            mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
689        """
690
691        def mass_spectrum_to_string(
692            mass_spec, normalize=True, min_normalized_abun=0.01
693        ):
694            """Converts a mass spectrum to a string of m/z:abundance pairs.
695
696            Parameters
697            -----------
698            mass_spec : MassSpectrum
699                A MassSpectrum object to be converted to a string.
700            normalize : bool, optional
701                If True, normalizes the abundance values to a maximum of 1. Defaults to True.
702            min_normalized_abun : float, optional
703                The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01.
704
705            Returns
706            --------
707            str
708                A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon.
709            """
710            mz_np = mass_spec.to_dataframe()["m/z"].values
711            abun_np = mass_spec.to_dataframe()["Peak Height"].values
712            if normalize:
713                abun_np = abun_np / abun_np.max()
714            mz_abun = np.column_stack((mz_np, abun_np))
715            if normalize:
716                mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun]
717            mz_abun_str = [
718                str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2))
719                for mz, abun in mz_abun
720            ]
721            return "; ".join(mz_abun_str)
722
723        cols_in_df = [
724            "id",
725            "_apex_scan",
726            "start_scan",
727            "final_scan",
728            "_retention_time",
729            "_intensity",
730            "_persistence",
731            "_area",
732            "_dispersity_index",
733            "_tailing_factor",
734            "monoisotopic_mf_id",
735            "isotopologue_type",
736            "mass_spectrum_deconvoluted_parent",
737        ]
738        df_mf_list = []
739        for mf_id in self.mass_features.keys():
740            # Find cols_in_df that are in single_mf
741            df_keys = list(
742                set(cols_in_df).intersection(self.mass_features[mf_id].__dir__())
743            )
744            dict_mf = {}
745            for key in df_keys:
746                dict_mf[key] = getattr(self.mass_features[mf_id], key)
747            if len(self.mass_features[mf_id].ms2_scan_numbers) > 0:
748                # Add MS2 spectra info
749                best_ms2_spectrum = self.mass_features[mf_id].best_ms2
750                dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum)
751            if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0:
752                dict_mf["associated_mass_features"] = ", ".join(
753                    map(
754                        str,
755                        self.mass_features[mf_id].associated_mass_features_deconvoluted,
756                    )
757                )
758            if self.mass_features[mf_id]._half_height_width is not None:
759                dict_mf["half_height_width"] = self.mass_features[
760                    mf_id
761                ].half_height_width
762            # Check if EIC for mass feature is set
763            df_mf_single = pd.DataFrame(dict_mf, index=[mf_id])
764            df_mf_single["mz"] = self.mass_features[mf_id].mz
765            df_mf_list.append(df_mf_single)
766        df_mf = pd.concat(df_mf_list)
767
768        # rename _area to area and id to mf_id
769        df_mf = df_mf.rename(
770            columns={
771                "_area": "area",
772                "id": "mf_id",
773                "_apex_scan": "apex_scan",
774                "_retention_time": "scan_time",
775                "_intensity": "intensity",
776                "_persistence": "persistence",
777                "_dispersity_index": "dispersity_index",
778                "_tailing_factor": "tailing_factor",
779            }
780        )
781
782        # reorder columns
783        col_order = [
784            "mf_id",
785            "scan_time",
786            "mz",
787            "apex_scan",
788            "start_scan",
789            "final_scan",
790            "intensity",
791            "persistence",
792            "area",
793            "half_height_width",
794            "tailing_factor",
795            "dispersity_index",
796            "monoisotopic_mf_id",
797            "isotopologue_type",
798            "mass_spectrum_deconvoluted_parent",
799            "associated_mass_features",
800            "ms2_spectrum",
801        ]
802        # drop columns that are not in col_order
803        cols_to_order = [col for col in col_order if col in df_mf.columns]
804        df_mf = df_mf[cols_to_order]
805
806        # reset index to mf_id
807        df_mf = df_mf.set_index("mf_id")
808        df_mf.index.name = "mf_id"
809
810        return df_mf

Returns a pandas dataframe summarizing the mass features.

The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID).

Returns
  • pandas.DataFrame: A pandas dataframe of mass features with the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
def mass_features_ms1_annot_to_df(self):
812    def mass_features_ms1_annot_to_df(self):
813        """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
814
815        Returns
816        --------
817        pandas.DataFrame
818            A pandas dataframe of MS1 annotations for the mass features in the dataset.
819            The index is set to mf_id (mass feature ID)
820
821        Raises
822        ------
823        Warning
824            If no MS1 annotations were found for the mass features in the dataset.
825        """
826        annot_df_list_ms1 = []
827        for mf_id in self.mass_features.keys():
828            if self.mass_features[mf_id].mass_spectrum is None:
829                pass
830            else:
831                # Add ms1 annotations to ms1 annotation list
832                if (
833                    np.abs(
834                        (
835                            self.mass_features[mf_id].ms1_peak.mz_exp
836                            - self.mass_features[mf_id].mz
837                        )
838                    )
839                    < 0.01
840                ):
841                    # Get the molecular formula from the mass spectrum
842                    annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe()
843                    # Subset to pull out only the peak associated with the mass feature
844                    annot_df = annot_df[
845                        annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index
846                    ].copy()
847
848                    # Remove the index column and add column for mf_id
849                    annot_df = annot_df.drop(columns=["Index"])
850                    annot_df["mf_id"] = mf_id
851                    annot_df_list_ms1.append(annot_df)
852
853        if len(annot_df_list_ms1) > 0:
854            annot_ms1_df_full = pd.concat(annot_df_list_ms1)
855            annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id")
856            annot_ms1_df_full.index.name = "mf_id"
857
858        else:
859            annot_ms1_df_full = None
860            # Warn that no ms1 annotations were found
861            warnings.warn(
862                "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?",
863                UserWarning,
864            )
865
866        return annot_ms1_df_full

Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.

Returns
  • pandas.DataFrame: A pandas dataframe of MS1 annotations for the mass features in the dataset. The index is set to mf_id (mass feature ID)
Raises
  • Warning: If no MS1 annotations were found for the mass features in the dataset.
def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
868    def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
869        """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
870
871        Parameters
872        -----------
873        molecular_metadata :  dict of MolecularMetadata objects
874            A dictionary of MolecularMetadata objects, keyed by metabref_mol_id.  Defaults to None.
875
876        Returns
877        --------
878        pandas.DataFrame
879            A pandas dataframe of MS2 annotations for the mass features in the dataset,
880            and optionally molecular metadata. The index is set to mf_id (mass feature ID)
881
882        Raises
883        ------
884        Warning
885            If no MS2 annotations were found for the mass features in the dataset.
886        """
887        annot_df_list_ms2 = []
888        for mf_id in self.mass_features.keys():
889            if len(self.mass_features[mf_id].ms2_similarity_results) > 0:
890                # Add ms2 annotations to ms2 annotation list
891                for result in self.mass_features[mf_id].ms2_similarity_results:
892                    annot_df_ms2 = result.to_dataframe()
893                    annot_df_ms2["mf_id"] = mf_id
894                    annot_df_list_ms2.append(annot_df_ms2)
895
896        if len(annot_df_list_ms2) > 0:
897            annot_ms2_df_full = pd.concat(annot_df_list_ms2)
898            if molecular_metadata is not None:
899                molecular_metadata_df = pd.concat(
900                    [
901                        pd.DataFrame.from_dict(v.__dict__, orient="index").transpose()
902                        for k, v in molecular_metadata.items()
903                    ],
904                    ignore_index=True,
905                )
906                molecular_metadata_df = molecular_metadata_df.rename(
907                    columns={"id": "ref_mol_id"}
908                )
909                annot_ms2_df_full = annot_ms2_df_full.merge(
910                    molecular_metadata_df, on="ref_mol_id", how="left"
911                )
912            annot_ms2_df_full = annot_ms2_df_full.drop_duplicates(
913                subset=["mf_id", "query_spectrum_id", "ref_ms_id"]
914            ).copy()
915            annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id")
916            annot_ms2_df_full.index.name = "mf_id"
917        else:
918            annot_ms2_df_full = None
919            # Warn that no ms2 annotations were found
920            warnings.warn(
921                "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?",
922                UserWarning,
923            )
924
925        return annot_ms2_df_full

Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.

Parameters
  • molecular_metadata (dict of MolecularMetadata objects): A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None.
Returns
  • pandas.DataFrame: A pandas dataframe of MS2 annotations for the mass features in the dataset, and optionally molecular metadata. The index is set to mf_id (mass feature ID)
Raises
  • Warning: If no MS2 annotations were found for the mass features in the dataset.
def plot_composite_mz_features( self, binsize=0.0001, ph_int_min_thresh=0.001, mf_plot=True, ms2_plot=True, return_fig=False):
 927    def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False):
 928        """Returns a figure displaying 
 929            (1) thresholded, unprocessed data
 930            (2) the m/z features
 931            (3) which m/z features are associated with MS2 spectra
 932
 933        Parameters
 934        -----------
 935        binsize :  float
 936            Desired binsize for the m/z axis of the composite feature map.  Defaults to 1e-4.
 937        mf_plot : boolean
 938            Indicates whether to plot the m/z features. Defaults to True.
 939        ms2_plot : boolean
 940            Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
 941        return_fig : boolean
 942            Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
 943
 944        Returns
 945        --------
 946        matplotlib.pyplot.Figure
 947            A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 
 948            scan time. Unprocessed data is displayed in gray scale with darker colors indicating 
 949            higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z
 950            features with associated with MS2 spectra are plotted, they are displayed in red.
 951
 952        Raises
 953        ------
 954        Warning
 955            If m/z features are set to be plot but aren't in the dataset.
 956            If m/z features with associated MS2 data are set to be plot but no MS2 annotations 
 957            were found for the m/z features in the dataset.
 958        """
 959        if mf_plot:
 960            # Check if mass_features is set, raise error if not
 961            if self.mass_features is None:
 962                raise ValueError(
 963                    "mass_features not set, must run find_mass_features() first"
 964                )
 965            ## call mass feature data
 966            mf_df = self.mass_features_to_df()
 967
 968        if ms2_plot:
 969            if not mf_plot:
 970                # Check if mass_features is set, raise error if not
 971                if self.mass_features is None:
 972                    raise ValueError(
 973                        "mass_features not set, must run find_mass_features() first"
 974                    )
 975
 976            ## call m/z feature data
 977            mf_df = self.mass_features_to_df()
 978
 979            # Check if ms2_spectrum is set, raise error if not
 980            if 'ms2_spectrum' not in mf_df.columns:
 981                raise ValueError(                
 982                    "ms2_spectrum not set, must run add_associated_ms2_dda() first"            
 983                )
 984    
 985        ## threshold and grid unprocessed data
 986        df = self._ms_unprocessed[1].copy()
 987        df = df.dropna(subset=['intensity']).reset_index(drop = True)
 988        threshold = ph_int_min_thresh * df.intensity.max()
 989        df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy()
 990        df = self.grid_data(df_thres)
 991    
 992        ## format unprocessed data for plotting
 993        df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan')
 994        mz_grid = np.arange(0, np.max(df.mz), binsize)
 995        mz_data = np.array(df.mz)
 996        df['mz_bin'] = find_closest(mz_grid, mz_data)
 997        df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum)
 998        unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True)
 999
1000        ## generate figure
1001        fig = plt.figure()
1002        plt.scatter(
1003            unproc_df.scan_time,
1004            unproc_df.mz_bin*binsize,
1005            c = unproc_df.ab_bin/np.max(unproc_df.ab_bin),
1006            alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 
1007            cmap = 'Greys_r',
1008            s = 1
1009        )
1010
1011        if mf_plot:
1012            if ms2_plot:
1013                plt.scatter(
1014                    mf_df[mf_df.ms2_spectrum.isna()].scan_time,
1015                    mf_df[mf_df.ms2_spectrum.isna()].mz,
1016                    c = 'c',
1017                    s = 4,
1018                    label = 'M/Z features without MS2'
1019                )
1020            else:
1021                plt.scatter(
1022                    mf_df.scan_time,
1023                    mf_df.mz,
1024                    c = 'c',
1025                    s = 4,
1026                    label = 'M/Z features'
1027                )
1028
1029        if ms2_plot: 
1030            plt.scatter(
1031                mf_df[~mf_df.ms2_spectrum.isna()].scan_time,
1032                mf_df[~mf_df.ms2_spectrum.isna()].mz,
1033                c = 'r',
1034                s = 2,
1035                label = 'M/Z features with MS2'
1036            )
1037
1038        if mf_plot == True or ms2_plot == True:
1039            plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2)
1040        plt.xlabel('Scan time')
1041        plt.ylabel('m/z')
1042        plt.ylim(0, np.ceil(np.max(df.mz)))
1043        plt.xlim(0, np.ceil(np.max(df.scan_time)))
1044        plt.title('Composite Feature Map')
1045
1046        if return_fig:
1047            plt.close(fig)
1048            return fig
1049
1050        else:
1051            plt.show()

Returns a figure displaying (1) thresholded, unprocessed data (2) the m/z features (3) which m/z features are associated with MS2 spectra

Parameters
  • binsize (float): Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4.
  • mf_plot (boolean): Indicates whether to plot the m/z features. Defaults to True.
  • ms2_plot (boolean): Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
  • return_fig (boolean): Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
Returns
  • matplotlib.pyplot.Figure: A figure with the thresholded, unprocessed data on an axis of m/z value with respect to scan time. Unprocessed data is displayed in gray scale with darker colors indicating higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z features with associated with MS2 spectra are plotted, they are displayed in red.
Raises
  • Warning: If m/z features are set to be plot but aren't in the dataset. If m/z features with associated MS2 data are set to be plot but no MS2 annotations were found for the m/z features in the dataset.
def set_tic_list_from_data(self, overwrite=False):
1090    def set_tic_list_from_data(self, overwrite=False):
1091        """Sets the TIC list from the mass spectrum objects within the _ms dictionary.
1092
1093        Parameters
1094        -----------
1095        overwrite : bool, optional
1096            If True, overwrites the TIC list if it is already set. Defaults to False.
1097
1098        Notes
1099        -----
1100        If the _ms dictionary is incomplete, sets the TIC list to an empty list.
1101
1102        Raises
1103        ------
1104        ValueError
1105            If no mass spectra are found in the dataset.
1106            If the TIC list is already set and overwrite is False.
1107        """
1108        # Check if _ms is empty and raise error if so
1109        if len(self._ms) == 0:
1110            raise ValueError("No mass spectra found in dataset")
1111
1112        # Check if tic_list is already set and raise error if so
1113        if len(self.tic) > 0 and not overwrite:
1114            raise ValueError("TIC list already set, use overwrite=True to overwrite")
1115
1116        self.tic = [self._ms.get(i).tic for i in self.scans_number]

Sets the TIC list from the mass spectrum objects within the _ms dictionary.

Parameters
  • overwrite (bool, optional): If True, overwrites the TIC list if it is already set. Defaults to False.
Notes

If the _ms dictionary is incomplete, sets the TIC list to an empty list.

Raises
  • ValueError: If no mass spectra are found in the dataset. If the TIC list is already set and overwrite is False.
def set_retention_time_from_data(self, overwrite=False):
1118    def set_retention_time_from_data(self, overwrite=False):
1119        """Sets the retention time list from the data in the _ms dictionary.
1120
1121        Parameters
1122        -----------
1123        overwrite : bool, optional
1124            If True, overwrites the retention time list if it is already set. Defaults to False.
1125
1126        Notes
1127        -----
1128        If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
1129
1130        Raises
1131        ------
1132        ValueError
1133            If no mass spectra are found in the dataset.
1134            If the retention time list is already set and overwrite is False.
1135        """
1136        # Check if _ms is empty and raise error if so
1137        if len(self._ms) == 0:
1138            raise ValueError("No mass spectra found in dataset")
1139
1140        # Check if retention_time_list is already set and raise error if so
1141        if len(self.retention_time) > 0 and not overwrite:
1142            raise ValueError(
1143                "Retention time list already set, use overwrite=True to overwrite"
1144            )
1145
1146        retention_time_list = []
1147        for key_ms in sorted(self._ms.keys()):
1148            retention_time_list.append(self._ms.get(key_ms).retention_time)
1149        self.retention_time = retention_time_list

Sets the retention time list from the data in the _ms dictionary.

Parameters
  • overwrite (bool, optional): If True, overwrites the retention time list if it is already set. Defaults to False.
Notes

If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.

Raises
  • ValueError: If no mass spectra are found in the dataset. If the retention time list is already set and overwrite is False.
def set_scans_number_from_data(self, overwrite=False):
1151    def set_scans_number_from_data(self, overwrite=False):
1152        """Sets the scan number list from the data in the _ms dictionary.
1153
1154        Notes
1155        -----
1156        If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
1157
1158        Raises
1159        ------
1160        ValueError
1161            If no mass spectra are found in the dataset.
1162            If the scan number list is already set and overwrite is False.
1163        """
1164        # Check if _ms is empty and raise error if so
1165        if len(self._ms) == 0:
1166            raise ValueError("No mass spectra found in dataset")
1167
1168        # Check if scans_number_list is already set and raise error if so
1169        if len(self.scans_number) > 0 and not overwrite:
1170            raise ValueError(
1171                "Scan number list already set, use overwrite=True to overwrite"
1172            )
1173
1174        self.scans_number = sorted(self._ms.keys())

Sets the scan number list from the data in the _ms dictionary.

Notes

If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.

Raises
  • ValueError: If no mass spectra are found in the dataset. If the scan number list is already set and overwrite is False.
ms1_scans

list : A list of MS1 scan numbers for the dataset.

parameters

LCMSParameters : The parameters used for the LC-MS analysis.

scans_number

list : A list of scan numbers for the dataset.

retention_time

numpy.ndarray : An array of retention times for the dataset.

tic

numpy.ndarray : An array of TIC values for the dataset.