corems.mass_spectra.factory.lc_class

   1from pathlib import Path
   2
   3import numpy as np
   4import pandas as pd
   5import warnings
   6import matplotlib.pyplot as plt
   7
   8from corems.encapsulation.factory.parameters import LCMSParameters
   9from corems.mass_spectra.calc.lc_calc import LCCalculations, PHCalculations
  10from corems.molecular_id.search.lcms_spectral_search import LCMSSpectralSearch
  11from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  12from corems.mass_spectra.calc.lc_calc import find_closest
  13
  14
  15class MassSpectraBase:
  16    """Base class for mass spectra objects.
  17
  18    Parameters
  19    -----------
  20    file_location : str or Path
  21        The location of the file containing the mass spectra data.
  22    analyzer : str, optional
  23        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
  24    instrument_label : str, optional
  25        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
  26    sample_name : str, optional
  27        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
  28    spectra_parser : object, optional
  29        The spectra parser object used to create the mass spectra object. Defaults to None.
  30
  31    Attributes
  32    -----------
  33    spectra_parser_class : class
  34        The class of the spectra parser used to create the mass spectra object.
  35    file_location : str or Path
  36        The location of the file containing the mass spectra data.
  37    sample_name : str
  38        The name of the sample; defaults to the file name if not provided to the parser.
  39    analyzer : str
  40        The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
  41    instrument_label : str
  42        The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
  43    _scan_info : dict
  44        A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z,
  45        scan text, and scan window (lower and upper).
  46        Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
  47    _ms : dict
  48        A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
  49    _ms_unprocessed: dictionary of pandas.DataFrames or None
  50        A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking.
  51        Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
  52
  53    Methods
  54    --------
  55    * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True).
  56        Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
  57    * get_time_of_scan_id(scan).
  58        Returns the scan time for the specified scan number.
  59    """
  60
  61    def __init__(
  62        self,
  63        file_location,
  64        analyzer="Unknown",
  65        instrument_label="Unknown",
  66        sample_name=None,
  67        spectra_parser=None,
  68    ):
  69        if isinstance(file_location, str):
  70            file_location = Path(file_location)
  71        else:
  72            file_location = file_location
  73        if not file_location.exists():
  74            raise FileExistsError("File does not exist: " + str(file_location))
  75
  76        if sample_name:
  77            self.sample_name = sample_name
  78        else:
  79            self.sample_name = file_location.stem
  80
  81        self.file_location = file_location
  82        self.analyzer = analyzer
  83        self.instrument_label = instrument_label
  84
  85        # Add the spectra parser class to the object if it is not None
  86        if spectra_parser is not None:
  87            self.spectra_parser_class = spectra_parser.__class__
  88            self.spectra_parser = spectra_parser
  89            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
  90            if (
  91                self.sample_name is not None
  92                and self.sample_name != self.spectra_parser.sample_name
  93            ):
  94                warnings.warn(
  95                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
  96                    UserWarning,
  97                )
  98            if self.analyzer != self.spectra_parser.analyzer:
  99                warnings.warn(
 100                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
 101                    UserWarning,
 102                )
 103            if self.instrument_label != self.spectra_parser.instrument_label:
 104                warnings.warn(
 105                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
 106                    UserWarning,
 107                )
 108            if self.file_location != self.spectra_parser.file_location:
 109                warnings.warn(
 110                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
 111                    UserWarning,
 112                )
 113
 114        # Instantiate empty dictionaries for scan information and mass spectra
 115        self._scan_info = {}
 116        self._ms = {}
 117        self._ms_unprocessed = {}
 118
 119    def add_mass_spectrum(self, mass_spec):
 120        """Adds a mass spectrum to the dataset.
 121
 122        Parameters
 123        -----------
 124        mass_spec : MassSpectrum
 125            The corems MassSpectrum object to be added to the dataset.
 126
 127        Notes
 128        -----
 129        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
 130        """
 131        # check if mass_spec has a scan_number attribute
 132        if not hasattr(mass_spec, "scan_number"):
 133            raise ValueError(
 134                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
 135            )
 136        self._ms[mass_spec.scan_number] = mass_spec
 137
 138    def add_mass_spectra(
 139        self,
 140        scan_list,
 141        spectrum_mode=None,
 142        ms_level=1,
 143        use_parser=True,
 144        auto_process=True,
 145        ms_params=None,
 146    ):
 147        """Add mass spectra to _ms dictionary, from a list of scans or single scan
 148
 149        Notes
 150        -----
 151        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
 152
 153
 154        Parameters
 155        -----------
 156        scan_list : list of ints
 157            List of scans to use to populate _ms slot
 158        spectrum_mode : str or None
 159            The spectrum mode to use for the mass spectra.
 160            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 161            Defaults to None.
 162        ms_level : int, optional
 163            The MS level to use for the mass spectra.
 164            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
 165            Defaults to 1.
 166        using_parser : bool
 167            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
 168        auto_process : bool
 169            Whether to auto-process the mass spectra.  Defaults to True.
 170        ms_params : MSParameters or None
 171            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
 172
 173        Raises
 174        ------
 175        TypeError
 176            If scan_list is not a list of ints
 177        ValueError
 178            If polarity is not 'positive' or 'negative'
 179            If ms_level is not 1 or 2
 180        """
 181
 182        # check if scan_list is a list or a single int; if single int, convert to list
 183        if isinstance(scan_list, int):
 184            scan_list = [scan_list]
 185        if not isinstance(scan_list, list):
 186            raise TypeError("scan_list must be a list of integers")
 187        for scan in scan_list:
 188            if not isinstance(scan, int):
 189                raise TypeError("scan_list must be a list of integers")
 190
 191        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 192        if self.polarity == "negative":
 193            polarity = -1
 194        elif self.polarity == "positive":
 195            polarity = 1
 196        else:
 197            raise ValueError(
 198                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
 199            )
 200
 201        # is not using_parser, check that ms1 and ms2 are not None
 202        if not use_parser:
 203            if ms_level not in self._ms_unprocessed.keys():
 204                raise ValueError(
 205                    "ms_level {} not found in _ms_unprocessed dictionary".format(
 206                        ms_level
 207                    )
 208                )
 209
 210        scan_list = list(set(scan_list))
 211        scan_list.sort()
 212        if not use_parser:
 213            if self._ms_unprocessed[ms_level] is None:
 214                raise ValueError(
 215                    "No unprocessed data found for ms_level {}".format(ms_level)
 216                )
 217            if (
 218                len(
 219                    np.setdiff1d(
 220                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
 221                    )
 222                )
 223                > 0
 224            ):
 225                raise ValueError(
 226                    "Not all scans in scan_list are present in the unprocessed data"
 227                )
 228            # Prepare the ms_df for parsing
 229            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
 230
 231        for scan in scan_list:
 232            ms = None
 233            if spectrum_mode is None:
 234                # get spectrum mode from _scan_info
 235                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
 236            else:
 237                spectrum_mode_scan = spectrum_mode
 238            # Instantiate the mass spectrum object using the parser or the unprocessed data
 239            if not use_parser:
 240                my_ms_df = ms_df.loc[scan]
 241                if spectrum_mode_scan == "profile":
 242                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
 243                    ms = ms_from_array_profile(
 244                        my_ms_df.mz,
 245                        my_ms_df.intensity,
 246                        self.file_location,
 247                        polarity=polarity,
 248                        auto_process=False,
 249                    )
 250                else:
 251                    raise ValueError(
 252                        "Only profile mode is supported for unprocessed data"
 253                    )
 254            if use_parser:
 255                ms = self.spectra_parser.get_mass_spectrum_from_scan(
 256                    scan_number=scan,
 257                    spectrum_mode=spectrum_mode_scan,
 258                    auto_process=False,
 259                )
 260
 261            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 262            if ms is not None:
 263                if ms_params is not None:
 264                    ms.parameters = ms_params
 265                ms.scan_number = scan
 266                if auto_process:
 267                    ms.process_mass_spec()
 268                self.add_mass_spectrum(ms)
 269
 270    def get_time_of_scan_id(self, scan):
 271        """Returns the scan time for the specified scan number.
 272
 273        Parameters
 274        -----------
 275        scan : int
 276            The scan number of the desired scan time.
 277
 278        Returns
 279        --------
 280        float
 281            The scan time for the specified scan number (in minutes).
 282
 283        Raises
 284        ------
 285        ValueError
 286            If no scan time is found for the specified scan number.
 287        """
 288        # Check if _retenion_time_list is empty and raise error if so
 289        if len(self._retention_time_list) == 0:
 290            raise ValueError("No retention times found in dataset")
 291        rt = self._retention_time_list[self._scans_number_list.index(scan)]
 292        return rt
 293
 294    @property
 295    def scan_df(self):
 296        """
 297        pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).
 298        """
 299        scan_df = pd.DataFrame.from_dict(self._scan_info)
 300        return scan_df
 301        
 302    @property
 303    def ms(self):
 304        """
 305        dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles
 306        """
 307        return self._ms
 308
 309    
 310    @scan_df.setter
 311    def scan_df(self, df):
 312        """
 313        Sets the scan data for the dataset.
 314
 315        Parameters
 316        -----------
 317        df : pandas.DataFrame
 318            A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level,
 319            precursor m/z, scan text, and scan window (lower and upper).
 320        """
 321        self._scan_info = df.to_dict()
 322
 323    def __getitem__(self, scan_number):
 324        return self._ms.get(scan_number)
 325
 326
 327class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch):
 328    """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.
 329
 330    This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.
 331
 332    Parameters
 333    -----------
 334    file_location : str or Path
 335        The location of the file containing the mass spectra data.
 336    analyzer : str, optional
 337        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
 338    instrument_label : str, optional
 339        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
 340    sample_name : str, optional
 341        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
 342    spectra_parser : object, optional
 343        The spectra parser object used to create the mass spectra object. Defaults to None.
 344
 345    Attributes
 346    -----------
 347    polarity : str
 348        The polarity of the ionization mode used for the dataset.
 349    _parameters : LCMSParameters
 350        The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
 351    _retention_time_list : numpy.ndarray
 352        An array of retention times for the dataset.
 353    _scans_number_list : list
 354        A list of scan numbers for the dataset.
 355    _tic_list : numpy.ndarray
 356        An array of total ion current (TIC) values for the dataset.
 357    eics : dict
 358        A dictionary containing extracted ion chromatograms (EICs) for the dataset.
 359        Key is the mz of the EIC. Initialized as an empty dictionary.
 360    mass_features : dictionary of LCMSMassFeature objects
 361        A dictionary containing mass features for the dataset.
 362        Key is mass feature ID. Initialized as an empty dictionary.
 363    spectral_search_results : dictionary of MS2SearchResults objects
 364        A dictionary containing spectral search results for the dataset.
 365        Key is scan number : precursor mz. Initialized as an empty dictionary.
 366
 367    Methods
 368    --------
 369    * get_parameters_json().
 370        Returns the parameters used for the LC-MS analysis in JSON format.
 371    * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 372        Adds which MS2 scans are associated with each mass feature to the
 373        mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
 374    * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 375        Adds the MS1 spectra associated with each mass feature to the
 376        mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
 377    * mass_features_to_df()
 378        Returns a pandas dataframe summarizing the mass features in the dataset.
 379    * set_tic_list_from_data(overwrite=False)
 380        Sets the TIC list from the mass spectrum objects within the _ms dictionary.
 381    * set_retention_time_from_data(overwrite=False)
 382        Sets the retention time list from the data in the _ms dictionary.
 383    * set_scans_number_from_data(overwrite=False)
 384        Sets the scan number list from the data in the _ms dictionary.
 385    * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False)
 386        Generates plot of M/Z features comparing scan time vs M/Z value
 387    """
 388
 389    def __init__(
 390        self,
 391        file_location,
 392        analyzer="Unknown",
 393        instrument_label="Unknown",
 394        sample_name=None,
 395        spectra_parser=None,
 396    ):
 397        super().__init__(
 398            file_location, analyzer, instrument_label, sample_name, spectra_parser
 399        )
 400        self.polarity = ""
 401        self._parameters = LCMSParameters()
 402        self._retention_time_list = []
 403        self._scans_number_list = []
 404        self._tic_list = []
 405        self.eics = {}
 406        self.mass_features = {}
 407        self.spectral_search_results = {}
 408
 409    def get_parameters_json(self):
 410        """Returns the parameters stored for the LC-MS object in JSON format.
 411
 412        Returns
 413        --------
 414        str
 415            The parameters used for the LC-MS analysis in JSON format.
 416        """
 417        return self.parameters.to_json()
 418
 419    def remove_unprocessed_data(self, ms_level=None):
 420        """Removes the unprocessed data from the LCMSBase object.
 421
 422        Parameters
 423        -----------
 424        ms_level : int, optional
 425            The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
 426
 427        Raises
 428        ------
 429        ValueError
 430            If ms_level is not 1 or 2.
 431
 432        Notes
 433        -----
 434        This method is useful for freeing up memory after the data has been processed.
 435        """
 436        if ms_level is None:
 437            for ms_level in self._ms_unprocessed.keys():
 438                self._ms_unprocessed[ms_level] = None
 439        if ms_level not in [1, 2]:
 440            raise ValueError("ms_level must be 1 or 2")
 441        self._ms_unprocessed[ms_level] = None
 442
 443    def add_associated_ms2_dda(
 444        self,
 445        auto_process=True,
 446        use_parser=True,
 447        spectrum_mode=None,
 448        ms_params_key="ms2",
 449        scan_filter=None,
 450    ):
 451        """Add MS2 spectra associated with mass features to the dataset.
 452
 453        Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
 454
 455        Parameters
 456        -----------
 457        auto_process : bool, optional
 458            If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
 459        use_parser : bool, optional
 460            If True, envoke the spectra parser to get the MS2 spectra. Default is True.
 461        spectrum_mode : str or None, optional
 462            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 463            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 464            Defaults to None. (faster if defined, otherwise will check each scan)
 465        ms_params_key : string, optional
 466            The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute.
 467            Defaults to 'ms2'.
 468        scan_filter : str
 469            A string to filter the scans to add to the _ms dictionary.  If None, all scans are added.  Defaults to None.
 470            "hcd" will pull out only HCD scans.
 471
 472        Raises
 473        ------
 474        ValueError
 475            If mass_features is not set, must run find_mass_features() first.
 476            If no MS2 scans are found in the dataset.
 477            If no precursor m/z values are found in MS2 scans, not a DDA dataset.
 478        """
 479        # Check if mass_features is set, raise error if not
 480        if self.mass_features is None:
 481            raise ValueError(
 482                "mass_features not set, must run find_mass_features() first"
 483            )
 484
 485        # reconfigure ms_params to get the correct mass spectrum parameters from the key
 486        ms_params = self.parameters.mass_spectrum[ms_params_key]
 487
 488        mf_df = self.mass_features_to_df().copy()
 489        # Find ms2 scans that have a precursor m/z value
 490        ms2_scans = self.scan_df[self.scan_df.ms_level == 2]
 491        ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()]
 492        # drop ms2 scans that have no tic
 493        ms2_scans = ms2_scans[ms2_scans.tic > 0]
 494        if ms2_scans is None:
 495            raise ValueError("No DDA scans found in dataset")
 496
 497        if scan_filter is not None:
 498            ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)]
 499        # set tolerance in rt space (in minutes) and mz space (in daltons)
 500        time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance
 501        mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance
 502
 503        # for each mass feature, find the ms2 scans that are within the roi scan time and mz range
 504        dda_scans = []
 505        for i, row in mf_df.iterrows():
 506            ms2_scans_filtered = ms2_scans[
 507                ms2_scans.scan_time.between(
 508                    row.scan_time - time_tol, row.scan_time + time_tol
 509                )
 510            ]
 511            ms2_scans_filtered = ms2_scans_filtered[
 512                ms2_scans_filtered.precursor_mz.between(
 513                    row.mz - mz_tol, row.mz + mz_tol
 514                )
 515            ]
 516            dda_scans = dda_scans + ms2_scans_filtered.scan.tolist()
 517            self.mass_features[i].ms2_scan_numbers = (
 518                ms2_scans_filtered.scan.tolist()
 519                + self.mass_features[i].ms2_scan_numbers
 520            )
 521        # add to _ms attribute
 522        self.add_mass_spectra(
 523            scan_list=list(set(dda_scans)),
 524            auto_process=auto_process,
 525            spectrum_mode=spectrum_mode,
 526            use_parser=use_parser,
 527            ms_params=ms_params,
 528        )
 529        # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute
 530        for mf_id in self.mass_features:
 531            if self.mass_features[mf_id].ms2_scan_numbers is not None:
 532                for dda_scan in self.mass_features[mf_id].ms2_scan_numbers:
 533                    if dda_scan in self._ms.keys():
 534                        self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[
 535                            dda_scan
 536                        ]
 537
 538    def add_associated_ms1(
 539        self, auto_process=True, use_parser=True, spectrum_mode=None
 540    ):
 541        """Add MS1 spectra associated with mass features to the dataset.
 542
 543        Parameters
 544        -----------
 545        auto_process : bool, optional
 546            If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
 547        use_parser : bool, optional
 548            If True, envoke the spectra parser to get the MS1 spectra. Default is True.
 549        spectrum_mode : str or None, optional
 550            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 551            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 552            Defaults to None. (faster if defined, otherwise will check each scan)
 553
 554        Raises
 555        ------
 556        ValueError
 557            If mass_features is not set, must run find_mass_features() first.
 558            If apex scans are not profile mode, all apex scans must be profile mode for averaging.
 559            If number of scans to average is not  1 or an integer with an integer median (i.e. 3, 5, 7, 9).
 560            If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
 561        """
 562        # Check if mass_features is set, raise error if not
 563        if self.mass_features is None:
 564            raise ValueError(
 565                "mass_features not set, must run find_mass_features() first"
 566            )
 567        scans_to_average = self.parameters.lc_ms.ms1_scans_to_average
 568
 569        if scans_to_average == 1:
 570            # Add to LCMSobj
 571            self.add_mass_spectra(
 572                scan_list=[
 573                    int(x) for x in self.mass_features_to_df().apex_scan.tolist()
 574                ],
 575                auto_process=auto_process,
 576                use_parser=use_parser,
 577                spectrum_mode=spectrum_mode,
 578                ms_params=self.parameters.mass_spectrum["ms1"],
 579            )
 580
 581        elif (
 582            (scans_to_average - 1) % 2
 583        ) == 0:  # scans_to_average = 3, 5, 7 etc, mirror l/r around apex
 584            apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist()))
 585            # Check if all apex scans are profile mode, raise error if not
 586            if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"):
 587                raise ValueError("All apex scans must be profile mode for averaging")
 588
 589            # First get sets of scans to average
 590            def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average):
 591                ms1_idx_start = ms1_scans.index(apex_scan) - int(
 592                    (scans_to_average - 1) / 2
 593                )
 594                if ms1_idx_start < 0:
 595                    ms1_idx_start = 0
 596                ms1_idx_end = (
 597                    ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1
 598                )
 599                if ms1_idx_end > (len(ms1_scans) - 1):
 600                    ms1_idx_end = len(ms1_scans) - 1
 601                scan_list = ms1_scans[ms1_idx_start:ms1_idx_end]
 602                return scan_list
 603
 604            ms1_scans = self.ms1_scans
 605            scans_lists = [
 606                get_scans_from_apex(ms1_scans, apex_scan, scans_to_average)
 607                for apex_scan in apex_scans
 608            ]
 609
 610            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 611            if self.polarity == "negative":
 612                polarity = -1
 613            elif self.polarity == "positive":
 614                polarity = 1
 615
 616            if not use_parser:
 617                # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once)
 618                ms1_unprocessed = self._ms_unprocessed[1].copy()
 619                # Set the index on _ms_unprocessed[1] to scan number
 620                ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False)
 621                self._ms_unprocessed[1] = ms1_unprocessed
 622
 623                # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1]
 624                scans_lists_flat = list(
 625                    set([scan for sublist in scans_lists for scan in sublist])
 626                )
 627                if (
 628                    len(
 629                        np.setdiff1d(
 630                            np.sort(scans_lists_flat),
 631                            np.sort(ms1_unprocessed.index.values),
 632                        )
 633                    )
 634                    > 0
 635                ):
 636                    raise ValueError(
 637                        "Not all scans to average are present in the unprocessed data"
 638                    )
 639
 640            for scan_list_average, apex_scan in zip(scans_lists, apex_scans):
 641                # Get unprocessed mass spectrum from scans
 642                ms = self.get_average_mass_spectrum(
 643                    scan_list=scan_list_average,
 644                    apex_scan=apex_scan,
 645                    spectrum_mode="profile",
 646                    ms_level=1,
 647                    auto_process=auto_process,
 648                    use_parser=use_parser,
 649                    perform_checks=False,
 650                    polarity=polarity,
 651                    ms_params=self.parameters.mass_spectrum["ms1"],
 652                )
 653                # Add mass spectrum to LCMS object and associated with mass feature
 654                self.add_mass_spectrum(ms)
 655
 656            if not use_parser:
 657                # Reset the index on _ms_unprocessed[1] to not be scan number
 658                ms1_unprocessed = ms1_unprocessed.reset_index(drop=True)
 659                self._ms_unprocessed[1] = ms1_unprocessed
 660        else:
 661            raise ValueError(
 662                "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)"
 663            )
 664
 665        # Associate the ms1 spectra with the mass features
 666        for mf_id in self.mass_features:
 667            self.mass_features[mf_id].mass_spectrum = self._ms[
 668                self.mass_features[mf_id].apex_scan
 669            ]
 670            self.mass_features[mf_id].update_mz()
 671
 672    def mass_features_to_df(self):
 673        """Returns a pandas dataframe summarizing the mass features.
 674
 675        The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity,
 676        persistence, area, monoisotopic_mf_id, and isotopologue_type.  The index is set to mf_id (mass feature ID).
 677
 678
 679        Returns
 680        --------
 681        pandas.DataFrame
 682            A pandas dataframe of mass features with the following columns:
 683            mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
 684        """
 685
 686        def mass_spectrum_to_string(
 687            mass_spec, normalize=True, min_normalized_abun=0.01
 688        ):
 689            """Converts a mass spectrum to a string of m/z:abundance pairs.
 690
 691            Parameters
 692            -----------
 693            mass_spec : MassSpectrum
 694                A MassSpectrum object to be converted to a string.
 695            normalize : bool, optional
 696                If True, normalizes the abundance values to a maximum of 1. Defaults to True.
 697            min_normalized_abun : float, optional
 698                The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01.
 699
 700            Returns
 701            --------
 702            str
 703                A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon.
 704            """
 705            mz_np = mass_spec.to_dataframe()["m/z"].values
 706            abun_np = mass_spec.to_dataframe()["Peak Height"].values
 707            if normalize:
 708                abun_np = abun_np / abun_np.max()
 709            mz_abun = np.column_stack((mz_np, abun_np))
 710            if normalize:
 711                mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun]
 712            mz_abun_str = [
 713                str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2))
 714                for mz, abun in mz_abun
 715            ]
 716            return "; ".join(mz_abun_str)
 717
 718        cols_in_df = [
 719            "id",
 720            "_apex_scan",
 721            "start_scan",
 722            "final_scan",
 723            "_retention_time",
 724            "_intensity",
 725            "_persistence",
 726            "_area",
 727            "_dispersity_index",
 728            "_tailing_factor",
 729            "monoisotopic_mf_id",
 730            "isotopologue_type",
 731            "mass_spectrum_deconvoluted_parent",
 732        ]
 733        df_mf_list = []
 734        for mf_id in self.mass_features.keys():
 735            # Find cols_in_df that are in single_mf
 736            df_keys = list(
 737                set(cols_in_df).intersection(self.mass_features[mf_id].__dir__())
 738            )
 739            dict_mf = {}
 740            for key in df_keys:
 741                dict_mf[key] = getattr(self.mass_features[mf_id], key)
 742            if len(self.mass_features[mf_id].ms2_scan_numbers) > 0:
 743                # Add MS2 spectra info
 744                best_ms2_spectrum = self.mass_features[mf_id].best_ms2
 745                dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum)
 746            if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0:
 747                dict_mf["associated_mass_features"] = ", ".join(
 748                    map(
 749                        str,
 750                        self.mass_features[mf_id].associated_mass_features_deconvoluted,
 751                    )
 752                )
 753            if self.mass_features[mf_id]._half_height_width is not None:
 754                dict_mf["half_height_width"] = self.mass_features[
 755                    mf_id
 756                ].half_height_width
 757            # Check if EIC for mass feature is set
 758            df_mf_single = pd.DataFrame(dict_mf, index=[mf_id])
 759            df_mf_single["mz"] = self.mass_features[mf_id].mz
 760            df_mf_list.append(df_mf_single)
 761        df_mf = pd.concat(df_mf_list)
 762
 763        # rename _area to area and id to mf_id
 764        df_mf = df_mf.rename(
 765            columns={
 766                "_area": "area",
 767                "id": "mf_id",
 768                "_apex_scan": "apex_scan",
 769                "_retention_time": "scan_time",
 770                "_intensity": "intensity",
 771                "_persistence": "persistence",
 772                "_dispersity_index": "dispersity_index",
 773                "_tailing_factor": "tailing_factor",
 774            }
 775        )
 776
 777        # reorder columns
 778        col_order = [
 779            "mf_id",
 780            "scan_time",
 781            "mz",
 782            "apex_scan",
 783            "start_scan",
 784            "final_scan",
 785            "intensity",
 786            "persistence",
 787            "area",
 788            "half_height_width",
 789            "tailing_factor",
 790            "dispersity_index",
 791            "monoisotopic_mf_id",
 792            "isotopologue_type",
 793            "mass_spectrum_deconvoluted_parent",
 794            "associated_mass_features",
 795            "ms2_spectrum",
 796        ]
 797        # drop columns that are not in col_order
 798        cols_to_order = [col for col in col_order if col in df_mf.columns]
 799        df_mf = df_mf[cols_to_order]
 800
 801        # reset index to mf_id
 802        df_mf = df_mf.set_index("mf_id")
 803        df_mf.index.name = "mf_id"
 804
 805        return df_mf
 806
 807    def mass_features_ms1_annot_to_df(self):
 808        """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
 809
 810        Returns
 811        --------
 812        pandas.DataFrame
 813            A pandas dataframe of MS1 annotations for the mass features in the dataset.
 814            The index is set to mf_id (mass feature ID)
 815
 816        Raises
 817        ------
 818        Warning
 819            If no MS1 annotations were found for the mass features in the dataset.
 820        """
 821        annot_df_list_ms1 = []
 822        for mf_id in self.mass_features.keys():
 823            if self.mass_features[mf_id].mass_spectrum is None:
 824                pass
 825            else:
 826                # Add ms1 annotations to ms1 annotation list
 827                if (
 828                    np.abs(
 829                        (
 830                            self.mass_features[mf_id].ms1_peak.mz_exp
 831                            - self.mass_features[mf_id].mz
 832                        )
 833                    )
 834                    < 0.01
 835                ):
 836                    # Get the molecular formula from the mass spectrum
 837                    annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe()
 838                    # Subset to pull out only the peak associated with the mass feature
 839                    annot_df = annot_df[
 840                        annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index
 841                    ].copy()
 842
 843                    # Remove the index column and add column for mf_id
 844                    annot_df = annot_df.drop(columns=["Index"])
 845                    annot_df["mf_id"] = mf_id
 846                    annot_df_list_ms1.append(annot_df)
 847
 848        if len(annot_df_list_ms1) > 0:
 849            annot_ms1_df_full = pd.concat(annot_df_list_ms1)
 850            annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id")
 851            annot_ms1_df_full.index.name = "mf_id"
 852
 853        else:
 854            annot_ms1_df_full = None
 855            # Warn that no ms1 annotations were found
 856            warnings.warn(
 857                "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?",
 858                UserWarning,
 859            )
 860
 861        return annot_ms1_df_full
 862
 863    def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
 864        """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
 865
 866        Parameters
 867        -----------
 868        molecular_metadata :  dict of MolecularMetadata objects
 869            A dictionary of MolecularMetadata objects, keyed by metabref_mol_id.  Defaults to None.
 870
 871        Returns
 872        --------
 873        pandas.DataFrame
 874            A pandas dataframe of MS2 annotations for the mass features in the dataset,
 875            and optionally molecular metadata. The index is set to mf_id (mass feature ID)
 876
 877        Raises
 878        ------
 879        Warning
 880            If no MS2 annotations were found for the mass features in the dataset.
 881        """
 882        annot_df_list_ms2 = []
 883        for mf_id in self.mass_features.keys():
 884            if len(self.mass_features[mf_id].ms2_similarity_results) > 0:
 885                # Add ms2 annotations to ms2 annotation list
 886                for result in self.mass_features[mf_id].ms2_similarity_results:
 887                    annot_df_ms2 = result.to_dataframe()
 888                    annot_df_ms2["mf_id"] = mf_id
 889                    annot_df_list_ms2.append(annot_df_ms2)
 890
 891        if len(annot_df_list_ms2) > 0:
 892            annot_ms2_df_full = pd.concat(annot_df_list_ms2)
 893            if molecular_metadata is not None:
 894                molecular_metadata_df = pd.concat(
 895                    [
 896                        pd.DataFrame.from_dict(v.__dict__, orient="index").transpose()
 897                        for k, v in molecular_metadata.items()
 898                    ],
 899                    ignore_index=True,
 900                )
 901                molecular_metadata_df = molecular_metadata_df.rename(
 902                    columns={"id": "ref_mol_id"}
 903                )
 904                annot_ms2_df_full = annot_ms2_df_full.merge(
 905                    molecular_metadata_df, on="ref_mol_id", how="left"
 906                )
 907            annot_ms2_df_full = annot_ms2_df_full.drop_duplicates(
 908                subset=["mf_id", "query_spectrum_id", "ref_ms_id"]
 909            ).copy()
 910            annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id")
 911            annot_ms2_df_full.index.name = "mf_id"
 912        else:
 913            annot_ms2_df_full = None
 914            # Warn that no ms2 annotations were found
 915            warnings.warn(
 916                "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?",
 917                UserWarning,
 918            )
 919
 920        return annot_ms2_df_full
 921
 922    def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False):
 923        """Returns a figure displaying 
 924            (1) thresholded, unprocessed data
 925            (2) the m/z features
 926            (3) which m/z features are associated with MS2 spectra
 927
 928        Parameters
 929        -----------
 930        binsize :  float
 931            Desired binsize for the m/z axis of the composite feature map.  Defaults to 1e-4.
 932        mf_plot : boolean
 933            Indicates whether to plot the m/z features. Defaults to True.
 934        ms2_plot : boolean
 935            Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
 936        return_fig : boolean
 937            Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
 938
 939        Returns
 940        --------
 941        matplotlib.pyplot.Figure
 942            A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 
 943            scan time. Unprocessed data is displayed in gray scale with darker colors indicating 
 944            higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z
 945            features with associated with MS2 spectra are plotted, they are displayed in red.
 946
 947        Raises
 948        ------
 949        Warning
 950            If m/z features are set to be plot but aren't in the dataset.
 951            If m/z features with associated MS2 data are set to be plot but no MS2 annotations 
 952            were found for the m/z features in the dataset.
 953        """
 954        if mf_plot:
 955            # Check if mass_features is set, raise error if not
 956            if self.mass_features is None:
 957                raise ValueError(
 958                    "mass_features not set, must run find_mass_features() first"
 959                )
 960            ## call mass feature data
 961            mf_df = self.mass_features_to_df()
 962
 963        if ms2_plot:
 964            if not mf_plot:
 965                # Check if mass_features is set, raise error if not
 966                if self.mass_features is None:
 967                    raise ValueError(
 968                        "mass_features not set, must run find_mass_features() first"
 969                    )
 970
 971            ## call m/z feature data
 972            mf_df = self.mass_features_to_df()
 973
 974            # Check if ms2_spectrum is set, raise error if not
 975            if 'ms2_spectrum' not in mf_df.columns:
 976                raise ValueError(                
 977                    "ms2_spectrum not set, must run add_associated_ms2_dda() first"            
 978                )
 979    
 980        ## threshold and grid unprocessed data
 981        df = self._ms_unprocessed[1].copy()
 982        df = df.dropna(subset=['intensity']).reset_index(drop = True)
 983        threshold = ph_int_min_thresh * df.intensity.max()
 984        df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy()
 985        df = self.grid_data(df_thres)
 986    
 987        ## format unprocessed data for plotting
 988        df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan')
 989        mz_grid = np.arange(0, np.max(df.mz), binsize)
 990        mz_data = np.array(df.mz)
 991        df['mz_bin'] = find_closest(mz_grid, mz_data)
 992        df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum)
 993        unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True)
 994
 995        ## generate figure
 996        fig = plt.figure()
 997        plt.scatter(
 998            unproc_df.scan_time,
 999            unproc_df.mz_bin*binsize,
1000            c = unproc_df.ab_bin/np.max(unproc_df.ab_bin),
1001            alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 
1002            cmap = 'Greys_r',
1003            s = 1
1004        )
1005
1006        if mf_plot:
1007            if ms2_plot:
1008                plt.scatter(
1009                    mf_df[mf_df.ms2_spectrum.isna()].scan_time,
1010                    mf_df[mf_df.ms2_spectrum.isna()].mz,
1011                    c = 'c',
1012                    s = 4,
1013                    label = 'M/Z features without MS2'
1014                )
1015            else:
1016                plt.scatter(
1017                    mf_df.scan_time,
1018                    mf_df.mz,
1019                    c = 'c',
1020                    s = 4,
1021                    label = 'M/Z features'
1022                )
1023
1024        if ms2_plot: 
1025            plt.scatter(
1026                mf_df[~mf_df.ms2_spectrum.isna()].scan_time,
1027                mf_df[~mf_df.ms2_spectrum.isna()].mz,
1028                c = 'r',
1029                s = 2,
1030                label = 'M/Z features with MS2'
1031            )
1032
1033        if mf_plot == True or ms2_plot == True:
1034            plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2)
1035        plt.xlabel('Scan time')
1036        plt.ylabel('m/z')
1037        plt.ylim(0, np.ceil(np.max(df.mz)))
1038        plt.xlim(0, np.ceil(np.max(df.scan_time)))
1039        plt.title('Composite Feature Map')
1040
1041        if return_fig:
1042            plt.close(fig)
1043            return fig
1044
1045        else:
1046            plt.show()
1047
1048    def __len__(self):
1049        """
1050        Returns the number of mass spectra in the dataset.
1051
1052        Returns
1053        --------
1054        int
1055            The number of mass spectra in the dataset.
1056        """
1057        return len(self._ms)
1058
1059    def __getitem__(self, scan_number):
1060        """
1061        Returns the mass spectrum corresponding to the specified scan number.
1062
1063        Parameters
1064        -----------
1065        scan_number : int
1066            The scan number of the desired mass spectrum.
1067
1068        Returns
1069        --------
1070        MassSpectrum
1071            The mass spectrum corresponding to the specified scan number.
1072        """
1073        return self._ms.get(scan_number)
1074
1075    def __iter__(self):
1076        """Returns an iterator over the mass spectra in the dataset.
1077
1078        Returns
1079        --------
1080        iterator
1081            An iterator over the mass spectra in the dataset.
1082        """
1083        return iter(self._ms.values())
1084
1085    def set_tic_list_from_data(self, overwrite=False):
1086        """Sets the TIC list from the mass spectrum objects within the _ms dictionary.
1087
1088        Parameters
1089        -----------
1090        overwrite : bool, optional
1091            If True, overwrites the TIC list if it is already set. Defaults to False.
1092
1093        Notes
1094        -----
1095        If the _ms dictionary is incomplete, sets the TIC list to an empty list.
1096
1097        Raises
1098        ------
1099        ValueError
1100            If no mass spectra are found in the dataset.
1101            If the TIC list is already set and overwrite is False.
1102        """
1103        # Check if _ms is empty and raise error if so
1104        if len(self._ms) == 0:
1105            raise ValueError("No mass spectra found in dataset")
1106
1107        # Check if tic_list is already set and raise error if so
1108        if len(self.tic) > 0 and not overwrite:
1109            raise ValueError("TIC list already set, use overwrite=True to overwrite")
1110
1111        self.tic = [self._ms.get(i).tic for i in self.scans_number]
1112
1113    def set_retention_time_from_data(self, overwrite=False):
1114        """Sets the retention time list from the data in the _ms dictionary.
1115
1116        Parameters
1117        -----------
1118        overwrite : bool, optional
1119            If True, overwrites the retention time list if it is already set. Defaults to False.
1120
1121        Notes
1122        -----
1123        If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
1124
1125        Raises
1126        ------
1127        ValueError
1128            If no mass spectra are found in the dataset.
1129            If the retention time list is already set and overwrite is False.
1130        """
1131        # Check if _ms is empty and raise error if so
1132        if len(self._ms) == 0:
1133            raise ValueError("No mass spectra found in dataset")
1134
1135        # Check if retention_time_list is already set and raise error if so
1136        if len(self.retention_time) > 0 and not overwrite:
1137            raise ValueError(
1138                "Retention time list already set, use overwrite=True to overwrite"
1139            )
1140
1141        retention_time_list = []
1142        for key_ms in sorted(self._ms.keys()):
1143            retention_time_list.append(self._ms.get(key_ms).retention_time)
1144        self.retention_time = retention_time_list
1145
1146    def set_scans_number_from_data(self, overwrite=False):
1147        """Sets the scan number list from the data in the _ms dictionary.
1148
1149        Notes
1150        -----
1151        If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
1152
1153        Raises
1154        ------
1155        ValueError
1156            If no mass spectra are found in the dataset.
1157            If the scan number list is already set and overwrite is False.
1158        """
1159        # Check if _ms is empty and raise error if so
1160        if len(self._ms) == 0:
1161            raise ValueError("No mass spectra found in dataset")
1162
1163        # Check if scans_number_list is already set and raise error if so
1164        if len(self.scans_number) > 0 and not overwrite:
1165            raise ValueError(
1166                "Scan number list already set, use overwrite=True to overwrite"
1167            )
1168
1169        self.scans_number = sorted(self._ms.keys())
1170
1171    @property
1172    def ms1_scans(self):
1173        """
1174        list : A list of MS1 scan numbers for the dataset.
1175        """
1176        return self.scan_df[self.scan_df.ms_level == 1].index.tolist()
1177
1178    @property
1179    def parameters(self):
1180        """
1181        LCMSParameters : The parameters used for the LC-MS analysis.
1182        """
1183        return self._parameters
1184
1185    @parameters.setter
1186    def parameters(self, paramsinstance):
1187        """
1188        Sets the parameters used for the LC-MS analysis.
1189
1190        Parameters
1191        -----------
1192        paramsinstance : LCMSParameters
1193            The parameters used for the LC-MS analysis.
1194        """
1195        self._parameters = paramsinstance
1196
1197    @property
1198    def scans_number(self):
1199        """
1200        list : A list of scan numbers for the dataset.
1201        """
1202        return self._scans_number_list
1203
1204    @scans_number.setter
1205    def scans_number(self, scan_numbers_list):
1206        """
1207        Sets the scan numbers for the dataset.
1208
1209        Parameters
1210        -----------
1211        scan_numbers_list : list
1212            A list of scan numbers for the dataset.
1213        """
1214        self._scans_number_list = scan_numbers_list
1215
1216    @property
1217    def retention_time(self):
1218        """
1219        numpy.ndarray : An array of retention times for the dataset.
1220        """
1221        return self._retention_time_list
1222
1223    @retention_time.setter
1224    def retention_time(self, rt_list):
1225        """
1226        Sets the retention times for the dataset.
1227
1228        Parameters
1229        -----------
1230        rt_list : list
1231            A list of retention times for the dataset.
1232        """
1233        self._retention_time_list = np.array(rt_list)
1234
1235    @property
1236    def tic(self):
1237        """
1238        numpy.ndarray : An array of TIC values for the dataset.
1239        """
1240        return self._tic_list
1241
1242    @tic.setter
1243    def tic(self, tic_list):
1244        """
1245        Sets the TIC values for the dataset.
1246
1247        Parameters
1248        -----------
1249        tic_list : list
1250            A list of TIC values for the dataset.
1251        """
1252        self._tic_list = np.array(tic_list)
class MassSpectraBase:
 16class MassSpectraBase:
 17    """Base class for mass spectra objects.
 18
 19    Parameters
 20    -----------
 21    file_location : str or Path
 22        The location of the file containing the mass spectra data.
 23    analyzer : str, optional
 24        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
 25    instrument_label : str, optional
 26        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
 27    sample_name : str, optional
 28        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
 29    spectra_parser : object, optional
 30        The spectra parser object used to create the mass spectra object. Defaults to None.
 31
 32    Attributes
 33    -----------
 34    spectra_parser_class : class
 35        The class of the spectra parser used to create the mass spectra object.
 36    file_location : str or Path
 37        The location of the file containing the mass spectra data.
 38    sample_name : str
 39        The name of the sample; defaults to the file name if not provided to the parser.
 40    analyzer : str
 41        The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
 42    instrument_label : str
 43        The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
 44    _scan_info : dict
 45        A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z,
 46        scan text, and scan window (lower and upper).
 47        Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
 48    _ms : dict
 49        A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
 50    _ms_unprocessed: dictionary of pandas.DataFrames or None
 51        A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking.
 52        Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
 53
 54    Methods
 55    --------
 56    * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True).
 57        Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
 58    * get_time_of_scan_id(scan).
 59        Returns the scan time for the specified scan number.
 60    """
 61
 62    def __init__(
 63        self,
 64        file_location,
 65        analyzer="Unknown",
 66        instrument_label="Unknown",
 67        sample_name=None,
 68        spectra_parser=None,
 69    ):
 70        if isinstance(file_location, str):
 71            file_location = Path(file_location)
 72        else:
 73            file_location = file_location
 74        if not file_location.exists():
 75            raise FileExistsError("File does not exist: " + str(file_location))
 76
 77        if sample_name:
 78            self.sample_name = sample_name
 79        else:
 80            self.sample_name = file_location.stem
 81
 82        self.file_location = file_location
 83        self.analyzer = analyzer
 84        self.instrument_label = instrument_label
 85
 86        # Add the spectra parser class to the object if it is not None
 87        if spectra_parser is not None:
 88            self.spectra_parser_class = spectra_parser.__class__
 89            self.spectra_parser = spectra_parser
 90            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
 91            if (
 92                self.sample_name is not None
 93                and self.sample_name != self.spectra_parser.sample_name
 94            ):
 95                warnings.warn(
 96                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
 97                    UserWarning,
 98                )
 99            if self.analyzer != self.spectra_parser.analyzer:
100                warnings.warn(
101                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
102                    UserWarning,
103                )
104            if self.instrument_label != self.spectra_parser.instrument_label:
105                warnings.warn(
106                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
107                    UserWarning,
108                )
109            if self.file_location != self.spectra_parser.file_location:
110                warnings.warn(
111                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
112                    UserWarning,
113                )
114
115        # Instantiate empty dictionaries for scan information and mass spectra
116        self._scan_info = {}
117        self._ms = {}
118        self._ms_unprocessed = {}
119
120    def add_mass_spectrum(self, mass_spec):
121        """Adds a mass spectrum to the dataset.
122
123        Parameters
124        -----------
125        mass_spec : MassSpectrum
126            The corems MassSpectrum object to be added to the dataset.
127
128        Notes
129        -----
130        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
131        """
132        # check if mass_spec has a scan_number attribute
133        if not hasattr(mass_spec, "scan_number"):
134            raise ValueError(
135                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
136            )
137        self._ms[mass_spec.scan_number] = mass_spec
138
139    def add_mass_spectra(
140        self,
141        scan_list,
142        spectrum_mode=None,
143        ms_level=1,
144        use_parser=True,
145        auto_process=True,
146        ms_params=None,
147    ):
148        """Add mass spectra to _ms dictionary, from a list of scans or single scan
149
150        Notes
151        -----
152        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
153
154
155        Parameters
156        -----------
157        scan_list : list of ints
158            List of scans to use to populate _ms slot
159        spectrum_mode : str or None
160            The spectrum mode to use for the mass spectra.
161            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
162            Defaults to None.
163        ms_level : int, optional
164            The MS level to use for the mass spectra.
165            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
166            Defaults to 1.
167        using_parser : bool
168            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
169        auto_process : bool
170            Whether to auto-process the mass spectra.  Defaults to True.
171        ms_params : MSParameters or None
172            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
173
174        Raises
175        ------
176        TypeError
177            If scan_list is not a list of ints
178        ValueError
179            If polarity is not 'positive' or 'negative'
180            If ms_level is not 1 or 2
181        """
182
183        # check if scan_list is a list or a single int; if single int, convert to list
184        if isinstance(scan_list, int):
185            scan_list = [scan_list]
186        if not isinstance(scan_list, list):
187            raise TypeError("scan_list must be a list of integers")
188        for scan in scan_list:
189            if not isinstance(scan, int):
190                raise TypeError("scan_list must be a list of integers")
191
192        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
193        if self.polarity == "negative":
194            polarity = -1
195        elif self.polarity == "positive":
196            polarity = 1
197        else:
198            raise ValueError(
199                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
200            )
201
202        # is not using_parser, check that ms1 and ms2 are not None
203        if not use_parser:
204            if ms_level not in self._ms_unprocessed.keys():
205                raise ValueError(
206                    "ms_level {} not found in _ms_unprocessed dictionary".format(
207                        ms_level
208                    )
209                )
210
211        scan_list = list(set(scan_list))
212        scan_list.sort()
213        if not use_parser:
214            if self._ms_unprocessed[ms_level] is None:
215                raise ValueError(
216                    "No unprocessed data found for ms_level {}".format(ms_level)
217                )
218            if (
219                len(
220                    np.setdiff1d(
221                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
222                    )
223                )
224                > 0
225            ):
226                raise ValueError(
227                    "Not all scans in scan_list are present in the unprocessed data"
228                )
229            # Prepare the ms_df for parsing
230            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
231
232        for scan in scan_list:
233            ms = None
234            if spectrum_mode is None:
235                # get spectrum mode from _scan_info
236                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
237            else:
238                spectrum_mode_scan = spectrum_mode
239            # Instantiate the mass spectrum object using the parser or the unprocessed data
240            if not use_parser:
241                my_ms_df = ms_df.loc[scan]
242                if spectrum_mode_scan == "profile":
243                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
244                    ms = ms_from_array_profile(
245                        my_ms_df.mz,
246                        my_ms_df.intensity,
247                        self.file_location,
248                        polarity=polarity,
249                        auto_process=False,
250                    )
251                else:
252                    raise ValueError(
253                        "Only profile mode is supported for unprocessed data"
254                    )
255            if use_parser:
256                ms = self.spectra_parser.get_mass_spectrum_from_scan(
257                    scan_number=scan,
258                    spectrum_mode=spectrum_mode_scan,
259                    auto_process=False,
260                )
261
262            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
263            if ms is not None:
264                if ms_params is not None:
265                    ms.parameters = ms_params
266                ms.scan_number = scan
267                if auto_process:
268                    ms.process_mass_spec()
269                self.add_mass_spectrum(ms)
270
271    def get_time_of_scan_id(self, scan):
272        """Returns the scan time for the specified scan number.
273
274        Parameters
275        -----------
276        scan : int
277            The scan number of the desired scan time.
278
279        Returns
280        --------
281        float
282            The scan time for the specified scan number (in minutes).
283
284        Raises
285        ------
286        ValueError
287            If no scan time is found for the specified scan number.
288        """
289        # Check if _retenion_time_list is empty and raise error if so
290        if len(self._retention_time_list) == 0:
291            raise ValueError("No retention times found in dataset")
292        rt = self._retention_time_list[self._scans_number_list.index(scan)]
293        return rt
294
295    @property
296    def scan_df(self):
297        """
298        pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).
299        """
300        scan_df = pd.DataFrame.from_dict(self._scan_info)
301        return scan_df
302        
303    @property
304    def ms(self):
305        """
306        dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles
307        """
308        return self._ms
309
310    
311    @scan_df.setter
312    def scan_df(self, df):
313        """
314        Sets the scan data for the dataset.
315
316        Parameters
317        -----------
318        df : pandas.DataFrame
319            A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level,
320            precursor m/z, scan text, and scan window (lower and upper).
321        """
322        self._scan_info = df.to_dict()
323
324    def __getitem__(self, scan_number):
325        return self._ms.get(scan_number)

Base class for mass spectra objects.

Parameters
  • file_location (str or Path): The location of the file containing the mass spectra data.
  • analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
  • instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
  • sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
  • spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
  • spectra_parser_class (class): The class of the spectra parser used to create the mass spectra object.
  • file_location (str or Path): The location of the file containing the mass spectra data.
  • sample_name (str): The name of the sample; defaults to the file name if not provided to the parser.
  • analyzer (str): The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
  • instrument_label (str): The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
  • _scan_info (dict): A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
  • _ms (dict): A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
  • _ms_unprocessed (dictionary of pandas.DataFrames or None): A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
Methods
  • add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
  • get_time_of_scan_id(scan). Returns the scan time for the specified scan number.
MassSpectraBase( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None, spectra_parser=None)
 62    def __init__(
 63        self,
 64        file_location,
 65        analyzer="Unknown",
 66        instrument_label="Unknown",
 67        sample_name=None,
 68        spectra_parser=None,
 69    ):
 70        if isinstance(file_location, str):
 71            file_location = Path(file_location)
 72        else:
 73            file_location = file_location
 74        if not file_location.exists():
 75            raise FileExistsError("File does not exist: " + str(file_location))
 76
 77        if sample_name:
 78            self.sample_name = sample_name
 79        else:
 80            self.sample_name = file_location.stem
 81
 82        self.file_location = file_location
 83        self.analyzer = analyzer
 84        self.instrument_label = instrument_label
 85
 86        # Add the spectra parser class to the object if it is not None
 87        if spectra_parser is not None:
 88            self.spectra_parser_class = spectra_parser.__class__
 89            self.spectra_parser = spectra_parser
 90            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
 91            if (
 92                self.sample_name is not None
 93                and self.sample_name != self.spectra_parser.sample_name
 94            ):
 95                warnings.warn(
 96                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
 97                    UserWarning,
 98                )
 99            if self.analyzer != self.spectra_parser.analyzer:
100                warnings.warn(
101                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
102                    UserWarning,
103                )
104            if self.instrument_label != self.spectra_parser.instrument_label:
105                warnings.warn(
106                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
107                    UserWarning,
108                )
109            if self.file_location != self.spectra_parser.file_location:
110                warnings.warn(
111                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
112                    UserWarning,
113                )
114
115        # Instantiate empty dictionaries for scan information and mass spectra
116        self._scan_info = {}
117        self._ms = {}
118        self._ms_unprocessed = {}
file_location
analyzer
instrument_label
def add_mass_spectrum(self, mass_spec):
120    def add_mass_spectrum(self, mass_spec):
121        """Adds a mass spectrum to the dataset.
122
123        Parameters
124        -----------
125        mass_spec : MassSpectrum
126            The corems MassSpectrum object to be added to the dataset.
127
128        Notes
129        -----
130        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
131        """
132        # check if mass_spec has a scan_number attribute
133        if not hasattr(mass_spec, "scan_number"):
134            raise ValueError(
135                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
136            )
137        self._ms[mass_spec.scan_number] = mass_spec

Adds a mass spectrum to the dataset.

Parameters
  • mass_spec (MassSpectrum): The corems MassSpectrum object to be added to the dataset.
Notes

This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.

def add_mass_spectra( self, scan_list, spectrum_mode=None, ms_level=1, use_parser=True, auto_process=True, ms_params=None):
139    def add_mass_spectra(
140        self,
141        scan_list,
142        spectrum_mode=None,
143        ms_level=1,
144        use_parser=True,
145        auto_process=True,
146        ms_params=None,
147    ):
148        """Add mass spectra to _ms dictionary, from a list of scans or single scan
149
150        Notes
151        -----
152        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
153
154
155        Parameters
156        -----------
157        scan_list : list of ints
158            List of scans to use to populate _ms slot
159        spectrum_mode : str or None
160            The spectrum mode to use for the mass spectra.
161            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
162            Defaults to None.
163        ms_level : int, optional
164            The MS level to use for the mass spectra.
165            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
166            Defaults to 1.
167        using_parser : bool
168            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
169        auto_process : bool
170            Whether to auto-process the mass spectra.  Defaults to True.
171        ms_params : MSParameters or None
172            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
173
174        Raises
175        ------
176        TypeError
177            If scan_list is not a list of ints
178        ValueError
179            If polarity is not 'positive' or 'negative'
180            If ms_level is not 1 or 2
181        """
182
183        # check if scan_list is a list or a single int; if single int, convert to list
184        if isinstance(scan_list, int):
185            scan_list = [scan_list]
186        if not isinstance(scan_list, list):
187            raise TypeError("scan_list must be a list of integers")
188        for scan in scan_list:
189            if not isinstance(scan, int):
190                raise TypeError("scan_list must be a list of integers")
191
192        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
193        if self.polarity == "negative":
194            polarity = -1
195        elif self.polarity == "positive":
196            polarity = 1
197        else:
198            raise ValueError(
199                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
200            )
201
202        # is not using_parser, check that ms1 and ms2 are not None
203        if not use_parser:
204            if ms_level not in self._ms_unprocessed.keys():
205                raise ValueError(
206                    "ms_level {} not found in _ms_unprocessed dictionary".format(
207                        ms_level
208                    )
209                )
210
211        scan_list = list(set(scan_list))
212        scan_list.sort()
213        if not use_parser:
214            if self._ms_unprocessed[ms_level] is None:
215                raise ValueError(
216                    "No unprocessed data found for ms_level {}".format(ms_level)
217                )
218            if (
219                len(
220                    np.setdiff1d(
221                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
222                    )
223                )
224                > 0
225            ):
226                raise ValueError(
227                    "Not all scans in scan_list are present in the unprocessed data"
228                )
229            # Prepare the ms_df for parsing
230            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
231
232        for scan in scan_list:
233            ms = None
234            if spectrum_mode is None:
235                # get spectrum mode from _scan_info
236                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
237            else:
238                spectrum_mode_scan = spectrum_mode
239            # Instantiate the mass spectrum object using the parser or the unprocessed data
240            if not use_parser:
241                my_ms_df = ms_df.loc[scan]
242                if spectrum_mode_scan == "profile":
243                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
244                    ms = ms_from_array_profile(
245                        my_ms_df.mz,
246                        my_ms_df.intensity,
247                        self.file_location,
248                        polarity=polarity,
249                        auto_process=False,
250                    )
251                else:
252                    raise ValueError(
253                        "Only profile mode is supported for unprocessed data"
254                    )
255            if use_parser:
256                ms = self.spectra_parser.get_mass_spectrum_from_scan(
257                    scan_number=scan,
258                    spectrum_mode=spectrum_mode_scan,
259                    auto_process=False,
260                )
261
262            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
263            if ms is not None:
264                if ms_params is not None:
265                    ms.parameters = ms_params
266                ms.scan_number = scan
267                if auto_process:
268                    ms.process_mass_spec()
269                self.add_mass_spectrum(ms)

Add mass spectra to _ms dictionary, from a list of scans or single scan

Notes

The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.

Parameters
  • scan_list (list of ints): List of scans to use to populate _ms slot
  • spectrum_mode (str or None): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None.
  • ms_level (int, optional): The MS level to use for the mass spectra. This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. Defaults to 1.
  • using_parser (bool): Whether to use the mass spectra parser to get the mass spectra. Defaults to True.
  • auto_process (bool): Whether to auto-process the mass spectra. Defaults to True.
  • ms_params (MSParameters or None): The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters.
Raises
  • TypeError: If scan_list is not a list of ints
  • ValueError: If polarity is not 'positive' or 'negative' If ms_level is not 1 or 2
def get_time_of_scan_id(self, scan):
271    def get_time_of_scan_id(self, scan):
272        """Returns the scan time for the specified scan number.
273
274        Parameters
275        -----------
276        scan : int
277            The scan number of the desired scan time.
278
279        Returns
280        --------
281        float
282            The scan time for the specified scan number (in minutes).
283
284        Raises
285        ------
286        ValueError
287            If no scan time is found for the specified scan number.
288        """
289        # Check if _retenion_time_list is empty and raise error if so
290        if len(self._retention_time_list) == 0:
291            raise ValueError("No retention times found in dataset")
292        rt = self._retention_time_list[self._scans_number_list.index(scan)]
293        return rt

Returns the scan time for the specified scan number.

Parameters
  • scan (int): The scan number of the desired scan time.
Returns
  • float: The scan time for the specified scan number (in minutes).
Raises
  • ValueError: If no scan time is found for the specified scan number.
scan_df

pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).

ms

dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles

 328class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch):
 329    """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.
 330
 331    This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.
 332
 333    Parameters
 334    -----------
 335    file_location : str or Path
 336        The location of the file containing the mass spectra data.
 337    analyzer : str, optional
 338        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
 339    instrument_label : str, optional
 340        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
 341    sample_name : str, optional
 342        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
 343    spectra_parser : object, optional
 344        The spectra parser object used to create the mass spectra object. Defaults to None.
 345
 346    Attributes
 347    -----------
 348    polarity : str
 349        The polarity of the ionization mode used for the dataset.
 350    _parameters : LCMSParameters
 351        The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
 352    _retention_time_list : numpy.ndarray
 353        An array of retention times for the dataset.
 354    _scans_number_list : list
 355        A list of scan numbers for the dataset.
 356    _tic_list : numpy.ndarray
 357        An array of total ion current (TIC) values for the dataset.
 358    eics : dict
 359        A dictionary containing extracted ion chromatograms (EICs) for the dataset.
 360        Key is the mz of the EIC. Initialized as an empty dictionary.
 361    mass_features : dictionary of LCMSMassFeature objects
 362        A dictionary containing mass features for the dataset.
 363        Key is mass feature ID. Initialized as an empty dictionary.
 364    spectral_search_results : dictionary of MS2SearchResults objects
 365        A dictionary containing spectral search results for the dataset.
 366        Key is scan number : precursor mz. Initialized as an empty dictionary.
 367
 368    Methods
 369    --------
 370    * get_parameters_json().
 371        Returns the parameters used for the LC-MS analysis in JSON format.
 372    * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 373        Adds which MS2 scans are associated with each mass feature to the
 374        mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
 375    * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 376        Adds the MS1 spectra associated with each mass feature to the
 377        mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
 378    * mass_features_to_df()
 379        Returns a pandas dataframe summarizing the mass features in the dataset.
 380    * set_tic_list_from_data(overwrite=False)
 381        Sets the TIC list from the mass spectrum objects within the _ms dictionary.
 382    * set_retention_time_from_data(overwrite=False)
 383        Sets the retention time list from the data in the _ms dictionary.
 384    * set_scans_number_from_data(overwrite=False)
 385        Sets the scan number list from the data in the _ms dictionary.
 386    * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False)
 387        Generates plot of M/Z features comparing scan time vs M/Z value
 388    """
 389
 390    def __init__(
 391        self,
 392        file_location,
 393        analyzer="Unknown",
 394        instrument_label="Unknown",
 395        sample_name=None,
 396        spectra_parser=None,
 397    ):
 398        super().__init__(
 399            file_location, analyzer, instrument_label, sample_name, spectra_parser
 400        )
 401        self.polarity = ""
 402        self._parameters = LCMSParameters()
 403        self._retention_time_list = []
 404        self._scans_number_list = []
 405        self._tic_list = []
 406        self.eics = {}
 407        self.mass_features = {}
 408        self.spectral_search_results = {}
 409
 410    def get_parameters_json(self):
 411        """Returns the parameters stored for the LC-MS object in JSON format.
 412
 413        Returns
 414        --------
 415        str
 416            The parameters used for the LC-MS analysis in JSON format.
 417        """
 418        return self.parameters.to_json()
 419
 420    def remove_unprocessed_data(self, ms_level=None):
 421        """Removes the unprocessed data from the LCMSBase object.
 422
 423        Parameters
 424        -----------
 425        ms_level : int, optional
 426            The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
 427
 428        Raises
 429        ------
 430        ValueError
 431            If ms_level is not 1 or 2.
 432
 433        Notes
 434        -----
 435        This method is useful for freeing up memory after the data has been processed.
 436        """
 437        if ms_level is None:
 438            for ms_level in self._ms_unprocessed.keys():
 439                self._ms_unprocessed[ms_level] = None
 440        if ms_level not in [1, 2]:
 441            raise ValueError("ms_level must be 1 or 2")
 442        self._ms_unprocessed[ms_level] = None
 443
 444    def add_associated_ms2_dda(
 445        self,
 446        auto_process=True,
 447        use_parser=True,
 448        spectrum_mode=None,
 449        ms_params_key="ms2",
 450        scan_filter=None,
 451    ):
 452        """Add MS2 spectra associated with mass features to the dataset.
 453
 454        Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
 455
 456        Parameters
 457        -----------
 458        auto_process : bool, optional
 459            If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
 460        use_parser : bool, optional
 461            If True, envoke the spectra parser to get the MS2 spectra. Default is True.
 462        spectrum_mode : str or None, optional
 463            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 464            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 465            Defaults to None. (faster if defined, otherwise will check each scan)
 466        ms_params_key : string, optional
 467            The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute.
 468            Defaults to 'ms2'.
 469        scan_filter : str
 470            A string to filter the scans to add to the _ms dictionary.  If None, all scans are added.  Defaults to None.
 471            "hcd" will pull out only HCD scans.
 472
 473        Raises
 474        ------
 475        ValueError
 476            If mass_features is not set, must run find_mass_features() first.
 477            If no MS2 scans are found in the dataset.
 478            If no precursor m/z values are found in MS2 scans, not a DDA dataset.
 479        """
 480        # Check if mass_features is set, raise error if not
 481        if self.mass_features is None:
 482            raise ValueError(
 483                "mass_features not set, must run find_mass_features() first"
 484            )
 485
 486        # reconfigure ms_params to get the correct mass spectrum parameters from the key
 487        ms_params = self.parameters.mass_spectrum[ms_params_key]
 488
 489        mf_df = self.mass_features_to_df().copy()
 490        # Find ms2 scans that have a precursor m/z value
 491        ms2_scans = self.scan_df[self.scan_df.ms_level == 2]
 492        ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()]
 493        # drop ms2 scans that have no tic
 494        ms2_scans = ms2_scans[ms2_scans.tic > 0]
 495        if ms2_scans is None:
 496            raise ValueError("No DDA scans found in dataset")
 497
 498        if scan_filter is not None:
 499            ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)]
 500        # set tolerance in rt space (in minutes) and mz space (in daltons)
 501        time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance
 502        mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance
 503
 504        # for each mass feature, find the ms2 scans that are within the roi scan time and mz range
 505        dda_scans = []
 506        for i, row in mf_df.iterrows():
 507            ms2_scans_filtered = ms2_scans[
 508                ms2_scans.scan_time.between(
 509                    row.scan_time - time_tol, row.scan_time + time_tol
 510                )
 511            ]
 512            ms2_scans_filtered = ms2_scans_filtered[
 513                ms2_scans_filtered.precursor_mz.between(
 514                    row.mz - mz_tol, row.mz + mz_tol
 515                )
 516            ]
 517            dda_scans = dda_scans + ms2_scans_filtered.scan.tolist()
 518            self.mass_features[i].ms2_scan_numbers = (
 519                ms2_scans_filtered.scan.tolist()
 520                + self.mass_features[i].ms2_scan_numbers
 521            )
 522        # add to _ms attribute
 523        self.add_mass_spectra(
 524            scan_list=list(set(dda_scans)),
 525            auto_process=auto_process,
 526            spectrum_mode=spectrum_mode,
 527            use_parser=use_parser,
 528            ms_params=ms_params,
 529        )
 530        # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute
 531        for mf_id in self.mass_features:
 532            if self.mass_features[mf_id].ms2_scan_numbers is not None:
 533                for dda_scan in self.mass_features[mf_id].ms2_scan_numbers:
 534                    if dda_scan in self._ms.keys():
 535                        self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[
 536                            dda_scan
 537                        ]
 538
 539    def add_associated_ms1(
 540        self, auto_process=True, use_parser=True, spectrum_mode=None
 541    ):
 542        """Add MS1 spectra associated with mass features to the dataset.
 543
 544        Parameters
 545        -----------
 546        auto_process : bool, optional
 547            If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
 548        use_parser : bool, optional
 549            If True, envoke the spectra parser to get the MS1 spectra. Default is True.
 550        spectrum_mode : str or None, optional
 551            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 552            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 553            Defaults to None. (faster if defined, otherwise will check each scan)
 554
 555        Raises
 556        ------
 557        ValueError
 558            If mass_features is not set, must run find_mass_features() first.
 559            If apex scans are not profile mode, all apex scans must be profile mode for averaging.
 560            If number of scans to average is not  1 or an integer with an integer median (i.e. 3, 5, 7, 9).
 561            If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
 562        """
 563        # Check if mass_features is set, raise error if not
 564        if self.mass_features is None:
 565            raise ValueError(
 566                "mass_features not set, must run find_mass_features() first"
 567            )
 568        scans_to_average = self.parameters.lc_ms.ms1_scans_to_average
 569
 570        if scans_to_average == 1:
 571            # Add to LCMSobj
 572            self.add_mass_spectra(
 573                scan_list=[
 574                    int(x) for x in self.mass_features_to_df().apex_scan.tolist()
 575                ],
 576                auto_process=auto_process,
 577                use_parser=use_parser,
 578                spectrum_mode=spectrum_mode,
 579                ms_params=self.parameters.mass_spectrum["ms1"],
 580            )
 581
 582        elif (
 583            (scans_to_average - 1) % 2
 584        ) == 0:  # scans_to_average = 3, 5, 7 etc, mirror l/r around apex
 585            apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist()))
 586            # Check if all apex scans are profile mode, raise error if not
 587            if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"):
 588                raise ValueError("All apex scans must be profile mode for averaging")
 589
 590            # First get sets of scans to average
 591            def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average):
 592                ms1_idx_start = ms1_scans.index(apex_scan) - int(
 593                    (scans_to_average - 1) / 2
 594                )
 595                if ms1_idx_start < 0:
 596                    ms1_idx_start = 0
 597                ms1_idx_end = (
 598                    ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1
 599                )
 600                if ms1_idx_end > (len(ms1_scans) - 1):
 601                    ms1_idx_end = len(ms1_scans) - 1
 602                scan_list = ms1_scans[ms1_idx_start:ms1_idx_end]
 603                return scan_list
 604
 605            ms1_scans = self.ms1_scans
 606            scans_lists = [
 607                get_scans_from_apex(ms1_scans, apex_scan, scans_to_average)
 608                for apex_scan in apex_scans
 609            ]
 610
 611            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 612            if self.polarity == "negative":
 613                polarity = -1
 614            elif self.polarity == "positive":
 615                polarity = 1
 616
 617            if not use_parser:
 618                # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once)
 619                ms1_unprocessed = self._ms_unprocessed[1].copy()
 620                # Set the index on _ms_unprocessed[1] to scan number
 621                ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False)
 622                self._ms_unprocessed[1] = ms1_unprocessed
 623
 624                # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1]
 625                scans_lists_flat = list(
 626                    set([scan for sublist in scans_lists for scan in sublist])
 627                )
 628                if (
 629                    len(
 630                        np.setdiff1d(
 631                            np.sort(scans_lists_flat),
 632                            np.sort(ms1_unprocessed.index.values),
 633                        )
 634                    )
 635                    > 0
 636                ):
 637                    raise ValueError(
 638                        "Not all scans to average are present in the unprocessed data"
 639                    )
 640
 641            for scan_list_average, apex_scan in zip(scans_lists, apex_scans):
 642                # Get unprocessed mass spectrum from scans
 643                ms = self.get_average_mass_spectrum(
 644                    scan_list=scan_list_average,
 645                    apex_scan=apex_scan,
 646                    spectrum_mode="profile",
 647                    ms_level=1,
 648                    auto_process=auto_process,
 649                    use_parser=use_parser,
 650                    perform_checks=False,
 651                    polarity=polarity,
 652                    ms_params=self.parameters.mass_spectrum["ms1"],
 653                )
 654                # Add mass spectrum to LCMS object and associated with mass feature
 655                self.add_mass_spectrum(ms)
 656
 657            if not use_parser:
 658                # Reset the index on _ms_unprocessed[1] to not be scan number
 659                ms1_unprocessed = ms1_unprocessed.reset_index(drop=True)
 660                self._ms_unprocessed[1] = ms1_unprocessed
 661        else:
 662            raise ValueError(
 663                "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)"
 664            )
 665
 666        # Associate the ms1 spectra with the mass features
 667        for mf_id in self.mass_features:
 668            self.mass_features[mf_id].mass_spectrum = self._ms[
 669                self.mass_features[mf_id].apex_scan
 670            ]
 671            self.mass_features[mf_id].update_mz()
 672
 673    def mass_features_to_df(self):
 674        """Returns a pandas dataframe summarizing the mass features.
 675
 676        The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity,
 677        persistence, area, monoisotopic_mf_id, and isotopologue_type.  The index is set to mf_id (mass feature ID).
 678
 679
 680        Returns
 681        --------
 682        pandas.DataFrame
 683            A pandas dataframe of mass features with the following columns:
 684            mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
 685        """
 686
 687        def mass_spectrum_to_string(
 688            mass_spec, normalize=True, min_normalized_abun=0.01
 689        ):
 690            """Converts a mass spectrum to a string of m/z:abundance pairs.
 691
 692            Parameters
 693            -----------
 694            mass_spec : MassSpectrum
 695                A MassSpectrum object to be converted to a string.
 696            normalize : bool, optional
 697                If True, normalizes the abundance values to a maximum of 1. Defaults to True.
 698            min_normalized_abun : float, optional
 699                The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01.
 700
 701            Returns
 702            --------
 703            str
 704                A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon.
 705            """
 706            mz_np = mass_spec.to_dataframe()["m/z"].values
 707            abun_np = mass_spec.to_dataframe()["Peak Height"].values
 708            if normalize:
 709                abun_np = abun_np / abun_np.max()
 710            mz_abun = np.column_stack((mz_np, abun_np))
 711            if normalize:
 712                mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun]
 713            mz_abun_str = [
 714                str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2))
 715                for mz, abun in mz_abun
 716            ]
 717            return "; ".join(mz_abun_str)
 718
 719        cols_in_df = [
 720            "id",
 721            "_apex_scan",
 722            "start_scan",
 723            "final_scan",
 724            "_retention_time",
 725            "_intensity",
 726            "_persistence",
 727            "_area",
 728            "_dispersity_index",
 729            "_tailing_factor",
 730            "monoisotopic_mf_id",
 731            "isotopologue_type",
 732            "mass_spectrum_deconvoluted_parent",
 733        ]
 734        df_mf_list = []
 735        for mf_id in self.mass_features.keys():
 736            # Find cols_in_df that are in single_mf
 737            df_keys = list(
 738                set(cols_in_df).intersection(self.mass_features[mf_id].__dir__())
 739            )
 740            dict_mf = {}
 741            for key in df_keys:
 742                dict_mf[key] = getattr(self.mass_features[mf_id], key)
 743            if len(self.mass_features[mf_id].ms2_scan_numbers) > 0:
 744                # Add MS2 spectra info
 745                best_ms2_spectrum = self.mass_features[mf_id].best_ms2
 746                dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum)
 747            if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0:
 748                dict_mf["associated_mass_features"] = ", ".join(
 749                    map(
 750                        str,
 751                        self.mass_features[mf_id].associated_mass_features_deconvoluted,
 752                    )
 753                )
 754            if self.mass_features[mf_id]._half_height_width is not None:
 755                dict_mf["half_height_width"] = self.mass_features[
 756                    mf_id
 757                ].half_height_width
 758            # Check if EIC for mass feature is set
 759            df_mf_single = pd.DataFrame(dict_mf, index=[mf_id])
 760            df_mf_single["mz"] = self.mass_features[mf_id].mz
 761            df_mf_list.append(df_mf_single)
 762        df_mf = pd.concat(df_mf_list)
 763
 764        # rename _area to area and id to mf_id
 765        df_mf = df_mf.rename(
 766            columns={
 767                "_area": "area",
 768                "id": "mf_id",
 769                "_apex_scan": "apex_scan",
 770                "_retention_time": "scan_time",
 771                "_intensity": "intensity",
 772                "_persistence": "persistence",
 773                "_dispersity_index": "dispersity_index",
 774                "_tailing_factor": "tailing_factor",
 775            }
 776        )
 777
 778        # reorder columns
 779        col_order = [
 780            "mf_id",
 781            "scan_time",
 782            "mz",
 783            "apex_scan",
 784            "start_scan",
 785            "final_scan",
 786            "intensity",
 787            "persistence",
 788            "area",
 789            "half_height_width",
 790            "tailing_factor",
 791            "dispersity_index",
 792            "monoisotopic_mf_id",
 793            "isotopologue_type",
 794            "mass_spectrum_deconvoluted_parent",
 795            "associated_mass_features",
 796            "ms2_spectrum",
 797        ]
 798        # drop columns that are not in col_order
 799        cols_to_order = [col for col in col_order if col in df_mf.columns]
 800        df_mf = df_mf[cols_to_order]
 801
 802        # reset index to mf_id
 803        df_mf = df_mf.set_index("mf_id")
 804        df_mf.index.name = "mf_id"
 805
 806        return df_mf
 807
 808    def mass_features_ms1_annot_to_df(self):
 809        """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
 810
 811        Returns
 812        --------
 813        pandas.DataFrame
 814            A pandas dataframe of MS1 annotations for the mass features in the dataset.
 815            The index is set to mf_id (mass feature ID)
 816
 817        Raises
 818        ------
 819        Warning
 820            If no MS1 annotations were found for the mass features in the dataset.
 821        """
 822        annot_df_list_ms1 = []
 823        for mf_id in self.mass_features.keys():
 824            if self.mass_features[mf_id].mass_spectrum is None:
 825                pass
 826            else:
 827                # Add ms1 annotations to ms1 annotation list
 828                if (
 829                    np.abs(
 830                        (
 831                            self.mass_features[mf_id].ms1_peak.mz_exp
 832                            - self.mass_features[mf_id].mz
 833                        )
 834                    )
 835                    < 0.01
 836                ):
 837                    # Get the molecular formula from the mass spectrum
 838                    annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe()
 839                    # Subset to pull out only the peak associated with the mass feature
 840                    annot_df = annot_df[
 841                        annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index
 842                    ].copy()
 843
 844                    # Remove the index column and add column for mf_id
 845                    annot_df = annot_df.drop(columns=["Index"])
 846                    annot_df["mf_id"] = mf_id
 847                    annot_df_list_ms1.append(annot_df)
 848
 849        if len(annot_df_list_ms1) > 0:
 850            annot_ms1_df_full = pd.concat(annot_df_list_ms1)
 851            annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id")
 852            annot_ms1_df_full.index.name = "mf_id"
 853
 854        else:
 855            annot_ms1_df_full = None
 856            # Warn that no ms1 annotations were found
 857            warnings.warn(
 858                "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?",
 859                UserWarning,
 860            )
 861
 862        return annot_ms1_df_full
 863
 864    def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
 865        """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
 866
 867        Parameters
 868        -----------
 869        molecular_metadata :  dict of MolecularMetadata objects
 870            A dictionary of MolecularMetadata objects, keyed by metabref_mol_id.  Defaults to None.
 871
 872        Returns
 873        --------
 874        pandas.DataFrame
 875            A pandas dataframe of MS2 annotations for the mass features in the dataset,
 876            and optionally molecular metadata. The index is set to mf_id (mass feature ID)
 877
 878        Raises
 879        ------
 880        Warning
 881            If no MS2 annotations were found for the mass features in the dataset.
 882        """
 883        annot_df_list_ms2 = []
 884        for mf_id in self.mass_features.keys():
 885            if len(self.mass_features[mf_id].ms2_similarity_results) > 0:
 886                # Add ms2 annotations to ms2 annotation list
 887                for result in self.mass_features[mf_id].ms2_similarity_results:
 888                    annot_df_ms2 = result.to_dataframe()
 889                    annot_df_ms2["mf_id"] = mf_id
 890                    annot_df_list_ms2.append(annot_df_ms2)
 891
 892        if len(annot_df_list_ms2) > 0:
 893            annot_ms2_df_full = pd.concat(annot_df_list_ms2)
 894            if molecular_metadata is not None:
 895                molecular_metadata_df = pd.concat(
 896                    [
 897                        pd.DataFrame.from_dict(v.__dict__, orient="index").transpose()
 898                        for k, v in molecular_metadata.items()
 899                    ],
 900                    ignore_index=True,
 901                )
 902                molecular_metadata_df = molecular_metadata_df.rename(
 903                    columns={"id": "ref_mol_id"}
 904                )
 905                annot_ms2_df_full = annot_ms2_df_full.merge(
 906                    molecular_metadata_df, on="ref_mol_id", how="left"
 907                )
 908            annot_ms2_df_full = annot_ms2_df_full.drop_duplicates(
 909                subset=["mf_id", "query_spectrum_id", "ref_ms_id"]
 910            ).copy()
 911            annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id")
 912            annot_ms2_df_full.index.name = "mf_id"
 913        else:
 914            annot_ms2_df_full = None
 915            # Warn that no ms2 annotations were found
 916            warnings.warn(
 917                "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?",
 918                UserWarning,
 919            )
 920
 921        return annot_ms2_df_full
 922
 923    def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False):
 924        """Returns a figure displaying 
 925            (1) thresholded, unprocessed data
 926            (2) the m/z features
 927            (3) which m/z features are associated with MS2 spectra
 928
 929        Parameters
 930        -----------
 931        binsize :  float
 932            Desired binsize for the m/z axis of the composite feature map.  Defaults to 1e-4.
 933        mf_plot : boolean
 934            Indicates whether to plot the m/z features. Defaults to True.
 935        ms2_plot : boolean
 936            Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
 937        return_fig : boolean
 938            Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
 939
 940        Returns
 941        --------
 942        matplotlib.pyplot.Figure
 943            A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 
 944            scan time. Unprocessed data is displayed in gray scale with darker colors indicating 
 945            higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z
 946            features with associated with MS2 spectra are plotted, they are displayed in red.
 947
 948        Raises
 949        ------
 950        Warning
 951            If m/z features are set to be plot but aren't in the dataset.
 952            If m/z features with associated MS2 data are set to be plot but no MS2 annotations 
 953            were found for the m/z features in the dataset.
 954        """
 955        if mf_plot:
 956            # Check if mass_features is set, raise error if not
 957            if self.mass_features is None:
 958                raise ValueError(
 959                    "mass_features not set, must run find_mass_features() first"
 960                )
 961            ## call mass feature data
 962            mf_df = self.mass_features_to_df()
 963
 964        if ms2_plot:
 965            if not mf_plot:
 966                # Check if mass_features is set, raise error if not
 967                if self.mass_features is None:
 968                    raise ValueError(
 969                        "mass_features not set, must run find_mass_features() first"
 970                    )
 971
 972            ## call m/z feature data
 973            mf_df = self.mass_features_to_df()
 974
 975            # Check if ms2_spectrum is set, raise error if not
 976            if 'ms2_spectrum' not in mf_df.columns:
 977                raise ValueError(                
 978                    "ms2_spectrum not set, must run add_associated_ms2_dda() first"            
 979                )
 980    
 981        ## threshold and grid unprocessed data
 982        df = self._ms_unprocessed[1].copy()
 983        df = df.dropna(subset=['intensity']).reset_index(drop = True)
 984        threshold = ph_int_min_thresh * df.intensity.max()
 985        df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy()
 986        df = self.grid_data(df_thres)
 987    
 988        ## format unprocessed data for plotting
 989        df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan')
 990        mz_grid = np.arange(0, np.max(df.mz), binsize)
 991        mz_data = np.array(df.mz)
 992        df['mz_bin'] = find_closest(mz_grid, mz_data)
 993        df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum)
 994        unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True)
 995
 996        ## generate figure
 997        fig = plt.figure()
 998        plt.scatter(
 999            unproc_df.scan_time,
1000            unproc_df.mz_bin*binsize,
1001            c = unproc_df.ab_bin/np.max(unproc_df.ab_bin),
1002            alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 
1003            cmap = 'Greys_r',
1004            s = 1
1005        )
1006
1007        if mf_plot:
1008            if ms2_plot:
1009                plt.scatter(
1010                    mf_df[mf_df.ms2_spectrum.isna()].scan_time,
1011                    mf_df[mf_df.ms2_spectrum.isna()].mz,
1012                    c = 'c',
1013                    s = 4,
1014                    label = 'M/Z features without MS2'
1015                )
1016            else:
1017                plt.scatter(
1018                    mf_df.scan_time,
1019                    mf_df.mz,
1020                    c = 'c',
1021                    s = 4,
1022                    label = 'M/Z features'
1023                )
1024
1025        if ms2_plot: 
1026            plt.scatter(
1027                mf_df[~mf_df.ms2_spectrum.isna()].scan_time,
1028                mf_df[~mf_df.ms2_spectrum.isna()].mz,
1029                c = 'r',
1030                s = 2,
1031                label = 'M/Z features with MS2'
1032            )
1033
1034        if mf_plot == True or ms2_plot == True:
1035            plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2)
1036        plt.xlabel('Scan time')
1037        plt.ylabel('m/z')
1038        plt.ylim(0, np.ceil(np.max(df.mz)))
1039        plt.xlim(0, np.ceil(np.max(df.scan_time)))
1040        plt.title('Composite Feature Map')
1041
1042        if return_fig:
1043            plt.close(fig)
1044            return fig
1045
1046        else:
1047            plt.show()
1048
1049    def __len__(self):
1050        """
1051        Returns the number of mass spectra in the dataset.
1052
1053        Returns
1054        --------
1055        int
1056            The number of mass spectra in the dataset.
1057        """
1058        return len(self._ms)
1059
1060    def __getitem__(self, scan_number):
1061        """
1062        Returns the mass spectrum corresponding to the specified scan number.
1063
1064        Parameters
1065        -----------
1066        scan_number : int
1067            The scan number of the desired mass spectrum.
1068
1069        Returns
1070        --------
1071        MassSpectrum
1072            The mass spectrum corresponding to the specified scan number.
1073        """
1074        return self._ms.get(scan_number)
1075
1076    def __iter__(self):
1077        """Returns an iterator over the mass spectra in the dataset.
1078
1079        Returns
1080        --------
1081        iterator
1082            An iterator over the mass spectra in the dataset.
1083        """
1084        return iter(self._ms.values())
1085
1086    def set_tic_list_from_data(self, overwrite=False):
1087        """Sets the TIC list from the mass spectrum objects within the _ms dictionary.
1088
1089        Parameters
1090        -----------
1091        overwrite : bool, optional
1092            If True, overwrites the TIC list if it is already set. Defaults to False.
1093
1094        Notes
1095        -----
1096        If the _ms dictionary is incomplete, sets the TIC list to an empty list.
1097
1098        Raises
1099        ------
1100        ValueError
1101            If no mass spectra are found in the dataset.
1102            If the TIC list is already set and overwrite is False.
1103        """
1104        # Check if _ms is empty and raise error if so
1105        if len(self._ms) == 0:
1106            raise ValueError("No mass spectra found in dataset")
1107
1108        # Check if tic_list is already set and raise error if so
1109        if len(self.tic) > 0 and not overwrite:
1110            raise ValueError("TIC list already set, use overwrite=True to overwrite")
1111
1112        self.tic = [self._ms.get(i).tic for i in self.scans_number]
1113
1114    def set_retention_time_from_data(self, overwrite=False):
1115        """Sets the retention time list from the data in the _ms dictionary.
1116
1117        Parameters
1118        -----------
1119        overwrite : bool, optional
1120            If True, overwrites the retention time list if it is already set. Defaults to False.
1121
1122        Notes
1123        -----
1124        If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
1125
1126        Raises
1127        ------
1128        ValueError
1129            If no mass spectra are found in the dataset.
1130            If the retention time list is already set and overwrite is False.
1131        """
1132        # Check if _ms is empty and raise error if so
1133        if len(self._ms) == 0:
1134            raise ValueError("No mass spectra found in dataset")
1135
1136        # Check if retention_time_list is already set and raise error if so
1137        if len(self.retention_time) > 0 and not overwrite:
1138            raise ValueError(
1139                "Retention time list already set, use overwrite=True to overwrite"
1140            )
1141
1142        retention_time_list = []
1143        for key_ms in sorted(self._ms.keys()):
1144            retention_time_list.append(self._ms.get(key_ms).retention_time)
1145        self.retention_time = retention_time_list
1146
1147    def set_scans_number_from_data(self, overwrite=False):
1148        """Sets the scan number list from the data in the _ms dictionary.
1149
1150        Notes
1151        -----
1152        If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
1153
1154        Raises
1155        ------
1156        ValueError
1157            If no mass spectra are found in the dataset.
1158            If the scan number list is already set and overwrite is False.
1159        """
1160        # Check if _ms is empty and raise error if so
1161        if len(self._ms) == 0:
1162            raise ValueError("No mass spectra found in dataset")
1163
1164        # Check if scans_number_list is already set and raise error if so
1165        if len(self.scans_number) > 0 and not overwrite:
1166            raise ValueError(
1167                "Scan number list already set, use overwrite=True to overwrite"
1168            )
1169
1170        self.scans_number = sorted(self._ms.keys())
1171
1172    @property
1173    def ms1_scans(self):
1174        """
1175        list : A list of MS1 scan numbers for the dataset.
1176        """
1177        return self.scan_df[self.scan_df.ms_level == 1].index.tolist()
1178
1179    @property
1180    def parameters(self):
1181        """
1182        LCMSParameters : The parameters used for the LC-MS analysis.
1183        """
1184        return self._parameters
1185
1186    @parameters.setter
1187    def parameters(self, paramsinstance):
1188        """
1189        Sets the parameters used for the LC-MS analysis.
1190
1191        Parameters
1192        -----------
1193        paramsinstance : LCMSParameters
1194            The parameters used for the LC-MS analysis.
1195        """
1196        self._parameters = paramsinstance
1197
1198    @property
1199    def scans_number(self):
1200        """
1201        list : A list of scan numbers for the dataset.
1202        """
1203        return self._scans_number_list
1204
1205    @scans_number.setter
1206    def scans_number(self, scan_numbers_list):
1207        """
1208        Sets the scan numbers for the dataset.
1209
1210        Parameters
1211        -----------
1212        scan_numbers_list : list
1213            A list of scan numbers for the dataset.
1214        """
1215        self._scans_number_list = scan_numbers_list
1216
1217    @property
1218    def retention_time(self):
1219        """
1220        numpy.ndarray : An array of retention times for the dataset.
1221        """
1222        return self._retention_time_list
1223
1224    @retention_time.setter
1225    def retention_time(self, rt_list):
1226        """
1227        Sets the retention times for the dataset.
1228
1229        Parameters
1230        -----------
1231        rt_list : list
1232            A list of retention times for the dataset.
1233        """
1234        self._retention_time_list = np.array(rt_list)
1235
1236    @property
1237    def tic(self):
1238        """
1239        numpy.ndarray : An array of TIC values for the dataset.
1240        """
1241        return self._tic_list
1242
1243    @tic.setter
1244    def tic(self, tic_list):
1245        """
1246        Sets the TIC values for the dataset.
1247
1248        Parameters
1249        -----------
1250        tic_list : list
1251            A list of TIC values for the dataset.
1252        """
1253        self._tic_list = np.array(tic_list)

A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.

This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.

Parameters
  • file_location (str or Path): The location of the file containing the mass spectra data.
  • analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
  • instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
  • sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
  • spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
  • polarity (str): The polarity of the ionization mode used for the dataset.
  • _parameters (LCMSParameters): The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
  • _retention_time_list (numpy.ndarray): An array of retention times for the dataset.
  • _scans_number_list (list): A list of scan numbers for the dataset.
  • _tic_list (numpy.ndarray): An array of total ion current (TIC) values for the dataset.
  • eics (dict): A dictionary containing extracted ion chromatograms (EICs) for the dataset. Key is the mz of the EIC. Initialized as an empty dictionary.
  • mass_features (dictionary of LCMSMassFeature objects): A dictionary containing mass features for the dataset. Key is mass feature ID. Initialized as an empty dictionary.
  • spectral_search_results (dictionary of MS2SearchResults objects): A dictionary containing spectral search results for the dataset. Key is scan number : precursor mz. Initialized as an empty dictionary.
Methods
  • get_parameters_json(). Returns the parameters used for the LC-MS analysis in JSON format.
  • add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds which MS2 scans are associated with each mass feature to the mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
  • add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds the MS1 spectra associated with each mass feature to the mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
  • mass_features_to_df() Returns a pandas dataframe summarizing the mass features in the dataset.
  • set_tic_list_from_data(overwrite=False) Sets the TIC list from the mass spectrum objects within the _ms dictionary.
  • set_retention_time_from_data(overwrite=False) Sets the retention time list from the data in the _ms dictionary.
  • set_scans_number_from_data(overwrite=False) Sets the scan number list from the data in the _ms dictionary.
  • plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) Generates plot of M/Z features comparing scan time vs M/Z value
LCMSBase( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None, spectra_parser=None)
390    def __init__(
391        self,
392        file_location,
393        analyzer="Unknown",
394        instrument_label="Unknown",
395        sample_name=None,
396        spectra_parser=None,
397    ):
398        super().__init__(
399            file_location, analyzer, instrument_label, sample_name, spectra_parser
400        )
401        self.polarity = ""
402        self._parameters = LCMSParameters()
403        self._retention_time_list = []
404        self._scans_number_list = []
405        self._tic_list = []
406        self.eics = {}
407        self.mass_features = {}
408        self.spectral_search_results = {}
polarity
eics
mass_features
spectral_search_results
def get_parameters_json(self):
410    def get_parameters_json(self):
411        """Returns the parameters stored for the LC-MS object in JSON format.
412
413        Returns
414        --------
415        str
416            The parameters used for the LC-MS analysis in JSON format.
417        """
418        return self.parameters.to_json()

Returns the parameters stored for the LC-MS object in JSON format.

Returns
  • str: The parameters used for the LC-MS analysis in JSON format.
def remove_unprocessed_data(self, ms_level=None):
420    def remove_unprocessed_data(self, ms_level=None):
421        """Removes the unprocessed data from the LCMSBase object.
422
423        Parameters
424        -----------
425        ms_level : int, optional
426            The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
427
428        Raises
429        ------
430        ValueError
431            If ms_level is not 1 or 2.
432
433        Notes
434        -----
435        This method is useful for freeing up memory after the data has been processed.
436        """
437        if ms_level is None:
438            for ms_level in self._ms_unprocessed.keys():
439                self._ms_unprocessed[ms_level] = None
440        if ms_level not in [1, 2]:
441            raise ValueError("ms_level must be 1 or 2")
442        self._ms_unprocessed[ms_level] = None

Removes the unprocessed data from the LCMSBase object.

Parameters
  • ms_level (int, optional): The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
Raises
  • ValueError: If ms_level is not 1 or 2.
Notes

This method is useful for freeing up memory after the data has been processed.

def add_associated_ms2_dda( self, auto_process=True, use_parser=True, spectrum_mode=None, ms_params_key='ms2', scan_filter=None):
444    def add_associated_ms2_dda(
445        self,
446        auto_process=True,
447        use_parser=True,
448        spectrum_mode=None,
449        ms_params_key="ms2",
450        scan_filter=None,
451    ):
452        """Add MS2 spectra associated with mass features to the dataset.
453
454        Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
455
456        Parameters
457        -----------
458        auto_process : bool, optional
459            If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
460        use_parser : bool, optional
461            If True, envoke the spectra parser to get the MS2 spectra. Default is True.
462        spectrum_mode : str or None, optional
463            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
464            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
465            Defaults to None. (faster if defined, otherwise will check each scan)
466        ms_params_key : string, optional
467            The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute.
468            Defaults to 'ms2'.
469        scan_filter : str
470            A string to filter the scans to add to the _ms dictionary.  If None, all scans are added.  Defaults to None.
471            "hcd" will pull out only HCD scans.
472
473        Raises
474        ------
475        ValueError
476            If mass_features is not set, must run find_mass_features() first.
477            If no MS2 scans are found in the dataset.
478            If no precursor m/z values are found in MS2 scans, not a DDA dataset.
479        """
480        # Check if mass_features is set, raise error if not
481        if self.mass_features is None:
482            raise ValueError(
483                "mass_features not set, must run find_mass_features() first"
484            )
485
486        # reconfigure ms_params to get the correct mass spectrum parameters from the key
487        ms_params = self.parameters.mass_spectrum[ms_params_key]
488
489        mf_df = self.mass_features_to_df().copy()
490        # Find ms2 scans that have a precursor m/z value
491        ms2_scans = self.scan_df[self.scan_df.ms_level == 2]
492        ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()]
493        # drop ms2 scans that have no tic
494        ms2_scans = ms2_scans[ms2_scans.tic > 0]
495        if ms2_scans is None:
496            raise ValueError("No DDA scans found in dataset")
497
498        if scan_filter is not None:
499            ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)]
500        # set tolerance in rt space (in minutes) and mz space (in daltons)
501        time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance
502        mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance
503
504        # for each mass feature, find the ms2 scans that are within the roi scan time and mz range
505        dda_scans = []
506        for i, row in mf_df.iterrows():
507            ms2_scans_filtered = ms2_scans[
508                ms2_scans.scan_time.between(
509                    row.scan_time - time_tol, row.scan_time + time_tol
510                )
511            ]
512            ms2_scans_filtered = ms2_scans_filtered[
513                ms2_scans_filtered.precursor_mz.between(
514                    row.mz - mz_tol, row.mz + mz_tol
515                )
516            ]
517            dda_scans = dda_scans + ms2_scans_filtered.scan.tolist()
518            self.mass_features[i].ms2_scan_numbers = (
519                ms2_scans_filtered.scan.tolist()
520                + self.mass_features[i].ms2_scan_numbers
521            )
522        # add to _ms attribute
523        self.add_mass_spectra(
524            scan_list=list(set(dda_scans)),
525            auto_process=auto_process,
526            spectrum_mode=spectrum_mode,
527            use_parser=use_parser,
528            ms_params=ms_params,
529        )
530        # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute
531        for mf_id in self.mass_features:
532            if self.mass_features[mf_id].ms2_scan_numbers is not None:
533                for dda_scan in self.mass_features[mf_id].ms2_scan_numbers:
534                    if dda_scan in self._ms.keys():
535                        self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[
536                            dda_scan
537                        ]

Add MS2 spectra associated with mass features to the dataset.

Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)

Parameters
  • auto_process (bool, optional): If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
  • use_parser (bool, optional): If True, envoke the spectra parser to get the MS2 spectra. Default is True.
  • spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
  • ms_params_key (string, optional): The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. Defaults to 'ms2'.
  • scan_filter (str): A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. "hcd" will pull out only HCD scans.
Raises
  • ValueError: If mass_features is not set, must run find_mass_features() first. If no MS2 scans are found in the dataset. If no precursor m/z values are found in MS2 scans, not a DDA dataset.
def add_associated_ms1(self, auto_process=True, use_parser=True, spectrum_mode=None):
539    def add_associated_ms1(
540        self, auto_process=True, use_parser=True, spectrum_mode=None
541    ):
542        """Add MS1 spectra associated with mass features to the dataset.
543
544        Parameters
545        -----------
546        auto_process : bool, optional
547            If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
548        use_parser : bool, optional
549            If True, envoke the spectra parser to get the MS1 spectra. Default is True.
550        spectrum_mode : str or None, optional
551            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
552            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
553            Defaults to None. (faster if defined, otherwise will check each scan)
554
555        Raises
556        ------
557        ValueError
558            If mass_features is not set, must run find_mass_features() first.
559            If apex scans are not profile mode, all apex scans must be profile mode for averaging.
560            If number of scans to average is not  1 or an integer with an integer median (i.e. 3, 5, 7, 9).
561            If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
562        """
563        # Check if mass_features is set, raise error if not
564        if self.mass_features is None:
565            raise ValueError(
566                "mass_features not set, must run find_mass_features() first"
567            )
568        scans_to_average = self.parameters.lc_ms.ms1_scans_to_average
569
570        if scans_to_average == 1:
571            # Add to LCMSobj
572            self.add_mass_spectra(
573                scan_list=[
574                    int(x) for x in self.mass_features_to_df().apex_scan.tolist()
575                ],
576                auto_process=auto_process,
577                use_parser=use_parser,
578                spectrum_mode=spectrum_mode,
579                ms_params=self.parameters.mass_spectrum["ms1"],
580            )
581
582        elif (
583            (scans_to_average - 1) % 2
584        ) == 0:  # scans_to_average = 3, 5, 7 etc, mirror l/r around apex
585            apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist()))
586            # Check if all apex scans are profile mode, raise error if not
587            if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"):
588                raise ValueError("All apex scans must be profile mode for averaging")
589
590            # First get sets of scans to average
591            def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average):
592                ms1_idx_start = ms1_scans.index(apex_scan) - int(
593                    (scans_to_average - 1) / 2
594                )
595                if ms1_idx_start < 0:
596                    ms1_idx_start = 0
597                ms1_idx_end = (
598                    ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1
599                )
600                if ms1_idx_end > (len(ms1_scans) - 1):
601                    ms1_idx_end = len(ms1_scans) - 1
602                scan_list = ms1_scans[ms1_idx_start:ms1_idx_end]
603                return scan_list
604
605            ms1_scans = self.ms1_scans
606            scans_lists = [
607                get_scans_from_apex(ms1_scans, apex_scan, scans_to_average)
608                for apex_scan in apex_scans
609            ]
610
611            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
612            if self.polarity == "negative":
613                polarity = -1
614            elif self.polarity == "positive":
615                polarity = 1
616
617            if not use_parser:
618                # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once)
619                ms1_unprocessed = self._ms_unprocessed[1].copy()
620                # Set the index on _ms_unprocessed[1] to scan number
621                ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False)
622                self._ms_unprocessed[1] = ms1_unprocessed
623
624                # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1]
625                scans_lists_flat = list(
626                    set([scan for sublist in scans_lists for scan in sublist])
627                )
628                if (
629                    len(
630                        np.setdiff1d(
631                            np.sort(scans_lists_flat),
632                            np.sort(ms1_unprocessed.index.values),
633                        )
634                    )
635                    > 0
636                ):
637                    raise ValueError(
638                        "Not all scans to average are present in the unprocessed data"
639                    )
640
641            for scan_list_average, apex_scan in zip(scans_lists, apex_scans):
642                # Get unprocessed mass spectrum from scans
643                ms = self.get_average_mass_spectrum(
644                    scan_list=scan_list_average,
645                    apex_scan=apex_scan,
646                    spectrum_mode="profile",
647                    ms_level=1,
648                    auto_process=auto_process,
649                    use_parser=use_parser,
650                    perform_checks=False,
651                    polarity=polarity,
652                    ms_params=self.parameters.mass_spectrum["ms1"],
653                )
654                # Add mass spectrum to LCMS object and associated with mass feature
655                self.add_mass_spectrum(ms)
656
657            if not use_parser:
658                # Reset the index on _ms_unprocessed[1] to not be scan number
659                ms1_unprocessed = ms1_unprocessed.reset_index(drop=True)
660                self._ms_unprocessed[1] = ms1_unprocessed
661        else:
662            raise ValueError(
663                "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)"
664            )
665
666        # Associate the ms1 spectra with the mass features
667        for mf_id in self.mass_features:
668            self.mass_features[mf_id].mass_spectrum = self._ms[
669                self.mass_features[mf_id].apex_scan
670            ]
671            self.mass_features[mf_id].update_mz()

Add MS1 spectra associated with mass features to the dataset.

Parameters
  • auto_process (bool, optional): If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
  • use_parser (bool, optional): If True, envoke the spectra parser to get the MS1 spectra. Default is True.
  • spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
Raises
  • ValueError: If mass_features is not set, must run find_mass_features() first. If apex scans are not profile mode, all apex scans must be profile mode for averaging. If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
def mass_features_to_df(self):
673    def mass_features_to_df(self):
674        """Returns a pandas dataframe summarizing the mass features.
675
676        The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity,
677        persistence, area, monoisotopic_mf_id, and isotopologue_type.  The index is set to mf_id (mass feature ID).
678
679
680        Returns
681        --------
682        pandas.DataFrame
683            A pandas dataframe of mass features with the following columns:
684            mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
685        """
686
687        def mass_spectrum_to_string(
688            mass_spec, normalize=True, min_normalized_abun=0.01
689        ):
690            """Converts a mass spectrum to a string of m/z:abundance pairs.
691
692            Parameters
693            -----------
694            mass_spec : MassSpectrum
695                A MassSpectrum object to be converted to a string.
696            normalize : bool, optional
697                If True, normalizes the abundance values to a maximum of 1. Defaults to True.
698            min_normalized_abun : float, optional
699                The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01.
700
701            Returns
702            --------
703            str
704                A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon.
705            """
706            mz_np = mass_spec.to_dataframe()["m/z"].values
707            abun_np = mass_spec.to_dataframe()["Peak Height"].values
708            if normalize:
709                abun_np = abun_np / abun_np.max()
710            mz_abun = np.column_stack((mz_np, abun_np))
711            if normalize:
712                mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun]
713            mz_abun_str = [
714                str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2))
715                for mz, abun in mz_abun
716            ]
717            return "; ".join(mz_abun_str)
718
719        cols_in_df = [
720            "id",
721            "_apex_scan",
722            "start_scan",
723            "final_scan",
724            "_retention_time",
725            "_intensity",
726            "_persistence",
727            "_area",
728            "_dispersity_index",
729            "_tailing_factor",
730            "monoisotopic_mf_id",
731            "isotopologue_type",
732            "mass_spectrum_deconvoluted_parent",
733        ]
734        df_mf_list = []
735        for mf_id in self.mass_features.keys():
736            # Find cols_in_df that are in single_mf
737            df_keys = list(
738                set(cols_in_df).intersection(self.mass_features[mf_id].__dir__())
739            )
740            dict_mf = {}
741            for key in df_keys:
742                dict_mf[key] = getattr(self.mass_features[mf_id], key)
743            if len(self.mass_features[mf_id].ms2_scan_numbers) > 0:
744                # Add MS2 spectra info
745                best_ms2_spectrum = self.mass_features[mf_id].best_ms2
746                dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum)
747            if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0:
748                dict_mf["associated_mass_features"] = ", ".join(
749                    map(
750                        str,
751                        self.mass_features[mf_id].associated_mass_features_deconvoluted,
752                    )
753                )
754            if self.mass_features[mf_id]._half_height_width is not None:
755                dict_mf["half_height_width"] = self.mass_features[
756                    mf_id
757                ].half_height_width
758            # Check if EIC for mass feature is set
759            df_mf_single = pd.DataFrame(dict_mf, index=[mf_id])
760            df_mf_single["mz"] = self.mass_features[mf_id].mz
761            df_mf_list.append(df_mf_single)
762        df_mf = pd.concat(df_mf_list)
763
764        # rename _area to area and id to mf_id
765        df_mf = df_mf.rename(
766            columns={
767                "_area": "area",
768                "id": "mf_id",
769                "_apex_scan": "apex_scan",
770                "_retention_time": "scan_time",
771                "_intensity": "intensity",
772                "_persistence": "persistence",
773                "_dispersity_index": "dispersity_index",
774                "_tailing_factor": "tailing_factor",
775            }
776        )
777
778        # reorder columns
779        col_order = [
780            "mf_id",
781            "scan_time",
782            "mz",
783            "apex_scan",
784            "start_scan",
785            "final_scan",
786            "intensity",
787            "persistence",
788            "area",
789            "half_height_width",
790            "tailing_factor",
791            "dispersity_index",
792            "monoisotopic_mf_id",
793            "isotopologue_type",
794            "mass_spectrum_deconvoluted_parent",
795            "associated_mass_features",
796            "ms2_spectrum",
797        ]
798        # drop columns that are not in col_order
799        cols_to_order = [col for col in col_order if col in df_mf.columns]
800        df_mf = df_mf[cols_to_order]
801
802        # reset index to mf_id
803        df_mf = df_mf.set_index("mf_id")
804        df_mf.index.name = "mf_id"
805
806        return df_mf

Returns a pandas dataframe summarizing the mass features.

The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID).

Returns
  • pandas.DataFrame: A pandas dataframe of mass features with the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
def mass_features_ms1_annot_to_df(self):
808    def mass_features_ms1_annot_to_df(self):
809        """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
810
811        Returns
812        --------
813        pandas.DataFrame
814            A pandas dataframe of MS1 annotations for the mass features in the dataset.
815            The index is set to mf_id (mass feature ID)
816
817        Raises
818        ------
819        Warning
820            If no MS1 annotations were found for the mass features in the dataset.
821        """
822        annot_df_list_ms1 = []
823        for mf_id in self.mass_features.keys():
824            if self.mass_features[mf_id].mass_spectrum is None:
825                pass
826            else:
827                # Add ms1 annotations to ms1 annotation list
828                if (
829                    np.abs(
830                        (
831                            self.mass_features[mf_id].ms1_peak.mz_exp
832                            - self.mass_features[mf_id].mz
833                        )
834                    )
835                    < 0.01
836                ):
837                    # Get the molecular formula from the mass spectrum
838                    annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe()
839                    # Subset to pull out only the peak associated with the mass feature
840                    annot_df = annot_df[
841                        annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index
842                    ].copy()
843
844                    # Remove the index column and add column for mf_id
845                    annot_df = annot_df.drop(columns=["Index"])
846                    annot_df["mf_id"] = mf_id
847                    annot_df_list_ms1.append(annot_df)
848
849        if len(annot_df_list_ms1) > 0:
850            annot_ms1_df_full = pd.concat(annot_df_list_ms1)
851            annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id")
852            annot_ms1_df_full.index.name = "mf_id"
853
854        else:
855            annot_ms1_df_full = None
856            # Warn that no ms1 annotations were found
857            warnings.warn(
858                "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?",
859                UserWarning,
860            )
861
862        return annot_ms1_df_full

Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.

Returns
  • pandas.DataFrame: A pandas dataframe of MS1 annotations for the mass features in the dataset. The index is set to mf_id (mass feature ID)
Raises
  • Warning: If no MS1 annotations were found for the mass features in the dataset.
def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
864    def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
865        """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
866
867        Parameters
868        -----------
869        molecular_metadata :  dict of MolecularMetadata objects
870            A dictionary of MolecularMetadata objects, keyed by metabref_mol_id.  Defaults to None.
871
872        Returns
873        --------
874        pandas.DataFrame
875            A pandas dataframe of MS2 annotations for the mass features in the dataset,
876            and optionally molecular metadata. The index is set to mf_id (mass feature ID)
877
878        Raises
879        ------
880        Warning
881            If no MS2 annotations were found for the mass features in the dataset.
882        """
883        annot_df_list_ms2 = []
884        for mf_id in self.mass_features.keys():
885            if len(self.mass_features[mf_id].ms2_similarity_results) > 0:
886                # Add ms2 annotations to ms2 annotation list
887                for result in self.mass_features[mf_id].ms2_similarity_results:
888                    annot_df_ms2 = result.to_dataframe()
889                    annot_df_ms2["mf_id"] = mf_id
890                    annot_df_list_ms2.append(annot_df_ms2)
891
892        if len(annot_df_list_ms2) > 0:
893            annot_ms2_df_full = pd.concat(annot_df_list_ms2)
894            if molecular_metadata is not None:
895                molecular_metadata_df = pd.concat(
896                    [
897                        pd.DataFrame.from_dict(v.__dict__, orient="index").transpose()
898                        for k, v in molecular_metadata.items()
899                    ],
900                    ignore_index=True,
901                )
902                molecular_metadata_df = molecular_metadata_df.rename(
903                    columns={"id": "ref_mol_id"}
904                )
905                annot_ms2_df_full = annot_ms2_df_full.merge(
906                    molecular_metadata_df, on="ref_mol_id", how="left"
907                )
908            annot_ms2_df_full = annot_ms2_df_full.drop_duplicates(
909                subset=["mf_id", "query_spectrum_id", "ref_ms_id"]
910            ).copy()
911            annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id")
912            annot_ms2_df_full.index.name = "mf_id"
913        else:
914            annot_ms2_df_full = None
915            # Warn that no ms2 annotations were found
916            warnings.warn(
917                "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?",
918                UserWarning,
919            )
920
921        return annot_ms2_df_full

Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.

Parameters
  • molecular_metadata (dict of MolecularMetadata objects): A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None.
Returns
  • pandas.DataFrame: A pandas dataframe of MS2 annotations for the mass features in the dataset, and optionally molecular metadata. The index is set to mf_id (mass feature ID)
Raises
  • Warning: If no MS2 annotations were found for the mass features in the dataset.
def plot_composite_mz_features( self, binsize=0.0001, ph_int_min_thresh=0.001, mf_plot=True, ms2_plot=True, return_fig=False):
 923    def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False):
 924        """Returns a figure displaying 
 925            (1) thresholded, unprocessed data
 926            (2) the m/z features
 927            (3) which m/z features are associated with MS2 spectra
 928
 929        Parameters
 930        -----------
 931        binsize :  float
 932            Desired binsize for the m/z axis of the composite feature map.  Defaults to 1e-4.
 933        mf_plot : boolean
 934            Indicates whether to plot the m/z features. Defaults to True.
 935        ms2_plot : boolean
 936            Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
 937        return_fig : boolean
 938            Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
 939
 940        Returns
 941        --------
 942        matplotlib.pyplot.Figure
 943            A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 
 944            scan time. Unprocessed data is displayed in gray scale with darker colors indicating 
 945            higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z
 946            features with associated with MS2 spectra are plotted, they are displayed in red.
 947
 948        Raises
 949        ------
 950        Warning
 951            If m/z features are set to be plot but aren't in the dataset.
 952            If m/z features with associated MS2 data are set to be plot but no MS2 annotations 
 953            were found for the m/z features in the dataset.
 954        """
 955        if mf_plot:
 956            # Check if mass_features is set, raise error if not
 957            if self.mass_features is None:
 958                raise ValueError(
 959                    "mass_features not set, must run find_mass_features() first"
 960                )
 961            ## call mass feature data
 962            mf_df = self.mass_features_to_df()
 963
 964        if ms2_plot:
 965            if not mf_plot:
 966                # Check if mass_features is set, raise error if not
 967                if self.mass_features is None:
 968                    raise ValueError(
 969                        "mass_features not set, must run find_mass_features() first"
 970                    )
 971
 972            ## call m/z feature data
 973            mf_df = self.mass_features_to_df()
 974
 975            # Check if ms2_spectrum is set, raise error if not
 976            if 'ms2_spectrum' not in mf_df.columns:
 977                raise ValueError(                
 978                    "ms2_spectrum not set, must run add_associated_ms2_dda() first"            
 979                )
 980    
 981        ## threshold and grid unprocessed data
 982        df = self._ms_unprocessed[1].copy()
 983        df = df.dropna(subset=['intensity']).reset_index(drop = True)
 984        threshold = ph_int_min_thresh * df.intensity.max()
 985        df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy()
 986        df = self.grid_data(df_thres)
 987    
 988        ## format unprocessed data for plotting
 989        df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan')
 990        mz_grid = np.arange(0, np.max(df.mz), binsize)
 991        mz_data = np.array(df.mz)
 992        df['mz_bin'] = find_closest(mz_grid, mz_data)
 993        df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum)
 994        unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True)
 995
 996        ## generate figure
 997        fig = plt.figure()
 998        plt.scatter(
 999            unproc_df.scan_time,
1000            unproc_df.mz_bin*binsize,
1001            c = unproc_df.ab_bin/np.max(unproc_df.ab_bin),
1002            alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 
1003            cmap = 'Greys_r',
1004            s = 1
1005        )
1006
1007        if mf_plot:
1008            if ms2_plot:
1009                plt.scatter(
1010                    mf_df[mf_df.ms2_spectrum.isna()].scan_time,
1011                    mf_df[mf_df.ms2_spectrum.isna()].mz,
1012                    c = 'c',
1013                    s = 4,
1014                    label = 'M/Z features without MS2'
1015                )
1016            else:
1017                plt.scatter(
1018                    mf_df.scan_time,
1019                    mf_df.mz,
1020                    c = 'c',
1021                    s = 4,
1022                    label = 'M/Z features'
1023                )
1024
1025        if ms2_plot: 
1026            plt.scatter(
1027                mf_df[~mf_df.ms2_spectrum.isna()].scan_time,
1028                mf_df[~mf_df.ms2_spectrum.isna()].mz,
1029                c = 'r',
1030                s = 2,
1031                label = 'M/Z features with MS2'
1032            )
1033
1034        if mf_plot == True or ms2_plot == True:
1035            plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2)
1036        plt.xlabel('Scan time')
1037        plt.ylabel('m/z')
1038        plt.ylim(0, np.ceil(np.max(df.mz)))
1039        plt.xlim(0, np.ceil(np.max(df.scan_time)))
1040        plt.title('Composite Feature Map')
1041
1042        if return_fig:
1043            plt.close(fig)
1044            return fig
1045
1046        else:
1047            plt.show()

Returns a figure displaying (1) thresholded, unprocessed data (2) the m/z features (3) which m/z features are associated with MS2 spectra

Parameters
  • binsize (float): Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4.
  • mf_plot (boolean): Indicates whether to plot the m/z features. Defaults to True.
  • ms2_plot (boolean): Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
  • return_fig (boolean): Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
Returns
  • matplotlib.pyplot.Figure: A figure with the thresholded, unprocessed data on an axis of m/z value with respect to scan time. Unprocessed data is displayed in gray scale with darker colors indicating higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z features with associated with MS2 spectra are plotted, they are displayed in red.
Raises
  • Warning: If m/z features are set to be plot but aren't in the dataset. If m/z features with associated MS2 data are set to be plot but no MS2 annotations were found for the m/z features in the dataset.
def set_tic_list_from_data(self, overwrite=False):
1086    def set_tic_list_from_data(self, overwrite=False):
1087        """Sets the TIC list from the mass spectrum objects within the _ms dictionary.
1088
1089        Parameters
1090        -----------
1091        overwrite : bool, optional
1092            If True, overwrites the TIC list if it is already set. Defaults to False.
1093
1094        Notes
1095        -----
1096        If the _ms dictionary is incomplete, sets the TIC list to an empty list.
1097
1098        Raises
1099        ------
1100        ValueError
1101            If no mass spectra are found in the dataset.
1102            If the TIC list is already set and overwrite is False.
1103        """
1104        # Check if _ms is empty and raise error if so
1105        if len(self._ms) == 0:
1106            raise ValueError("No mass spectra found in dataset")
1107
1108        # Check if tic_list is already set and raise error if so
1109        if len(self.tic) > 0 and not overwrite:
1110            raise ValueError("TIC list already set, use overwrite=True to overwrite")
1111
1112        self.tic = [self._ms.get(i).tic for i in self.scans_number]

Sets the TIC list from the mass spectrum objects within the _ms dictionary.

Parameters
  • overwrite (bool, optional): If True, overwrites the TIC list if it is already set. Defaults to False.
Notes

If the _ms dictionary is incomplete, sets the TIC list to an empty list.

Raises
  • ValueError: If no mass spectra are found in the dataset. If the TIC list is already set and overwrite is False.
def set_retention_time_from_data(self, overwrite=False):
1114    def set_retention_time_from_data(self, overwrite=False):
1115        """Sets the retention time list from the data in the _ms dictionary.
1116
1117        Parameters
1118        -----------
1119        overwrite : bool, optional
1120            If True, overwrites the retention time list if it is already set. Defaults to False.
1121
1122        Notes
1123        -----
1124        If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
1125
1126        Raises
1127        ------
1128        ValueError
1129            If no mass spectra are found in the dataset.
1130            If the retention time list is already set and overwrite is False.
1131        """
1132        # Check if _ms is empty and raise error if so
1133        if len(self._ms) == 0:
1134            raise ValueError("No mass spectra found in dataset")
1135
1136        # Check if retention_time_list is already set and raise error if so
1137        if len(self.retention_time) > 0 and not overwrite:
1138            raise ValueError(
1139                "Retention time list already set, use overwrite=True to overwrite"
1140            )
1141
1142        retention_time_list = []
1143        for key_ms in sorted(self._ms.keys()):
1144            retention_time_list.append(self._ms.get(key_ms).retention_time)
1145        self.retention_time = retention_time_list

Sets the retention time list from the data in the _ms dictionary.

Parameters
  • overwrite (bool, optional): If True, overwrites the retention time list if it is already set. Defaults to False.
Notes

If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.

Raises
  • ValueError: If no mass spectra are found in the dataset. If the retention time list is already set and overwrite is False.
def set_scans_number_from_data(self, overwrite=False):
1147    def set_scans_number_from_data(self, overwrite=False):
1148        """Sets the scan number list from the data in the _ms dictionary.
1149
1150        Notes
1151        -----
1152        If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
1153
1154        Raises
1155        ------
1156        ValueError
1157            If no mass spectra are found in the dataset.
1158            If the scan number list is already set and overwrite is False.
1159        """
1160        # Check if _ms is empty and raise error if so
1161        if len(self._ms) == 0:
1162            raise ValueError("No mass spectra found in dataset")
1163
1164        # Check if scans_number_list is already set and raise error if so
1165        if len(self.scans_number) > 0 and not overwrite:
1166            raise ValueError(
1167                "Scan number list already set, use overwrite=True to overwrite"
1168            )
1169
1170        self.scans_number = sorted(self._ms.keys())

Sets the scan number list from the data in the _ms dictionary.

Notes

If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.

Raises
  • ValueError: If no mass spectra are found in the dataset. If the scan number list is already set and overwrite is False.
ms1_scans

list : A list of MS1 scan numbers for the dataset.

parameters

LCMSParameters : The parameters used for the LC-MS analysis.

scans_number

list : A list of scan numbers for the dataset.

retention_time

numpy.ndarray : An array of retention times for the dataset.

tic

numpy.ndarray : An array of TIC values for the dataset.