corems.mass_spectra.factory.lc_class

View Source

   1from pathlib import Path
   2
   3import numpy as np
   4import pandas as pd
   5import warnings
   6import matplotlib.pyplot as plt
   7
   8from corems.encapsulation.factory.parameters import LCMSParameters
   9from corems.mass_spectra.calc.lc_calc import LCCalculations, PHCalculations
  10from corems.molecular_id.search.lcms_spectral_search import LCMSSpectralSearch
  11from corems.mass_spectrum.input.numpyArray import ms_from_array_profile
  12from corems.mass_spectra.calc.lc_calc import find_closest
  13
  14
  15class MassSpectraBase:
  16    """Base class for mass spectra objects.
  17
  18    Parameters
  19    -----------
  20    file_location : str or Path
  21        The location of the file containing the mass spectra data.
  22    analyzer : str, optional
  23        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
  24    instrument_label : str, optional
  25        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
  26    sample_name : str, optional
  27        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
  28    spectra_parser : object, optional
  29        The spectra parser object used to create the mass spectra object. Defaults to None.
  30
  31    Attributes
  32    -----------
  33    spectra_parser_class : class
  34        The class of the spectra parser used to create the mass spectra object.
  35    file_location : str or Path
  36        The location of the file containing the mass spectra data.
  37    sample_name : str
  38        The name of the sample; defaults to the file name if not provided to the parser.
  39    analyzer : str
  40        The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
  41    instrument_label : str
  42        The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
  43    _scan_info : dict
  44        A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z,
  45        scan text, and scan window (lower and upper).
  46        Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
  47    _ms : dict
  48        A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
  49    _ms_unprocessed: dictionary of pandas.DataFrames or None
  50        A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking.
  51        Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
  52
  53    Methods
  54    --------
  55    * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True).
  56        Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
  57    * get_time_of_scan_id(scan).
  58        Returns the scan time for the specified scan number.
  59    """
  60
  61    def __init__(
  62        self,
  63        file_location,
  64        analyzer="Unknown",
  65        instrument_label="Unknown",
  66        sample_name=None,
  67        spectra_parser=None,
  68    ):
  69        if isinstance(file_location, str):
  70            file_location = Path(file_location)
  71        else:
  72            file_location = file_location
  73        if not file_location.exists():
  74            raise FileExistsError("File does not exist: " + str(file_location))
  75
  76        if sample_name:
  77            self.sample_name = sample_name
  78        else:
  79            self.sample_name = file_location.stem
  80
  81        self.file_location = file_location
  82        self.analyzer = analyzer
  83        self.instrument_label = instrument_label
  84
  85        # Add the spectra parser class to the object if it is not None
  86        if spectra_parser is not None:
  87            self.spectra_parser_class = spectra_parser.__class__
  88            self.spectra_parser = spectra_parser
  89            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
  90            if (
  91                self.sample_name is not None
  92                and self.sample_name != self.spectra_parser.sample_name
  93            ):
  94                warnings.warn(
  95                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
  96                    UserWarning,
  97                )
  98            if self.analyzer != self.spectra_parser.analyzer:
  99                warnings.warn(
 100                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
 101                    UserWarning,
 102                )
 103            if self.instrument_label != self.spectra_parser.instrument_label:
 104                warnings.warn(
 105                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
 106                    UserWarning,
 107                )
 108            if self.file_location != self.spectra_parser.file_location:
 109                warnings.warn(
 110                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
 111                    UserWarning,
 112                )
 113
 114        # Instantiate empty dictionaries for scan information and mass spectra
 115        self._scan_info = {}
 116        self._ms = {}
 117        self._ms_unprocessed = {}
 118
 119    def add_mass_spectrum(self, mass_spec):
 120        """Adds a mass spectrum to the dataset.
 121
 122        Parameters
 123        -----------
 124        mass_spec : MassSpectrum
 125            The corems MassSpectrum object to be added to the dataset.
 126
 127        Notes
 128        -----
 129        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
 130        """
 131        # check if mass_spec has a scan_number attribute
 132        if not hasattr(mass_spec, "scan_number"):
 133            raise ValueError(
 134                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
 135            )
 136        self._ms[mass_spec.scan_number] = mass_spec
 137
 138    def add_mass_spectra(
 139        self,
 140        scan_list,
 141        spectrum_mode=None,
 142        ms_level=1,
 143        use_parser=True,
 144        auto_process=True,
 145        ms_params=None,
 146    ):
 147        """Add mass spectra to _ms dictionary, from a list of scans or single scan
 148
 149        Notes
 150        -----
 151        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
 152
 153
 154        Parameters
 155        -----------
 156        scan_list : list of ints
 157            List of scans to use to populate _ms slot
 158        spectrum_mode : str or None
 159            The spectrum mode to use for the mass spectra.
 160            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 161            Defaults to None.
 162        ms_level : int, optional
 163            The MS level to use for the mass spectra.
 164            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
 165            Defaults to 1.
 166        using_parser : bool
 167            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
 168        auto_process : bool
 169            Whether to auto-process the mass spectra.  Defaults to True.
 170        ms_params : MSParameters or None
 171            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
 172
 173        Raises
 174        ------
 175        TypeError
 176            If scan_list is not a list of ints
 177        ValueError
 178            If polarity is not 'positive' or 'negative'
 179            If ms_level is not 1 or 2
 180        """
 181
 182        # check if scan_list is a list or a single int; if single int, convert to list
 183        if isinstance(scan_list, int):
 184            scan_list = [scan_list]
 185        if not isinstance(scan_list, list):
 186            raise TypeError("scan_list must be a list of integers")
 187        for scan in scan_list:
 188            if not isinstance(scan, int):
 189                raise TypeError("scan_list must be a list of integers")
 190
 191        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 192        if self.polarity == "negative":
 193            polarity = -1
 194        elif self.polarity == "positive":
 195            polarity = 1
 196        else:
 197            raise ValueError(
 198                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
 199            )
 200
 201        # is not using_parser, check that ms1 and ms2 are not None
 202        if not use_parser:
 203            if ms_level not in self._ms_unprocessed.keys():
 204                raise ValueError(
 205                    "ms_level {} not found in _ms_unprocessed dictionary".format(
 206                        ms_level
 207                    )
 208                )
 209
 210        scan_list = list(set(scan_list))
 211        scan_list.sort()
 212        if not use_parser:
 213            if self._ms_unprocessed[ms_level] is None:
 214                raise ValueError(
 215                    "No unprocessed data found for ms_level {}".format(ms_level)
 216                )
 217            if (
 218                len(
 219                    np.setdiff1d(
 220                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
 221                    )
 222                )
 223                > 0
 224            ):
 225                raise ValueError(
 226                    "Not all scans in scan_list are present in the unprocessed data"
 227                )
 228            # Prepare the ms_df for parsing
 229            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
 230
 231        for scan in scan_list:
 232            ms = None
 233            if spectrum_mode is None:
 234                # get spectrum mode from _scan_info
 235                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
 236            else:
 237                spectrum_mode_scan = spectrum_mode
 238            # Instantiate the mass spectrum object using the parser or the unprocessed data
 239            if not use_parser:
 240                my_ms_df = ms_df.loc[scan]
 241                if spectrum_mode_scan == "profile":
 242                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
 243                    ms = ms_from_array_profile(
 244                        my_ms_df.mz,
 245                        my_ms_df.intensity,
 246                        self.file_location,
 247                        polarity=polarity,
 248                        auto_process=False,
 249                    )
 250                else:
 251                    raise ValueError(
 252                        "Only profile mode is supported for unprocessed data"
 253                    )
 254            if use_parser:
 255                ms = self.spectra_parser.get_mass_spectrum_from_scan(
 256                    scan_number=scan,
 257                    spectrum_mode=spectrum_mode_scan,
 258                    auto_process=False,
 259                )
 260
 261            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
 262            if ms is not None:
 263                if ms_params is not None:
 264                    ms.parameters = ms_params
 265                ms.scan_number = scan
 266                if auto_process:
 267                    ms.process_mass_spec()
 268                self.add_mass_spectrum(ms)
 269
 270    def get_time_of_scan_id(self, scan):
 271        """Returns the scan time for the specified scan number.
 272
 273        Parameters
 274        -----------
 275        scan : int
 276            The scan number of the desired scan time.
 277
 278        Returns
 279        --------
 280        float
 281            The scan time for the specified scan number (in minutes).
 282
 283        Raises
 284        ------
 285        ValueError
 286            If no scan time is found for the specified scan number.
 287        """
 288        # Check if _retenion_time_list is empty and raise error if so
 289        if len(self._retention_time_list) == 0:
 290            raise ValueError("No retention times found in dataset")
 291        rt = self._retention_time_list[self._scans_number_list.index(scan)]
 292        return rt
 293
 294    @property
 295    def scan_df(self):
 296        """
 297        pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).
 298        """
 299        scan_df = pd.DataFrame.from_dict(self._scan_info)
 300        return scan_df
 301        
 302    @property
 303    def ms(self):
 304        """
 305        dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles
 306        """
 307        return self._ms
 308
 309    
 310    @scan_df.setter
 311    def scan_df(self, df):
 312        """
 313        Sets the scan data for the dataset.
 314
 315        Parameters
 316        -----------
 317        df : pandas.DataFrame
 318            A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level,
 319            precursor m/z, scan text, and scan window (lower and upper).
 320        """
 321        self._scan_info = df.to_dict()
 322
 323    def __getitem__(self, scan_number):
 324        return self._ms.get(scan_number)
 325
 326
 327class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch):
 328    """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.
 329
 330    This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.
 331
 332    Parameters
 333    -----------
 334    file_location : str or Path
 335        The location of the file containing the mass spectra data.
 336    analyzer : str, optional
 337        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
 338    instrument_label : str, optional
 339        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
 340    sample_name : str, optional
 341        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
 342    spectra_parser : object, optional
 343        The spectra parser object used to create the mass spectra object. Defaults to None.
 344
 345    Attributes
 346    -----------
 347    polarity : str
 348        The polarity of the ionization mode used for the dataset.
 349    _parameters : LCMSParameters
 350        The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
 351    _retention_time_list : numpy.ndarray
 352        An array of retention times for the dataset.
 353    _scans_number_list : list
 354        A list of scan numbers for the dataset.
 355    _tic_list : numpy.ndarray
 356        An array of total ion current (TIC) values for the dataset.
 357    eics : dict
 358        A dictionary containing extracted ion chromatograms (EICs) for the dataset.
 359        Key is the mz of the EIC. Initialized as an empty dictionary.
 360    mass_features : dictionary of LCMSMassFeature objects
 361        A dictionary containing mass features for the dataset.
 362        Key is mass feature ID. Initialized as an empty dictionary.
 363    spectral_search_results : dictionary of MS2SearchResults objects
 364        A dictionary containing spectral search results for the dataset.
 365        Key is scan number : precursor mz. Initialized as an empty dictionary.
 366
 367    Methods
 368    --------
 369    * get_parameters_json().
 370        Returns the parameters used for the LC-MS analysis in JSON format.
 371    * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 372        Adds which MS2 scans are associated with each mass feature to the
 373        mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
 374    * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True)
 375        Adds the MS1 spectra associated with each mass feature to the
 376        mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
 377    * mass_features_to_df()
 378        Returns a pandas dataframe summarizing the mass features in the dataset.
 379    * set_tic_list_from_data(overwrite=False)
 380        Sets the TIC list from the mass spectrum objects within the _ms dictionary.
 381    * set_retention_time_from_data(overwrite=False)
 382        Sets the retention time list from the data in the _ms dictionary.
 383    * set_scans_number_from_data(overwrite=False)
 384        Sets the scan number list from the data in the _ms dictionary.
 385    * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False)
 386        Generates plot of M/Z features comparing scan time vs M/Z value
 387    """
 388
 389    def __init__(
 390        self,
 391        file_location,
 392        analyzer="Unknown",
 393        instrument_label="Unknown",
 394        sample_name=None,
 395        spectra_parser=None,
 396    ):
 397        super().__init__(
 398            file_location, analyzer, instrument_label, sample_name, spectra_parser
 399        )
 400        self.polarity = ""
 401        self._parameters = LCMSParameters()
 402        self._retention_time_list = []
 403        self._scans_number_list = []
 404        self._tic_list = []
 405        self.eics = {}
 406        self.mass_features = {}
 407        self.spectral_search_results = {}
 408
 409    def get_parameters_json(self):
 410        """Returns the parameters stored for the LC-MS object in JSON format.
 411
 412        Returns
 413        --------
 414        str
 415            The parameters used for the LC-MS analysis in JSON format.
 416        """
 417        return self.parameters.to_json()
 418
 419    def remove_unprocessed_data(self, ms_level=None):
 420        """Removes the unprocessed data from the LCMSBase object.
 421
 422        Parameters
 423        -----------
 424        ms_level : int, optional
 425            The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
 426
 427        Raises
 428        ------
 429        ValueError
 430            If ms_level is not 1 or 2.
 431
 432        Notes
 433        -----
 434        This method is useful for freeing up memory after the data has been processed.
 435        """
 436        if ms_level is None:
 437            for ms_level in self._ms_unprocessed.keys():
 438                self._ms_unprocessed[ms_level] = None
 439        if ms_level not in [1, 2]:
 440            raise ValueError("ms_level must be 1 or 2")
 441        self._ms_unprocessed[ms_level] = None
 442
 443    def add_associated_ms2_dda(
 444        self,
 445        auto_process=True,
 446        use_parser=True,
 447        spectrum_mode=None,
 448        ms_params_key="ms2",
 449        scan_filter=None,
 450    ):
 451        """Add MS2 spectra associated with mass features to the dataset.
 452
 453        Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
 454
 455        Parameters
 456        -----------
 457        auto_process : bool, optional
 458            If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
 459        use_parser : bool, optional
 460            If True, envoke the spectra parser to get the MS2 spectra. Default is True.
 461        spectrum_mode : str or None, optional
 462            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 463            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 464            Defaults to None. (faster if defined, otherwise will check each scan)
 465        ms_params_key : string, optional
 466            The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute.
 467            Defaults to 'ms2'.
 468        scan_filter : str
 469            A string to filter the scans to add to the _ms dictionary.  If None, all scans are added.  Defaults to None.
 470            "hcd" will pull out only HCD scans.
 471
 472        Raises
 473        ------
 474        ValueError
 475            If mass_features is not set, must run find_mass_features() first.
 476            If no MS2 scans are found in the dataset.
 477            If no precursor m/z values are found in MS2 scans, not a DDA dataset.
 478        """
 479        # Check if mass_features is set, raise error if not
 480        if self.mass_features is None:
 481            raise ValueError(
 482                "mass_features not set, must run find_mass_features() first"
 483            )
 484
 485        # reconfigure ms_params to get the correct mass spectrum parameters from the key
 486        ms_params = self.parameters.mass_spectrum[ms_params_key]
 487
 488        mf_df = self.mass_features_to_df().copy()
 489        # Find ms2 scans that have a precursor m/z value
 490        ms2_scans = self.scan_df[self.scan_df.ms_level == 2]
 491        ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()]
 492        # drop ms2 scans that have no tic
 493        ms2_scans = ms2_scans[ms2_scans.tic > 0]
 494        if ms2_scans is None:
 495            raise ValueError("No DDA scans found in dataset")
 496
 497        if scan_filter is not None:
 498            ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)]
 499        # set tolerance in rt space (in minutes) and mz space (in daltons)
 500        time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance
 501        mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance
 502
 503        # for each mass feature, find the ms2 scans that are within the roi scan time and mz range
 504        dda_scans = []
 505        for i, row in mf_df.iterrows():
 506            ms2_scans_filtered = ms2_scans[
 507                ms2_scans.scan_time.between(
 508                    row.scan_time - time_tol, row.scan_time + time_tol
 509                )
 510            ]
 511            ms2_scans_filtered = ms2_scans_filtered[
 512                ms2_scans_filtered.precursor_mz.between(
 513                    row.mz - mz_tol, row.mz + mz_tol
 514                )
 515            ]
 516            dda_scans = dda_scans + ms2_scans_filtered.scan.tolist()
 517            self.mass_features[i].ms2_scan_numbers = (
 518                ms2_scans_filtered.scan.tolist()
 519                + self.mass_features[i].ms2_scan_numbers
 520            )
 521        # add to _ms attribute
 522        self.add_mass_spectra(
 523            scan_list=list(set(dda_scans)),
 524            auto_process=auto_process,
 525            spectrum_mode=spectrum_mode,
 526            use_parser=use_parser,
 527            ms_params=ms_params,
 528        )
 529        # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute
 530        for mf_id in self.mass_features:
 531            if self.mass_features[mf_id].ms2_scan_numbers is not None:
 532                for dda_scan in self.mass_features[mf_id].ms2_scan_numbers:
 533                    if dda_scan in self._ms.keys():
 534                        self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[
 535                            dda_scan
 536                        ]
 537
 538    def add_associated_ms1(
 539        self, auto_process=True, use_parser=True, spectrum_mode=None
 540    ):
 541        """Add MS1 spectra associated with mass features to the dataset.
 542
 543        Parameters
 544        -----------
 545        auto_process : bool, optional
 546            If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
 547        use_parser : bool, optional
 548            If True, envoke the spectra parser to get the MS1 spectra. Default is True.
 549        spectrum_mode : str or None, optional
 550            The spectrum mode to use for the mass spectra.  If None, method will use the spectrum mode
 551            from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
 552            Defaults to None. (faster if defined, otherwise will check each scan)
 553
 554        Raises
 555        ------
 556        ValueError
 557            If mass_features is not set, must run find_mass_features() first.
 558            If apex scans are not profile mode, all apex scans must be profile mode for averaging.
 559            If number of scans to average is not  1 or an integer with an integer median (i.e. 3, 5, 7, 9).
 560            If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
 561        """
 562        # Check if mass_features is set, raise error if not
 563        if self.mass_features is None:
 564            raise ValueError(
 565                "mass_features not set, must run find_mass_features() first"
 566            )
 567        scans_to_average = self.parameters.lc_ms.ms1_scans_to_average
 568
 569        if scans_to_average == 1:
 570            # Add to LCMSobj
 571            self.add_mass_spectra(
 572                scan_list=[
 573                    int(x) for x in self.mass_features_to_df().apex_scan.tolist()
 574                ],
 575                auto_process=auto_process,
 576                use_parser=use_parser,
 577                spectrum_mode=spectrum_mode,
 578                ms_params=self.parameters.mass_spectrum["ms1"],
 579            )
 580
 581        elif (
 582            (scans_to_average - 1) % 2
 583        ) == 0:  # scans_to_average = 3, 5, 7 etc, mirror l/r around apex
 584            apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist()))
 585            # Check if all apex scans are profile mode, raise error if not
 586            if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"):
 587                raise ValueError("All apex scans must be profile mode for averaging")
 588
 589            # First get sets of scans to average
 590            def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average):
 591                ms1_idx_start = ms1_scans.index(apex_scan) - int(
 592                    (scans_to_average - 1) / 2
 593                )
 594                if ms1_idx_start < 0:
 595                    ms1_idx_start = 0
 596                ms1_idx_end = (
 597                    ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1
 598                )
 599                if ms1_idx_end > (len(ms1_scans) - 1):
 600                    ms1_idx_end = len(ms1_scans) - 1
 601                scan_list = ms1_scans[ms1_idx_start:ms1_idx_end]
 602                return scan_list
 603
 604            ms1_scans = self.ms1_scans
 605            scans_lists = [
 606                get_scans_from_apex(ms1_scans, apex_scan, scans_to_average)
 607                for apex_scan in apex_scans
 608            ]
 609
 610            # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
 611            if self.polarity == "negative":
 612                polarity = -1
 613            elif self.polarity == "positive":
 614                polarity = 1
 615
 616            if not use_parser:
 617                # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once)
 618                ms1_unprocessed = self._ms_unprocessed[1].copy()
 619                # Set the index on _ms_unprocessed[1] to scan number
 620                ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False)
 621                self._ms_unprocessed[1] = ms1_unprocessed
 622
 623                # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1]
 624                scans_lists_flat = list(
 625                    set([scan for sublist in scans_lists for scan in sublist])
 626                )
 627                if (
 628                    len(
 629                        np.setdiff1d(
 630                            np.sort(scans_lists_flat),
 631                            np.sort(ms1_unprocessed.index.values),
 632                        )
 633                    )
 634                    > 0
 635                ):
 636                    raise ValueError(
 637                        "Not all scans to average are present in the unprocessed data"
 638                    )
 639
 640            for scan_list_average, apex_scan in zip(scans_lists, apex_scans):
 641                # Get unprocessed mass spectrum from scans
 642                ms = self.get_average_mass_spectrum(
 643                    scan_list=scan_list_average,
 644                    apex_scan=apex_scan,
 645                    spectrum_mode="profile",
 646                    ms_level=1,
 647                    auto_process=auto_process,
 648                    use_parser=use_parser,
 649                    perform_checks=False,
 650                    polarity=polarity,
 651                    ms_params=self.parameters.mass_spectrum["ms1"],
 652                )
 653                # Add mass spectrum to LCMS object and associated with mass feature
 654                self.add_mass_spectrum(ms)
 655
 656            if not use_parser:
 657                # Reset the index on _ms_unprocessed[1] to not be scan number
 658                ms1_unprocessed = ms1_unprocessed.reset_index(drop=True)
 659                self._ms_unprocessed[1] = ms1_unprocessed
 660        else:
 661            raise ValueError(
 662                "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)"
 663            )
 664
 665        # Associate the ms1 spectra with the mass features
 666        for mf_id in self.mass_features:
 667            self.mass_features[mf_id].mass_spectrum = self._ms[
 668                self.mass_features[mf_id].apex_scan
 669            ]
 670            self.mass_features[mf_id].update_mz()
 671
 672        # Re-process clustering if persistent homology is selected to remove duplicate mass features after adding and processing MS1 spectra
 673        if self.parameters.lc_ms.peak_picking_method == "persistent homology":
 674            self.cluster_mass_features(drop_children=True, sort_by="persistence")
 675
 676    def mass_features_to_df(self):
 677        """Returns a pandas dataframe summarizing the mass features.
 678
 679        The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity,
 680        persistence, area, monoisotopic_mf_id, and isotopologue_type.  The index is set to mf_id (mass feature ID).
 681
 682
 683        Returns
 684        --------
 685        pandas.DataFrame
 686            A pandas dataframe of mass features with the following columns:
 687            mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
 688        """
 689
 690        def mass_spectrum_to_string(
 691            mass_spec, normalize=True, min_normalized_abun=0.01
 692        ):
 693            """Converts a mass spectrum to a string of m/z:abundance pairs.
 694
 695            Parameters
 696            -----------
 697            mass_spec : MassSpectrum
 698                A MassSpectrum object to be converted to a string.
 699            normalize : bool, optional
 700                If True, normalizes the abundance values to a maximum of 1. Defaults to True.
 701            min_normalized_abun : float, optional
 702                The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01.
 703
 704            Returns
 705            --------
 706            str
 707                A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon.
 708            """
 709            mz_np = mass_spec.to_dataframe()["m/z"].values
 710            abun_np = mass_spec.to_dataframe()["Peak Height"].values
 711            if normalize:
 712                abun_np = abun_np / abun_np.max()
 713            mz_abun = np.column_stack((mz_np, abun_np))
 714            if normalize:
 715                mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun]
 716            mz_abun_str = [
 717                str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2))
 718                for mz, abun in mz_abun
 719            ]
 720            return "; ".join(mz_abun_str)
 721
 722        cols_in_df = [
 723            "id",
 724            "_apex_scan",
 725            "start_scan",
 726            "final_scan",
 727            "_retention_time",
 728            "_intensity",
 729            "_persistence",
 730            "_area",
 731            "_dispersity_index",
 732            "_tailing_factor",
 733            "monoisotopic_mf_id",
 734            "isotopologue_type",
 735            "mass_spectrum_deconvoluted_parent",
 736        ]
 737        df_mf_list = []
 738        for mf_id in self.mass_features.keys():
 739            # Find cols_in_df that are in single_mf
 740            df_keys = list(
 741                set(cols_in_df).intersection(self.mass_features[mf_id].__dir__())
 742            )
 743            dict_mf = {}
 744            for key in df_keys:
 745                dict_mf[key] = getattr(self.mass_features[mf_id], key)
 746            if len(self.mass_features[mf_id].ms2_scan_numbers) > 0:
 747                # Add MS2 spectra info
 748                best_ms2_spectrum = self.mass_features[mf_id].best_ms2
 749                dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum)
 750            if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0:
 751                dict_mf["associated_mass_features"] = ", ".join(
 752                    map(
 753                        str,
 754                        self.mass_features[mf_id].associated_mass_features_deconvoluted,
 755                    )
 756                )
 757            if self.mass_features[mf_id]._half_height_width is not None:
 758                dict_mf["half_height_width"] = self.mass_features[
 759                    mf_id
 760                ].half_height_width
 761            # Check if EIC for mass feature is set
 762            df_mf_single = pd.DataFrame(dict_mf, index=[mf_id])
 763            df_mf_single["mz"] = self.mass_features[mf_id].mz
 764            df_mf_list.append(df_mf_single)
 765        df_mf = pd.concat(df_mf_list)
 766
 767        # rename _area to area and id to mf_id
 768        df_mf = df_mf.rename(
 769            columns={
 770                "_area": "area",
 771                "id": "mf_id",
 772                "_apex_scan": "apex_scan",
 773                "_retention_time": "scan_time",
 774                "_intensity": "intensity",
 775                "_persistence": "persistence",
 776                "_dispersity_index": "dispersity_index",
 777                "_tailing_factor": "tailing_factor",
 778            }
 779        )
 780
 781        # reorder columns
 782        col_order = [
 783            "mf_id",
 784            "scan_time",
 785            "mz",
 786            "apex_scan",
 787            "start_scan",
 788            "final_scan",
 789            "intensity",
 790            "persistence",
 791            "area",
 792            "half_height_width",
 793            "tailing_factor",
 794            "dispersity_index",
 795            "monoisotopic_mf_id",
 796            "isotopologue_type",
 797            "mass_spectrum_deconvoluted_parent",
 798            "associated_mass_features",
 799            "ms2_spectrum",
 800        ]
 801        # drop columns that are not in col_order
 802        cols_to_order = [col for col in col_order if col in df_mf.columns]
 803        df_mf = df_mf[cols_to_order]
 804
 805        # reset index to mf_id
 806        df_mf = df_mf.set_index("mf_id")
 807        df_mf.index.name = "mf_id"
 808
 809        return df_mf
 810
 811    def mass_features_ms1_annot_to_df(self):
 812        """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
 813
 814        Returns
 815        --------
 816        pandas.DataFrame
 817            A pandas dataframe of MS1 annotations for the mass features in the dataset.
 818            The index is set to mf_id (mass feature ID)
 819
 820        Raises
 821        ------
 822        Warning
 823            If no MS1 annotations were found for the mass features in the dataset.
 824        """
 825        annot_df_list_ms1 = []
 826        for mf_id in self.mass_features.keys():
 827            if self.mass_features[mf_id].mass_spectrum is None:
 828                pass
 829            else:
 830                # Add ms1 annotations to ms1 annotation list
 831                if (
 832                    np.abs(
 833                        (
 834                            self.mass_features[mf_id].ms1_peak.mz_exp
 835                            - self.mass_features[mf_id].mz
 836                        )
 837                    )
 838                    < 0.01
 839                ):
 840                    # Get the molecular formula from the mass spectrum
 841                    annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe()
 842                    # Subset to pull out only the peak associated with the mass feature
 843                    annot_df = annot_df[
 844                        annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index
 845                    ].copy()
 846
 847                    # Remove the index column and add column for mf_id
 848                    annot_df = annot_df.drop(columns=["Index"])
 849                    annot_df["mf_id"] = mf_id
 850                    annot_df_list_ms1.append(annot_df)
 851
 852        if len(annot_df_list_ms1) > 0:
 853            annot_ms1_df_full = pd.concat(annot_df_list_ms1)
 854            annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id")
 855            annot_ms1_df_full.index.name = "mf_id"
 856
 857        else:
 858            annot_ms1_df_full = None
 859            # Warn that no ms1 annotations were found
 860            warnings.warn(
 861                "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?",
 862                UserWarning,
 863            )
 864
 865        return annot_ms1_df_full
 866
 867    def mass_features_ms2_annot_to_df(self, molecular_metadata=None):
 868        """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
 869
 870        Parameters
 871        -----------
 872        molecular_metadata :  dict of MolecularMetadata objects
 873            A dictionary of MolecularMetadata objects, keyed by metabref_mol_id.  Defaults to None.
 874
 875        Returns
 876        --------
 877        pandas.DataFrame
 878            A pandas dataframe of MS2 annotations for the mass features in the dataset,
 879            and optionally molecular metadata. The index is set to mf_id (mass feature ID)
 880
 881        Raises
 882        ------
 883        Warning
 884            If no MS2 annotations were found for the mass features in the dataset.
 885        """
 886        annot_df_list_ms2 = []
 887        for mf_id in self.mass_features.keys():
 888            if len(self.mass_features[mf_id].ms2_similarity_results) > 0:
 889                # Add ms2 annotations to ms2 annotation list
 890                for result in self.mass_features[mf_id].ms2_similarity_results:
 891                    annot_df_ms2 = result.to_dataframe()
 892                    annot_df_ms2["mf_id"] = mf_id
 893                    annot_df_list_ms2.append(annot_df_ms2)
 894
 895        if len(annot_df_list_ms2) > 0:
 896            annot_ms2_df_full = pd.concat(annot_df_list_ms2)
 897            if molecular_metadata is not None:
 898                molecular_metadata_df = pd.concat(
 899                    [
 900                        pd.DataFrame.from_dict(v.__dict__, orient="index").transpose()
 901                        for k, v in molecular_metadata.items()
 902                    ],
 903                    ignore_index=True,
 904                )
 905                molecular_metadata_df = molecular_metadata_df.rename(
 906                    columns={"id": "ref_mol_id"}
 907                )
 908                annot_ms2_df_full = annot_ms2_df_full.merge(
 909                    molecular_metadata_df, on="ref_mol_id", how="left"
 910                )
 911            annot_ms2_df_full = annot_ms2_df_full.drop_duplicates(
 912                subset=["mf_id", "query_spectrum_id", "ref_ms_id"]
 913            ).copy()
 914            annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id")
 915            annot_ms2_df_full.index.name = "mf_id"
 916        else:
 917            annot_ms2_df_full = None
 918            # Warn that no ms2 annotations were found
 919            warnings.warn(
 920                "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?",
 921                UserWarning,
 922            )
 923
 924        return annot_ms2_df_full
 925
 926    def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False):
 927        """Returns a figure displaying 
 928            (1) thresholded, unprocessed data
 929            (2) the m/z features
 930            (3) which m/z features are associated with MS2 spectra
 931
 932        Parameters
 933        -----------
 934        binsize :  float
 935            Desired binsize for the m/z axis of the composite feature map.  Defaults to 1e-4.
 936        mf_plot : boolean
 937            Indicates whether to plot the m/z features. Defaults to True.
 938        ms2_plot : boolean
 939            Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
 940        return_fig : boolean
 941            Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
 942
 943        Returns
 944        --------
 945        matplotlib.pyplot.Figure
 946            A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 
 947            scan time. Unprocessed data is displayed in gray scale with darker colors indicating 
 948            higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z
 949            features with associated with MS2 spectra are plotted, they are displayed in red.
 950
 951        Raises
 952        ------
 953        Warning
 954            If m/z features are set to be plot but aren't in the dataset.
 955            If m/z features with associated MS2 data are set to be plot but no MS2 annotations 
 956            were found for the m/z features in the dataset.
 957        """
 958        if mf_plot:
 959            # Check if mass_features is set, raise error if not
 960            if self.mass_features is None:
 961                raise ValueError(
 962                    "mass_features not set, must run find_mass_features() first"
 963                )
 964            ## call mass feature data
 965            mf_df = self.mass_features_to_df()
 966
 967        if ms2_plot:
 968            if not mf_plot:
 969                # Check if mass_features is set, raise error if not
 970                if self.mass_features is None:
 971                    raise ValueError(
 972                        "mass_features not set, must run find_mass_features() first"
 973                    )
 974
 975            ## call m/z feature data
 976            mf_df = self.mass_features_to_df()
 977
 978            # Check if ms2_spectrum is set, raise error if not
 979            if 'ms2_spectrum' not in mf_df.columns:
 980                raise ValueError(                
 981                    "ms2_spectrum not set, must run add_associated_ms2_dda() first"            
 982                )
 983    
 984        ## threshold and grid unprocessed data
 985        df = self._ms_unprocessed[1].copy()
 986        df = df.dropna(subset=['intensity']).reset_index(drop = True)
 987        threshold = ph_int_min_thresh * df.intensity.max()
 988        df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy()
 989        df = self.grid_data(df_thres)
 990    
 991        ## format unprocessed data for plotting
 992        df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan')
 993        mz_grid = np.arange(0, np.max(df.mz), binsize)
 994        mz_data = np.array(df.mz)
 995        df['mz_bin'] = find_closest(mz_grid, mz_data)
 996        df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum)
 997        unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True)
 998
 999        ## generate figure
1000        fig = plt.figure()
1001        plt.scatter(
1002            unproc_df.scan_time,
1003            unproc_df.mz_bin*binsize,
1004            c = unproc_df.ab_bin/np.max(unproc_df.ab_bin),
1005            alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 
1006            cmap = 'Greys_r',
1007            s = 1
1008        )
1009
1010        if mf_plot:
1011            if ms2_plot:
1012                plt.scatter(
1013                    mf_df[mf_df.ms2_spectrum.isna()].scan_time,
1014                    mf_df[mf_df.ms2_spectrum.isna()].mz,
1015                    c = 'c',
1016                    s = 4,
1017                    label = 'M/Z features without MS2'
1018                )
1019            else:
1020                plt.scatter(
1021                    mf_df.scan_time,
1022                    mf_df.mz,
1023                    c = 'c',
1024                    s = 4,
1025                    label = 'M/Z features'
1026                )
1027
1028        if ms2_plot: 
1029            plt.scatter(
1030                mf_df[~mf_df.ms2_spectrum.isna()].scan_time,
1031                mf_df[~mf_df.ms2_spectrum.isna()].mz,
1032                c = 'r',
1033                s = 2,
1034                label = 'M/Z features with MS2'
1035            )
1036
1037        if mf_plot == True or ms2_plot == True:
1038            plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2)
1039        plt.xlabel('Scan time')
1040        plt.ylabel('m/z')
1041        plt.ylim(0, np.ceil(np.max(df.mz)))
1042        plt.xlim(0, np.ceil(np.max(df.scan_time)))
1043        plt.title('Composite Feature Map')
1044
1045        if return_fig:
1046            plt.close(fig)
1047            return fig
1048
1049        else:
1050            plt.show()
1051
1052    def __len__(self):
1053        """
1054        Returns the number of mass spectra in the dataset.
1055
1056        Returns
1057        --------
1058        int
1059            The number of mass spectra in the dataset.
1060        """
1061        return len(self._ms)
1062
1063    def __getitem__(self, scan_number):
1064        """
1065        Returns the mass spectrum corresponding to the specified scan number.
1066
1067        Parameters
1068        -----------
1069        scan_number : int
1070            The scan number of the desired mass spectrum.
1071
1072        Returns
1073        --------
1074        MassSpectrum
1075            The mass spectrum corresponding to the specified scan number.
1076        """
1077        return self._ms.get(scan_number)
1078
1079    def __iter__(self):
1080        """Returns an iterator over the mass spectra in the dataset.
1081
1082        Returns
1083        --------
1084        iterator
1085            An iterator over the mass spectra in the dataset.
1086        """
1087        return iter(self._ms.values())
1088
1089    def set_tic_list_from_data(self, overwrite=False):
1090        """Sets the TIC list from the mass spectrum objects within the _ms dictionary.
1091
1092        Parameters
1093        -----------
1094        overwrite : bool, optional
1095            If True, overwrites the TIC list if it is already set. Defaults to False.
1096
1097        Notes
1098        -----
1099        If the _ms dictionary is incomplete, sets the TIC list to an empty list.
1100
1101        Raises
1102        ------
1103        ValueError
1104            If no mass spectra are found in the dataset.
1105            If the TIC list is already set and overwrite is False.
1106        """
1107        # Check if _ms is empty and raise error if so
1108        if len(self._ms) == 0:
1109            raise ValueError("No mass spectra found in dataset")
1110
1111        # Check if tic_list is already set and raise error if so
1112        if len(self.tic) > 0 and not overwrite:
1113            raise ValueError("TIC list already set, use overwrite=True to overwrite")
1114
1115        self.tic = [self._ms.get(i).tic for i in self.scans_number]
1116
1117    def set_retention_time_from_data(self, overwrite=False):
1118        """Sets the retention time list from the data in the _ms dictionary.
1119
1120        Parameters
1121        -----------
1122        overwrite : bool, optional
1123            If True, overwrites the retention time list if it is already set. Defaults to False.
1124
1125        Notes
1126        -----
1127        If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
1128
1129        Raises
1130        ------
1131        ValueError
1132            If no mass spectra are found in the dataset.
1133            If the retention time list is already set and overwrite is False.
1134        """
1135        # Check if _ms is empty and raise error if so
1136        if len(self._ms) == 0:
1137            raise ValueError("No mass spectra found in dataset")
1138
1139        # Check if retention_time_list is already set and raise error if so
1140        if len(self.retention_time) > 0 and not overwrite:
1141            raise ValueError(
1142                "Retention time list already set, use overwrite=True to overwrite"
1143            )
1144
1145        retention_time_list = []
1146        for key_ms in sorted(self._ms.keys()):
1147            retention_time_list.append(self._ms.get(key_ms).retention_time)
1148        self.retention_time = retention_time_list
1149
1150    def set_scans_number_from_data(self, overwrite=False):
1151        """Sets the scan number list from the data in the _ms dictionary.
1152
1153        Notes
1154        -----
1155        If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
1156
1157        Raises
1158        ------
1159        ValueError
1160            If no mass spectra are found in the dataset.
1161            If the scan number list is already set and overwrite is False.
1162        """
1163        # Check if _ms is empty and raise error if so
1164        if len(self._ms) == 0:
1165            raise ValueError("No mass spectra found in dataset")
1166
1167        # Check if scans_number_list is already set and raise error if so
1168        if len(self.scans_number) > 0 and not overwrite:
1169            raise ValueError(
1170                "Scan number list already set, use overwrite=True to overwrite"
1171            )
1172
1173        self.scans_number = sorted(self._ms.keys())
1174
1175    @property
1176    def ms1_scans(self):
1177        """
1178        list : A list of MS1 scan numbers for the dataset.
1179        """
1180        return self.scan_df[self.scan_df.ms_level == 1].index.tolist()
1181
1182    @property
1183    def parameters(self):
1184        """
1185        LCMSParameters : The parameters used for the LC-MS analysis.
1186        """
1187        return self._parameters
1188
1189    @parameters.setter
1190    def parameters(self, paramsinstance):
1191        """
1192        Sets the parameters used for the LC-MS analysis.
1193
1194        Parameters
1195        -----------
1196        paramsinstance : LCMSParameters
1197            The parameters used for the LC-MS analysis.
1198        """
1199        self._parameters = paramsinstance
1200
1201    @property
1202    def scans_number(self):
1203        """
1204        list : A list of scan numbers for the dataset.
1205        """
1206        return self._scans_number_list
1207
1208    @scans_number.setter
1209    def scans_number(self, scan_numbers_list):
1210        """
1211        Sets the scan numbers for the dataset.
1212
1213        Parameters
1214        -----------
1215        scan_numbers_list : list
1216            A list of scan numbers for the dataset.
1217        """
1218        self._scans_number_list = scan_numbers_list
1219
1220    @property
1221    def retention_time(self):
1222        """
1223        numpy.ndarray : An array of retention times for the dataset.
1224        """
1225        return self._retention_time_list
1226
1227    @retention_time.setter
1228    def retention_time(self, rt_list):
1229        """
1230        Sets the retention times for the dataset.
1231
1232        Parameters
1233        -----------
1234        rt_list : list
1235            A list of retention times for the dataset.
1236        """
1237        self._retention_time_list = np.array(rt_list)
1238
1239    @property
1240    def tic(self):
1241        """
1242        numpy.ndarray : An array of TIC values for the dataset.
1243        """
1244        return self._tic_list
1245
1246    @tic.setter
1247    def tic(self, tic_list):
1248        """
1249        Sets the TIC values for the dataset.
1250
1251        Parameters
1252        -----------
1253        tic_list : list
1254            A list of TIC values for the dataset.
1255        """
1256        self._tic_list = np.array(tic_list)

class MassSpectraBase: View Source

 16class MassSpectraBase:
 17    """Base class for mass spectra objects.
 18
 19    Parameters
 20    -----------
 21    file_location : str or Path
 22        The location of the file containing the mass spectra data.
 23    analyzer : str, optional
 24        The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
 25    instrument_label : str, optional
 26        The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
 27    sample_name : str, optional
 28        The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
 29    spectra_parser : object, optional
 30        The spectra parser object used to create the mass spectra object. Defaults to None.
 31
 32    Attributes
 33    -----------
 34    spectra_parser_class : class
 35        The class of the spectra parser used to create the mass spectra object.
 36    file_location : str or Path
 37        The location of the file containing the mass spectra data.
 38    sample_name : str
 39        The name of the sample; defaults to the file name if not provided to the parser.
 40    analyzer : str
 41        The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
 42    instrument_label : str
 43        The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
 44    _scan_info : dict
 45        A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z,
 46        scan text, and scan window (lower and upper).
 47        Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
 48    _ms : dict
 49        A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
 50    _ms_unprocessed: dictionary of pandas.DataFrames or None
 51        A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking.
 52        Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
 53
 54    Methods
 55    --------
 56    * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True).
 57        Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
 58    * get_time_of_scan_id(scan).
 59        Returns the scan time for the specified scan number.
 60    """
 61
 62    def __init__(
 63        self,
 64        file_location,
 65        analyzer="Unknown",
 66        instrument_label="Unknown",
 67        sample_name=None,
 68        spectra_parser=None,
 69    ):
 70        if isinstance(file_location, str):
 71            file_location = Path(file_location)
 72        else:
 73            file_location = file_location
 74        if not file_location.exists():
 75            raise FileExistsError("File does not exist: " + str(file_location))
 76
 77        if sample_name:
 78            self.sample_name = sample_name
 79        else:
 80            self.sample_name = file_location.stem
 81
 82        self.file_location = file_location
 83        self.analyzer = analyzer
 84        self.instrument_label = instrument_label
 85
 86        # Add the spectra parser class to the object if it is not None
 87        if spectra_parser is not None:
 88            self.spectra_parser_class = spectra_parser.__class__
 89            self.spectra_parser = spectra_parser
 90            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
 91            if (
 92                self.sample_name is not None
 93                and self.sample_name != self.spectra_parser.sample_name
 94            ):
 95                warnings.warn(
 96                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
 97                    UserWarning,
 98                )
 99            if self.analyzer != self.spectra_parser.analyzer:
100                warnings.warn(
101                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
102                    UserWarning,
103                )
104            if self.instrument_label != self.spectra_parser.instrument_label:
105                warnings.warn(
106                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
107                    UserWarning,
108                )
109            if self.file_location != self.spectra_parser.file_location:
110                warnings.warn(
111                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
112                    UserWarning,
113                )
114
115        # Instantiate empty dictionaries for scan information and mass spectra
116        self._scan_info = {}
117        self._ms = {}
118        self._ms_unprocessed = {}
119
120    def add_mass_spectrum(self, mass_spec):
121        """Adds a mass spectrum to the dataset.
122
123        Parameters
124        -----------
125        mass_spec : MassSpectrum
126            The corems MassSpectrum object to be added to the dataset.
127
128        Notes
129        -----
130        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
131        """
132        # check if mass_spec has a scan_number attribute
133        if not hasattr(mass_spec, "scan_number"):
134            raise ValueError(
135                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
136            )
137        self._ms[mass_spec.scan_number] = mass_spec
138
139    def add_mass_spectra(
140        self,
141        scan_list,
142        spectrum_mode=None,
143        ms_level=1,
144        use_parser=True,
145        auto_process=True,
146        ms_params=None,
147    ):
148        """Add mass spectra to _ms dictionary, from a list of scans or single scan
149
150        Notes
151        -----
152        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
153
154
155        Parameters
156        -----------
157        scan_list : list of ints
158            List of scans to use to populate _ms slot
159        spectrum_mode : str or None
160            The spectrum mode to use for the mass spectra.
161            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
162            Defaults to None.
163        ms_level : int, optional
164            The MS level to use for the mass spectra.
165            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
166            Defaults to 1.
167        using_parser : bool
168            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
169        auto_process : bool
170            Whether to auto-process the mass spectra.  Defaults to True.
171        ms_params : MSParameters or None
172            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
173
174        Raises
175        ------
176        TypeError
177            If scan_list is not a list of ints
178        ValueError
179            If polarity is not 'positive' or 'negative'
180            If ms_level is not 1 or 2
181        """
182
183        # check if scan_list is a list or a single int; if single int, convert to list
184        if isinstance(scan_list, int):
185            scan_list = [scan_list]
186        if not isinstance(scan_list, list):
187            raise TypeError("scan_list must be a list of integers")
188        for scan in scan_list:
189            if not isinstance(scan, int):
190                raise TypeError("scan_list must be a list of integers")
191
192        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
193        if self.polarity == "negative":
194            polarity = -1
195        elif self.polarity == "positive":
196            polarity = 1
197        else:
198            raise ValueError(
199                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
200            )
201
202        # is not using_parser, check that ms1 and ms2 are not None
203        if not use_parser:
204            if ms_level not in self._ms_unprocessed.keys():
205                raise ValueError(
206                    "ms_level {} not found in _ms_unprocessed dictionary".format(
207                        ms_level
208                    )
209                )
210
211        scan_list = list(set(scan_list))
212        scan_list.sort()
213        if not use_parser:
214            if self._ms_unprocessed[ms_level] is None:
215                raise ValueError(
216                    "No unprocessed data found for ms_level {}".format(ms_level)
217                )
218            if (
219                len(
220                    np.setdiff1d(
221                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
222                    )
223                )
224                > 0
225            ):
226                raise ValueError(
227                    "Not all scans in scan_list are present in the unprocessed data"
228                )
229            # Prepare the ms_df for parsing
230            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
231
232        for scan in scan_list:
233            ms = None
234            if spectrum_mode is None:
235                # get spectrum mode from _scan_info
236                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
237            else:
238                spectrum_mode_scan = spectrum_mode
239            # Instantiate the mass spectrum object using the parser or the unprocessed data
240            if not use_parser:
241                my_ms_df = ms_df.loc[scan]
242                if spectrum_mode_scan == "profile":
243                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
244                    ms = ms_from_array_profile(
245                        my_ms_df.mz,
246                        my_ms_df.intensity,
247                        self.file_location,
248                        polarity=polarity,
249                        auto_process=False,
250                    )
251                else:
252                    raise ValueError(
253                        "Only profile mode is supported for unprocessed data"
254                    )
255            if use_parser:
256                ms = self.spectra_parser.get_mass_spectrum_from_scan(
257                    scan_number=scan,
258                    spectrum_mode=spectrum_mode_scan,
259                    auto_process=False,
260                )
261
262            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
263            if ms is not None:
264                if ms_params is not None:
265                    ms.parameters = ms_params
266                ms.scan_number = scan
267                if auto_process:
268                    ms.process_mass_spec()
269                self.add_mass_spectrum(ms)
270
271    def get_time_of_scan_id(self, scan):
272        """Returns the scan time for the specified scan number.
273
274        Parameters
275        -----------
276        scan : int
277            The scan number of the desired scan time.
278
279        Returns
280        --------
281        float
282            The scan time for the specified scan number (in minutes).
283
284        Raises
285        ------
286        ValueError
287            If no scan time is found for the specified scan number.
288        """
289        # Check if _retenion_time_list is empty and raise error if so
290        if len(self._retention_time_list) == 0:
291            raise ValueError("No retention times found in dataset")
292        rt = self._retention_time_list[self._scans_number_list.index(scan)]
293        return rt
294
295    @property
296    def scan_df(self):
297        """
298        pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).
299        """
300        scan_df = pd.DataFrame.from_dict(self._scan_info)
301        return scan_df
302        
303    @property
304    def ms(self):
305        """
306        dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles
307        """
308        return self._ms
309
310    
311    @scan_df.setter
312    def scan_df(self, df):
313        """
314        Sets the scan data for the dataset.
315
316        Parameters
317        -----------
318        df : pandas.DataFrame
319            A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level,
320            precursor m/z, scan text, and scan window (lower and upper).
321        """
322        self._scan_info = df.to_dict()
323
324    def __getitem__(self, scan_number):
325        return self._ms.get(scan_number)

Base class for mass spectra objects.

Parameters

file_location (str or Path): The location of the file containing the mass spectra data.
analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.

Attributes

spectra_parser_class (class): The class of the spectra parser used to create the mass spectra object.
file_location (str or Path): The location of the file containing the mass spectra data.
sample_name (str): The name of the sample; defaults to the file name if not provided to the parser.
analyzer (str): The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
instrument_label (str): The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
_scan_info (dict): A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
_ms (dict): A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
_ms_unprocessed (dictionary of pandas.DataFrames or None): A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.

Methods

add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
get_time_of_scan_id(scan). Returns the scan time for the specified scan number.

MassSpectraBase( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None, spectra_parser=None) View Source

 62    def __init__(
 63        self,
 64        file_location,
 65        analyzer="Unknown",
 66        instrument_label="Unknown",
 67        sample_name=None,
 68        spectra_parser=None,
 69    ):
 70        if isinstance(file_location, str):
 71            file_location = Path(file_location)
 72        else:
 73            file_location = file_location
 74        if not file_location.exists():
 75            raise FileExistsError("File does not exist: " + str(file_location))
 76
 77        if sample_name:
 78            self.sample_name = sample_name
 79        else:
 80            self.sample_name = file_location.stem
 81
 82        self.file_location = file_location
 83        self.analyzer = analyzer
 84        self.instrument_label = instrument_label
 85
 86        # Add the spectra parser class to the object if it is not None
 87        if spectra_parser is not None:
 88            self.spectra_parser_class = spectra_parser.__class__
 89            self.spectra_parser = spectra_parser
 90            # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not
 91            if (
 92                self.sample_name is not None
 93                and self.sample_name != self.spectra_parser.sample_name
 94            ):
 95                warnings.warn(
 96                    "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object",
 97                    UserWarning,
 98                )
 99            if self.analyzer != self.spectra_parser.analyzer:
100                warnings.warn(
101                    "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object",
102                    UserWarning,
103                )
104            if self.instrument_label != self.spectra_parser.instrument_label:
105                warnings.warn(
106                    "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object",
107                    UserWarning,
108                )
109            if self.file_location != self.spectra_parser.file_location:
110                warnings.warn(
111                    "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object",
112                    UserWarning,
113                )
114
115        # Instantiate empty dictionaries for scan information and mass spectra
116        self._scan_info = {}
117        self._ms = {}
118        self._ms_unprocessed = {}

file_location

analyzer

instrument_label

def add_mass_spectrum(self, mass_spec): View Source

120    def add_mass_spectrum(self, mass_spec):
121        """Adds a mass spectrum to the dataset.
122
123        Parameters
124        -----------
125        mass_spec : MassSpectrum
126            The corems MassSpectrum object to be added to the dataset.
127
128        Notes
129        -----
130        This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
131        """
132        # check if mass_spec has a scan_number attribute
133        if not hasattr(mass_spec, "scan_number"):
134            raise ValueError(
135                "Mass spectrum must have a scan_number attribute to be added to the dataset correctly"
136            )
137        self._ms[mass_spec.scan_number] = mass_spec

Adds a mass spectrum to the dataset.

Parameters

mass_spec (MassSpectrum): The corems MassSpectrum object to be added to the dataset.

Notes

This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.

def add_mass_spectra( self, scan_list, spectrum_mode=None, ms_level=1, use_parser=True, auto_process=True, ms_params=None): View Source

139    def add_mass_spectra(
140        self,
141        scan_list,
142        spectrum_mode=None,
143        ms_level=1,
144        use_parser=True,
145        auto_process=True,
146        ms_params=None,
147    ):
148        """Add mass spectra to _ms dictionary, from a list of scans or single scan
149
150        Notes
151        -----
152        The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
153
154
155        Parameters
156        -----------
157        scan_list : list of ints
158            List of scans to use to populate _ms slot
159        spectrum_mode : str or None
160            The spectrum mode to use for the mass spectra.
161            If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types).
162            Defaults to None.
163        ms_level : int, optional
164            The MS level to use for the mass spectra.
165            This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects.
166            Defaults to 1.
167        using_parser : bool
168            Whether to use the mass spectra parser to get the mass spectra.  Defaults to True.
169        auto_process : bool
170            Whether to auto-process the mass spectra.  Defaults to True.
171        ms_params : MSParameters or None
172            The mass spectrum parameters to use for the mass spectra.  If None, uses the globally set MSParameters.
173
174        Raises
175        ------
176        TypeError
177            If scan_list is not a list of ints
178        ValueError
179            If polarity is not 'positive' or 'negative'
180            If ms_level is not 1 or 2
181        """
182
183        # check if scan_list is a list or a single int; if single int, convert to list
184        if isinstance(scan_list, int):
185            scan_list = [scan_list]
186        if not isinstance(scan_list, list):
187            raise TypeError("scan_list must be a list of integers")
188        for scan in scan_list:
189            if not isinstance(scan, int):
190                raise TypeError("scan_list must be a list of integers")
191
192        # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation)
193        if self.polarity == "negative":
194            polarity = -1
195        elif self.polarity == "positive":
196            polarity = 1
197        else:
198            raise ValueError(
199                "Polarity not set for dataset, must be a either 'positive' or 'negative'"
200            )
201
202        # is not using_parser, check that ms1 and ms2 are not None
203        if not use_parser:
204            if ms_level not in self._ms_unprocessed.keys():
205                raise ValueError(
206                    "ms_level {} not found in _ms_unprocessed dictionary".format(
207                        ms_level
208                    )
209                )
210
211        scan_list = list(set(scan_list))
212        scan_list.sort()
213        if not use_parser:
214            if self._ms_unprocessed[ms_level] is None:
215                raise ValueError(
216                    "No unprocessed data found for ms_level {}".format(ms_level)
217                )
218            if (
219                len(
220                    np.setdiff1d(
221                        scan_list, self._ms_unprocessed[ms_level].scan.tolist()
222                    )
223                )
224                > 0
225            ):
226                raise ValueError(
227                    "Not all scans in scan_list are present in the unprocessed data"
228                )
229            # Prepare the ms_df for parsing
230            ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False)
231
232        for scan in scan_list:
233            ms = None
234            if spectrum_mode is None:
235                # get spectrum mode from _scan_info
236                spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"]
237            else:
238                spectrum_mode_scan = spectrum_mode
239            # Instantiate the mass spectrum object using the parser or the unprocessed data
240            if not use_parser:
241                my_ms_df = ms_df.loc[scan]
242                if spectrum_mode_scan == "profile":
243                    # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum
244                    ms = ms_from_array_profile(
245                        my_ms_df.mz,
246                        my_ms_df.intensity,
247                        self.file_location,
248                        polarity=polarity,
249                        auto_process=False,
250                    )
251                else:
252                    raise ValueError(
253                        "Only profile mode is supported for unprocessed data"
254                    )
255            if use_parser:
256                ms = self.spectra_parser.get_mass_spectrum_from_scan(
257                    scan_number=scan,
258                    spectrum_mode=spectrum_mode_scan,
259                    auto_process=False,
260                )
261
262            # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset
263            if ms is not None:
264                if ms_params is not None:
265                    ms.parameters = ms_params
266                ms.scan_number = scan
267                if auto_process:
268                    ms.process_mass_spec()
269                self.add_mass_spectrum(ms)

Add mass spectra to _ms dictionary, from a list of scans or single scan

Notes

The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.

Parameters

scan_list (list of ints): List of scans to use to populate _ms slot
spectrum_mode (str or None): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None.
ms_level (int, optional): The MS level to use for the mass spectra. This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. Defaults to 1.
using_parser (bool): Whether to use the mass spectra parser to get the mass spectra. Defaults to True.
auto_process (bool): Whether to auto-process the mass spectra. Defaults to True.
ms_params (MSParameters or None): The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters.

Raises

TypeError: If scan_list is not a list of ints
ValueError: If polarity is not 'positive' or 'negative' If ms_level is not 1 or 2

def get_time_of_scan_id(self, scan): View Source

271    def get_time_of_scan_id(self, scan):
272        """Returns the scan time for the specified scan number.
273
274        Parameters
275        -----------
276        scan : int
277            The scan number of the desired scan time.
278
279        Returns
280        --------
281        float
282            The scan time for the specified scan number (in minutes).
283
284        Raises
285        ------
286        ValueError
287            If no scan time is found for the specified scan number.
288        """
289        # Check if _retenion_time_list is empty and raise error if so
290        if len(self._retention_time_list) == 0:
291            raise ValueError("No retention times found in dataset")
292        rt = self._retention_time_list[self._scans_number_list.index(scan)]
293        return rt

Returns the scan time for the specified scan number.

Parameters

scan (int): The scan number of the desired scan time.

Returns

float: The scan time for the specified scan number (in minutes).

Raises

ValueError: If no scan time is found for the specified scan number.

scan_df

pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper).

dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles

corems.mass_spectra.factory.lc_class

Parameters

Attributes

Methods

Parameters

Notes

Notes

Parameters

Raises

Parameters

Returns

Raises

Parameters

Attributes

Methods

Returns

Parameters

Raises

Notes

Parameters

Raises

Parameters

Raises

Returns

Returns

Raises

Parameters

Returns

Raises

Parameters

Returns

Raises

Parameters

Notes

Raises

Parameters

Notes

Raises

Notes

Raises

Inherited Members