corems.mass_spectra.factory.lc_class
1from pathlib import Path 2 3import numpy as np 4import pandas as pd 5import warnings 6import matplotlib.pyplot as plt 7 8from corems.encapsulation.factory.parameters import LCMSParameters 9from corems.mass_spectra.calc.lc_calc import LCCalculations, PHCalculations 10from corems.molecular_id.search.lcms_spectral_search import LCMSSpectralSearch 11from corems.mass_spectrum.input.numpyArray import ms_from_array_profile 12from corems.mass_spectra.calc.lc_calc import find_closest 13 14 15class MassSpectraBase: 16 """Base class for mass spectra objects. 17 18 Parameters 19 ----------- 20 file_location : str or Path 21 The location of the file containing the mass spectra data. 22 analyzer : str, optional 23 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 24 instrument_label : str, optional 25 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 26 sample_name : str, optional 27 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 28 spectra_parser : object, optional 29 The spectra parser object used to create the mass spectra object. Defaults to None. 30 31 Attributes 32 ----------- 33 spectra_parser_class : class 34 The class of the spectra parser used to create the mass spectra object. 35 file_location : str or Path 36 The location of the file containing the mass spectra data. 37 sample_name : str 38 The name of the sample; defaults to the file name if not provided to the parser. 39 analyzer : str 40 The type of analyzer used to generate the mass spectra data. Derived from the spectra parser. 41 instrument_label : str 42 The type of instrument used to generate the mass spectra data. Derived from the spectra parser. 43 _scan_info : dict 44 A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, 45 scan text, and scan window (lower and upper). 46 Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame. 47 _ms : dict 48 A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary. 49 _ms_unprocessed: dictionary of pandas.DataFrames or None 50 A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. 51 Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None. 52 53 Methods 54 -------- 55 * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). 56 Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans 57 * get_time_of_scan_id(scan). 58 Returns the scan time for the specified scan number. 59 """ 60 61 def __init__( 62 self, 63 file_location, 64 analyzer="Unknown", 65 instrument_label="Unknown", 66 sample_name=None, 67 spectra_parser=None, 68 ): 69 if isinstance(file_location, str): 70 file_location = Path(file_location) 71 else: 72 file_location = file_location 73 if not file_location.exists(): 74 raise FileExistsError("File does not exist: " + str(file_location)) 75 76 if sample_name: 77 self.sample_name = sample_name 78 else: 79 self.sample_name = file_location.stem 80 81 self.file_location = file_location 82 self.analyzer = analyzer 83 self.instrument_label = instrument_label 84 85 # Add the spectra parser class to the object if it is not None 86 if spectra_parser is not None: 87 self.spectra_parser_class = spectra_parser.__class__ 88 self.spectra_parser = spectra_parser 89 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 90 if ( 91 self.sample_name is not None 92 and self.sample_name != self.spectra_parser.sample_name 93 ): 94 warnings.warn( 95 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 96 UserWarning, 97 ) 98 if self.analyzer != self.spectra_parser.analyzer: 99 warnings.warn( 100 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 101 UserWarning, 102 ) 103 if self.instrument_label != self.spectra_parser.instrument_label: 104 warnings.warn( 105 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 106 UserWarning, 107 ) 108 if self.file_location != self.spectra_parser.file_location: 109 warnings.warn( 110 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 111 UserWarning, 112 ) 113 114 # Instantiate empty dictionaries for scan information and mass spectra 115 self._scan_info = {} 116 self._ms = {} 117 self._ms_unprocessed = {} 118 119 def add_mass_spectrum(self, mass_spec): 120 """Adds a mass spectrum to the dataset. 121 122 Parameters 123 ----------- 124 mass_spec : MassSpectrum 125 The corems MassSpectrum object to be added to the dataset. 126 127 Notes 128 ----- 129 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 130 """ 131 # check if mass_spec has a scan_number attribute 132 if not hasattr(mass_spec, "scan_number"): 133 raise ValueError( 134 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 135 ) 136 self._ms[mass_spec.scan_number] = mass_spec 137 138 def add_mass_spectra( 139 self, 140 scan_list, 141 spectrum_mode=None, 142 ms_level=1, 143 use_parser=True, 144 auto_process=True, 145 ms_params=None, 146 ): 147 """Add mass spectra to _ms dictionary, from a list of scans or single scan 148 149 Notes 150 ----- 151 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 152 153 154 Parameters 155 ----------- 156 scan_list : list of ints 157 List of scans to use to populate _ms slot 158 spectrum_mode : str or None 159 The spectrum mode to use for the mass spectra. 160 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 161 Defaults to None. 162 ms_level : int, optional 163 The MS level to use for the mass spectra. 164 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 165 Defaults to 1. 166 using_parser : bool 167 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 168 auto_process : bool 169 Whether to auto-process the mass spectra. Defaults to True. 170 ms_params : MSParameters or None 171 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 172 173 Raises 174 ------ 175 TypeError 176 If scan_list is not a list of ints 177 ValueError 178 If polarity is not 'positive' or 'negative' 179 If ms_level is not 1 or 2 180 """ 181 182 # check if scan_list is a list or a single int; if single int, convert to list 183 if isinstance(scan_list, int): 184 scan_list = [scan_list] 185 if not isinstance(scan_list, list): 186 raise TypeError("scan_list must be a list of integers") 187 for scan in scan_list: 188 if not isinstance(scan, int): 189 raise TypeError("scan_list must be a list of integers") 190 191 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 192 if self.polarity == "negative": 193 polarity = -1 194 elif self.polarity == "positive": 195 polarity = 1 196 else: 197 raise ValueError( 198 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 199 ) 200 201 # is not using_parser, check that ms1 and ms2 are not None 202 if not use_parser: 203 if ms_level not in self._ms_unprocessed.keys(): 204 raise ValueError( 205 "ms_level {} not found in _ms_unprocessed dictionary".format( 206 ms_level 207 ) 208 ) 209 210 scan_list = list(set(scan_list)) 211 scan_list.sort() 212 if not use_parser: 213 if self._ms_unprocessed[ms_level] is None: 214 raise ValueError( 215 "No unprocessed data found for ms_level {}".format(ms_level) 216 ) 217 if ( 218 len( 219 np.setdiff1d( 220 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 221 ) 222 ) 223 > 0 224 ): 225 raise ValueError( 226 "Not all scans in scan_list are present in the unprocessed data" 227 ) 228 # Prepare the ms_df for parsing 229 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 230 231 for scan in scan_list: 232 ms = None 233 if spectrum_mode is None: 234 # get spectrum mode from _scan_info 235 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 236 else: 237 spectrum_mode_scan = spectrum_mode 238 # Instantiate the mass spectrum object using the parser or the unprocessed data 239 if not use_parser: 240 my_ms_df = ms_df.loc[scan] 241 if spectrum_mode_scan == "profile": 242 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 243 ms = ms_from_array_profile( 244 my_ms_df.mz, 245 my_ms_df.intensity, 246 self.file_location, 247 polarity=polarity, 248 auto_process=False, 249 ) 250 else: 251 raise ValueError( 252 "Only profile mode is supported for unprocessed data" 253 ) 254 if use_parser: 255 ms = self.spectra_parser.get_mass_spectrum_from_scan( 256 scan_number=scan, 257 spectrum_mode=spectrum_mode_scan, 258 auto_process=False, 259 ) 260 261 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 262 if ms is not None: 263 if ms_params is not None: 264 ms.parameters = ms_params 265 ms.scan_number = scan 266 if auto_process: 267 ms.process_mass_spec() 268 self.add_mass_spectrum(ms) 269 270 def get_time_of_scan_id(self, scan): 271 """Returns the scan time for the specified scan number. 272 273 Parameters 274 ----------- 275 scan : int 276 The scan number of the desired scan time. 277 278 Returns 279 -------- 280 float 281 The scan time for the specified scan number (in minutes). 282 283 Raises 284 ------ 285 ValueError 286 If no scan time is found for the specified scan number. 287 """ 288 # Check if _retenion_time_list is empty and raise error if so 289 if len(self._retention_time_list) == 0: 290 raise ValueError("No retention times found in dataset") 291 rt = self._retention_time_list[self._scans_number_list.index(scan)] 292 return rt 293 294 @property 295 def scan_df(self): 296 """ 297 pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). 298 """ 299 scan_df = pd.DataFrame.from_dict(self._scan_info) 300 return scan_df 301 302 @property 303 def ms(self): 304 """ 305 dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles 306 """ 307 return self._ms 308 309 310 @scan_df.setter 311 def scan_df(self, df): 312 """ 313 Sets the scan data for the dataset. 314 315 Parameters 316 ----------- 317 df : pandas.DataFrame 318 A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level, 319 precursor m/z, scan text, and scan window (lower and upper). 320 """ 321 self._scan_info = df.to_dict() 322 323 def __getitem__(self, scan_number): 324 return self._ms.get(scan_number) 325 326 327class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch): 328 """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object. 329 330 This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method. 331 332 Parameters 333 ----------- 334 file_location : str or Path 335 The location of the file containing the mass spectra data. 336 analyzer : str, optional 337 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 338 instrument_label : str, optional 339 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 340 sample_name : str, optional 341 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 342 spectra_parser : object, optional 343 The spectra parser object used to create the mass spectra object. Defaults to None. 344 345 Attributes 346 ----------- 347 polarity : str 348 The polarity of the ionization mode used for the dataset. 349 _parameters : LCMSParameters 350 The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters. 351 _retention_time_list : numpy.ndarray 352 An array of retention times for the dataset. 353 _scans_number_list : list 354 A list of scan numbers for the dataset. 355 _tic_list : numpy.ndarray 356 An array of total ion current (TIC) values for the dataset. 357 eics : dict 358 A dictionary containing extracted ion chromatograms (EICs) for the dataset. 359 Key is the mz of the EIC. Initialized as an empty dictionary. 360 mass_features : dictionary of LCMSMassFeature objects 361 A dictionary containing mass features for the dataset. 362 Key is mass feature ID. Initialized as an empty dictionary. 363 spectral_search_results : dictionary of MS2SearchResults objects 364 A dictionary containing spectral search results for the dataset. 365 Key is scan number : precursor mz. Initialized as an empty dictionary. 366 367 Methods 368 -------- 369 * get_parameters_json(). 370 Returns the parameters used for the LC-MS analysis in JSON format. 371 * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) 372 Adds which MS2 scans are associated with each mass feature to the 373 mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary. 374 * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) 375 Adds the MS1 spectra associated with each mass feature to the 376 mass_features dictionary and adds the MS1 spectra to the _ms dictionary. 377 * mass_features_to_df() 378 Returns a pandas dataframe summarizing the mass features in the dataset. 379 * set_tic_list_from_data(overwrite=False) 380 Sets the TIC list from the mass spectrum objects within the _ms dictionary. 381 * set_retention_time_from_data(overwrite=False) 382 Sets the retention time list from the data in the _ms dictionary. 383 * set_scans_number_from_data(overwrite=False) 384 Sets the scan number list from the data in the _ms dictionary. 385 * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) 386 Generates plot of M/Z features comparing scan time vs M/Z value 387 """ 388 389 def __init__( 390 self, 391 file_location, 392 analyzer="Unknown", 393 instrument_label="Unknown", 394 sample_name=None, 395 spectra_parser=None, 396 ): 397 super().__init__( 398 file_location, analyzer, instrument_label, sample_name, spectra_parser 399 ) 400 self.polarity = "" 401 self._parameters = LCMSParameters() 402 self._retention_time_list = [] 403 self._scans_number_list = [] 404 self._tic_list = [] 405 self.eics = {} 406 self.mass_features = {} 407 self.spectral_search_results = {} 408 409 def get_parameters_json(self): 410 """Returns the parameters stored for the LC-MS object in JSON format. 411 412 Returns 413 -------- 414 str 415 The parameters used for the LC-MS analysis in JSON format. 416 """ 417 return self.parameters.to_json() 418 419 def remove_unprocessed_data(self, ms_level=None): 420 """Removes the unprocessed data from the LCMSBase object. 421 422 Parameters 423 ----------- 424 ms_level : int, optional 425 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 426 427 Raises 428 ------ 429 ValueError 430 If ms_level is not 1 or 2. 431 432 Notes 433 ----- 434 This method is useful for freeing up memory after the data has been processed. 435 """ 436 if ms_level is None: 437 for ms_level in self._ms_unprocessed.keys(): 438 self._ms_unprocessed[ms_level] = None 439 if ms_level not in [1, 2]: 440 raise ValueError("ms_level must be 1 or 2") 441 self._ms_unprocessed[ms_level] = None 442 443 def add_associated_ms2_dda( 444 self, 445 auto_process=True, 446 use_parser=True, 447 spectrum_mode=None, 448 ms_params_key="ms2", 449 scan_filter=None, 450 ): 451 """Add MS2 spectra associated with mass features to the dataset. 452 453 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 454 455 Parameters 456 ----------- 457 auto_process : bool, optional 458 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 459 use_parser : bool, optional 460 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 461 spectrum_mode : str or None, optional 462 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 463 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 464 Defaults to None. (faster if defined, otherwise will check each scan) 465 ms_params_key : string, optional 466 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 467 Defaults to 'ms2'. 468 scan_filter : str 469 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 470 "hcd" will pull out only HCD scans. 471 472 Raises 473 ------ 474 ValueError 475 If mass_features is not set, must run find_mass_features() first. 476 If no MS2 scans are found in the dataset. 477 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 478 """ 479 # Check if mass_features is set, raise error if not 480 if self.mass_features is None: 481 raise ValueError( 482 "mass_features not set, must run find_mass_features() first" 483 ) 484 485 # reconfigure ms_params to get the correct mass spectrum parameters from the key 486 ms_params = self.parameters.mass_spectrum[ms_params_key] 487 488 mf_df = self.mass_features_to_df().copy() 489 # Find ms2 scans that have a precursor m/z value 490 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 491 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 492 # drop ms2 scans that have no tic 493 ms2_scans = ms2_scans[ms2_scans.tic > 0] 494 if ms2_scans is None: 495 raise ValueError("No DDA scans found in dataset") 496 497 if scan_filter is not None: 498 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 499 # set tolerance in rt space (in minutes) and mz space (in daltons) 500 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 501 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 502 503 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 504 dda_scans = [] 505 for i, row in mf_df.iterrows(): 506 ms2_scans_filtered = ms2_scans[ 507 ms2_scans.scan_time.between( 508 row.scan_time - time_tol, row.scan_time + time_tol 509 ) 510 ] 511 ms2_scans_filtered = ms2_scans_filtered[ 512 ms2_scans_filtered.precursor_mz.between( 513 row.mz - mz_tol, row.mz + mz_tol 514 ) 515 ] 516 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 517 self.mass_features[i].ms2_scan_numbers = ( 518 ms2_scans_filtered.scan.tolist() 519 + self.mass_features[i].ms2_scan_numbers 520 ) 521 # add to _ms attribute 522 self.add_mass_spectra( 523 scan_list=list(set(dda_scans)), 524 auto_process=auto_process, 525 spectrum_mode=spectrum_mode, 526 use_parser=use_parser, 527 ms_params=ms_params, 528 ) 529 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 530 for mf_id in self.mass_features: 531 if self.mass_features[mf_id].ms2_scan_numbers is not None: 532 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 533 if dda_scan in self._ms.keys(): 534 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 535 dda_scan 536 ] 537 538 def add_associated_ms1( 539 self, auto_process=True, use_parser=True, spectrum_mode=None 540 ): 541 """Add MS1 spectra associated with mass features to the dataset. 542 543 Parameters 544 ----------- 545 auto_process : bool, optional 546 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 547 use_parser : bool, optional 548 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 549 spectrum_mode : str or None, optional 550 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 551 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 552 Defaults to None. (faster if defined, otherwise will check each scan) 553 554 Raises 555 ------ 556 ValueError 557 If mass_features is not set, must run find_mass_features() first. 558 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 559 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 560 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 561 """ 562 # Check if mass_features is set, raise error if not 563 if self.mass_features is None: 564 raise ValueError( 565 "mass_features not set, must run find_mass_features() first" 566 ) 567 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 568 569 if scans_to_average == 1: 570 # Add to LCMSobj 571 self.add_mass_spectra( 572 scan_list=[ 573 int(x) for x in self.mass_features_to_df().apex_scan.tolist() 574 ], 575 auto_process=auto_process, 576 use_parser=use_parser, 577 spectrum_mode=spectrum_mode, 578 ms_params=self.parameters.mass_spectrum["ms1"], 579 ) 580 581 elif ( 582 (scans_to_average - 1) % 2 583 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 584 apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist())) 585 # Check if all apex scans are profile mode, raise error if not 586 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 587 raise ValueError("All apex scans must be profile mode for averaging") 588 589 # First get sets of scans to average 590 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 591 ms1_idx_start = ms1_scans.index(apex_scan) - int( 592 (scans_to_average - 1) / 2 593 ) 594 if ms1_idx_start < 0: 595 ms1_idx_start = 0 596 ms1_idx_end = ( 597 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 598 ) 599 if ms1_idx_end > (len(ms1_scans) - 1): 600 ms1_idx_end = len(ms1_scans) - 1 601 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 602 return scan_list 603 604 ms1_scans = self.ms1_scans 605 scans_lists = [ 606 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 607 for apex_scan in apex_scans 608 ] 609 610 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 611 if self.polarity == "negative": 612 polarity = -1 613 elif self.polarity == "positive": 614 polarity = 1 615 616 if not use_parser: 617 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 618 ms1_unprocessed = self._ms_unprocessed[1].copy() 619 # Set the index on _ms_unprocessed[1] to scan number 620 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 621 self._ms_unprocessed[1] = ms1_unprocessed 622 623 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 624 scans_lists_flat = list( 625 set([scan for sublist in scans_lists for scan in sublist]) 626 ) 627 if ( 628 len( 629 np.setdiff1d( 630 np.sort(scans_lists_flat), 631 np.sort(ms1_unprocessed.index.values), 632 ) 633 ) 634 > 0 635 ): 636 raise ValueError( 637 "Not all scans to average are present in the unprocessed data" 638 ) 639 640 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 641 # Get unprocessed mass spectrum from scans 642 ms = self.get_average_mass_spectrum( 643 scan_list=scan_list_average, 644 apex_scan=apex_scan, 645 spectrum_mode="profile", 646 ms_level=1, 647 auto_process=auto_process, 648 use_parser=use_parser, 649 perform_checks=False, 650 polarity=polarity, 651 ms_params=self.parameters.mass_spectrum["ms1"], 652 ) 653 # Add mass spectrum to LCMS object and associated with mass feature 654 self.add_mass_spectrum(ms) 655 656 if not use_parser: 657 # Reset the index on _ms_unprocessed[1] to not be scan number 658 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 659 self._ms_unprocessed[1] = ms1_unprocessed 660 else: 661 raise ValueError( 662 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 663 ) 664 665 # Associate the ms1 spectra with the mass features 666 for mf_id in self.mass_features: 667 self.mass_features[mf_id].mass_spectrum = self._ms[ 668 self.mass_features[mf_id].apex_scan 669 ] 670 self.mass_features[mf_id].update_mz() 671 672 def mass_features_to_df(self): 673 """Returns a pandas dataframe summarizing the mass features. 674 675 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 676 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 677 678 679 Returns 680 -------- 681 pandas.DataFrame 682 A pandas dataframe of mass features with the following columns: 683 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 684 """ 685 686 def mass_spectrum_to_string( 687 mass_spec, normalize=True, min_normalized_abun=0.01 688 ): 689 """Converts a mass spectrum to a string of m/z:abundance pairs. 690 691 Parameters 692 ----------- 693 mass_spec : MassSpectrum 694 A MassSpectrum object to be converted to a string. 695 normalize : bool, optional 696 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 697 min_normalized_abun : float, optional 698 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 699 700 Returns 701 -------- 702 str 703 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 704 """ 705 mz_np = mass_spec.to_dataframe()["m/z"].values 706 abun_np = mass_spec.to_dataframe()["Peak Height"].values 707 if normalize: 708 abun_np = abun_np / abun_np.max() 709 mz_abun = np.column_stack((mz_np, abun_np)) 710 if normalize: 711 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 712 mz_abun_str = [ 713 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 714 for mz, abun in mz_abun 715 ] 716 return "; ".join(mz_abun_str) 717 718 cols_in_df = [ 719 "id", 720 "_apex_scan", 721 "start_scan", 722 "final_scan", 723 "_retention_time", 724 "_intensity", 725 "_persistence", 726 "_area", 727 "_dispersity_index", 728 "_tailing_factor", 729 "monoisotopic_mf_id", 730 "isotopologue_type", 731 "mass_spectrum_deconvoluted_parent", 732 ] 733 df_mf_list = [] 734 for mf_id in self.mass_features.keys(): 735 # Find cols_in_df that are in single_mf 736 df_keys = list( 737 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 738 ) 739 dict_mf = {} 740 for key in df_keys: 741 dict_mf[key] = getattr(self.mass_features[mf_id], key) 742 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 743 # Add MS2 spectra info 744 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 745 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 746 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 747 dict_mf["associated_mass_features"] = ", ".join( 748 map( 749 str, 750 self.mass_features[mf_id].associated_mass_features_deconvoluted, 751 ) 752 ) 753 if self.mass_features[mf_id]._half_height_width is not None: 754 dict_mf["half_height_width"] = self.mass_features[ 755 mf_id 756 ].half_height_width 757 # Check if EIC for mass feature is set 758 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 759 df_mf_single["mz"] = self.mass_features[mf_id].mz 760 df_mf_list.append(df_mf_single) 761 df_mf = pd.concat(df_mf_list) 762 763 # rename _area to area and id to mf_id 764 df_mf = df_mf.rename( 765 columns={ 766 "_area": "area", 767 "id": "mf_id", 768 "_apex_scan": "apex_scan", 769 "_retention_time": "scan_time", 770 "_intensity": "intensity", 771 "_persistence": "persistence", 772 "_dispersity_index": "dispersity_index", 773 "_tailing_factor": "tailing_factor", 774 } 775 ) 776 777 # reorder columns 778 col_order = [ 779 "mf_id", 780 "scan_time", 781 "mz", 782 "apex_scan", 783 "start_scan", 784 "final_scan", 785 "intensity", 786 "persistence", 787 "area", 788 "half_height_width", 789 "tailing_factor", 790 "dispersity_index", 791 "monoisotopic_mf_id", 792 "isotopologue_type", 793 "mass_spectrum_deconvoluted_parent", 794 "associated_mass_features", 795 "ms2_spectrum", 796 ] 797 # drop columns that are not in col_order 798 cols_to_order = [col for col in col_order if col in df_mf.columns] 799 df_mf = df_mf[cols_to_order] 800 801 # reset index to mf_id 802 df_mf = df_mf.set_index("mf_id") 803 df_mf.index.name = "mf_id" 804 805 return df_mf 806 807 def mass_features_ms1_annot_to_df(self): 808 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 809 810 Returns 811 -------- 812 pandas.DataFrame 813 A pandas dataframe of MS1 annotations for the mass features in the dataset. 814 The index is set to mf_id (mass feature ID) 815 816 Raises 817 ------ 818 Warning 819 If no MS1 annotations were found for the mass features in the dataset. 820 """ 821 annot_df_list_ms1 = [] 822 for mf_id in self.mass_features.keys(): 823 if self.mass_features[mf_id].mass_spectrum is None: 824 pass 825 else: 826 # Add ms1 annotations to ms1 annotation list 827 if ( 828 np.abs( 829 ( 830 self.mass_features[mf_id].ms1_peak.mz_exp 831 - self.mass_features[mf_id].mz 832 ) 833 ) 834 < 0.01 835 ): 836 # Get the molecular formula from the mass spectrum 837 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 838 # Subset to pull out only the peak associated with the mass feature 839 annot_df = annot_df[ 840 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 841 ].copy() 842 843 # Remove the index column and add column for mf_id 844 annot_df = annot_df.drop(columns=["Index"]) 845 annot_df["mf_id"] = mf_id 846 annot_df_list_ms1.append(annot_df) 847 848 if len(annot_df_list_ms1) > 0: 849 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 850 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 851 annot_ms1_df_full.index.name = "mf_id" 852 853 else: 854 annot_ms1_df_full = None 855 # Warn that no ms1 annotations were found 856 warnings.warn( 857 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 858 UserWarning, 859 ) 860 861 return annot_ms1_df_full 862 863 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 864 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 865 866 Parameters 867 ----------- 868 molecular_metadata : dict of MolecularMetadata objects 869 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 870 871 Returns 872 -------- 873 pandas.DataFrame 874 A pandas dataframe of MS2 annotations for the mass features in the dataset, 875 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 876 877 Raises 878 ------ 879 Warning 880 If no MS2 annotations were found for the mass features in the dataset. 881 """ 882 annot_df_list_ms2 = [] 883 for mf_id in self.mass_features.keys(): 884 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 885 # Add ms2 annotations to ms2 annotation list 886 for result in self.mass_features[mf_id].ms2_similarity_results: 887 annot_df_ms2 = result.to_dataframe() 888 annot_df_ms2["mf_id"] = mf_id 889 annot_df_list_ms2.append(annot_df_ms2) 890 891 if len(annot_df_list_ms2) > 0: 892 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 893 if molecular_metadata is not None: 894 molecular_metadata_df = pd.concat( 895 [ 896 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 897 for k, v in molecular_metadata.items() 898 ], 899 ignore_index=True, 900 ) 901 molecular_metadata_df = molecular_metadata_df.rename( 902 columns={"id": "ref_mol_id"} 903 ) 904 annot_ms2_df_full = annot_ms2_df_full.merge( 905 molecular_metadata_df, on="ref_mol_id", how="left" 906 ) 907 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 908 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 909 ).copy() 910 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 911 annot_ms2_df_full.index.name = "mf_id" 912 else: 913 annot_ms2_df_full = None 914 # Warn that no ms2 annotations were found 915 warnings.warn( 916 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 917 UserWarning, 918 ) 919 920 return annot_ms2_df_full 921 922 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 923 """Returns a figure displaying 924 (1) thresholded, unprocessed data 925 (2) the m/z features 926 (3) which m/z features are associated with MS2 spectra 927 928 Parameters 929 ----------- 930 binsize : float 931 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 932 mf_plot : boolean 933 Indicates whether to plot the m/z features. Defaults to True. 934 ms2_plot : boolean 935 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 936 return_fig : boolean 937 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 938 939 Returns 940 -------- 941 matplotlib.pyplot.Figure 942 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 943 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 944 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 945 features with associated with MS2 spectra are plotted, they are displayed in red. 946 947 Raises 948 ------ 949 Warning 950 If m/z features are set to be plot but aren't in the dataset. 951 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 952 were found for the m/z features in the dataset. 953 """ 954 if mf_plot: 955 # Check if mass_features is set, raise error if not 956 if self.mass_features is None: 957 raise ValueError( 958 "mass_features not set, must run find_mass_features() first" 959 ) 960 ## call mass feature data 961 mf_df = self.mass_features_to_df() 962 963 if ms2_plot: 964 if not mf_plot: 965 # Check if mass_features is set, raise error if not 966 if self.mass_features is None: 967 raise ValueError( 968 "mass_features not set, must run find_mass_features() first" 969 ) 970 971 ## call m/z feature data 972 mf_df = self.mass_features_to_df() 973 974 # Check if ms2_spectrum is set, raise error if not 975 if 'ms2_spectrum' not in mf_df.columns: 976 raise ValueError( 977 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 978 ) 979 980 ## threshold and grid unprocessed data 981 df = self._ms_unprocessed[1].copy() 982 df = df.dropna(subset=['intensity']).reset_index(drop = True) 983 threshold = ph_int_min_thresh * df.intensity.max() 984 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 985 df = self.grid_data(df_thres) 986 987 ## format unprocessed data for plotting 988 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 989 mz_grid = np.arange(0, np.max(df.mz), binsize) 990 mz_data = np.array(df.mz) 991 df['mz_bin'] = find_closest(mz_grid, mz_data) 992 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 993 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 994 995 ## generate figure 996 fig = plt.figure() 997 plt.scatter( 998 unproc_df.scan_time, 999 unproc_df.mz_bin*binsize, 1000 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1001 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1002 cmap = 'Greys_r', 1003 s = 1 1004 ) 1005 1006 if mf_plot: 1007 if ms2_plot: 1008 plt.scatter( 1009 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1010 mf_df[mf_df.ms2_spectrum.isna()].mz, 1011 c = 'c', 1012 s = 4, 1013 label = 'M/Z features without MS2' 1014 ) 1015 else: 1016 plt.scatter( 1017 mf_df.scan_time, 1018 mf_df.mz, 1019 c = 'c', 1020 s = 4, 1021 label = 'M/Z features' 1022 ) 1023 1024 if ms2_plot: 1025 plt.scatter( 1026 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1027 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1028 c = 'r', 1029 s = 2, 1030 label = 'M/Z features with MS2' 1031 ) 1032 1033 if mf_plot == True or ms2_plot == True: 1034 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1035 plt.xlabel('Scan time') 1036 plt.ylabel('m/z') 1037 plt.ylim(0, np.ceil(np.max(df.mz))) 1038 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1039 plt.title('Composite Feature Map') 1040 1041 if return_fig: 1042 plt.close(fig) 1043 return fig 1044 1045 else: 1046 plt.show() 1047 1048 def __len__(self): 1049 """ 1050 Returns the number of mass spectra in the dataset. 1051 1052 Returns 1053 -------- 1054 int 1055 The number of mass spectra in the dataset. 1056 """ 1057 return len(self._ms) 1058 1059 def __getitem__(self, scan_number): 1060 """ 1061 Returns the mass spectrum corresponding to the specified scan number. 1062 1063 Parameters 1064 ----------- 1065 scan_number : int 1066 The scan number of the desired mass spectrum. 1067 1068 Returns 1069 -------- 1070 MassSpectrum 1071 The mass spectrum corresponding to the specified scan number. 1072 """ 1073 return self._ms.get(scan_number) 1074 1075 def __iter__(self): 1076 """Returns an iterator over the mass spectra in the dataset. 1077 1078 Returns 1079 -------- 1080 iterator 1081 An iterator over the mass spectra in the dataset. 1082 """ 1083 return iter(self._ms.values()) 1084 1085 def set_tic_list_from_data(self, overwrite=False): 1086 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1087 1088 Parameters 1089 ----------- 1090 overwrite : bool, optional 1091 If True, overwrites the TIC list if it is already set. Defaults to False. 1092 1093 Notes 1094 ----- 1095 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1096 1097 Raises 1098 ------ 1099 ValueError 1100 If no mass spectra are found in the dataset. 1101 If the TIC list is already set and overwrite is False. 1102 """ 1103 # Check if _ms is empty and raise error if so 1104 if len(self._ms) == 0: 1105 raise ValueError("No mass spectra found in dataset") 1106 1107 # Check if tic_list is already set and raise error if so 1108 if len(self.tic) > 0 and not overwrite: 1109 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1110 1111 self.tic = [self._ms.get(i).tic for i in self.scans_number] 1112 1113 def set_retention_time_from_data(self, overwrite=False): 1114 """Sets the retention time list from the data in the _ms dictionary. 1115 1116 Parameters 1117 ----------- 1118 overwrite : bool, optional 1119 If True, overwrites the retention time list if it is already set. Defaults to False. 1120 1121 Notes 1122 ----- 1123 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1124 1125 Raises 1126 ------ 1127 ValueError 1128 If no mass spectra are found in the dataset. 1129 If the retention time list is already set and overwrite is False. 1130 """ 1131 # Check if _ms is empty and raise error if so 1132 if len(self._ms) == 0: 1133 raise ValueError("No mass spectra found in dataset") 1134 1135 # Check if retention_time_list is already set and raise error if so 1136 if len(self.retention_time) > 0 and not overwrite: 1137 raise ValueError( 1138 "Retention time list already set, use overwrite=True to overwrite" 1139 ) 1140 1141 retention_time_list = [] 1142 for key_ms in sorted(self._ms.keys()): 1143 retention_time_list.append(self._ms.get(key_ms).retention_time) 1144 self.retention_time = retention_time_list 1145 1146 def set_scans_number_from_data(self, overwrite=False): 1147 """Sets the scan number list from the data in the _ms dictionary. 1148 1149 Notes 1150 ----- 1151 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1152 1153 Raises 1154 ------ 1155 ValueError 1156 If no mass spectra are found in the dataset. 1157 If the scan number list is already set and overwrite is False. 1158 """ 1159 # Check if _ms is empty and raise error if so 1160 if len(self._ms) == 0: 1161 raise ValueError("No mass spectra found in dataset") 1162 1163 # Check if scans_number_list is already set and raise error if so 1164 if len(self.scans_number) > 0 and not overwrite: 1165 raise ValueError( 1166 "Scan number list already set, use overwrite=True to overwrite" 1167 ) 1168 1169 self.scans_number = sorted(self._ms.keys()) 1170 1171 @property 1172 def ms1_scans(self): 1173 """ 1174 list : A list of MS1 scan numbers for the dataset. 1175 """ 1176 return self.scan_df[self.scan_df.ms_level == 1].index.tolist() 1177 1178 @property 1179 def parameters(self): 1180 """ 1181 LCMSParameters : The parameters used for the LC-MS analysis. 1182 """ 1183 return self._parameters 1184 1185 @parameters.setter 1186 def parameters(self, paramsinstance): 1187 """ 1188 Sets the parameters used for the LC-MS analysis. 1189 1190 Parameters 1191 ----------- 1192 paramsinstance : LCMSParameters 1193 The parameters used for the LC-MS analysis. 1194 """ 1195 self._parameters = paramsinstance 1196 1197 @property 1198 def scans_number(self): 1199 """ 1200 list : A list of scan numbers for the dataset. 1201 """ 1202 return self._scans_number_list 1203 1204 @scans_number.setter 1205 def scans_number(self, scan_numbers_list): 1206 """ 1207 Sets the scan numbers for the dataset. 1208 1209 Parameters 1210 ----------- 1211 scan_numbers_list : list 1212 A list of scan numbers for the dataset. 1213 """ 1214 self._scans_number_list = scan_numbers_list 1215 1216 @property 1217 def retention_time(self): 1218 """ 1219 numpy.ndarray : An array of retention times for the dataset. 1220 """ 1221 return self._retention_time_list 1222 1223 @retention_time.setter 1224 def retention_time(self, rt_list): 1225 """ 1226 Sets the retention times for the dataset. 1227 1228 Parameters 1229 ----------- 1230 rt_list : list 1231 A list of retention times for the dataset. 1232 """ 1233 self._retention_time_list = np.array(rt_list) 1234 1235 @property 1236 def tic(self): 1237 """ 1238 numpy.ndarray : An array of TIC values for the dataset. 1239 """ 1240 return self._tic_list 1241 1242 @tic.setter 1243 def tic(self, tic_list): 1244 """ 1245 Sets the TIC values for the dataset. 1246 1247 Parameters 1248 ----------- 1249 tic_list : list 1250 A list of TIC values for the dataset. 1251 """ 1252 self._tic_list = np.array(tic_list)
16class MassSpectraBase: 17 """Base class for mass spectra objects. 18 19 Parameters 20 ----------- 21 file_location : str or Path 22 The location of the file containing the mass spectra data. 23 analyzer : str, optional 24 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 25 instrument_label : str, optional 26 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 27 sample_name : str, optional 28 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 29 spectra_parser : object, optional 30 The spectra parser object used to create the mass spectra object. Defaults to None. 31 32 Attributes 33 ----------- 34 spectra_parser_class : class 35 The class of the spectra parser used to create the mass spectra object. 36 file_location : str or Path 37 The location of the file containing the mass spectra data. 38 sample_name : str 39 The name of the sample; defaults to the file name if not provided to the parser. 40 analyzer : str 41 The type of analyzer used to generate the mass spectra data. Derived from the spectra parser. 42 instrument_label : str 43 The type of instrument used to generate the mass spectra data. Derived from the spectra parser. 44 _scan_info : dict 45 A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, 46 scan text, and scan window (lower and upper). 47 Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame. 48 _ms : dict 49 A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary. 50 _ms_unprocessed: dictionary of pandas.DataFrames or None 51 A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. 52 Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None. 53 54 Methods 55 -------- 56 * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). 57 Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans 58 * get_time_of_scan_id(scan). 59 Returns the scan time for the specified scan number. 60 """ 61 62 def __init__( 63 self, 64 file_location, 65 analyzer="Unknown", 66 instrument_label="Unknown", 67 sample_name=None, 68 spectra_parser=None, 69 ): 70 if isinstance(file_location, str): 71 file_location = Path(file_location) 72 else: 73 file_location = file_location 74 if not file_location.exists(): 75 raise FileExistsError("File does not exist: " + str(file_location)) 76 77 if sample_name: 78 self.sample_name = sample_name 79 else: 80 self.sample_name = file_location.stem 81 82 self.file_location = file_location 83 self.analyzer = analyzer 84 self.instrument_label = instrument_label 85 86 # Add the spectra parser class to the object if it is not None 87 if spectra_parser is not None: 88 self.spectra_parser_class = spectra_parser.__class__ 89 self.spectra_parser = spectra_parser 90 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 91 if ( 92 self.sample_name is not None 93 and self.sample_name != self.spectra_parser.sample_name 94 ): 95 warnings.warn( 96 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 97 UserWarning, 98 ) 99 if self.analyzer != self.spectra_parser.analyzer: 100 warnings.warn( 101 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 102 UserWarning, 103 ) 104 if self.instrument_label != self.spectra_parser.instrument_label: 105 warnings.warn( 106 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 107 UserWarning, 108 ) 109 if self.file_location != self.spectra_parser.file_location: 110 warnings.warn( 111 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 112 UserWarning, 113 ) 114 115 # Instantiate empty dictionaries for scan information and mass spectra 116 self._scan_info = {} 117 self._ms = {} 118 self._ms_unprocessed = {} 119 120 def add_mass_spectrum(self, mass_spec): 121 """Adds a mass spectrum to the dataset. 122 123 Parameters 124 ----------- 125 mass_spec : MassSpectrum 126 The corems MassSpectrum object to be added to the dataset. 127 128 Notes 129 ----- 130 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 131 """ 132 # check if mass_spec has a scan_number attribute 133 if not hasattr(mass_spec, "scan_number"): 134 raise ValueError( 135 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 136 ) 137 self._ms[mass_spec.scan_number] = mass_spec 138 139 def add_mass_spectra( 140 self, 141 scan_list, 142 spectrum_mode=None, 143 ms_level=1, 144 use_parser=True, 145 auto_process=True, 146 ms_params=None, 147 ): 148 """Add mass spectra to _ms dictionary, from a list of scans or single scan 149 150 Notes 151 ----- 152 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 153 154 155 Parameters 156 ----------- 157 scan_list : list of ints 158 List of scans to use to populate _ms slot 159 spectrum_mode : str or None 160 The spectrum mode to use for the mass spectra. 161 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 162 Defaults to None. 163 ms_level : int, optional 164 The MS level to use for the mass spectra. 165 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 166 Defaults to 1. 167 using_parser : bool 168 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 169 auto_process : bool 170 Whether to auto-process the mass spectra. Defaults to True. 171 ms_params : MSParameters or None 172 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 173 174 Raises 175 ------ 176 TypeError 177 If scan_list is not a list of ints 178 ValueError 179 If polarity is not 'positive' or 'negative' 180 If ms_level is not 1 or 2 181 """ 182 183 # check if scan_list is a list or a single int; if single int, convert to list 184 if isinstance(scan_list, int): 185 scan_list = [scan_list] 186 if not isinstance(scan_list, list): 187 raise TypeError("scan_list must be a list of integers") 188 for scan in scan_list: 189 if not isinstance(scan, int): 190 raise TypeError("scan_list must be a list of integers") 191 192 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 193 if self.polarity == "negative": 194 polarity = -1 195 elif self.polarity == "positive": 196 polarity = 1 197 else: 198 raise ValueError( 199 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 200 ) 201 202 # is not using_parser, check that ms1 and ms2 are not None 203 if not use_parser: 204 if ms_level not in self._ms_unprocessed.keys(): 205 raise ValueError( 206 "ms_level {} not found in _ms_unprocessed dictionary".format( 207 ms_level 208 ) 209 ) 210 211 scan_list = list(set(scan_list)) 212 scan_list.sort() 213 if not use_parser: 214 if self._ms_unprocessed[ms_level] is None: 215 raise ValueError( 216 "No unprocessed data found for ms_level {}".format(ms_level) 217 ) 218 if ( 219 len( 220 np.setdiff1d( 221 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 222 ) 223 ) 224 > 0 225 ): 226 raise ValueError( 227 "Not all scans in scan_list are present in the unprocessed data" 228 ) 229 # Prepare the ms_df for parsing 230 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 231 232 for scan in scan_list: 233 ms = None 234 if spectrum_mode is None: 235 # get spectrum mode from _scan_info 236 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 237 else: 238 spectrum_mode_scan = spectrum_mode 239 # Instantiate the mass spectrum object using the parser or the unprocessed data 240 if not use_parser: 241 my_ms_df = ms_df.loc[scan] 242 if spectrum_mode_scan == "profile": 243 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 244 ms = ms_from_array_profile( 245 my_ms_df.mz, 246 my_ms_df.intensity, 247 self.file_location, 248 polarity=polarity, 249 auto_process=False, 250 ) 251 else: 252 raise ValueError( 253 "Only profile mode is supported for unprocessed data" 254 ) 255 if use_parser: 256 ms = self.spectra_parser.get_mass_spectrum_from_scan( 257 scan_number=scan, 258 spectrum_mode=spectrum_mode_scan, 259 auto_process=False, 260 ) 261 262 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 263 if ms is not None: 264 if ms_params is not None: 265 ms.parameters = ms_params 266 ms.scan_number = scan 267 if auto_process: 268 ms.process_mass_spec() 269 self.add_mass_spectrum(ms) 270 271 def get_time_of_scan_id(self, scan): 272 """Returns the scan time for the specified scan number. 273 274 Parameters 275 ----------- 276 scan : int 277 The scan number of the desired scan time. 278 279 Returns 280 -------- 281 float 282 The scan time for the specified scan number (in minutes). 283 284 Raises 285 ------ 286 ValueError 287 If no scan time is found for the specified scan number. 288 """ 289 # Check if _retenion_time_list is empty and raise error if so 290 if len(self._retention_time_list) == 0: 291 raise ValueError("No retention times found in dataset") 292 rt = self._retention_time_list[self._scans_number_list.index(scan)] 293 return rt 294 295 @property 296 def scan_df(self): 297 """ 298 pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). 299 """ 300 scan_df = pd.DataFrame.from_dict(self._scan_info) 301 return scan_df 302 303 @property 304 def ms(self): 305 """ 306 dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles 307 """ 308 return self._ms 309 310 311 @scan_df.setter 312 def scan_df(self, df): 313 """ 314 Sets the scan data for the dataset. 315 316 Parameters 317 ----------- 318 df : pandas.DataFrame 319 A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level, 320 precursor m/z, scan text, and scan window (lower and upper). 321 """ 322 self._scan_info = df.to_dict() 323 324 def __getitem__(self, scan_number): 325 return self._ms.get(scan_number)
Base class for mass spectra objects.
Parameters
- file_location (str or Path): The location of the file containing the mass spectra data.
- analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
- instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
- sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
- spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
- spectra_parser_class (class): The class of the spectra parser used to create the mass spectra object.
- file_location (str or Path): The location of the file containing the mass spectra data.
- sample_name (str): The name of the sample; defaults to the file name if not provided to the parser.
- analyzer (str): The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
- instrument_label (str): The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
- _scan_info (dict): A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
- _ms (dict): A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
- _ms_unprocessed (dictionary of pandas.DataFrames or None): A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
Methods
- add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
- get_time_of_scan_id(scan). Returns the scan time for the specified scan number.
62 def __init__( 63 self, 64 file_location, 65 analyzer="Unknown", 66 instrument_label="Unknown", 67 sample_name=None, 68 spectra_parser=None, 69 ): 70 if isinstance(file_location, str): 71 file_location = Path(file_location) 72 else: 73 file_location = file_location 74 if not file_location.exists(): 75 raise FileExistsError("File does not exist: " + str(file_location)) 76 77 if sample_name: 78 self.sample_name = sample_name 79 else: 80 self.sample_name = file_location.stem 81 82 self.file_location = file_location 83 self.analyzer = analyzer 84 self.instrument_label = instrument_label 85 86 # Add the spectra parser class to the object if it is not None 87 if spectra_parser is not None: 88 self.spectra_parser_class = spectra_parser.__class__ 89 self.spectra_parser = spectra_parser 90 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 91 if ( 92 self.sample_name is not None 93 and self.sample_name != self.spectra_parser.sample_name 94 ): 95 warnings.warn( 96 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 97 UserWarning, 98 ) 99 if self.analyzer != self.spectra_parser.analyzer: 100 warnings.warn( 101 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 102 UserWarning, 103 ) 104 if self.instrument_label != self.spectra_parser.instrument_label: 105 warnings.warn( 106 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 107 UserWarning, 108 ) 109 if self.file_location != self.spectra_parser.file_location: 110 warnings.warn( 111 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 112 UserWarning, 113 ) 114 115 # Instantiate empty dictionaries for scan information and mass spectra 116 self._scan_info = {} 117 self._ms = {} 118 self._ms_unprocessed = {}
120 def add_mass_spectrum(self, mass_spec): 121 """Adds a mass spectrum to the dataset. 122 123 Parameters 124 ----------- 125 mass_spec : MassSpectrum 126 The corems MassSpectrum object to be added to the dataset. 127 128 Notes 129 ----- 130 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 131 """ 132 # check if mass_spec has a scan_number attribute 133 if not hasattr(mass_spec, "scan_number"): 134 raise ValueError( 135 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 136 ) 137 self._ms[mass_spec.scan_number] = mass_spec
Adds a mass spectrum to the dataset.
Parameters
- mass_spec (MassSpectrum): The corems MassSpectrum object to be added to the dataset.
Notes
This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
139 def add_mass_spectra( 140 self, 141 scan_list, 142 spectrum_mode=None, 143 ms_level=1, 144 use_parser=True, 145 auto_process=True, 146 ms_params=None, 147 ): 148 """Add mass spectra to _ms dictionary, from a list of scans or single scan 149 150 Notes 151 ----- 152 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 153 154 155 Parameters 156 ----------- 157 scan_list : list of ints 158 List of scans to use to populate _ms slot 159 spectrum_mode : str or None 160 The spectrum mode to use for the mass spectra. 161 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 162 Defaults to None. 163 ms_level : int, optional 164 The MS level to use for the mass spectra. 165 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 166 Defaults to 1. 167 using_parser : bool 168 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 169 auto_process : bool 170 Whether to auto-process the mass spectra. Defaults to True. 171 ms_params : MSParameters or None 172 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 173 174 Raises 175 ------ 176 TypeError 177 If scan_list is not a list of ints 178 ValueError 179 If polarity is not 'positive' or 'negative' 180 If ms_level is not 1 or 2 181 """ 182 183 # check if scan_list is a list or a single int; if single int, convert to list 184 if isinstance(scan_list, int): 185 scan_list = [scan_list] 186 if not isinstance(scan_list, list): 187 raise TypeError("scan_list must be a list of integers") 188 for scan in scan_list: 189 if not isinstance(scan, int): 190 raise TypeError("scan_list must be a list of integers") 191 192 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 193 if self.polarity == "negative": 194 polarity = -1 195 elif self.polarity == "positive": 196 polarity = 1 197 else: 198 raise ValueError( 199 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 200 ) 201 202 # is not using_parser, check that ms1 and ms2 are not None 203 if not use_parser: 204 if ms_level not in self._ms_unprocessed.keys(): 205 raise ValueError( 206 "ms_level {} not found in _ms_unprocessed dictionary".format( 207 ms_level 208 ) 209 ) 210 211 scan_list = list(set(scan_list)) 212 scan_list.sort() 213 if not use_parser: 214 if self._ms_unprocessed[ms_level] is None: 215 raise ValueError( 216 "No unprocessed data found for ms_level {}".format(ms_level) 217 ) 218 if ( 219 len( 220 np.setdiff1d( 221 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 222 ) 223 ) 224 > 0 225 ): 226 raise ValueError( 227 "Not all scans in scan_list are present in the unprocessed data" 228 ) 229 # Prepare the ms_df for parsing 230 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 231 232 for scan in scan_list: 233 ms = None 234 if spectrum_mode is None: 235 # get spectrum mode from _scan_info 236 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 237 else: 238 spectrum_mode_scan = spectrum_mode 239 # Instantiate the mass spectrum object using the parser or the unprocessed data 240 if not use_parser: 241 my_ms_df = ms_df.loc[scan] 242 if spectrum_mode_scan == "profile": 243 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 244 ms = ms_from_array_profile( 245 my_ms_df.mz, 246 my_ms_df.intensity, 247 self.file_location, 248 polarity=polarity, 249 auto_process=False, 250 ) 251 else: 252 raise ValueError( 253 "Only profile mode is supported for unprocessed data" 254 ) 255 if use_parser: 256 ms = self.spectra_parser.get_mass_spectrum_from_scan( 257 scan_number=scan, 258 spectrum_mode=spectrum_mode_scan, 259 auto_process=False, 260 ) 261 262 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 263 if ms is not None: 264 if ms_params is not None: 265 ms.parameters = ms_params 266 ms.scan_number = scan 267 if auto_process: 268 ms.process_mass_spec() 269 self.add_mass_spectrum(ms)
Add mass spectra to _ms dictionary, from a list of scans or single scan
Notes
The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
Parameters
- scan_list (list of ints): List of scans to use to populate _ms slot
- spectrum_mode (str or None): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None.
- ms_level (int, optional): The MS level to use for the mass spectra. This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. Defaults to 1.
- using_parser (bool): Whether to use the mass spectra parser to get the mass spectra. Defaults to True.
- auto_process (bool): Whether to auto-process the mass spectra. Defaults to True.
- ms_params (MSParameters or None): The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters.
Raises
- TypeError: If scan_list is not a list of ints
- ValueError: If polarity is not 'positive' or 'negative' If ms_level is not 1 or 2
271 def get_time_of_scan_id(self, scan): 272 """Returns the scan time for the specified scan number. 273 274 Parameters 275 ----------- 276 scan : int 277 The scan number of the desired scan time. 278 279 Returns 280 -------- 281 float 282 The scan time for the specified scan number (in minutes). 283 284 Raises 285 ------ 286 ValueError 287 If no scan time is found for the specified scan number. 288 """ 289 # Check if _retenion_time_list is empty and raise error if so 290 if len(self._retention_time_list) == 0: 291 raise ValueError("No retention times found in dataset") 292 rt = self._retention_time_list[self._scans_number_list.index(scan)] 293 return rt
Returns the scan time for the specified scan number.
Parameters
- scan (int): The scan number of the desired scan time.
Returns
- float: The scan time for the specified scan number (in minutes).
Raises
- ValueError: If no scan time is found for the specified scan number.
328class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch): 329 """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object. 330 331 This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method. 332 333 Parameters 334 ----------- 335 file_location : str or Path 336 The location of the file containing the mass spectra data. 337 analyzer : str, optional 338 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 339 instrument_label : str, optional 340 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 341 sample_name : str, optional 342 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 343 spectra_parser : object, optional 344 The spectra parser object used to create the mass spectra object. Defaults to None. 345 346 Attributes 347 ----------- 348 polarity : str 349 The polarity of the ionization mode used for the dataset. 350 _parameters : LCMSParameters 351 The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters. 352 _retention_time_list : numpy.ndarray 353 An array of retention times for the dataset. 354 _scans_number_list : list 355 A list of scan numbers for the dataset. 356 _tic_list : numpy.ndarray 357 An array of total ion current (TIC) values for the dataset. 358 eics : dict 359 A dictionary containing extracted ion chromatograms (EICs) for the dataset. 360 Key is the mz of the EIC. Initialized as an empty dictionary. 361 mass_features : dictionary of LCMSMassFeature objects 362 A dictionary containing mass features for the dataset. 363 Key is mass feature ID. Initialized as an empty dictionary. 364 spectral_search_results : dictionary of MS2SearchResults objects 365 A dictionary containing spectral search results for the dataset. 366 Key is scan number : precursor mz. Initialized as an empty dictionary. 367 368 Methods 369 -------- 370 * get_parameters_json(). 371 Returns the parameters used for the LC-MS analysis in JSON format. 372 * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) 373 Adds which MS2 scans are associated with each mass feature to the 374 mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary. 375 * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) 376 Adds the MS1 spectra associated with each mass feature to the 377 mass_features dictionary and adds the MS1 spectra to the _ms dictionary. 378 * mass_features_to_df() 379 Returns a pandas dataframe summarizing the mass features in the dataset. 380 * set_tic_list_from_data(overwrite=False) 381 Sets the TIC list from the mass spectrum objects within the _ms dictionary. 382 * set_retention_time_from_data(overwrite=False) 383 Sets the retention time list from the data in the _ms dictionary. 384 * set_scans_number_from_data(overwrite=False) 385 Sets the scan number list from the data in the _ms dictionary. 386 * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) 387 Generates plot of M/Z features comparing scan time vs M/Z value 388 """ 389 390 def __init__( 391 self, 392 file_location, 393 analyzer="Unknown", 394 instrument_label="Unknown", 395 sample_name=None, 396 spectra_parser=None, 397 ): 398 super().__init__( 399 file_location, analyzer, instrument_label, sample_name, spectra_parser 400 ) 401 self.polarity = "" 402 self._parameters = LCMSParameters() 403 self._retention_time_list = [] 404 self._scans_number_list = [] 405 self._tic_list = [] 406 self.eics = {} 407 self.mass_features = {} 408 self.spectral_search_results = {} 409 410 def get_parameters_json(self): 411 """Returns the parameters stored for the LC-MS object in JSON format. 412 413 Returns 414 -------- 415 str 416 The parameters used for the LC-MS analysis in JSON format. 417 """ 418 return self.parameters.to_json() 419 420 def remove_unprocessed_data(self, ms_level=None): 421 """Removes the unprocessed data from the LCMSBase object. 422 423 Parameters 424 ----------- 425 ms_level : int, optional 426 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 427 428 Raises 429 ------ 430 ValueError 431 If ms_level is not 1 or 2. 432 433 Notes 434 ----- 435 This method is useful for freeing up memory after the data has been processed. 436 """ 437 if ms_level is None: 438 for ms_level in self._ms_unprocessed.keys(): 439 self._ms_unprocessed[ms_level] = None 440 if ms_level not in [1, 2]: 441 raise ValueError("ms_level must be 1 or 2") 442 self._ms_unprocessed[ms_level] = None 443 444 def add_associated_ms2_dda( 445 self, 446 auto_process=True, 447 use_parser=True, 448 spectrum_mode=None, 449 ms_params_key="ms2", 450 scan_filter=None, 451 ): 452 """Add MS2 spectra associated with mass features to the dataset. 453 454 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 455 456 Parameters 457 ----------- 458 auto_process : bool, optional 459 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 460 use_parser : bool, optional 461 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 462 spectrum_mode : str or None, optional 463 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 464 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 465 Defaults to None. (faster if defined, otherwise will check each scan) 466 ms_params_key : string, optional 467 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 468 Defaults to 'ms2'. 469 scan_filter : str 470 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 471 "hcd" will pull out only HCD scans. 472 473 Raises 474 ------ 475 ValueError 476 If mass_features is not set, must run find_mass_features() first. 477 If no MS2 scans are found in the dataset. 478 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 479 """ 480 # Check if mass_features is set, raise error if not 481 if self.mass_features is None: 482 raise ValueError( 483 "mass_features not set, must run find_mass_features() first" 484 ) 485 486 # reconfigure ms_params to get the correct mass spectrum parameters from the key 487 ms_params = self.parameters.mass_spectrum[ms_params_key] 488 489 mf_df = self.mass_features_to_df().copy() 490 # Find ms2 scans that have a precursor m/z value 491 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 492 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 493 # drop ms2 scans that have no tic 494 ms2_scans = ms2_scans[ms2_scans.tic > 0] 495 if ms2_scans is None: 496 raise ValueError("No DDA scans found in dataset") 497 498 if scan_filter is not None: 499 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 500 # set tolerance in rt space (in minutes) and mz space (in daltons) 501 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 502 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 503 504 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 505 dda_scans = [] 506 for i, row in mf_df.iterrows(): 507 ms2_scans_filtered = ms2_scans[ 508 ms2_scans.scan_time.between( 509 row.scan_time - time_tol, row.scan_time + time_tol 510 ) 511 ] 512 ms2_scans_filtered = ms2_scans_filtered[ 513 ms2_scans_filtered.precursor_mz.between( 514 row.mz - mz_tol, row.mz + mz_tol 515 ) 516 ] 517 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 518 self.mass_features[i].ms2_scan_numbers = ( 519 ms2_scans_filtered.scan.tolist() 520 + self.mass_features[i].ms2_scan_numbers 521 ) 522 # add to _ms attribute 523 self.add_mass_spectra( 524 scan_list=list(set(dda_scans)), 525 auto_process=auto_process, 526 spectrum_mode=spectrum_mode, 527 use_parser=use_parser, 528 ms_params=ms_params, 529 ) 530 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 531 for mf_id in self.mass_features: 532 if self.mass_features[mf_id].ms2_scan_numbers is not None: 533 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 534 if dda_scan in self._ms.keys(): 535 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 536 dda_scan 537 ] 538 539 def add_associated_ms1( 540 self, auto_process=True, use_parser=True, spectrum_mode=None 541 ): 542 """Add MS1 spectra associated with mass features to the dataset. 543 544 Parameters 545 ----------- 546 auto_process : bool, optional 547 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 548 use_parser : bool, optional 549 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 550 spectrum_mode : str or None, optional 551 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 552 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 553 Defaults to None. (faster if defined, otherwise will check each scan) 554 555 Raises 556 ------ 557 ValueError 558 If mass_features is not set, must run find_mass_features() first. 559 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 560 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 561 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 562 """ 563 # Check if mass_features is set, raise error if not 564 if self.mass_features is None: 565 raise ValueError( 566 "mass_features not set, must run find_mass_features() first" 567 ) 568 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 569 570 if scans_to_average == 1: 571 # Add to LCMSobj 572 self.add_mass_spectra( 573 scan_list=[ 574 int(x) for x in self.mass_features_to_df().apex_scan.tolist() 575 ], 576 auto_process=auto_process, 577 use_parser=use_parser, 578 spectrum_mode=spectrum_mode, 579 ms_params=self.parameters.mass_spectrum["ms1"], 580 ) 581 582 elif ( 583 (scans_to_average - 1) % 2 584 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 585 apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist())) 586 # Check if all apex scans are profile mode, raise error if not 587 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 588 raise ValueError("All apex scans must be profile mode for averaging") 589 590 # First get sets of scans to average 591 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 592 ms1_idx_start = ms1_scans.index(apex_scan) - int( 593 (scans_to_average - 1) / 2 594 ) 595 if ms1_idx_start < 0: 596 ms1_idx_start = 0 597 ms1_idx_end = ( 598 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 599 ) 600 if ms1_idx_end > (len(ms1_scans) - 1): 601 ms1_idx_end = len(ms1_scans) - 1 602 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 603 return scan_list 604 605 ms1_scans = self.ms1_scans 606 scans_lists = [ 607 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 608 for apex_scan in apex_scans 609 ] 610 611 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 612 if self.polarity == "negative": 613 polarity = -1 614 elif self.polarity == "positive": 615 polarity = 1 616 617 if not use_parser: 618 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 619 ms1_unprocessed = self._ms_unprocessed[1].copy() 620 # Set the index on _ms_unprocessed[1] to scan number 621 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 622 self._ms_unprocessed[1] = ms1_unprocessed 623 624 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 625 scans_lists_flat = list( 626 set([scan for sublist in scans_lists for scan in sublist]) 627 ) 628 if ( 629 len( 630 np.setdiff1d( 631 np.sort(scans_lists_flat), 632 np.sort(ms1_unprocessed.index.values), 633 ) 634 ) 635 > 0 636 ): 637 raise ValueError( 638 "Not all scans to average are present in the unprocessed data" 639 ) 640 641 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 642 # Get unprocessed mass spectrum from scans 643 ms = self.get_average_mass_spectrum( 644 scan_list=scan_list_average, 645 apex_scan=apex_scan, 646 spectrum_mode="profile", 647 ms_level=1, 648 auto_process=auto_process, 649 use_parser=use_parser, 650 perform_checks=False, 651 polarity=polarity, 652 ms_params=self.parameters.mass_spectrum["ms1"], 653 ) 654 # Add mass spectrum to LCMS object and associated with mass feature 655 self.add_mass_spectrum(ms) 656 657 if not use_parser: 658 # Reset the index on _ms_unprocessed[1] to not be scan number 659 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 660 self._ms_unprocessed[1] = ms1_unprocessed 661 else: 662 raise ValueError( 663 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 664 ) 665 666 # Associate the ms1 spectra with the mass features 667 for mf_id in self.mass_features: 668 self.mass_features[mf_id].mass_spectrum = self._ms[ 669 self.mass_features[mf_id].apex_scan 670 ] 671 self.mass_features[mf_id].update_mz() 672 673 def mass_features_to_df(self): 674 """Returns a pandas dataframe summarizing the mass features. 675 676 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 677 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 678 679 680 Returns 681 -------- 682 pandas.DataFrame 683 A pandas dataframe of mass features with the following columns: 684 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 685 """ 686 687 def mass_spectrum_to_string( 688 mass_spec, normalize=True, min_normalized_abun=0.01 689 ): 690 """Converts a mass spectrum to a string of m/z:abundance pairs. 691 692 Parameters 693 ----------- 694 mass_spec : MassSpectrum 695 A MassSpectrum object to be converted to a string. 696 normalize : bool, optional 697 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 698 min_normalized_abun : float, optional 699 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 700 701 Returns 702 -------- 703 str 704 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 705 """ 706 mz_np = mass_spec.to_dataframe()["m/z"].values 707 abun_np = mass_spec.to_dataframe()["Peak Height"].values 708 if normalize: 709 abun_np = abun_np / abun_np.max() 710 mz_abun = np.column_stack((mz_np, abun_np)) 711 if normalize: 712 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 713 mz_abun_str = [ 714 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 715 for mz, abun in mz_abun 716 ] 717 return "; ".join(mz_abun_str) 718 719 cols_in_df = [ 720 "id", 721 "_apex_scan", 722 "start_scan", 723 "final_scan", 724 "_retention_time", 725 "_intensity", 726 "_persistence", 727 "_area", 728 "_dispersity_index", 729 "_tailing_factor", 730 "monoisotopic_mf_id", 731 "isotopologue_type", 732 "mass_spectrum_deconvoluted_parent", 733 ] 734 df_mf_list = [] 735 for mf_id in self.mass_features.keys(): 736 # Find cols_in_df that are in single_mf 737 df_keys = list( 738 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 739 ) 740 dict_mf = {} 741 for key in df_keys: 742 dict_mf[key] = getattr(self.mass_features[mf_id], key) 743 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 744 # Add MS2 spectra info 745 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 746 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 747 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 748 dict_mf["associated_mass_features"] = ", ".join( 749 map( 750 str, 751 self.mass_features[mf_id].associated_mass_features_deconvoluted, 752 ) 753 ) 754 if self.mass_features[mf_id]._half_height_width is not None: 755 dict_mf["half_height_width"] = self.mass_features[ 756 mf_id 757 ].half_height_width 758 # Check if EIC for mass feature is set 759 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 760 df_mf_single["mz"] = self.mass_features[mf_id].mz 761 df_mf_list.append(df_mf_single) 762 df_mf = pd.concat(df_mf_list) 763 764 # rename _area to area and id to mf_id 765 df_mf = df_mf.rename( 766 columns={ 767 "_area": "area", 768 "id": "mf_id", 769 "_apex_scan": "apex_scan", 770 "_retention_time": "scan_time", 771 "_intensity": "intensity", 772 "_persistence": "persistence", 773 "_dispersity_index": "dispersity_index", 774 "_tailing_factor": "tailing_factor", 775 } 776 ) 777 778 # reorder columns 779 col_order = [ 780 "mf_id", 781 "scan_time", 782 "mz", 783 "apex_scan", 784 "start_scan", 785 "final_scan", 786 "intensity", 787 "persistence", 788 "area", 789 "half_height_width", 790 "tailing_factor", 791 "dispersity_index", 792 "monoisotopic_mf_id", 793 "isotopologue_type", 794 "mass_spectrum_deconvoluted_parent", 795 "associated_mass_features", 796 "ms2_spectrum", 797 ] 798 # drop columns that are not in col_order 799 cols_to_order = [col for col in col_order if col in df_mf.columns] 800 df_mf = df_mf[cols_to_order] 801 802 # reset index to mf_id 803 df_mf = df_mf.set_index("mf_id") 804 df_mf.index.name = "mf_id" 805 806 return df_mf 807 808 def mass_features_ms1_annot_to_df(self): 809 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 810 811 Returns 812 -------- 813 pandas.DataFrame 814 A pandas dataframe of MS1 annotations for the mass features in the dataset. 815 The index is set to mf_id (mass feature ID) 816 817 Raises 818 ------ 819 Warning 820 If no MS1 annotations were found for the mass features in the dataset. 821 """ 822 annot_df_list_ms1 = [] 823 for mf_id in self.mass_features.keys(): 824 if self.mass_features[mf_id].mass_spectrum is None: 825 pass 826 else: 827 # Add ms1 annotations to ms1 annotation list 828 if ( 829 np.abs( 830 ( 831 self.mass_features[mf_id].ms1_peak.mz_exp 832 - self.mass_features[mf_id].mz 833 ) 834 ) 835 < 0.01 836 ): 837 # Get the molecular formula from the mass spectrum 838 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 839 # Subset to pull out only the peak associated with the mass feature 840 annot_df = annot_df[ 841 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 842 ].copy() 843 844 # Remove the index column and add column for mf_id 845 annot_df = annot_df.drop(columns=["Index"]) 846 annot_df["mf_id"] = mf_id 847 annot_df_list_ms1.append(annot_df) 848 849 if len(annot_df_list_ms1) > 0: 850 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 851 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 852 annot_ms1_df_full.index.name = "mf_id" 853 854 else: 855 annot_ms1_df_full = None 856 # Warn that no ms1 annotations were found 857 warnings.warn( 858 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 859 UserWarning, 860 ) 861 862 return annot_ms1_df_full 863 864 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 865 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 866 867 Parameters 868 ----------- 869 molecular_metadata : dict of MolecularMetadata objects 870 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 871 872 Returns 873 -------- 874 pandas.DataFrame 875 A pandas dataframe of MS2 annotations for the mass features in the dataset, 876 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 877 878 Raises 879 ------ 880 Warning 881 If no MS2 annotations were found for the mass features in the dataset. 882 """ 883 annot_df_list_ms2 = [] 884 for mf_id in self.mass_features.keys(): 885 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 886 # Add ms2 annotations to ms2 annotation list 887 for result in self.mass_features[mf_id].ms2_similarity_results: 888 annot_df_ms2 = result.to_dataframe() 889 annot_df_ms2["mf_id"] = mf_id 890 annot_df_list_ms2.append(annot_df_ms2) 891 892 if len(annot_df_list_ms2) > 0: 893 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 894 if molecular_metadata is not None: 895 molecular_metadata_df = pd.concat( 896 [ 897 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 898 for k, v in molecular_metadata.items() 899 ], 900 ignore_index=True, 901 ) 902 molecular_metadata_df = molecular_metadata_df.rename( 903 columns={"id": "ref_mol_id"} 904 ) 905 annot_ms2_df_full = annot_ms2_df_full.merge( 906 molecular_metadata_df, on="ref_mol_id", how="left" 907 ) 908 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 909 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 910 ).copy() 911 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 912 annot_ms2_df_full.index.name = "mf_id" 913 else: 914 annot_ms2_df_full = None 915 # Warn that no ms2 annotations were found 916 warnings.warn( 917 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 918 UserWarning, 919 ) 920 921 return annot_ms2_df_full 922 923 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 924 """Returns a figure displaying 925 (1) thresholded, unprocessed data 926 (2) the m/z features 927 (3) which m/z features are associated with MS2 spectra 928 929 Parameters 930 ----------- 931 binsize : float 932 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 933 mf_plot : boolean 934 Indicates whether to plot the m/z features. Defaults to True. 935 ms2_plot : boolean 936 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 937 return_fig : boolean 938 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 939 940 Returns 941 -------- 942 matplotlib.pyplot.Figure 943 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 944 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 945 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 946 features with associated with MS2 spectra are plotted, they are displayed in red. 947 948 Raises 949 ------ 950 Warning 951 If m/z features are set to be plot but aren't in the dataset. 952 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 953 were found for the m/z features in the dataset. 954 """ 955 if mf_plot: 956 # Check if mass_features is set, raise error if not 957 if self.mass_features is None: 958 raise ValueError( 959 "mass_features not set, must run find_mass_features() first" 960 ) 961 ## call mass feature data 962 mf_df = self.mass_features_to_df() 963 964 if ms2_plot: 965 if not mf_plot: 966 # Check if mass_features is set, raise error if not 967 if self.mass_features is None: 968 raise ValueError( 969 "mass_features not set, must run find_mass_features() first" 970 ) 971 972 ## call m/z feature data 973 mf_df = self.mass_features_to_df() 974 975 # Check if ms2_spectrum is set, raise error if not 976 if 'ms2_spectrum' not in mf_df.columns: 977 raise ValueError( 978 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 979 ) 980 981 ## threshold and grid unprocessed data 982 df = self._ms_unprocessed[1].copy() 983 df = df.dropna(subset=['intensity']).reset_index(drop = True) 984 threshold = ph_int_min_thresh * df.intensity.max() 985 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 986 df = self.grid_data(df_thres) 987 988 ## format unprocessed data for plotting 989 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 990 mz_grid = np.arange(0, np.max(df.mz), binsize) 991 mz_data = np.array(df.mz) 992 df['mz_bin'] = find_closest(mz_grid, mz_data) 993 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 994 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 995 996 ## generate figure 997 fig = plt.figure() 998 plt.scatter( 999 unproc_df.scan_time, 1000 unproc_df.mz_bin*binsize, 1001 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1002 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1003 cmap = 'Greys_r', 1004 s = 1 1005 ) 1006 1007 if mf_plot: 1008 if ms2_plot: 1009 plt.scatter( 1010 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1011 mf_df[mf_df.ms2_spectrum.isna()].mz, 1012 c = 'c', 1013 s = 4, 1014 label = 'M/Z features without MS2' 1015 ) 1016 else: 1017 plt.scatter( 1018 mf_df.scan_time, 1019 mf_df.mz, 1020 c = 'c', 1021 s = 4, 1022 label = 'M/Z features' 1023 ) 1024 1025 if ms2_plot: 1026 plt.scatter( 1027 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1028 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1029 c = 'r', 1030 s = 2, 1031 label = 'M/Z features with MS2' 1032 ) 1033 1034 if mf_plot == True or ms2_plot == True: 1035 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1036 plt.xlabel('Scan time') 1037 plt.ylabel('m/z') 1038 plt.ylim(0, np.ceil(np.max(df.mz))) 1039 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1040 plt.title('Composite Feature Map') 1041 1042 if return_fig: 1043 plt.close(fig) 1044 return fig 1045 1046 else: 1047 plt.show() 1048 1049 def __len__(self): 1050 """ 1051 Returns the number of mass spectra in the dataset. 1052 1053 Returns 1054 -------- 1055 int 1056 The number of mass spectra in the dataset. 1057 """ 1058 return len(self._ms) 1059 1060 def __getitem__(self, scan_number): 1061 """ 1062 Returns the mass spectrum corresponding to the specified scan number. 1063 1064 Parameters 1065 ----------- 1066 scan_number : int 1067 The scan number of the desired mass spectrum. 1068 1069 Returns 1070 -------- 1071 MassSpectrum 1072 The mass spectrum corresponding to the specified scan number. 1073 """ 1074 return self._ms.get(scan_number) 1075 1076 def __iter__(self): 1077 """Returns an iterator over the mass spectra in the dataset. 1078 1079 Returns 1080 -------- 1081 iterator 1082 An iterator over the mass spectra in the dataset. 1083 """ 1084 return iter(self._ms.values()) 1085 1086 def set_tic_list_from_data(self, overwrite=False): 1087 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1088 1089 Parameters 1090 ----------- 1091 overwrite : bool, optional 1092 If True, overwrites the TIC list if it is already set. Defaults to False. 1093 1094 Notes 1095 ----- 1096 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1097 1098 Raises 1099 ------ 1100 ValueError 1101 If no mass spectra are found in the dataset. 1102 If the TIC list is already set and overwrite is False. 1103 """ 1104 # Check if _ms is empty and raise error if so 1105 if len(self._ms) == 0: 1106 raise ValueError("No mass spectra found in dataset") 1107 1108 # Check if tic_list is already set and raise error if so 1109 if len(self.tic) > 0 and not overwrite: 1110 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1111 1112 self.tic = [self._ms.get(i).tic for i in self.scans_number] 1113 1114 def set_retention_time_from_data(self, overwrite=False): 1115 """Sets the retention time list from the data in the _ms dictionary. 1116 1117 Parameters 1118 ----------- 1119 overwrite : bool, optional 1120 If True, overwrites the retention time list if it is already set. Defaults to False. 1121 1122 Notes 1123 ----- 1124 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1125 1126 Raises 1127 ------ 1128 ValueError 1129 If no mass spectra are found in the dataset. 1130 If the retention time list is already set and overwrite is False. 1131 """ 1132 # Check if _ms is empty and raise error if so 1133 if len(self._ms) == 0: 1134 raise ValueError("No mass spectra found in dataset") 1135 1136 # Check if retention_time_list is already set and raise error if so 1137 if len(self.retention_time) > 0 and not overwrite: 1138 raise ValueError( 1139 "Retention time list already set, use overwrite=True to overwrite" 1140 ) 1141 1142 retention_time_list = [] 1143 for key_ms in sorted(self._ms.keys()): 1144 retention_time_list.append(self._ms.get(key_ms).retention_time) 1145 self.retention_time = retention_time_list 1146 1147 def set_scans_number_from_data(self, overwrite=False): 1148 """Sets the scan number list from the data in the _ms dictionary. 1149 1150 Notes 1151 ----- 1152 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1153 1154 Raises 1155 ------ 1156 ValueError 1157 If no mass spectra are found in the dataset. 1158 If the scan number list is already set and overwrite is False. 1159 """ 1160 # Check if _ms is empty and raise error if so 1161 if len(self._ms) == 0: 1162 raise ValueError("No mass spectra found in dataset") 1163 1164 # Check if scans_number_list is already set and raise error if so 1165 if len(self.scans_number) > 0 and not overwrite: 1166 raise ValueError( 1167 "Scan number list already set, use overwrite=True to overwrite" 1168 ) 1169 1170 self.scans_number = sorted(self._ms.keys()) 1171 1172 @property 1173 def ms1_scans(self): 1174 """ 1175 list : A list of MS1 scan numbers for the dataset. 1176 """ 1177 return self.scan_df[self.scan_df.ms_level == 1].index.tolist() 1178 1179 @property 1180 def parameters(self): 1181 """ 1182 LCMSParameters : The parameters used for the LC-MS analysis. 1183 """ 1184 return self._parameters 1185 1186 @parameters.setter 1187 def parameters(self, paramsinstance): 1188 """ 1189 Sets the parameters used for the LC-MS analysis. 1190 1191 Parameters 1192 ----------- 1193 paramsinstance : LCMSParameters 1194 The parameters used for the LC-MS analysis. 1195 """ 1196 self._parameters = paramsinstance 1197 1198 @property 1199 def scans_number(self): 1200 """ 1201 list : A list of scan numbers for the dataset. 1202 """ 1203 return self._scans_number_list 1204 1205 @scans_number.setter 1206 def scans_number(self, scan_numbers_list): 1207 """ 1208 Sets the scan numbers for the dataset. 1209 1210 Parameters 1211 ----------- 1212 scan_numbers_list : list 1213 A list of scan numbers for the dataset. 1214 """ 1215 self._scans_number_list = scan_numbers_list 1216 1217 @property 1218 def retention_time(self): 1219 """ 1220 numpy.ndarray : An array of retention times for the dataset. 1221 """ 1222 return self._retention_time_list 1223 1224 @retention_time.setter 1225 def retention_time(self, rt_list): 1226 """ 1227 Sets the retention times for the dataset. 1228 1229 Parameters 1230 ----------- 1231 rt_list : list 1232 A list of retention times for the dataset. 1233 """ 1234 self._retention_time_list = np.array(rt_list) 1235 1236 @property 1237 def tic(self): 1238 """ 1239 numpy.ndarray : An array of TIC values for the dataset. 1240 """ 1241 return self._tic_list 1242 1243 @tic.setter 1244 def tic(self, tic_list): 1245 """ 1246 Sets the TIC values for the dataset. 1247 1248 Parameters 1249 ----------- 1250 tic_list : list 1251 A list of TIC values for the dataset. 1252 """ 1253 self._tic_list = np.array(tic_list)
A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.
This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.
Parameters
- file_location (str or Path): The location of the file containing the mass spectra data.
- analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
- instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
- sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
- spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
- polarity (str): The polarity of the ionization mode used for the dataset.
- _parameters (LCMSParameters): The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
- _retention_time_list (numpy.ndarray): An array of retention times for the dataset.
- _scans_number_list (list): A list of scan numbers for the dataset.
- _tic_list (numpy.ndarray): An array of total ion current (TIC) values for the dataset.
- eics (dict): A dictionary containing extracted ion chromatograms (EICs) for the dataset. Key is the mz of the EIC. Initialized as an empty dictionary.
- mass_features (dictionary of LCMSMassFeature objects): A dictionary containing mass features for the dataset. Key is mass feature ID. Initialized as an empty dictionary.
- spectral_search_results (dictionary of MS2SearchResults objects): A dictionary containing spectral search results for the dataset. Key is scan number : precursor mz. Initialized as an empty dictionary.
Methods
- get_parameters_json(). Returns the parameters used for the LC-MS analysis in JSON format.
- add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds which MS2 scans are associated with each mass feature to the mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
- add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds the MS1 spectra associated with each mass feature to the mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
- mass_features_to_df() Returns a pandas dataframe summarizing the mass features in the dataset.
- set_tic_list_from_data(overwrite=False) Sets the TIC list from the mass spectrum objects within the _ms dictionary.
- set_retention_time_from_data(overwrite=False) Sets the retention time list from the data in the _ms dictionary.
- set_scans_number_from_data(overwrite=False) Sets the scan number list from the data in the _ms dictionary.
- plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) Generates plot of M/Z features comparing scan time vs M/Z value
390 def __init__( 391 self, 392 file_location, 393 analyzer="Unknown", 394 instrument_label="Unknown", 395 sample_name=None, 396 spectra_parser=None, 397 ): 398 super().__init__( 399 file_location, analyzer, instrument_label, sample_name, spectra_parser 400 ) 401 self.polarity = "" 402 self._parameters = LCMSParameters() 403 self._retention_time_list = [] 404 self._scans_number_list = [] 405 self._tic_list = [] 406 self.eics = {} 407 self.mass_features = {} 408 self.spectral_search_results = {}
410 def get_parameters_json(self): 411 """Returns the parameters stored for the LC-MS object in JSON format. 412 413 Returns 414 -------- 415 str 416 The parameters used for the LC-MS analysis in JSON format. 417 """ 418 return self.parameters.to_json()
Returns the parameters stored for the LC-MS object in JSON format.
Returns
- str: The parameters used for the LC-MS analysis in JSON format.
420 def remove_unprocessed_data(self, ms_level=None): 421 """Removes the unprocessed data from the LCMSBase object. 422 423 Parameters 424 ----------- 425 ms_level : int, optional 426 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 427 428 Raises 429 ------ 430 ValueError 431 If ms_level is not 1 or 2. 432 433 Notes 434 ----- 435 This method is useful for freeing up memory after the data has been processed. 436 """ 437 if ms_level is None: 438 for ms_level in self._ms_unprocessed.keys(): 439 self._ms_unprocessed[ms_level] = None 440 if ms_level not in [1, 2]: 441 raise ValueError("ms_level must be 1 or 2") 442 self._ms_unprocessed[ms_level] = None
Removes the unprocessed data from the LCMSBase object.
Parameters
- ms_level (int, optional): The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
Raises
- ValueError: If ms_level is not 1 or 2.
Notes
This method is useful for freeing up memory after the data has been processed.
444 def add_associated_ms2_dda( 445 self, 446 auto_process=True, 447 use_parser=True, 448 spectrum_mode=None, 449 ms_params_key="ms2", 450 scan_filter=None, 451 ): 452 """Add MS2 spectra associated with mass features to the dataset. 453 454 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 455 456 Parameters 457 ----------- 458 auto_process : bool, optional 459 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 460 use_parser : bool, optional 461 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 462 spectrum_mode : str or None, optional 463 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 464 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 465 Defaults to None. (faster if defined, otherwise will check each scan) 466 ms_params_key : string, optional 467 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 468 Defaults to 'ms2'. 469 scan_filter : str 470 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 471 "hcd" will pull out only HCD scans. 472 473 Raises 474 ------ 475 ValueError 476 If mass_features is not set, must run find_mass_features() first. 477 If no MS2 scans are found in the dataset. 478 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 479 """ 480 # Check if mass_features is set, raise error if not 481 if self.mass_features is None: 482 raise ValueError( 483 "mass_features not set, must run find_mass_features() first" 484 ) 485 486 # reconfigure ms_params to get the correct mass spectrum parameters from the key 487 ms_params = self.parameters.mass_spectrum[ms_params_key] 488 489 mf_df = self.mass_features_to_df().copy() 490 # Find ms2 scans that have a precursor m/z value 491 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 492 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 493 # drop ms2 scans that have no tic 494 ms2_scans = ms2_scans[ms2_scans.tic > 0] 495 if ms2_scans is None: 496 raise ValueError("No DDA scans found in dataset") 497 498 if scan_filter is not None: 499 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 500 # set tolerance in rt space (in minutes) and mz space (in daltons) 501 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 502 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 503 504 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 505 dda_scans = [] 506 for i, row in mf_df.iterrows(): 507 ms2_scans_filtered = ms2_scans[ 508 ms2_scans.scan_time.between( 509 row.scan_time - time_tol, row.scan_time + time_tol 510 ) 511 ] 512 ms2_scans_filtered = ms2_scans_filtered[ 513 ms2_scans_filtered.precursor_mz.between( 514 row.mz - mz_tol, row.mz + mz_tol 515 ) 516 ] 517 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 518 self.mass_features[i].ms2_scan_numbers = ( 519 ms2_scans_filtered.scan.tolist() 520 + self.mass_features[i].ms2_scan_numbers 521 ) 522 # add to _ms attribute 523 self.add_mass_spectra( 524 scan_list=list(set(dda_scans)), 525 auto_process=auto_process, 526 spectrum_mode=spectrum_mode, 527 use_parser=use_parser, 528 ms_params=ms_params, 529 ) 530 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 531 for mf_id in self.mass_features: 532 if self.mass_features[mf_id].ms2_scan_numbers is not None: 533 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 534 if dda_scan in self._ms.keys(): 535 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 536 dda_scan 537 ]
Add MS2 spectra associated with mass features to the dataset.
Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
Parameters
- auto_process (bool, optional): If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
- use_parser (bool, optional): If True, envoke the spectra parser to get the MS2 spectra. Default is True.
- spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
- ms_params_key (string, optional): The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. Defaults to 'ms2'.
- scan_filter (str): A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. "hcd" will pull out only HCD scans.
Raises
- ValueError: If mass_features is not set, must run find_mass_features() first. If no MS2 scans are found in the dataset. If no precursor m/z values are found in MS2 scans, not a DDA dataset.
539 def add_associated_ms1( 540 self, auto_process=True, use_parser=True, spectrum_mode=None 541 ): 542 """Add MS1 spectra associated with mass features to the dataset. 543 544 Parameters 545 ----------- 546 auto_process : bool, optional 547 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 548 use_parser : bool, optional 549 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 550 spectrum_mode : str or None, optional 551 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 552 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 553 Defaults to None. (faster if defined, otherwise will check each scan) 554 555 Raises 556 ------ 557 ValueError 558 If mass_features is not set, must run find_mass_features() first. 559 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 560 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 561 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 562 """ 563 # Check if mass_features is set, raise error if not 564 if self.mass_features is None: 565 raise ValueError( 566 "mass_features not set, must run find_mass_features() first" 567 ) 568 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 569 570 if scans_to_average == 1: 571 # Add to LCMSobj 572 self.add_mass_spectra( 573 scan_list=[ 574 int(x) for x in self.mass_features_to_df().apex_scan.tolist() 575 ], 576 auto_process=auto_process, 577 use_parser=use_parser, 578 spectrum_mode=spectrum_mode, 579 ms_params=self.parameters.mass_spectrum["ms1"], 580 ) 581 582 elif ( 583 (scans_to_average - 1) % 2 584 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 585 apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist())) 586 # Check if all apex scans are profile mode, raise error if not 587 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 588 raise ValueError("All apex scans must be profile mode for averaging") 589 590 # First get sets of scans to average 591 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 592 ms1_idx_start = ms1_scans.index(apex_scan) - int( 593 (scans_to_average - 1) / 2 594 ) 595 if ms1_idx_start < 0: 596 ms1_idx_start = 0 597 ms1_idx_end = ( 598 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 599 ) 600 if ms1_idx_end > (len(ms1_scans) - 1): 601 ms1_idx_end = len(ms1_scans) - 1 602 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 603 return scan_list 604 605 ms1_scans = self.ms1_scans 606 scans_lists = [ 607 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 608 for apex_scan in apex_scans 609 ] 610 611 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 612 if self.polarity == "negative": 613 polarity = -1 614 elif self.polarity == "positive": 615 polarity = 1 616 617 if not use_parser: 618 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 619 ms1_unprocessed = self._ms_unprocessed[1].copy() 620 # Set the index on _ms_unprocessed[1] to scan number 621 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 622 self._ms_unprocessed[1] = ms1_unprocessed 623 624 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 625 scans_lists_flat = list( 626 set([scan for sublist in scans_lists for scan in sublist]) 627 ) 628 if ( 629 len( 630 np.setdiff1d( 631 np.sort(scans_lists_flat), 632 np.sort(ms1_unprocessed.index.values), 633 ) 634 ) 635 > 0 636 ): 637 raise ValueError( 638 "Not all scans to average are present in the unprocessed data" 639 ) 640 641 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 642 # Get unprocessed mass spectrum from scans 643 ms = self.get_average_mass_spectrum( 644 scan_list=scan_list_average, 645 apex_scan=apex_scan, 646 spectrum_mode="profile", 647 ms_level=1, 648 auto_process=auto_process, 649 use_parser=use_parser, 650 perform_checks=False, 651 polarity=polarity, 652 ms_params=self.parameters.mass_spectrum["ms1"], 653 ) 654 # Add mass spectrum to LCMS object and associated with mass feature 655 self.add_mass_spectrum(ms) 656 657 if not use_parser: 658 # Reset the index on _ms_unprocessed[1] to not be scan number 659 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 660 self._ms_unprocessed[1] = ms1_unprocessed 661 else: 662 raise ValueError( 663 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 664 ) 665 666 # Associate the ms1 spectra with the mass features 667 for mf_id in self.mass_features: 668 self.mass_features[mf_id].mass_spectrum = self._ms[ 669 self.mass_features[mf_id].apex_scan 670 ] 671 self.mass_features[mf_id].update_mz()
Add MS1 spectra associated with mass features to the dataset.
Parameters
- auto_process (bool, optional): If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
- use_parser (bool, optional): If True, envoke the spectra parser to get the MS1 spectra. Default is True.
- spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
Raises
- ValueError: If mass_features is not set, must run find_mass_features() first. If apex scans are not profile mode, all apex scans must be profile mode for averaging. If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
673 def mass_features_to_df(self): 674 """Returns a pandas dataframe summarizing the mass features. 675 676 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 677 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 678 679 680 Returns 681 -------- 682 pandas.DataFrame 683 A pandas dataframe of mass features with the following columns: 684 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 685 """ 686 687 def mass_spectrum_to_string( 688 mass_spec, normalize=True, min_normalized_abun=0.01 689 ): 690 """Converts a mass spectrum to a string of m/z:abundance pairs. 691 692 Parameters 693 ----------- 694 mass_spec : MassSpectrum 695 A MassSpectrum object to be converted to a string. 696 normalize : bool, optional 697 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 698 min_normalized_abun : float, optional 699 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 700 701 Returns 702 -------- 703 str 704 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 705 """ 706 mz_np = mass_spec.to_dataframe()["m/z"].values 707 abun_np = mass_spec.to_dataframe()["Peak Height"].values 708 if normalize: 709 abun_np = abun_np / abun_np.max() 710 mz_abun = np.column_stack((mz_np, abun_np)) 711 if normalize: 712 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 713 mz_abun_str = [ 714 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 715 for mz, abun in mz_abun 716 ] 717 return "; ".join(mz_abun_str) 718 719 cols_in_df = [ 720 "id", 721 "_apex_scan", 722 "start_scan", 723 "final_scan", 724 "_retention_time", 725 "_intensity", 726 "_persistence", 727 "_area", 728 "_dispersity_index", 729 "_tailing_factor", 730 "monoisotopic_mf_id", 731 "isotopologue_type", 732 "mass_spectrum_deconvoluted_parent", 733 ] 734 df_mf_list = [] 735 for mf_id in self.mass_features.keys(): 736 # Find cols_in_df that are in single_mf 737 df_keys = list( 738 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 739 ) 740 dict_mf = {} 741 for key in df_keys: 742 dict_mf[key] = getattr(self.mass_features[mf_id], key) 743 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 744 # Add MS2 spectra info 745 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 746 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 747 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 748 dict_mf["associated_mass_features"] = ", ".join( 749 map( 750 str, 751 self.mass_features[mf_id].associated_mass_features_deconvoluted, 752 ) 753 ) 754 if self.mass_features[mf_id]._half_height_width is not None: 755 dict_mf["half_height_width"] = self.mass_features[ 756 mf_id 757 ].half_height_width 758 # Check if EIC for mass feature is set 759 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 760 df_mf_single["mz"] = self.mass_features[mf_id].mz 761 df_mf_list.append(df_mf_single) 762 df_mf = pd.concat(df_mf_list) 763 764 # rename _area to area and id to mf_id 765 df_mf = df_mf.rename( 766 columns={ 767 "_area": "area", 768 "id": "mf_id", 769 "_apex_scan": "apex_scan", 770 "_retention_time": "scan_time", 771 "_intensity": "intensity", 772 "_persistence": "persistence", 773 "_dispersity_index": "dispersity_index", 774 "_tailing_factor": "tailing_factor", 775 } 776 ) 777 778 # reorder columns 779 col_order = [ 780 "mf_id", 781 "scan_time", 782 "mz", 783 "apex_scan", 784 "start_scan", 785 "final_scan", 786 "intensity", 787 "persistence", 788 "area", 789 "half_height_width", 790 "tailing_factor", 791 "dispersity_index", 792 "monoisotopic_mf_id", 793 "isotopologue_type", 794 "mass_spectrum_deconvoluted_parent", 795 "associated_mass_features", 796 "ms2_spectrum", 797 ] 798 # drop columns that are not in col_order 799 cols_to_order = [col for col in col_order if col in df_mf.columns] 800 df_mf = df_mf[cols_to_order] 801 802 # reset index to mf_id 803 df_mf = df_mf.set_index("mf_id") 804 df_mf.index.name = "mf_id" 805 806 return df_mf
Returns a pandas dataframe summarizing the mass features.
The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID).
Returns
- pandas.DataFrame: A pandas dataframe of mass features with the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
808 def mass_features_ms1_annot_to_df(self): 809 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 810 811 Returns 812 -------- 813 pandas.DataFrame 814 A pandas dataframe of MS1 annotations for the mass features in the dataset. 815 The index is set to mf_id (mass feature ID) 816 817 Raises 818 ------ 819 Warning 820 If no MS1 annotations were found for the mass features in the dataset. 821 """ 822 annot_df_list_ms1 = [] 823 for mf_id in self.mass_features.keys(): 824 if self.mass_features[mf_id].mass_spectrum is None: 825 pass 826 else: 827 # Add ms1 annotations to ms1 annotation list 828 if ( 829 np.abs( 830 ( 831 self.mass_features[mf_id].ms1_peak.mz_exp 832 - self.mass_features[mf_id].mz 833 ) 834 ) 835 < 0.01 836 ): 837 # Get the molecular formula from the mass spectrum 838 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 839 # Subset to pull out only the peak associated with the mass feature 840 annot_df = annot_df[ 841 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 842 ].copy() 843 844 # Remove the index column and add column for mf_id 845 annot_df = annot_df.drop(columns=["Index"]) 846 annot_df["mf_id"] = mf_id 847 annot_df_list_ms1.append(annot_df) 848 849 if len(annot_df_list_ms1) > 0: 850 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 851 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 852 annot_ms1_df_full.index.name = "mf_id" 853 854 else: 855 annot_ms1_df_full = None 856 # Warn that no ms1 annotations were found 857 warnings.warn( 858 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 859 UserWarning, 860 ) 861 862 return annot_ms1_df_full
Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
Returns
- pandas.DataFrame: A pandas dataframe of MS1 annotations for the mass features in the dataset. The index is set to mf_id (mass feature ID)
Raises
- Warning: If no MS1 annotations were found for the mass features in the dataset.
864 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 865 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 866 867 Parameters 868 ----------- 869 molecular_metadata : dict of MolecularMetadata objects 870 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 871 872 Returns 873 -------- 874 pandas.DataFrame 875 A pandas dataframe of MS2 annotations for the mass features in the dataset, 876 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 877 878 Raises 879 ------ 880 Warning 881 If no MS2 annotations were found for the mass features in the dataset. 882 """ 883 annot_df_list_ms2 = [] 884 for mf_id in self.mass_features.keys(): 885 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 886 # Add ms2 annotations to ms2 annotation list 887 for result in self.mass_features[mf_id].ms2_similarity_results: 888 annot_df_ms2 = result.to_dataframe() 889 annot_df_ms2["mf_id"] = mf_id 890 annot_df_list_ms2.append(annot_df_ms2) 891 892 if len(annot_df_list_ms2) > 0: 893 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 894 if molecular_metadata is not None: 895 molecular_metadata_df = pd.concat( 896 [ 897 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 898 for k, v in molecular_metadata.items() 899 ], 900 ignore_index=True, 901 ) 902 molecular_metadata_df = molecular_metadata_df.rename( 903 columns={"id": "ref_mol_id"} 904 ) 905 annot_ms2_df_full = annot_ms2_df_full.merge( 906 molecular_metadata_df, on="ref_mol_id", how="left" 907 ) 908 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 909 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 910 ).copy() 911 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 912 annot_ms2_df_full.index.name = "mf_id" 913 else: 914 annot_ms2_df_full = None 915 # Warn that no ms2 annotations were found 916 warnings.warn( 917 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 918 UserWarning, 919 ) 920 921 return annot_ms2_df_full
Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
Parameters
- molecular_metadata (dict of MolecularMetadata objects): A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None.
Returns
- pandas.DataFrame: A pandas dataframe of MS2 annotations for the mass features in the dataset, and optionally molecular metadata. The index is set to mf_id (mass feature ID)
Raises
- Warning: If no MS2 annotations were found for the mass features in the dataset.
923 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 924 """Returns a figure displaying 925 (1) thresholded, unprocessed data 926 (2) the m/z features 927 (3) which m/z features are associated with MS2 spectra 928 929 Parameters 930 ----------- 931 binsize : float 932 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 933 mf_plot : boolean 934 Indicates whether to plot the m/z features. Defaults to True. 935 ms2_plot : boolean 936 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 937 return_fig : boolean 938 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 939 940 Returns 941 -------- 942 matplotlib.pyplot.Figure 943 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 944 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 945 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 946 features with associated with MS2 spectra are plotted, they are displayed in red. 947 948 Raises 949 ------ 950 Warning 951 If m/z features are set to be plot but aren't in the dataset. 952 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 953 were found for the m/z features in the dataset. 954 """ 955 if mf_plot: 956 # Check if mass_features is set, raise error if not 957 if self.mass_features is None: 958 raise ValueError( 959 "mass_features not set, must run find_mass_features() first" 960 ) 961 ## call mass feature data 962 mf_df = self.mass_features_to_df() 963 964 if ms2_plot: 965 if not mf_plot: 966 # Check if mass_features is set, raise error if not 967 if self.mass_features is None: 968 raise ValueError( 969 "mass_features not set, must run find_mass_features() first" 970 ) 971 972 ## call m/z feature data 973 mf_df = self.mass_features_to_df() 974 975 # Check if ms2_spectrum is set, raise error if not 976 if 'ms2_spectrum' not in mf_df.columns: 977 raise ValueError( 978 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 979 ) 980 981 ## threshold and grid unprocessed data 982 df = self._ms_unprocessed[1].copy() 983 df = df.dropna(subset=['intensity']).reset_index(drop = True) 984 threshold = ph_int_min_thresh * df.intensity.max() 985 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 986 df = self.grid_data(df_thres) 987 988 ## format unprocessed data for plotting 989 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 990 mz_grid = np.arange(0, np.max(df.mz), binsize) 991 mz_data = np.array(df.mz) 992 df['mz_bin'] = find_closest(mz_grid, mz_data) 993 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 994 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 995 996 ## generate figure 997 fig = plt.figure() 998 plt.scatter( 999 unproc_df.scan_time, 1000 unproc_df.mz_bin*binsize, 1001 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1002 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1003 cmap = 'Greys_r', 1004 s = 1 1005 ) 1006 1007 if mf_plot: 1008 if ms2_plot: 1009 plt.scatter( 1010 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1011 mf_df[mf_df.ms2_spectrum.isna()].mz, 1012 c = 'c', 1013 s = 4, 1014 label = 'M/Z features without MS2' 1015 ) 1016 else: 1017 plt.scatter( 1018 mf_df.scan_time, 1019 mf_df.mz, 1020 c = 'c', 1021 s = 4, 1022 label = 'M/Z features' 1023 ) 1024 1025 if ms2_plot: 1026 plt.scatter( 1027 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1028 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1029 c = 'r', 1030 s = 2, 1031 label = 'M/Z features with MS2' 1032 ) 1033 1034 if mf_plot == True or ms2_plot == True: 1035 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1036 plt.xlabel('Scan time') 1037 plt.ylabel('m/z') 1038 plt.ylim(0, np.ceil(np.max(df.mz))) 1039 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1040 plt.title('Composite Feature Map') 1041 1042 if return_fig: 1043 plt.close(fig) 1044 return fig 1045 1046 else: 1047 plt.show()
Returns a figure displaying (1) thresholded, unprocessed data (2) the m/z features (3) which m/z features are associated with MS2 spectra
Parameters
- binsize (float): Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4.
- mf_plot (boolean): Indicates whether to plot the m/z features. Defaults to True.
- ms2_plot (boolean): Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
- return_fig (boolean): Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
Returns
- matplotlib.pyplot.Figure: A figure with the thresholded, unprocessed data on an axis of m/z value with respect to scan time. Unprocessed data is displayed in gray scale with darker colors indicating higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z features with associated with MS2 spectra are plotted, they are displayed in red.
Raises
- Warning: If m/z features are set to be plot but aren't in the dataset. If m/z features with associated MS2 data are set to be plot but no MS2 annotations were found for the m/z features in the dataset.
1086 def set_tic_list_from_data(self, overwrite=False): 1087 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1088 1089 Parameters 1090 ----------- 1091 overwrite : bool, optional 1092 If True, overwrites the TIC list if it is already set. Defaults to False. 1093 1094 Notes 1095 ----- 1096 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1097 1098 Raises 1099 ------ 1100 ValueError 1101 If no mass spectra are found in the dataset. 1102 If the TIC list is already set and overwrite is False. 1103 """ 1104 # Check if _ms is empty and raise error if so 1105 if len(self._ms) == 0: 1106 raise ValueError("No mass spectra found in dataset") 1107 1108 # Check if tic_list is already set and raise error if so 1109 if len(self.tic) > 0 and not overwrite: 1110 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1111 1112 self.tic = [self._ms.get(i).tic for i in self.scans_number]
Sets the TIC list from the mass spectrum objects within the _ms dictionary.
Parameters
- overwrite (bool, optional): If True, overwrites the TIC list if it is already set. Defaults to False.
Notes
If the _ms dictionary is incomplete, sets the TIC list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the TIC list is already set and overwrite is False.
1114 def set_retention_time_from_data(self, overwrite=False): 1115 """Sets the retention time list from the data in the _ms dictionary. 1116 1117 Parameters 1118 ----------- 1119 overwrite : bool, optional 1120 If True, overwrites the retention time list if it is already set. Defaults to False. 1121 1122 Notes 1123 ----- 1124 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1125 1126 Raises 1127 ------ 1128 ValueError 1129 If no mass spectra are found in the dataset. 1130 If the retention time list is already set and overwrite is False. 1131 """ 1132 # Check if _ms is empty and raise error if so 1133 if len(self._ms) == 0: 1134 raise ValueError("No mass spectra found in dataset") 1135 1136 # Check if retention_time_list is already set and raise error if so 1137 if len(self.retention_time) > 0 and not overwrite: 1138 raise ValueError( 1139 "Retention time list already set, use overwrite=True to overwrite" 1140 ) 1141 1142 retention_time_list = [] 1143 for key_ms in sorted(self._ms.keys()): 1144 retention_time_list.append(self._ms.get(key_ms).retention_time) 1145 self.retention_time = retention_time_list
Sets the retention time list from the data in the _ms dictionary.
Parameters
- overwrite (bool, optional): If True, overwrites the retention time list if it is already set. Defaults to False.
Notes
If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the retention time list is already set and overwrite is False.
1147 def set_scans_number_from_data(self, overwrite=False): 1148 """Sets the scan number list from the data in the _ms dictionary. 1149 1150 Notes 1151 ----- 1152 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1153 1154 Raises 1155 ------ 1156 ValueError 1157 If no mass spectra are found in the dataset. 1158 If the scan number list is already set and overwrite is False. 1159 """ 1160 # Check if _ms is empty and raise error if so 1161 if len(self._ms) == 0: 1162 raise ValueError("No mass spectra found in dataset") 1163 1164 # Check if scans_number_list is already set and raise error if so 1165 if len(self.scans_number) > 0 and not overwrite: 1166 raise ValueError( 1167 "Scan number list already set, use overwrite=True to overwrite" 1168 ) 1169 1170 self.scans_number = sorted(self._ms.keys())
Sets the scan number list from the data in the _ms dictionary.
Notes
If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the scan number list is already set and overwrite is False.
Inherited Members
- MassSpectraBase
- file_location
- analyzer
- instrument_label
- add_mass_spectrum
- add_mass_spectra
- get_time_of_scan_id
- scan_df
- ms
- corems.mass_spectra.calc.lc_calc.LCCalculations
- get_max_eic
- smooth_tic
- eic_centroid_detector
- find_nearest_scan
- add_peak_metrics
- get_average_mass_spectrum
- find_mass_features
- integrate_mass_features
- find_c13_mass_features
- deconvolute_ms1_mass_features