corems.mass_spectra.factory.lc_class
1from pathlib import Path 2 3import numpy as np 4import pandas as pd 5import warnings 6import matplotlib.pyplot as plt 7 8from corems.encapsulation.factory.parameters import LCMSParameters 9from corems.mass_spectra.calc.lc_calc import LCCalculations, PHCalculations 10from corems.molecular_id.search.lcms_spectral_search import LCMSSpectralSearch 11from corems.mass_spectrum.input.numpyArray import ms_from_array_profile 12from corems.mass_spectra.calc.lc_calc import find_closest 13 14 15class MassSpectraBase: 16 """Base class for mass spectra objects. 17 18 Parameters 19 ----------- 20 file_location : str or Path 21 The location of the file containing the mass spectra data. 22 analyzer : str, optional 23 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 24 instrument_label : str, optional 25 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 26 sample_name : str, optional 27 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 28 spectra_parser : object, optional 29 The spectra parser object used to create the mass spectra object. Defaults to None. 30 31 Attributes 32 ----------- 33 spectra_parser_class : class 34 The class of the spectra parser used to create the mass spectra object. 35 file_location : str or Path 36 The location of the file containing the mass spectra data. 37 sample_name : str 38 The name of the sample; defaults to the file name if not provided to the parser. 39 analyzer : str 40 The type of analyzer used to generate the mass spectra data. Derived from the spectra parser. 41 instrument_label : str 42 The type of instrument used to generate the mass spectra data. Derived from the spectra parser. 43 _scan_info : dict 44 A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, 45 scan text, and scan window (lower and upper). 46 Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame. 47 _ms : dict 48 A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary. 49 _ms_unprocessed: dictionary of pandas.DataFrames or None 50 A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. 51 Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None. 52 53 Methods 54 -------- 55 * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). 56 Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans 57 * get_time_of_scan_id(scan). 58 Returns the scan time for the specified scan number. 59 """ 60 61 def __init__( 62 self, 63 file_location, 64 analyzer="Unknown", 65 instrument_label="Unknown", 66 sample_name=None, 67 spectra_parser=None, 68 ): 69 if isinstance(file_location, str): 70 file_location = Path(file_location) 71 else: 72 file_location = file_location 73 if not file_location.exists(): 74 raise FileExistsError("File does not exist: " + str(file_location)) 75 76 if sample_name: 77 self.sample_name = sample_name 78 else: 79 self.sample_name = file_location.stem 80 81 self.file_location = file_location 82 self.analyzer = analyzer 83 self.instrument_label = instrument_label 84 85 # Add the spectra parser class to the object if it is not None 86 if spectra_parser is not None: 87 self.spectra_parser_class = spectra_parser.__class__ 88 self.spectra_parser = spectra_parser 89 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 90 if ( 91 self.sample_name is not None 92 and self.sample_name != self.spectra_parser.sample_name 93 ): 94 warnings.warn( 95 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 96 UserWarning, 97 ) 98 if self.analyzer != self.spectra_parser.analyzer: 99 warnings.warn( 100 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 101 UserWarning, 102 ) 103 if self.instrument_label != self.spectra_parser.instrument_label: 104 warnings.warn( 105 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 106 UserWarning, 107 ) 108 if self.file_location != self.spectra_parser.file_location: 109 warnings.warn( 110 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 111 UserWarning, 112 ) 113 114 # Instantiate empty dictionaries for scan information and mass spectra 115 self._scan_info = {} 116 self._ms = {} 117 self._ms_unprocessed = {} 118 119 def add_mass_spectrum(self, mass_spec): 120 """Adds a mass spectrum to the dataset. 121 122 Parameters 123 ----------- 124 mass_spec : MassSpectrum 125 The corems MassSpectrum object to be added to the dataset. 126 127 Notes 128 ----- 129 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 130 """ 131 # check if mass_spec has a scan_number attribute 132 if not hasattr(mass_spec, "scan_number"): 133 raise ValueError( 134 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 135 ) 136 self._ms[mass_spec.scan_number] = mass_spec 137 138 def add_mass_spectra( 139 self, 140 scan_list, 141 spectrum_mode=None, 142 ms_level=1, 143 use_parser=True, 144 auto_process=True, 145 ms_params=None, 146 ): 147 """Add mass spectra to _ms dictionary, from a list of scans or single scan 148 149 Notes 150 ----- 151 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 152 153 154 Parameters 155 ----------- 156 scan_list : list of ints 157 List of scans to use to populate _ms slot 158 spectrum_mode : str or None 159 The spectrum mode to use for the mass spectra. 160 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 161 Defaults to None. 162 ms_level : int, optional 163 The MS level to use for the mass spectra. 164 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 165 Defaults to 1. 166 using_parser : bool 167 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 168 auto_process : bool 169 Whether to auto-process the mass spectra. Defaults to True. 170 ms_params : MSParameters or None 171 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 172 173 Raises 174 ------ 175 TypeError 176 If scan_list is not a list of ints 177 ValueError 178 If polarity is not 'positive' or 'negative' 179 If ms_level is not 1 or 2 180 """ 181 182 # check if scan_list is a list or a single int; if single int, convert to list 183 if isinstance(scan_list, int): 184 scan_list = [scan_list] 185 if not isinstance(scan_list, list): 186 raise TypeError("scan_list must be a list of integers") 187 for scan in scan_list: 188 if not isinstance(scan, int): 189 raise TypeError("scan_list must be a list of integers") 190 191 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 192 if self.polarity == "negative": 193 polarity = -1 194 elif self.polarity == "positive": 195 polarity = 1 196 else: 197 raise ValueError( 198 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 199 ) 200 201 # is not using_parser, check that ms1 and ms2 are not None 202 if not use_parser: 203 if ms_level not in self._ms_unprocessed.keys(): 204 raise ValueError( 205 "ms_level {} not found in _ms_unprocessed dictionary".format( 206 ms_level 207 ) 208 ) 209 210 scan_list = list(set(scan_list)) 211 scan_list.sort() 212 if not use_parser: 213 if self._ms_unprocessed[ms_level] is None: 214 raise ValueError( 215 "No unprocessed data found for ms_level {}".format(ms_level) 216 ) 217 if ( 218 len( 219 np.setdiff1d( 220 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 221 ) 222 ) 223 > 0 224 ): 225 raise ValueError( 226 "Not all scans in scan_list are present in the unprocessed data" 227 ) 228 # Prepare the ms_df for parsing 229 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 230 231 for scan in scan_list: 232 ms = None 233 if spectrum_mode is None: 234 # get spectrum mode from _scan_info 235 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 236 else: 237 spectrum_mode_scan = spectrum_mode 238 # Instantiate the mass spectrum object using the parser or the unprocessed data 239 if not use_parser: 240 my_ms_df = ms_df.loc[scan] 241 if spectrum_mode_scan == "profile": 242 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 243 ms = ms_from_array_profile( 244 my_ms_df.mz, 245 my_ms_df.intensity, 246 self.file_location, 247 polarity=polarity, 248 auto_process=False, 249 ) 250 else: 251 raise ValueError( 252 "Only profile mode is supported for unprocessed data" 253 ) 254 if use_parser: 255 ms = self.spectra_parser.get_mass_spectrum_from_scan( 256 scan_number=scan, 257 spectrum_mode=spectrum_mode_scan, 258 auto_process=False, 259 ) 260 261 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 262 if ms is not None: 263 if ms_params is not None: 264 ms.parameters = ms_params 265 ms.scan_number = scan 266 if auto_process: 267 ms.process_mass_spec() 268 self.add_mass_spectrum(ms) 269 270 def get_time_of_scan_id(self, scan): 271 """Returns the scan time for the specified scan number. 272 273 Parameters 274 ----------- 275 scan : int 276 The scan number of the desired scan time. 277 278 Returns 279 -------- 280 float 281 The scan time for the specified scan number (in minutes). 282 283 Raises 284 ------ 285 ValueError 286 If no scan time is found for the specified scan number. 287 """ 288 # Check if _retenion_time_list is empty and raise error if so 289 if len(self._retention_time_list) == 0: 290 raise ValueError("No retention times found in dataset") 291 rt = self._retention_time_list[self._scans_number_list.index(scan)] 292 return rt 293 294 @property 295 def scan_df(self): 296 """ 297 pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). 298 """ 299 scan_df = pd.DataFrame.from_dict(self._scan_info) 300 return scan_df 301 302 @property 303 def ms(self): 304 """ 305 dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles 306 """ 307 return self._ms 308 309 310 @scan_df.setter 311 def scan_df(self, df): 312 """ 313 Sets the scan data for the dataset. 314 315 Parameters 316 ----------- 317 df : pandas.DataFrame 318 A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level, 319 precursor m/z, scan text, and scan window (lower and upper). 320 """ 321 self._scan_info = df.to_dict() 322 323 def __getitem__(self, scan_number): 324 return self._ms.get(scan_number) 325 326 327class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch): 328 """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object. 329 330 This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method. 331 332 Parameters 333 ----------- 334 file_location : str or Path 335 The location of the file containing the mass spectra data. 336 analyzer : str, optional 337 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 338 instrument_label : str, optional 339 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 340 sample_name : str, optional 341 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 342 spectra_parser : object, optional 343 The spectra parser object used to create the mass spectra object. Defaults to None. 344 345 Attributes 346 ----------- 347 polarity : str 348 The polarity of the ionization mode used for the dataset. 349 _parameters : LCMSParameters 350 The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters. 351 _retention_time_list : numpy.ndarray 352 An array of retention times for the dataset. 353 _scans_number_list : list 354 A list of scan numbers for the dataset. 355 _tic_list : numpy.ndarray 356 An array of total ion current (TIC) values for the dataset. 357 eics : dict 358 A dictionary containing extracted ion chromatograms (EICs) for the dataset. 359 Key is the mz of the EIC. Initialized as an empty dictionary. 360 mass_features : dictionary of LCMSMassFeature objects 361 A dictionary containing mass features for the dataset. 362 Key is mass feature ID. Initialized as an empty dictionary. 363 spectral_search_results : dictionary of MS2SearchResults objects 364 A dictionary containing spectral search results for the dataset. 365 Key is scan number : precursor mz. Initialized as an empty dictionary. 366 367 Methods 368 -------- 369 * get_parameters_json(). 370 Returns the parameters used for the LC-MS analysis in JSON format. 371 * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) 372 Adds which MS2 scans are associated with each mass feature to the 373 mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary. 374 * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) 375 Adds the MS1 spectra associated with each mass feature to the 376 mass_features dictionary and adds the MS1 spectra to the _ms dictionary. 377 * mass_features_to_df() 378 Returns a pandas dataframe summarizing the mass features in the dataset. 379 * set_tic_list_from_data(overwrite=False) 380 Sets the TIC list from the mass spectrum objects within the _ms dictionary. 381 * set_retention_time_from_data(overwrite=False) 382 Sets the retention time list from the data in the _ms dictionary. 383 * set_scans_number_from_data(overwrite=False) 384 Sets the scan number list from the data in the _ms dictionary. 385 * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) 386 Generates plot of M/Z features comparing scan time vs M/Z value 387 """ 388 389 def __init__( 390 self, 391 file_location, 392 analyzer="Unknown", 393 instrument_label="Unknown", 394 sample_name=None, 395 spectra_parser=None, 396 ): 397 super().__init__( 398 file_location, analyzer, instrument_label, sample_name, spectra_parser 399 ) 400 self.polarity = "" 401 self._parameters = LCMSParameters() 402 self._retention_time_list = [] 403 self._scans_number_list = [] 404 self._tic_list = [] 405 self.eics = {} 406 self.mass_features = {} 407 self.spectral_search_results = {} 408 409 def get_parameters_json(self): 410 """Returns the parameters stored for the LC-MS object in JSON format. 411 412 Returns 413 -------- 414 str 415 The parameters used for the LC-MS analysis in JSON format. 416 """ 417 return self.parameters.to_json() 418 419 def remove_unprocessed_data(self, ms_level=None): 420 """Removes the unprocessed data from the LCMSBase object. 421 422 Parameters 423 ----------- 424 ms_level : int, optional 425 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 426 427 Raises 428 ------ 429 ValueError 430 If ms_level is not 1 or 2. 431 432 Notes 433 ----- 434 This method is useful for freeing up memory after the data has been processed. 435 """ 436 if ms_level is None: 437 for ms_level in self._ms_unprocessed.keys(): 438 self._ms_unprocessed[ms_level] = None 439 if ms_level not in [1, 2]: 440 raise ValueError("ms_level must be 1 or 2") 441 self._ms_unprocessed[ms_level] = None 442 443 def add_associated_ms2_dda( 444 self, 445 auto_process=True, 446 use_parser=True, 447 spectrum_mode=None, 448 ms_params_key="ms2", 449 scan_filter=None, 450 ): 451 """Add MS2 spectra associated with mass features to the dataset. 452 453 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 454 455 Parameters 456 ----------- 457 auto_process : bool, optional 458 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 459 use_parser : bool, optional 460 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 461 spectrum_mode : str or None, optional 462 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 463 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 464 Defaults to None. (faster if defined, otherwise will check each scan) 465 ms_params_key : string, optional 466 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 467 Defaults to 'ms2'. 468 scan_filter : str 469 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 470 "hcd" will pull out only HCD scans. 471 472 Raises 473 ------ 474 ValueError 475 If mass_features is not set, must run find_mass_features() first. 476 If no MS2 scans are found in the dataset. 477 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 478 """ 479 # Check if mass_features is set, raise error if not 480 if self.mass_features is None: 481 raise ValueError( 482 "mass_features not set, must run find_mass_features() first" 483 ) 484 485 # reconfigure ms_params to get the correct mass spectrum parameters from the key 486 ms_params = self.parameters.mass_spectrum[ms_params_key] 487 488 mf_df = self.mass_features_to_df().copy() 489 # Find ms2 scans that have a precursor m/z value 490 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 491 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 492 # drop ms2 scans that have no tic 493 ms2_scans = ms2_scans[ms2_scans.tic > 0] 494 if ms2_scans is None: 495 raise ValueError("No DDA scans found in dataset") 496 497 if scan_filter is not None: 498 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 499 # set tolerance in rt space (in minutes) and mz space (in daltons) 500 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 501 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 502 503 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 504 dda_scans = [] 505 for i, row in mf_df.iterrows(): 506 ms2_scans_filtered = ms2_scans[ 507 ms2_scans.scan_time.between( 508 row.scan_time - time_tol, row.scan_time + time_tol 509 ) 510 ] 511 ms2_scans_filtered = ms2_scans_filtered[ 512 ms2_scans_filtered.precursor_mz.between( 513 row.mz - mz_tol, row.mz + mz_tol 514 ) 515 ] 516 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 517 self.mass_features[i].ms2_scan_numbers = ( 518 ms2_scans_filtered.scan.tolist() 519 + self.mass_features[i].ms2_scan_numbers 520 ) 521 # add to _ms attribute 522 self.add_mass_spectra( 523 scan_list=list(set(dda_scans)), 524 auto_process=auto_process, 525 spectrum_mode=spectrum_mode, 526 use_parser=use_parser, 527 ms_params=ms_params, 528 ) 529 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 530 for mf_id in self.mass_features: 531 if self.mass_features[mf_id].ms2_scan_numbers is not None: 532 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 533 if dda_scan in self._ms.keys(): 534 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 535 dda_scan 536 ] 537 538 def add_associated_ms1( 539 self, auto_process=True, use_parser=True, spectrum_mode=None 540 ): 541 """Add MS1 spectra associated with mass features to the dataset. 542 543 Parameters 544 ----------- 545 auto_process : bool, optional 546 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 547 use_parser : bool, optional 548 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 549 spectrum_mode : str or None, optional 550 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 551 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 552 Defaults to None. (faster if defined, otherwise will check each scan) 553 554 Raises 555 ------ 556 ValueError 557 If mass_features is not set, must run find_mass_features() first. 558 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 559 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 560 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 561 """ 562 # Check if mass_features is set, raise error if not 563 if self.mass_features is None: 564 raise ValueError( 565 "mass_features not set, must run find_mass_features() first" 566 ) 567 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 568 569 if scans_to_average == 1: 570 # Add to LCMSobj 571 self.add_mass_spectra( 572 scan_list=[ 573 int(x) for x in self.mass_features_to_df().apex_scan.tolist() 574 ], 575 auto_process=auto_process, 576 use_parser=use_parser, 577 spectrum_mode=spectrum_mode, 578 ms_params=self.parameters.mass_spectrum["ms1"], 579 ) 580 581 elif ( 582 (scans_to_average - 1) % 2 583 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 584 apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist())) 585 # Check if all apex scans are profile mode, raise error if not 586 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 587 raise ValueError("All apex scans must be profile mode for averaging") 588 589 # First get sets of scans to average 590 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 591 ms1_idx_start = ms1_scans.index(apex_scan) - int( 592 (scans_to_average - 1) / 2 593 ) 594 if ms1_idx_start < 0: 595 ms1_idx_start = 0 596 ms1_idx_end = ( 597 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 598 ) 599 if ms1_idx_end > (len(ms1_scans) - 1): 600 ms1_idx_end = len(ms1_scans) - 1 601 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 602 return scan_list 603 604 ms1_scans = self.ms1_scans 605 scans_lists = [ 606 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 607 for apex_scan in apex_scans 608 ] 609 610 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 611 if self.polarity == "negative": 612 polarity = -1 613 elif self.polarity == "positive": 614 polarity = 1 615 616 if not use_parser: 617 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 618 ms1_unprocessed = self._ms_unprocessed[1].copy() 619 # Set the index on _ms_unprocessed[1] to scan number 620 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 621 self._ms_unprocessed[1] = ms1_unprocessed 622 623 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 624 scans_lists_flat = list( 625 set([scan for sublist in scans_lists for scan in sublist]) 626 ) 627 if ( 628 len( 629 np.setdiff1d( 630 np.sort(scans_lists_flat), 631 np.sort(ms1_unprocessed.index.values), 632 ) 633 ) 634 > 0 635 ): 636 raise ValueError( 637 "Not all scans to average are present in the unprocessed data" 638 ) 639 640 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 641 # Get unprocessed mass spectrum from scans 642 ms = self.get_average_mass_spectrum( 643 scan_list=scan_list_average, 644 apex_scan=apex_scan, 645 spectrum_mode="profile", 646 ms_level=1, 647 auto_process=auto_process, 648 use_parser=use_parser, 649 perform_checks=False, 650 polarity=polarity, 651 ms_params=self.parameters.mass_spectrum["ms1"], 652 ) 653 # Add mass spectrum to LCMS object and associated with mass feature 654 self.add_mass_spectrum(ms) 655 656 if not use_parser: 657 # Reset the index on _ms_unprocessed[1] to not be scan number 658 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 659 self._ms_unprocessed[1] = ms1_unprocessed 660 else: 661 raise ValueError( 662 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 663 ) 664 665 # Associate the ms1 spectra with the mass features 666 for mf_id in self.mass_features: 667 self.mass_features[mf_id].mass_spectrum = self._ms[ 668 self.mass_features[mf_id].apex_scan 669 ] 670 self.mass_features[mf_id].update_mz() 671 672 # Re-process clustering if persistent homology is selected to remove duplicate mass features after adding and processing MS1 spectra 673 if self.parameters.lc_ms.peak_picking_method == "persistent homology": 674 self.cluster_mass_features(drop_children=True, sort_by="persistence") 675 676 def mass_features_to_df(self): 677 """Returns a pandas dataframe summarizing the mass features. 678 679 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 680 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 681 682 683 Returns 684 -------- 685 pandas.DataFrame 686 A pandas dataframe of mass features with the following columns: 687 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 688 """ 689 690 def mass_spectrum_to_string( 691 mass_spec, normalize=True, min_normalized_abun=0.01 692 ): 693 """Converts a mass spectrum to a string of m/z:abundance pairs. 694 695 Parameters 696 ----------- 697 mass_spec : MassSpectrum 698 A MassSpectrum object to be converted to a string. 699 normalize : bool, optional 700 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 701 min_normalized_abun : float, optional 702 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 703 704 Returns 705 -------- 706 str 707 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 708 """ 709 mz_np = mass_spec.to_dataframe()["m/z"].values 710 abun_np = mass_spec.to_dataframe()["Peak Height"].values 711 if normalize: 712 abun_np = abun_np / abun_np.max() 713 mz_abun = np.column_stack((mz_np, abun_np)) 714 if normalize: 715 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 716 mz_abun_str = [ 717 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 718 for mz, abun in mz_abun 719 ] 720 return "; ".join(mz_abun_str) 721 722 cols_in_df = [ 723 "id", 724 "_apex_scan", 725 "start_scan", 726 "final_scan", 727 "_retention_time", 728 "_intensity", 729 "_persistence", 730 "_area", 731 "_dispersity_index", 732 "_tailing_factor", 733 "monoisotopic_mf_id", 734 "isotopologue_type", 735 "mass_spectrum_deconvoluted_parent", 736 ] 737 df_mf_list = [] 738 for mf_id in self.mass_features.keys(): 739 # Find cols_in_df that are in single_mf 740 df_keys = list( 741 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 742 ) 743 dict_mf = {} 744 for key in df_keys: 745 dict_mf[key] = getattr(self.mass_features[mf_id], key) 746 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 747 # Add MS2 spectra info 748 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 749 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 750 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 751 dict_mf["associated_mass_features"] = ", ".join( 752 map( 753 str, 754 self.mass_features[mf_id].associated_mass_features_deconvoluted, 755 ) 756 ) 757 if self.mass_features[mf_id]._half_height_width is not None: 758 dict_mf["half_height_width"] = self.mass_features[ 759 mf_id 760 ].half_height_width 761 # Check if EIC for mass feature is set 762 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 763 df_mf_single["mz"] = self.mass_features[mf_id].mz 764 df_mf_list.append(df_mf_single) 765 df_mf = pd.concat(df_mf_list) 766 767 # rename _area to area and id to mf_id 768 df_mf = df_mf.rename( 769 columns={ 770 "_area": "area", 771 "id": "mf_id", 772 "_apex_scan": "apex_scan", 773 "_retention_time": "scan_time", 774 "_intensity": "intensity", 775 "_persistence": "persistence", 776 "_dispersity_index": "dispersity_index", 777 "_tailing_factor": "tailing_factor", 778 } 779 ) 780 781 # reorder columns 782 col_order = [ 783 "mf_id", 784 "scan_time", 785 "mz", 786 "apex_scan", 787 "start_scan", 788 "final_scan", 789 "intensity", 790 "persistence", 791 "area", 792 "half_height_width", 793 "tailing_factor", 794 "dispersity_index", 795 "monoisotopic_mf_id", 796 "isotopologue_type", 797 "mass_spectrum_deconvoluted_parent", 798 "associated_mass_features", 799 "ms2_spectrum", 800 ] 801 # drop columns that are not in col_order 802 cols_to_order = [col for col in col_order if col in df_mf.columns] 803 df_mf = df_mf[cols_to_order] 804 805 # reset index to mf_id 806 df_mf = df_mf.set_index("mf_id") 807 df_mf.index.name = "mf_id" 808 809 return df_mf 810 811 def mass_features_ms1_annot_to_df(self): 812 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 813 814 Returns 815 -------- 816 pandas.DataFrame 817 A pandas dataframe of MS1 annotations for the mass features in the dataset. 818 The index is set to mf_id (mass feature ID) 819 820 Raises 821 ------ 822 Warning 823 If no MS1 annotations were found for the mass features in the dataset. 824 """ 825 annot_df_list_ms1 = [] 826 for mf_id in self.mass_features.keys(): 827 if self.mass_features[mf_id].mass_spectrum is None: 828 pass 829 else: 830 # Add ms1 annotations to ms1 annotation list 831 if ( 832 np.abs( 833 ( 834 self.mass_features[mf_id].ms1_peak.mz_exp 835 - self.mass_features[mf_id].mz 836 ) 837 ) 838 < 0.01 839 ): 840 # Get the molecular formula from the mass spectrum 841 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 842 # Subset to pull out only the peak associated with the mass feature 843 annot_df = annot_df[ 844 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 845 ].copy() 846 847 # Remove the index column and add column for mf_id 848 annot_df = annot_df.drop(columns=["Index"]) 849 annot_df["mf_id"] = mf_id 850 annot_df_list_ms1.append(annot_df) 851 852 if len(annot_df_list_ms1) > 0: 853 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 854 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 855 annot_ms1_df_full.index.name = "mf_id" 856 857 else: 858 annot_ms1_df_full = None 859 # Warn that no ms1 annotations were found 860 warnings.warn( 861 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 862 UserWarning, 863 ) 864 865 return annot_ms1_df_full 866 867 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 868 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 869 870 Parameters 871 ----------- 872 molecular_metadata : dict of MolecularMetadata objects 873 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 874 875 Returns 876 -------- 877 pandas.DataFrame 878 A pandas dataframe of MS2 annotations for the mass features in the dataset, 879 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 880 881 Raises 882 ------ 883 Warning 884 If no MS2 annotations were found for the mass features in the dataset. 885 """ 886 annot_df_list_ms2 = [] 887 for mf_id in self.mass_features.keys(): 888 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 889 # Add ms2 annotations to ms2 annotation list 890 for result in self.mass_features[mf_id].ms2_similarity_results: 891 annot_df_ms2 = result.to_dataframe() 892 annot_df_ms2["mf_id"] = mf_id 893 annot_df_list_ms2.append(annot_df_ms2) 894 895 if len(annot_df_list_ms2) > 0: 896 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 897 if molecular_metadata is not None: 898 molecular_metadata_df = pd.concat( 899 [ 900 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 901 for k, v in molecular_metadata.items() 902 ], 903 ignore_index=True, 904 ) 905 molecular_metadata_df = molecular_metadata_df.rename( 906 columns={"id": "ref_mol_id"} 907 ) 908 annot_ms2_df_full = annot_ms2_df_full.merge( 909 molecular_metadata_df, on="ref_mol_id", how="left" 910 ) 911 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 912 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 913 ).copy() 914 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 915 annot_ms2_df_full.index.name = "mf_id" 916 else: 917 annot_ms2_df_full = None 918 # Warn that no ms2 annotations were found 919 warnings.warn( 920 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 921 UserWarning, 922 ) 923 924 return annot_ms2_df_full 925 926 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 927 """Returns a figure displaying 928 (1) thresholded, unprocessed data 929 (2) the m/z features 930 (3) which m/z features are associated with MS2 spectra 931 932 Parameters 933 ----------- 934 binsize : float 935 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 936 mf_plot : boolean 937 Indicates whether to plot the m/z features. Defaults to True. 938 ms2_plot : boolean 939 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 940 return_fig : boolean 941 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 942 943 Returns 944 -------- 945 matplotlib.pyplot.Figure 946 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 947 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 948 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 949 features with associated with MS2 spectra are plotted, they are displayed in red. 950 951 Raises 952 ------ 953 Warning 954 If m/z features are set to be plot but aren't in the dataset. 955 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 956 were found for the m/z features in the dataset. 957 """ 958 if mf_plot: 959 # Check if mass_features is set, raise error if not 960 if self.mass_features is None: 961 raise ValueError( 962 "mass_features not set, must run find_mass_features() first" 963 ) 964 ## call mass feature data 965 mf_df = self.mass_features_to_df() 966 967 if ms2_plot: 968 if not mf_plot: 969 # Check if mass_features is set, raise error if not 970 if self.mass_features is None: 971 raise ValueError( 972 "mass_features not set, must run find_mass_features() first" 973 ) 974 975 ## call m/z feature data 976 mf_df = self.mass_features_to_df() 977 978 # Check if ms2_spectrum is set, raise error if not 979 if 'ms2_spectrum' not in mf_df.columns: 980 raise ValueError( 981 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 982 ) 983 984 ## threshold and grid unprocessed data 985 df = self._ms_unprocessed[1].copy() 986 df = df.dropna(subset=['intensity']).reset_index(drop = True) 987 threshold = ph_int_min_thresh * df.intensity.max() 988 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 989 df = self.grid_data(df_thres) 990 991 ## format unprocessed data for plotting 992 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 993 mz_grid = np.arange(0, np.max(df.mz), binsize) 994 mz_data = np.array(df.mz) 995 df['mz_bin'] = find_closest(mz_grid, mz_data) 996 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 997 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 998 999 ## generate figure 1000 fig = plt.figure() 1001 plt.scatter( 1002 unproc_df.scan_time, 1003 unproc_df.mz_bin*binsize, 1004 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1005 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1006 cmap = 'Greys_r', 1007 s = 1 1008 ) 1009 1010 if mf_plot: 1011 if ms2_plot: 1012 plt.scatter( 1013 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1014 mf_df[mf_df.ms2_spectrum.isna()].mz, 1015 c = 'c', 1016 s = 4, 1017 label = 'M/Z features without MS2' 1018 ) 1019 else: 1020 plt.scatter( 1021 mf_df.scan_time, 1022 mf_df.mz, 1023 c = 'c', 1024 s = 4, 1025 label = 'M/Z features' 1026 ) 1027 1028 if ms2_plot: 1029 plt.scatter( 1030 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1031 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1032 c = 'r', 1033 s = 2, 1034 label = 'M/Z features with MS2' 1035 ) 1036 1037 if mf_plot == True or ms2_plot == True: 1038 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1039 plt.xlabel('Scan time') 1040 plt.ylabel('m/z') 1041 plt.ylim(0, np.ceil(np.max(df.mz))) 1042 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1043 plt.title('Composite Feature Map') 1044 1045 if return_fig: 1046 plt.close(fig) 1047 return fig 1048 1049 else: 1050 plt.show() 1051 1052 def __len__(self): 1053 """ 1054 Returns the number of mass spectra in the dataset. 1055 1056 Returns 1057 -------- 1058 int 1059 The number of mass spectra in the dataset. 1060 """ 1061 return len(self._ms) 1062 1063 def __getitem__(self, scan_number): 1064 """ 1065 Returns the mass spectrum corresponding to the specified scan number. 1066 1067 Parameters 1068 ----------- 1069 scan_number : int 1070 The scan number of the desired mass spectrum. 1071 1072 Returns 1073 -------- 1074 MassSpectrum 1075 The mass spectrum corresponding to the specified scan number. 1076 """ 1077 return self._ms.get(scan_number) 1078 1079 def __iter__(self): 1080 """Returns an iterator over the mass spectra in the dataset. 1081 1082 Returns 1083 -------- 1084 iterator 1085 An iterator over the mass spectra in the dataset. 1086 """ 1087 return iter(self._ms.values()) 1088 1089 def set_tic_list_from_data(self, overwrite=False): 1090 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1091 1092 Parameters 1093 ----------- 1094 overwrite : bool, optional 1095 If True, overwrites the TIC list if it is already set. Defaults to False. 1096 1097 Notes 1098 ----- 1099 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1100 1101 Raises 1102 ------ 1103 ValueError 1104 If no mass spectra are found in the dataset. 1105 If the TIC list is already set and overwrite is False. 1106 """ 1107 # Check if _ms is empty and raise error if so 1108 if len(self._ms) == 0: 1109 raise ValueError("No mass spectra found in dataset") 1110 1111 # Check if tic_list is already set and raise error if so 1112 if len(self.tic) > 0 and not overwrite: 1113 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1114 1115 self.tic = [self._ms.get(i).tic for i in self.scans_number] 1116 1117 def set_retention_time_from_data(self, overwrite=False): 1118 """Sets the retention time list from the data in the _ms dictionary. 1119 1120 Parameters 1121 ----------- 1122 overwrite : bool, optional 1123 If True, overwrites the retention time list if it is already set. Defaults to False. 1124 1125 Notes 1126 ----- 1127 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1128 1129 Raises 1130 ------ 1131 ValueError 1132 If no mass spectra are found in the dataset. 1133 If the retention time list is already set and overwrite is False. 1134 """ 1135 # Check if _ms is empty and raise error if so 1136 if len(self._ms) == 0: 1137 raise ValueError("No mass spectra found in dataset") 1138 1139 # Check if retention_time_list is already set and raise error if so 1140 if len(self.retention_time) > 0 and not overwrite: 1141 raise ValueError( 1142 "Retention time list already set, use overwrite=True to overwrite" 1143 ) 1144 1145 retention_time_list = [] 1146 for key_ms in sorted(self._ms.keys()): 1147 retention_time_list.append(self._ms.get(key_ms).retention_time) 1148 self.retention_time = retention_time_list 1149 1150 def set_scans_number_from_data(self, overwrite=False): 1151 """Sets the scan number list from the data in the _ms dictionary. 1152 1153 Notes 1154 ----- 1155 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1156 1157 Raises 1158 ------ 1159 ValueError 1160 If no mass spectra are found in the dataset. 1161 If the scan number list is already set and overwrite is False. 1162 """ 1163 # Check if _ms is empty and raise error if so 1164 if len(self._ms) == 0: 1165 raise ValueError("No mass spectra found in dataset") 1166 1167 # Check if scans_number_list is already set and raise error if so 1168 if len(self.scans_number) > 0 and not overwrite: 1169 raise ValueError( 1170 "Scan number list already set, use overwrite=True to overwrite" 1171 ) 1172 1173 self.scans_number = sorted(self._ms.keys()) 1174 1175 @property 1176 def ms1_scans(self): 1177 """ 1178 list : A list of MS1 scan numbers for the dataset. 1179 """ 1180 return self.scan_df[self.scan_df.ms_level == 1].index.tolist() 1181 1182 @property 1183 def parameters(self): 1184 """ 1185 LCMSParameters : The parameters used for the LC-MS analysis. 1186 """ 1187 return self._parameters 1188 1189 @parameters.setter 1190 def parameters(self, paramsinstance): 1191 """ 1192 Sets the parameters used for the LC-MS analysis. 1193 1194 Parameters 1195 ----------- 1196 paramsinstance : LCMSParameters 1197 The parameters used for the LC-MS analysis. 1198 """ 1199 self._parameters = paramsinstance 1200 1201 @property 1202 def scans_number(self): 1203 """ 1204 list : A list of scan numbers for the dataset. 1205 """ 1206 return self._scans_number_list 1207 1208 @scans_number.setter 1209 def scans_number(self, scan_numbers_list): 1210 """ 1211 Sets the scan numbers for the dataset. 1212 1213 Parameters 1214 ----------- 1215 scan_numbers_list : list 1216 A list of scan numbers for the dataset. 1217 """ 1218 self._scans_number_list = scan_numbers_list 1219 1220 @property 1221 def retention_time(self): 1222 """ 1223 numpy.ndarray : An array of retention times for the dataset. 1224 """ 1225 return self._retention_time_list 1226 1227 @retention_time.setter 1228 def retention_time(self, rt_list): 1229 """ 1230 Sets the retention times for the dataset. 1231 1232 Parameters 1233 ----------- 1234 rt_list : list 1235 A list of retention times for the dataset. 1236 """ 1237 self._retention_time_list = np.array(rt_list) 1238 1239 @property 1240 def tic(self): 1241 """ 1242 numpy.ndarray : An array of TIC values for the dataset. 1243 """ 1244 return self._tic_list 1245 1246 @tic.setter 1247 def tic(self, tic_list): 1248 """ 1249 Sets the TIC values for the dataset. 1250 1251 Parameters 1252 ----------- 1253 tic_list : list 1254 A list of TIC values for the dataset. 1255 """ 1256 self._tic_list = np.array(tic_list)
16class MassSpectraBase: 17 """Base class for mass spectra objects. 18 19 Parameters 20 ----------- 21 file_location : str or Path 22 The location of the file containing the mass spectra data. 23 analyzer : str, optional 24 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 25 instrument_label : str, optional 26 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 27 sample_name : str, optional 28 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 29 spectra_parser : object, optional 30 The spectra parser object used to create the mass spectra object. Defaults to None. 31 32 Attributes 33 ----------- 34 spectra_parser_class : class 35 The class of the spectra parser used to create the mass spectra object. 36 file_location : str or Path 37 The location of the file containing the mass spectra data. 38 sample_name : str 39 The name of the sample; defaults to the file name if not provided to the parser. 40 analyzer : str 41 The type of analyzer used to generate the mass spectra data. Derived from the spectra parser. 42 instrument_label : str 43 The type of instrument used to generate the mass spectra data. Derived from the spectra parser. 44 _scan_info : dict 45 A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, 46 scan text, and scan window (lower and upper). 47 Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame. 48 _ms : dict 49 A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary. 50 _ms_unprocessed: dictionary of pandas.DataFrames or None 51 A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. 52 Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None. 53 54 Methods 55 -------- 56 * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). 57 Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans 58 * get_time_of_scan_id(scan). 59 Returns the scan time for the specified scan number. 60 """ 61 62 def __init__( 63 self, 64 file_location, 65 analyzer="Unknown", 66 instrument_label="Unknown", 67 sample_name=None, 68 spectra_parser=None, 69 ): 70 if isinstance(file_location, str): 71 file_location = Path(file_location) 72 else: 73 file_location = file_location 74 if not file_location.exists(): 75 raise FileExistsError("File does not exist: " + str(file_location)) 76 77 if sample_name: 78 self.sample_name = sample_name 79 else: 80 self.sample_name = file_location.stem 81 82 self.file_location = file_location 83 self.analyzer = analyzer 84 self.instrument_label = instrument_label 85 86 # Add the spectra parser class to the object if it is not None 87 if spectra_parser is not None: 88 self.spectra_parser_class = spectra_parser.__class__ 89 self.spectra_parser = spectra_parser 90 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 91 if ( 92 self.sample_name is not None 93 and self.sample_name != self.spectra_parser.sample_name 94 ): 95 warnings.warn( 96 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 97 UserWarning, 98 ) 99 if self.analyzer != self.spectra_parser.analyzer: 100 warnings.warn( 101 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 102 UserWarning, 103 ) 104 if self.instrument_label != self.spectra_parser.instrument_label: 105 warnings.warn( 106 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 107 UserWarning, 108 ) 109 if self.file_location != self.spectra_parser.file_location: 110 warnings.warn( 111 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 112 UserWarning, 113 ) 114 115 # Instantiate empty dictionaries for scan information and mass spectra 116 self._scan_info = {} 117 self._ms = {} 118 self._ms_unprocessed = {} 119 120 def add_mass_spectrum(self, mass_spec): 121 """Adds a mass spectrum to the dataset. 122 123 Parameters 124 ----------- 125 mass_spec : MassSpectrum 126 The corems MassSpectrum object to be added to the dataset. 127 128 Notes 129 ----- 130 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 131 """ 132 # check if mass_spec has a scan_number attribute 133 if not hasattr(mass_spec, "scan_number"): 134 raise ValueError( 135 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 136 ) 137 self._ms[mass_spec.scan_number] = mass_spec 138 139 def add_mass_spectra( 140 self, 141 scan_list, 142 spectrum_mode=None, 143 ms_level=1, 144 use_parser=True, 145 auto_process=True, 146 ms_params=None, 147 ): 148 """Add mass spectra to _ms dictionary, from a list of scans or single scan 149 150 Notes 151 ----- 152 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 153 154 155 Parameters 156 ----------- 157 scan_list : list of ints 158 List of scans to use to populate _ms slot 159 spectrum_mode : str or None 160 The spectrum mode to use for the mass spectra. 161 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 162 Defaults to None. 163 ms_level : int, optional 164 The MS level to use for the mass spectra. 165 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 166 Defaults to 1. 167 using_parser : bool 168 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 169 auto_process : bool 170 Whether to auto-process the mass spectra. Defaults to True. 171 ms_params : MSParameters or None 172 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 173 174 Raises 175 ------ 176 TypeError 177 If scan_list is not a list of ints 178 ValueError 179 If polarity is not 'positive' or 'negative' 180 If ms_level is not 1 or 2 181 """ 182 183 # check if scan_list is a list or a single int; if single int, convert to list 184 if isinstance(scan_list, int): 185 scan_list = [scan_list] 186 if not isinstance(scan_list, list): 187 raise TypeError("scan_list must be a list of integers") 188 for scan in scan_list: 189 if not isinstance(scan, int): 190 raise TypeError("scan_list must be a list of integers") 191 192 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 193 if self.polarity == "negative": 194 polarity = -1 195 elif self.polarity == "positive": 196 polarity = 1 197 else: 198 raise ValueError( 199 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 200 ) 201 202 # is not using_parser, check that ms1 and ms2 are not None 203 if not use_parser: 204 if ms_level not in self._ms_unprocessed.keys(): 205 raise ValueError( 206 "ms_level {} not found in _ms_unprocessed dictionary".format( 207 ms_level 208 ) 209 ) 210 211 scan_list = list(set(scan_list)) 212 scan_list.sort() 213 if not use_parser: 214 if self._ms_unprocessed[ms_level] is None: 215 raise ValueError( 216 "No unprocessed data found for ms_level {}".format(ms_level) 217 ) 218 if ( 219 len( 220 np.setdiff1d( 221 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 222 ) 223 ) 224 > 0 225 ): 226 raise ValueError( 227 "Not all scans in scan_list are present in the unprocessed data" 228 ) 229 # Prepare the ms_df for parsing 230 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 231 232 for scan in scan_list: 233 ms = None 234 if spectrum_mode is None: 235 # get spectrum mode from _scan_info 236 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 237 else: 238 spectrum_mode_scan = spectrum_mode 239 # Instantiate the mass spectrum object using the parser or the unprocessed data 240 if not use_parser: 241 my_ms_df = ms_df.loc[scan] 242 if spectrum_mode_scan == "profile": 243 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 244 ms = ms_from_array_profile( 245 my_ms_df.mz, 246 my_ms_df.intensity, 247 self.file_location, 248 polarity=polarity, 249 auto_process=False, 250 ) 251 else: 252 raise ValueError( 253 "Only profile mode is supported for unprocessed data" 254 ) 255 if use_parser: 256 ms = self.spectra_parser.get_mass_spectrum_from_scan( 257 scan_number=scan, 258 spectrum_mode=spectrum_mode_scan, 259 auto_process=False, 260 ) 261 262 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 263 if ms is not None: 264 if ms_params is not None: 265 ms.parameters = ms_params 266 ms.scan_number = scan 267 if auto_process: 268 ms.process_mass_spec() 269 self.add_mass_spectrum(ms) 270 271 def get_time_of_scan_id(self, scan): 272 """Returns the scan time for the specified scan number. 273 274 Parameters 275 ----------- 276 scan : int 277 The scan number of the desired scan time. 278 279 Returns 280 -------- 281 float 282 The scan time for the specified scan number (in minutes). 283 284 Raises 285 ------ 286 ValueError 287 If no scan time is found for the specified scan number. 288 """ 289 # Check if _retenion_time_list is empty and raise error if so 290 if len(self._retention_time_list) == 0: 291 raise ValueError("No retention times found in dataset") 292 rt = self._retention_time_list[self._scans_number_list.index(scan)] 293 return rt 294 295 @property 296 def scan_df(self): 297 """ 298 pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). 299 """ 300 scan_df = pd.DataFrame.from_dict(self._scan_info) 301 return scan_df 302 303 @property 304 def ms(self): 305 """ 306 dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles 307 """ 308 return self._ms 309 310 311 @scan_df.setter 312 def scan_df(self, df): 313 """ 314 Sets the scan data for the dataset. 315 316 Parameters 317 ----------- 318 df : pandas.DataFrame 319 A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level, 320 precursor m/z, scan text, and scan window (lower and upper). 321 """ 322 self._scan_info = df.to_dict() 323 324 def __getitem__(self, scan_number): 325 return self._ms.get(scan_number)
Base class for mass spectra objects.
Parameters
- file_location (str or Path): The location of the file containing the mass spectra data.
- analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
- instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
- sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
- spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
- spectra_parser_class (class): The class of the spectra parser used to create the mass spectra object.
- file_location (str or Path): The location of the file containing the mass spectra data.
- sample_name (str): The name of the sample; defaults to the file name if not provided to the parser.
- analyzer (str): The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
- instrument_label (str): The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
- _scan_info (dict): A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
- _ms (dict): A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
- _ms_unprocessed (dictionary of pandas.DataFrames or None): A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
Methods
- add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
- get_time_of_scan_id(scan). Returns the scan time for the specified scan number.
62 def __init__( 63 self, 64 file_location, 65 analyzer="Unknown", 66 instrument_label="Unknown", 67 sample_name=None, 68 spectra_parser=None, 69 ): 70 if isinstance(file_location, str): 71 file_location = Path(file_location) 72 else: 73 file_location = file_location 74 if not file_location.exists(): 75 raise FileExistsError("File does not exist: " + str(file_location)) 76 77 if sample_name: 78 self.sample_name = sample_name 79 else: 80 self.sample_name = file_location.stem 81 82 self.file_location = file_location 83 self.analyzer = analyzer 84 self.instrument_label = instrument_label 85 86 # Add the spectra parser class to the object if it is not None 87 if spectra_parser is not None: 88 self.spectra_parser_class = spectra_parser.__class__ 89 self.spectra_parser = spectra_parser 90 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 91 if ( 92 self.sample_name is not None 93 and self.sample_name != self.spectra_parser.sample_name 94 ): 95 warnings.warn( 96 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 97 UserWarning, 98 ) 99 if self.analyzer != self.spectra_parser.analyzer: 100 warnings.warn( 101 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 102 UserWarning, 103 ) 104 if self.instrument_label != self.spectra_parser.instrument_label: 105 warnings.warn( 106 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 107 UserWarning, 108 ) 109 if self.file_location != self.spectra_parser.file_location: 110 warnings.warn( 111 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 112 UserWarning, 113 ) 114 115 # Instantiate empty dictionaries for scan information and mass spectra 116 self._scan_info = {} 117 self._ms = {} 118 self._ms_unprocessed = {}
120 def add_mass_spectrum(self, mass_spec): 121 """Adds a mass spectrum to the dataset. 122 123 Parameters 124 ----------- 125 mass_spec : MassSpectrum 126 The corems MassSpectrum object to be added to the dataset. 127 128 Notes 129 ----- 130 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 131 """ 132 # check if mass_spec has a scan_number attribute 133 if not hasattr(mass_spec, "scan_number"): 134 raise ValueError( 135 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 136 ) 137 self._ms[mass_spec.scan_number] = mass_spec
Adds a mass spectrum to the dataset.
Parameters
- mass_spec (MassSpectrum): The corems MassSpectrum object to be added to the dataset.
Notes
This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
139 def add_mass_spectra( 140 self, 141 scan_list, 142 spectrum_mode=None, 143 ms_level=1, 144 use_parser=True, 145 auto_process=True, 146 ms_params=None, 147 ): 148 """Add mass spectra to _ms dictionary, from a list of scans or single scan 149 150 Notes 151 ----- 152 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 153 154 155 Parameters 156 ----------- 157 scan_list : list of ints 158 List of scans to use to populate _ms slot 159 spectrum_mode : str or None 160 The spectrum mode to use for the mass spectra. 161 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 162 Defaults to None. 163 ms_level : int, optional 164 The MS level to use for the mass spectra. 165 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 166 Defaults to 1. 167 using_parser : bool 168 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 169 auto_process : bool 170 Whether to auto-process the mass spectra. Defaults to True. 171 ms_params : MSParameters or None 172 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 173 174 Raises 175 ------ 176 TypeError 177 If scan_list is not a list of ints 178 ValueError 179 If polarity is not 'positive' or 'negative' 180 If ms_level is not 1 or 2 181 """ 182 183 # check if scan_list is a list or a single int; if single int, convert to list 184 if isinstance(scan_list, int): 185 scan_list = [scan_list] 186 if not isinstance(scan_list, list): 187 raise TypeError("scan_list must be a list of integers") 188 for scan in scan_list: 189 if not isinstance(scan, int): 190 raise TypeError("scan_list must be a list of integers") 191 192 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 193 if self.polarity == "negative": 194 polarity = -1 195 elif self.polarity == "positive": 196 polarity = 1 197 else: 198 raise ValueError( 199 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 200 ) 201 202 # is not using_parser, check that ms1 and ms2 are not None 203 if not use_parser: 204 if ms_level not in self._ms_unprocessed.keys(): 205 raise ValueError( 206 "ms_level {} not found in _ms_unprocessed dictionary".format( 207 ms_level 208 ) 209 ) 210 211 scan_list = list(set(scan_list)) 212 scan_list.sort() 213 if not use_parser: 214 if self._ms_unprocessed[ms_level] is None: 215 raise ValueError( 216 "No unprocessed data found for ms_level {}".format(ms_level) 217 ) 218 if ( 219 len( 220 np.setdiff1d( 221 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 222 ) 223 ) 224 > 0 225 ): 226 raise ValueError( 227 "Not all scans in scan_list are present in the unprocessed data" 228 ) 229 # Prepare the ms_df for parsing 230 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 231 232 for scan in scan_list: 233 ms = None 234 if spectrum_mode is None: 235 # get spectrum mode from _scan_info 236 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 237 else: 238 spectrum_mode_scan = spectrum_mode 239 # Instantiate the mass spectrum object using the parser or the unprocessed data 240 if not use_parser: 241 my_ms_df = ms_df.loc[scan] 242 if spectrum_mode_scan == "profile": 243 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 244 ms = ms_from_array_profile( 245 my_ms_df.mz, 246 my_ms_df.intensity, 247 self.file_location, 248 polarity=polarity, 249 auto_process=False, 250 ) 251 else: 252 raise ValueError( 253 "Only profile mode is supported for unprocessed data" 254 ) 255 if use_parser: 256 ms = self.spectra_parser.get_mass_spectrum_from_scan( 257 scan_number=scan, 258 spectrum_mode=spectrum_mode_scan, 259 auto_process=False, 260 ) 261 262 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 263 if ms is not None: 264 if ms_params is not None: 265 ms.parameters = ms_params 266 ms.scan_number = scan 267 if auto_process: 268 ms.process_mass_spec() 269 self.add_mass_spectrum(ms)
Add mass spectra to _ms dictionary, from a list of scans or single scan
Notes
The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
Parameters
- scan_list (list of ints): List of scans to use to populate _ms slot
- spectrum_mode (str or None): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None.
- ms_level (int, optional): The MS level to use for the mass spectra. This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. Defaults to 1.
- using_parser (bool): Whether to use the mass spectra parser to get the mass spectra. Defaults to True.
- auto_process (bool): Whether to auto-process the mass spectra. Defaults to True.
- ms_params (MSParameters or None): The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters.
Raises
- TypeError: If scan_list is not a list of ints
- ValueError: If polarity is not 'positive' or 'negative' If ms_level is not 1 or 2
271 def get_time_of_scan_id(self, scan): 272 """Returns the scan time for the specified scan number. 273 274 Parameters 275 ----------- 276 scan : int 277 The scan number of the desired scan time. 278 279 Returns 280 -------- 281 float 282 The scan time for the specified scan number (in minutes). 283 284 Raises 285 ------ 286 ValueError 287 If no scan time is found for the specified scan number. 288 """ 289 # Check if _retenion_time_list is empty and raise error if so 290 if len(self._retention_time_list) == 0: 291 raise ValueError("No retention times found in dataset") 292 rt = self._retention_time_list[self._scans_number_list.index(scan)] 293 return rt
Returns the scan time for the specified scan number.
Parameters
- scan (int): The scan number of the desired scan time.
Returns
- float: The scan time for the specified scan number (in minutes).
Raises
- ValueError: If no scan time is found for the specified scan number.
328class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch): 329 """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object. 330 331 This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method. 332 333 Parameters 334 ----------- 335 file_location : str or Path 336 The location of the file containing the mass spectra data. 337 analyzer : str, optional 338 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 339 instrument_label : str, optional 340 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 341 sample_name : str, optional 342 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 343 spectra_parser : object, optional 344 The spectra parser object used to create the mass spectra object. Defaults to None. 345 346 Attributes 347 ----------- 348 polarity : str 349 The polarity of the ionization mode used for the dataset. 350 _parameters : LCMSParameters 351 The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters. 352 _retention_time_list : numpy.ndarray 353 An array of retention times for the dataset. 354 _scans_number_list : list 355 A list of scan numbers for the dataset. 356 _tic_list : numpy.ndarray 357 An array of total ion current (TIC) values for the dataset. 358 eics : dict 359 A dictionary containing extracted ion chromatograms (EICs) for the dataset. 360 Key is the mz of the EIC. Initialized as an empty dictionary. 361 mass_features : dictionary of LCMSMassFeature objects 362 A dictionary containing mass features for the dataset. 363 Key is mass feature ID. Initialized as an empty dictionary. 364 spectral_search_results : dictionary of MS2SearchResults objects 365 A dictionary containing spectral search results for the dataset. 366 Key is scan number : precursor mz. Initialized as an empty dictionary. 367 368 Methods 369 -------- 370 * get_parameters_json(). 371 Returns the parameters used for the LC-MS analysis in JSON format. 372 * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) 373 Adds which MS2 scans are associated with each mass feature to the 374 mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary. 375 * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) 376 Adds the MS1 spectra associated with each mass feature to the 377 mass_features dictionary and adds the MS1 spectra to the _ms dictionary. 378 * mass_features_to_df() 379 Returns a pandas dataframe summarizing the mass features in the dataset. 380 * set_tic_list_from_data(overwrite=False) 381 Sets the TIC list from the mass spectrum objects within the _ms dictionary. 382 * set_retention_time_from_data(overwrite=False) 383 Sets the retention time list from the data in the _ms dictionary. 384 * set_scans_number_from_data(overwrite=False) 385 Sets the scan number list from the data in the _ms dictionary. 386 * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) 387 Generates plot of M/Z features comparing scan time vs M/Z value 388 """ 389 390 def __init__( 391 self, 392 file_location, 393 analyzer="Unknown", 394 instrument_label="Unknown", 395 sample_name=None, 396 spectra_parser=None, 397 ): 398 super().__init__( 399 file_location, analyzer, instrument_label, sample_name, spectra_parser 400 ) 401 self.polarity = "" 402 self._parameters = LCMSParameters() 403 self._retention_time_list = [] 404 self._scans_number_list = [] 405 self._tic_list = [] 406 self.eics = {} 407 self.mass_features = {} 408 self.spectral_search_results = {} 409 410 def get_parameters_json(self): 411 """Returns the parameters stored for the LC-MS object in JSON format. 412 413 Returns 414 -------- 415 str 416 The parameters used for the LC-MS analysis in JSON format. 417 """ 418 return self.parameters.to_json() 419 420 def remove_unprocessed_data(self, ms_level=None): 421 """Removes the unprocessed data from the LCMSBase object. 422 423 Parameters 424 ----------- 425 ms_level : int, optional 426 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 427 428 Raises 429 ------ 430 ValueError 431 If ms_level is not 1 or 2. 432 433 Notes 434 ----- 435 This method is useful for freeing up memory after the data has been processed. 436 """ 437 if ms_level is None: 438 for ms_level in self._ms_unprocessed.keys(): 439 self._ms_unprocessed[ms_level] = None 440 if ms_level not in [1, 2]: 441 raise ValueError("ms_level must be 1 or 2") 442 self._ms_unprocessed[ms_level] = None 443 444 def add_associated_ms2_dda( 445 self, 446 auto_process=True, 447 use_parser=True, 448 spectrum_mode=None, 449 ms_params_key="ms2", 450 scan_filter=None, 451 ): 452 """Add MS2 spectra associated with mass features to the dataset. 453 454 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 455 456 Parameters 457 ----------- 458 auto_process : bool, optional 459 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 460 use_parser : bool, optional 461 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 462 spectrum_mode : str or None, optional 463 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 464 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 465 Defaults to None. (faster if defined, otherwise will check each scan) 466 ms_params_key : string, optional 467 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 468 Defaults to 'ms2'. 469 scan_filter : str 470 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 471 "hcd" will pull out only HCD scans. 472 473 Raises 474 ------ 475 ValueError 476 If mass_features is not set, must run find_mass_features() first. 477 If no MS2 scans are found in the dataset. 478 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 479 """ 480 # Check if mass_features is set, raise error if not 481 if self.mass_features is None: 482 raise ValueError( 483 "mass_features not set, must run find_mass_features() first" 484 ) 485 486 # reconfigure ms_params to get the correct mass spectrum parameters from the key 487 ms_params = self.parameters.mass_spectrum[ms_params_key] 488 489 mf_df = self.mass_features_to_df().copy() 490 # Find ms2 scans that have a precursor m/z value 491 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 492 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 493 # drop ms2 scans that have no tic 494 ms2_scans = ms2_scans[ms2_scans.tic > 0] 495 if ms2_scans is None: 496 raise ValueError("No DDA scans found in dataset") 497 498 if scan_filter is not None: 499 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 500 # set tolerance in rt space (in minutes) and mz space (in daltons) 501 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 502 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 503 504 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 505 dda_scans = [] 506 for i, row in mf_df.iterrows(): 507 ms2_scans_filtered = ms2_scans[ 508 ms2_scans.scan_time.between( 509 row.scan_time - time_tol, row.scan_time + time_tol 510 ) 511 ] 512 ms2_scans_filtered = ms2_scans_filtered[ 513 ms2_scans_filtered.precursor_mz.between( 514 row.mz - mz_tol, row.mz + mz_tol 515 ) 516 ] 517 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 518 self.mass_features[i].ms2_scan_numbers = ( 519 ms2_scans_filtered.scan.tolist() 520 + self.mass_features[i].ms2_scan_numbers 521 ) 522 # add to _ms attribute 523 self.add_mass_spectra( 524 scan_list=list(set(dda_scans)), 525 auto_process=auto_process, 526 spectrum_mode=spectrum_mode, 527 use_parser=use_parser, 528 ms_params=ms_params, 529 ) 530 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 531 for mf_id in self.mass_features: 532 if self.mass_features[mf_id].ms2_scan_numbers is not None: 533 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 534 if dda_scan in self._ms.keys(): 535 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 536 dda_scan 537 ] 538 539 def add_associated_ms1( 540 self, auto_process=True, use_parser=True, spectrum_mode=None 541 ): 542 """Add MS1 spectra associated with mass features to the dataset. 543 544 Parameters 545 ----------- 546 auto_process : bool, optional 547 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 548 use_parser : bool, optional 549 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 550 spectrum_mode : str or None, optional 551 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 552 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 553 Defaults to None. (faster if defined, otherwise will check each scan) 554 555 Raises 556 ------ 557 ValueError 558 If mass_features is not set, must run find_mass_features() first. 559 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 560 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 561 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 562 """ 563 # Check if mass_features is set, raise error if not 564 if self.mass_features is None: 565 raise ValueError( 566 "mass_features not set, must run find_mass_features() first" 567 ) 568 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 569 570 if scans_to_average == 1: 571 # Add to LCMSobj 572 self.add_mass_spectra( 573 scan_list=[ 574 int(x) for x in self.mass_features_to_df().apex_scan.tolist() 575 ], 576 auto_process=auto_process, 577 use_parser=use_parser, 578 spectrum_mode=spectrum_mode, 579 ms_params=self.parameters.mass_spectrum["ms1"], 580 ) 581 582 elif ( 583 (scans_to_average - 1) % 2 584 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 585 apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist())) 586 # Check if all apex scans are profile mode, raise error if not 587 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 588 raise ValueError("All apex scans must be profile mode for averaging") 589 590 # First get sets of scans to average 591 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 592 ms1_idx_start = ms1_scans.index(apex_scan) - int( 593 (scans_to_average - 1) / 2 594 ) 595 if ms1_idx_start < 0: 596 ms1_idx_start = 0 597 ms1_idx_end = ( 598 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 599 ) 600 if ms1_idx_end > (len(ms1_scans) - 1): 601 ms1_idx_end = len(ms1_scans) - 1 602 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 603 return scan_list 604 605 ms1_scans = self.ms1_scans 606 scans_lists = [ 607 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 608 for apex_scan in apex_scans 609 ] 610 611 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 612 if self.polarity == "negative": 613 polarity = -1 614 elif self.polarity == "positive": 615 polarity = 1 616 617 if not use_parser: 618 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 619 ms1_unprocessed = self._ms_unprocessed[1].copy() 620 # Set the index on _ms_unprocessed[1] to scan number 621 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 622 self._ms_unprocessed[1] = ms1_unprocessed 623 624 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 625 scans_lists_flat = list( 626 set([scan for sublist in scans_lists for scan in sublist]) 627 ) 628 if ( 629 len( 630 np.setdiff1d( 631 np.sort(scans_lists_flat), 632 np.sort(ms1_unprocessed.index.values), 633 ) 634 ) 635 > 0 636 ): 637 raise ValueError( 638 "Not all scans to average are present in the unprocessed data" 639 ) 640 641 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 642 # Get unprocessed mass spectrum from scans 643 ms = self.get_average_mass_spectrum( 644 scan_list=scan_list_average, 645 apex_scan=apex_scan, 646 spectrum_mode="profile", 647 ms_level=1, 648 auto_process=auto_process, 649 use_parser=use_parser, 650 perform_checks=False, 651 polarity=polarity, 652 ms_params=self.parameters.mass_spectrum["ms1"], 653 ) 654 # Add mass spectrum to LCMS object and associated with mass feature 655 self.add_mass_spectrum(ms) 656 657 if not use_parser: 658 # Reset the index on _ms_unprocessed[1] to not be scan number 659 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 660 self._ms_unprocessed[1] = ms1_unprocessed 661 else: 662 raise ValueError( 663 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 664 ) 665 666 # Associate the ms1 spectra with the mass features 667 for mf_id in self.mass_features: 668 self.mass_features[mf_id].mass_spectrum = self._ms[ 669 self.mass_features[mf_id].apex_scan 670 ] 671 self.mass_features[mf_id].update_mz() 672 673 # Re-process clustering if persistent homology is selected to remove duplicate mass features after adding and processing MS1 spectra 674 if self.parameters.lc_ms.peak_picking_method == "persistent homology": 675 self.cluster_mass_features(drop_children=True, sort_by="persistence") 676 677 def mass_features_to_df(self): 678 """Returns a pandas dataframe summarizing the mass features. 679 680 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 681 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 682 683 684 Returns 685 -------- 686 pandas.DataFrame 687 A pandas dataframe of mass features with the following columns: 688 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 689 """ 690 691 def mass_spectrum_to_string( 692 mass_spec, normalize=True, min_normalized_abun=0.01 693 ): 694 """Converts a mass spectrum to a string of m/z:abundance pairs. 695 696 Parameters 697 ----------- 698 mass_spec : MassSpectrum 699 A MassSpectrum object to be converted to a string. 700 normalize : bool, optional 701 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 702 min_normalized_abun : float, optional 703 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 704 705 Returns 706 -------- 707 str 708 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 709 """ 710 mz_np = mass_spec.to_dataframe()["m/z"].values 711 abun_np = mass_spec.to_dataframe()["Peak Height"].values 712 if normalize: 713 abun_np = abun_np / abun_np.max() 714 mz_abun = np.column_stack((mz_np, abun_np)) 715 if normalize: 716 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 717 mz_abun_str = [ 718 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 719 for mz, abun in mz_abun 720 ] 721 return "; ".join(mz_abun_str) 722 723 cols_in_df = [ 724 "id", 725 "_apex_scan", 726 "start_scan", 727 "final_scan", 728 "_retention_time", 729 "_intensity", 730 "_persistence", 731 "_area", 732 "_dispersity_index", 733 "_tailing_factor", 734 "monoisotopic_mf_id", 735 "isotopologue_type", 736 "mass_spectrum_deconvoluted_parent", 737 ] 738 df_mf_list = [] 739 for mf_id in self.mass_features.keys(): 740 # Find cols_in_df that are in single_mf 741 df_keys = list( 742 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 743 ) 744 dict_mf = {} 745 for key in df_keys: 746 dict_mf[key] = getattr(self.mass_features[mf_id], key) 747 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 748 # Add MS2 spectra info 749 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 750 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 751 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 752 dict_mf["associated_mass_features"] = ", ".join( 753 map( 754 str, 755 self.mass_features[mf_id].associated_mass_features_deconvoluted, 756 ) 757 ) 758 if self.mass_features[mf_id]._half_height_width is not None: 759 dict_mf["half_height_width"] = self.mass_features[ 760 mf_id 761 ].half_height_width 762 # Check if EIC for mass feature is set 763 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 764 df_mf_single["mz"] = self.mass_features[mf_id].mz 765 df_mf_list.append(df_mf_single) 766 df_mf = pd.concat(df_mf_list) 767 768 # rename _area to area and id to mf_id 769 df_mf = df_mf.rename( 770 columns={ 771 "_area": "area", 772 "id": "mf_id", 773 "_apex_scan": "apex_scan", 774 "_retention_time": "scan_time", 775 "_intensity": "intensity", 776 "_persistence": "persistence", 777 "_dispersity_index": "dispersity_index", 778 "_tailing_factor": "tailing_factor", 779 } 780 ) 781 782 # reorder columns 783 col_order = [ 784 "mf_id", 785 "scan_time", 786 "mz", 787 "apex_scan", 788 "start_scan", 789 "final_scan", 790 "intensity", 791 "persistence", 792 "area", 793 "half_height_width", 794 "tailing_factor", 795 "dispersity_index", 796 "monoisotopic_mf_id", 797 "isotopologue_type", 798 "mass_spectrum_deconvoluted_parent", 799 "associated_mass_features", 800 "ms2_spectrum", 801 ] 802 # drop columns that are not in col_order 803 cols_to_order = [col for col in col_order if col in df_mf.columns] 804 df_mf = df_mf[cols_to_order] 805 806 # reset index to mf_id 807 df_mf = df_mf.set_index("mf_id") 808 df_mf.index.name = "mf_id" 809 810 return df_mf 811 812 def mass_features_ms1_annot_to_df(self): 813 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 814 815 Returns 816 -------- 817 pandas.DataFrame 818 A pandas dataframe of MS1 annotations for the mass features in the dataset. 819 The index is set to mf_id (mass feature ID) 820 821 Raises 822 ------ 823 Warning 824 If no MS1 annotations were found for the mass features in the dataset. 825 """ 826 annot_df_list_ms1 = [] 827 for mf_id in self.mass_features.keys(): 828 if self.mass_features[mf_id].mass_spectrum is None: 829 pass 830 else: 831 # Add ms1 annotations to ms1 annotation list 832 if ( 833 np.abs( 834 ( 835 self.mass_features[mf_id].ms1_peak.mz_exp 836 - self.mass_features[mf_id].mz 837 ) 838 ) 839 < 0.01 840 ): 841 # Get the molecular formula from the mass spectrum 842 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 843 # Subset to pull out only the peak associated with the mass feature 844 annot_df = annot_df[ 845 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 846 ].copy() 847 848 # Remove the index column and add column for mf_id 849 annot_df = annot_df.drop(columns=["Index"]) 850 annot_df["mf_id"] = mf_id 851 annot_df_list_ms1.append(annot_df) 852 853 if len(annot_df_list_ms1) > 0: 854 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 855 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 856 annot_ms1_df_full.index.name = "mf_id" 857 858 else: 859 annot_ms1_df_full = None 860 # Warn that no ms1 annotations were found 861 warnings.warn( 862 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 863 UserWarning, 864 ) 865 866 return annot_ms1_df_full 867 868 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 869 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 870 871 Parameters 872 ----------- 873 molecular_metadata : dict of MolecularMetadata objects 874 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 875 876 Returns 877 -------- 878 pandas.DataFrame 879 A pandas dataframe of MS2 annotations for the mass features in the dataset, 880 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 881 882 Raises 883 ------ 884 Warning 885 If no MS2 annotations were found for the mass features in the dataset. 886 """ 887 annot_df_list_ms2 = [] 888 for mf_id in self.mass_features.keys(): 889 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 890 # Add ms2 annotations to ms2 annotation list 891 for result in self.mass_features[mf_id].ms2_similarity_results: 892 annot_df_ms2 = result.to_dataframe() 893 annot_df_ms2["mf_id"] = mf_id 894 annot_df_list_ms2.append(annot_df_ms2) 895 896 if len(annot_df_list_ms2) > 0: 897 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 898 if molecular_metadata is not None: 899 molecular_metadata_df = pd.concat( 900 [ 901 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 902 for k, v in molecular_metadata.items() 903 ], 904 ignore_index=True, 905 ) 906 molecular_metadata_df = molecular_metadata_df.rename( 907 columns={"id": "ref_mol_id"} 908 ) 909 annot_ms2_df_full = annot_ms2_df_full.merge( 910 molecular_metadata_df, on="ref_mol_id", how="left" 911 ) 912 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 913 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 914 ).copy() 915 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 916 annot_ms2_df_full.index.name = "mf_id" 917 else: 918 annot_ms2_df_full = None 919 # Warn that no ms2 annotations were found 920 warnings.warn( 921 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 922 UserWarning, 923 ) 924 925 return annot_ms2_df_full 926 927 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 928 """Returns a figure displaying 929 (1) thresholded, unprocessed data 930 (2) the m/z features 931 (3) which m/z features are associated with MS2 spectra 932 933 Parameters 934 ----------- 935 binsize : float 936 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 937 mf_plot : boolean 938 Indicates whether to plot the m/z features. Defaults to True. 939 ms2_plot : boolean 940 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 941 return_fig : boolean 942 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 943 944 Returns 945 -------- 946 matplotlib.pyplot.Figure 947 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 948 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 949 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 950 features with associated with MS2 spectra are plotted, they are displayed in red. 951 952 Raises 953 ------ 954 Warning 955 If m/z features are set to be plot but aren't in the dataset. 956 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 957 were found for the m/z features in the dataset. 958 """ 959 if mf_plot: 960 # Check if mass_features is set, raise error if not 961 if self.mass_features is None: 962 raise ValueError( 963 "mass_features not set, must run find_mass_features() first" 964 ) 965 ## call mass feature data 966 mf_df = self.mass_features_to_df() 967 968 if ms2_plot: 969 if not mf_plot: 970 # Check if mass_features is set, raise error if not 971 if self.mass_features is None: 972 raise ValueError( 973 "mass_features not set, must run find_mass_features() first" 974 ) 975 976 ## call m/z feature data 977 mf_df = self.mass_features_to_df() 978 979 # Check if ms2_spectrum is set, raise error if not 980 if 'ms2_spectrum' not in mf_df.columns: 981 raise ValueError( 982 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 983 ) 984 985 ## threshold and grid unprocessed data 986 df = self._ms_unprocessed[1].copy() 987 df = df.dropna(subset=['intensity']).reset_index(drop = True) 988 threshold = ph_int_min_thresh * df.intensity.max() 989 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 990 df = self.grid_data(df_thres) 991 992 ## format unprocessed data for plotting 993 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 994 mz_grid = np.arange(0, np.max(df.mz), binsize) 995 mz_data = np.array(df.mz) 996 df['mz_bin'] = find_closest(mz_grid, mz_data) 997 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 998 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 999 1000 ## generate figure 1001 fig = plt.figure() 1002 plt.scatter( 1003 unproc_df.scan_time, 1004 unproc_df.mz_bin*binsize, 1005 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1006 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1007 cmap = 'Greys_r', 1008 s = 1 1009 ) 1010 1011 if mf_plot: 1012 if ms2_plot: 1013 plt.scatter( 1014 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1015 mf_df[mf_df.ms2_spectrum.isna()].mz, 1016 c = 'c', 1017 s = 4, 1018 label = 'M/Z features without MS2' 1019 ) 1020 else: 1021 plt.scatter( 1022 mf_df.scan_time, 1023 mf_df.mz, 1024 c = 'c', 1025 s = 4, 1026 label = 'M/Z features' 1027 ) 1028 1029 if ms2_plot: 1030 plt.scatter( 1031 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1032 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1033 c = 'r', 1034 s = 2, 1035 label = 'M/Z features with MS2' 1036 ) 1037 1038 if mf_plot == True or ms2_plot == True: 1039 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1040 plt.xlabel('Scan time') 1041 plt.ylabel('m/z') 1042 plt.ylim(0, np.ceil(np.max(df.mz))) 1043 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1044 plt.title('Composite Feature Map') 1045 1046 if return_fig: 1047 plt.close(fig) 1048 return fig 1049 1050 else: 1051 plt.show() 1052 1053 def __len__(self): 1054 """ 1055 Returns the number of mass spectra in the dataset. 1056 1057 Returns 1058 -------- 1059 int 1060 The number of mass spectra in the dataset. 1061 """ 1062 return len(self._ms) 1063 1064 def __getitem__(self, scan_number): 1065 """ 1066 Returns the mass spectrum corresponding to the specified scan number. 1067 1068 Parameters 1069 ----------- 1070 scan_number : int 1071 The scan number of the desired mass spectrum. 1072 1073 Returns 1074 -------- 1075 MassSpectrum 1076 The mass spectrum corresponding to the specified scan number. 1077 """ 1078 return self._ms.get(scan_number) 1079 1080 def __iter__(self): 1081 """Returns an iterator over the mass spectra in the dataset. 1082 1083 Returns 1084 -------- 1085 iterator 1086 An iterator over the mass spectra in the dataset. 1087 """ 1088 return iter(self._ms.values()) 1089 1090 def set_tic_list_from_data(self, overwrite=False): 1091 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1092 1093 Parameters 1094 ----------- 1095 overwrite : bool, optional 1096 If True, overwrites the TIC list if it is already set. Defaults to False. 1097 1098 Notes 1099 ----- 1100 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1101 1102 Raises 1103 ------ 1104 ValueError 1105 If no mass spectra are found in the dataset. 1106 If the TIC list is already set and overwrite is False. 1107 """ 1108 # Check if _ms is empty and raise error if so 1109 if len(self._ms) == 0: 1110 raise ValueError("No mass spectra found in dataset") 1111 1112 # Check if tic_list is already set and raise error if so 1113 if len(self.tic) > 0 and not overwrite: 1114 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1115 1116 self.tic = [self._ms.get(i).tic for i in self.scans_number] 1117 1118 def set_retention_time_from_data(self, overwrite=False): 1119 """Sets the retention time list from the data in the _ms dictionary. 1120 1121 Parameters 1122 ----------- 1123 overwrite : bool, optional 1124 If True, overwrites the retention time list if it is already set. Defaults to False. 1125 1126 Notes 1127 ----- 1128 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1129 1130 Raises 1131 ------ 1132 ValueError 1133 If no mass spectra are found in the dataset. 1134 If the retention time list is already set and overwrite is False. 1135 """ 1136 # Check if _ms is empty and raise error if so 1137 if len(self._ms) == 0: 1138 raise ValueError("No mass spectra found in dataset") 1139 1140 # Check if retention_time_list is already set and raise error if so 1141 if len(self.retention_time) > 0 and not overwrite: 1142 raise ValueError( 1143 "Retention time list already set, use overwrite=True to overwrite" 1144 ) 1145 1146 retention_time_list = [] 1147 for key_ms in sorted(self._ms.keys()): 1148 retention_time_list.append(self._ms.get(key_ms).retention_time) 1149 self.retention_time = retention_time_list 1150 1151 def set_scans_number_from_data(self, overwrite=False): 1152 """Sets the scan number list from the data in the _ms dictionary. 1153 1154 Notes 1155 ----- 1156 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1157 1158 Raises 1159 ------ 1160 ValueError 1161 If no mass spectra are found in the dataset. 1162 If the scan number list is already set and overwrite is False. 1163 """ 1164 # Check if _ms is empty and raise error if so 1165 if len(self._ms) == 0: 1166 raise ValueError("No mass spectra found in dataset") 1167 1168 # Check if scans_number_list is already set and raise error if so 1169 if len(self.scans_number) > 0 and not overwrite: 1170 raise ValueError( 1171 "Scan number list already set, use overwrite=True to overwrite" 1172 ) 1173 1174 self.scans_number = sorted(self._ms.keys()) 1175 1176 @property 1177 def ms1_scans(self): 1178 """ 1179 list : A list of MS1 scan numbers for the dataset. 1180 """ 1181 return self.scan_df[self.scan_df.ms_level == 1].index.tolist() 1182 1183 @property 1184 def parameters(self): 1185 """ 1186 LCMSParameters : The parameters used for the LC-MS analysis. 1187 """ 1188 return self._parameters 1189 1190 @parameters.setter 1191 def parameters(self, paramsinstance): 1192 """ 1193 Sets the parameters used for the LC-MS analysis. 1194 1195 Parameters 1196 ----------- 1197 paramsinstance : LCMSParameters 1198 The parameters used for the LC-MS analysis. 1199 """ 1200 self._parameters = paramsinstance 1201 1202 @property 1203 def scans_number(self): 1204 """ 1205 list : A list of scan numbers for the dataset. 1206 """ 1207 return self._scans_number_list 1208 1209 @scans_number.setter 1210 def scans_number(self, scan_numbers_list): 1211 """ 1212 Sets the scan numbers for the dataset. 1213 1214 Parameters 1215 ----------- 1216 scan_numbers_list : list 1217 A list of scan numbers for the dataset. 1218 """ 1219 self._scans_number_list = scan_numbers_list 1220 1221 @property 1222 def retention_time(self): 1223 """ 1224 numpy.ndarray : An array of retention times for the dataset. 1225 """ 1226 return self._retention_time_list 1227 1228 @retention_time.setter 1229 def retention_time(self, rt_list): 1230 """ 1231 Sets the retention times for the dataset. 1232 1233 Parameters 1234 ----------- 1235 rt_list : list 1236 A list of retention times for the dataset. 1237 """ 1238 self._retention_time_list = np.array(rt_list) 1239 1240 @property 1241 def tic(self): 1242 """ 1243 numpy.ndarray : An array of TIC values for the dataset. 1244 """ 1245 return self._tic_list 1246 1247 @tic.setter 1248 def tic(self, tic_list): 1249 """ 1250 Sets the TIC values for the dataset. 1251 1252 Parameters 1253 ----------- 1254 tic_list : list 1255 A list of TIC values for the dataset. 1256 """ 1257 self._tic_list = np.array(tic_list)
A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.
This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.
Parameters
- file_location (str or Path): The location of the file containing the mass spectra data.
- analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
- instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
- sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
- spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
- polarity (str): The polarity of the ionization mode used for the dataset.
- _parameters (LCMSParameters): The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
- _retention_time_list (numpy.ndarray): An array of retention times for the dataset.
- _scans_number_list (list): A list of scan numbers for the dataset.
- _tic_list (numpy.ndarray): An array of total ion current (TIC) values for the dataset.
- eics (dict): A dictionary containing extracted ion chromatograms (EICs) for the dataset. Key is the mz of the EIC. Initialized as an empty dictionary.
- mass_features (dictionary of LCMSMassFeature objects): A dictionary containing mass features for the dataset. Key is mass feature ID. Initialized as an empty dictionary.
- spectral_search_results (dictionary of MS2SearchResults objects): A dictionary containing spectral search results for the dataset. Key is scan number : precursor mz. Initialized as an empty dictionary.
Methods
- get_parameters_json(). Returns the parameters used for the LC-MS analysis in JSON format.
- add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds which MS2 scans are associated with each mass feature to the mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
- add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds the MS1 spectra associated with each mass feature to the mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
- mass_features_to_df() Returns a pandas dataframe summarizing the mass features in the dataset.
- set_tic_list_from_data(overwrite=False) Sets the TIC list from the mass spectrum objects within the _ms dictionary.
- set_retention_time_from_data(overwrite=False) Sets the retention time list from the data in the _ms dictionary.
- set_scans_number_from_data(overwrite=False) Sets the scan number list from the data in the _ms dictionary.
- plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) Generates plot of M/Z features comparing scan time vs M/Z value
390 def __init__( 391 self, 392 file_location, 393 analyzer="Unknown", 394 instrument_label="Unknown", 395 sample_name=None, 396 spectra_parser=None, 397 ): 398 super().__init__( 399 file_location, analyzer, instrument_label, sample_name, spectra_parser 400 ) 401 self.polarity = "" 402 self._parameters = LCMSParameters() 403 self._retention_time_list = [] 404 self._scans_number_list = [] 405 self._tic_list = [] 406 self.eics = {} 407 self.mass_features = {} 408 self.spectral_search_results = {}
410 def get_parameters_json(self): 411 """Returns the parameters stored for the LC-MS object in JSON format. 412 413 Returns 414 -------- 415 str 416 The parameters used for the LC-MS analysis in JSON format. 417 """ 418 return self.parameters.to_json()
Returns the parameters stored for the LC-MS object in JSON format.
Returns
- str: The parameters used for the LC-MS analysis in JSON format.
420 def remove_unprocessed_data(self, ms_level=None): 421 """Removes the unprocessed data from the LCMSBase object. 422 423 Parameters 424 ----------- 425 ms_level : int, optional 426 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 427 428 Raises 429 ------ 430 ValueError 431 If ms_level is not 1 or 2. 432 433 Notes 434 ----- 435 This method is useful for freeing up memory after the data has been processed. 436 """ 437 if ms_level is None: 438 for ms_level in self._ms_unprocessed.keys(): 439 self._ms_unprocessed[ms_level] = None 440 if ms_level not in [1, 2]: 441 raise ValueError("ms_level must be 1 or 2") 442 self._ms_unprocessed[ms_level] = None
Removes the unprocessed data from the LCMSBase object.
Parameters
- ms_level (int, optional): The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
Raises
- ValueError: If ms_level is not 1 or 2.
Notes
This method is useful for freeing up memory after the data has been processed.
444 def add_associated_ms2_dda( 445 self, 446 auto_process=True, 447 use_parser=True, 448 spectrum_mode=None, 449 ms_params_key="ms2", 450 scan_filter=None, 451 ): 452 """Add MS2 spectra associated with mass features to the dataset. 453 454 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 455 456 Parameters 457 ----------- 458 auto_process : bool, optional 459 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 460 use_parser : bool, optional 461 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 462 spectrum_mode : str or None, optional 463 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 464 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 465 Defaults to None. (faster if defined, otherwise will check each scan) 466 ms_params_key : string, optional 467 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 468 Defaults to 'ms2'. 469 scan_filter : str 470 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 471 "hcd" will pull out only HCD scans. 472 473 Raises 474 ------ 475 ValueError 476 If mass_features is not set, must run find_mass_features() first. 477 If no MS2 scans are found in the dataset. 478 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 479 """ 480 # Check if mass_features is set, raise error if not 481 if self.mass_features is None: 482 raise ValueError( 483 "mass_features not set, must run find_mass_features() first" 484 ) 485 486 # reconfigure ms_params to get the correct mass spectrum parameters from the key 487 ms_params = self.parameters.mass_spectrum[ms_params_key] 488 489 mf_df = self.mass_features_to_df().copy() 490 # Find ms2 scans that have a precursor m/z value 491 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 492 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 493 # drop ms2 scans that have no tic 494 ms2_scans = ms2_scans[ms2_scans.tic > 0] 495 if ms2_scans is None: 496 raise ValueError("No DDA scans found in dataset") 497 498 if scan_filter is not None: 499 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 500 # set tolerance in rt space (in minutes) and mz space (in daltons) 501 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 502 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 503 504 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 505 dda_scans = [] 506 for i, row in mf_df.iterrows(): 507 ms2_scans_filtered = ms2_scans[ 508 ms2_scans.scan_time.between( 509 row.scan_time - time_tol, row.scan_time + time_tol 510 ) 511 ] 512 ms2_scans_filtered = ms2_scans_filtered[ 513 ms2_scans_filtered.precursor_mz.between( 514 row.mz - mz_tol, row.mz + mz_tol 515 ) 516 ] 517 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 518 self.mass_features[i].ms2_scan_numbers = ( 519 ms2_scans_filtered.scan.tolist() 520 + self.mass_features[i].ms2_scan_numbers 521 ) 522 # add to _ms attribute 523 self.add_mass_spectra( 524 scan_list=list(set(dda_scans)), 525 auto_process=auto_process, 526 spectrum_mode=spectrum_mode, 527 use_parser=use_parser, 528 ms_params=ms_params, 529 ) 530 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 531 for mf_id in self.mass_features: 532 if self.mass_features[mf_id].ms2_scan_numbers is not None: 533 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 534 if dda_scan in self._ms.keys(): 535 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 536 dda_scan 537 ]
Add MS2 spectra associated with mass features to the dataset.
Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
Parameters
- auto_process (bool, optional): If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
- use_parser (bool, optional): If True, envoke the spectra parser to get the MS2 spectra. Default is True.
- spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
- ms_params_key (string, optional): The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. Defaults to 'ms2'.
- scan_filter (str): A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. "hcd" will pull out only HCD scans.
Raises
- ValueError: If mass_features is not set, must run find_mass_features() first. If no MS2 scans are found in the dataset. If no precursor m/z values are found in MS2 scans, not a DDA dataset.
539 def add_associated_ms1( 540 self, auto_process=True, use_parser=True, spectrum_mode=None 541 ): 542 """Add MS1 spectra associated with mass features to the dataset. 543 544 Parameters 545 ----------- 546 auto_process : bool, optional 547 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 548 use_parser : bool, optional 549 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 550 spectrum_mode : str or None, optional 551 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 552 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 553 Defaults to None. (faster if defined, otherwise will check each scan) 554 555 Raises 556 ------ 557 ValueError 558 If mass_features is not set, must run find_mass_features() first. 559 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 560 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 561 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 562 """ 563 # Check if mass_features is set, raise error if not 564 if self.mass_features is None: 565 raise ValueError( 566 "mass_features not set, must run find_mass_features() first" 567 ) 568 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 569 570 if scans_to_average == 1: 571 # Add to LCMSobj 572 self.add_mass_spectra( 573 scan_list=[ 574 int(x) for x in self.mass_features_to_df().apex_scan.tolist() 575 ], 576 auto_process=auto_process, 577 use_parser=use_parser, 578 spectrum_mode=spectrum_mode, 579 ms_params=self.parameters.mass_spectrum["ms1"], 580 ) 581 582 elif ( 583 (scans_to_average - 1) % 2 584 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 585 apex_scans = list(set(self.mass_features_to_df().apex_scan.tolist())) 586 # Check if all apex scans are profile mode, raise error if not 587 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 588 raise ValueError("All apex scans must be profile mode for averaging") 589 590 # First get sets of scans to average 591 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 592 ms1_idx_start = ms1_scans.index(apex_scan) - int( 593 (scans_to_average - 1) / 2 594 ) 595 if ms1_idx_start < 0: 596 ms1_idx_start = 0 597 ms1_idx_end = ( 598 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 599 ) 600 if ms1_idx_end > (len(ms1_scans) - 1): 601 ms1_idx_end = len(ms1_scans) - 1 602 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 603 return scan_list 604 605 ms1_scans = self.ms1_scans 606 scans_lists = [ 607 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 608 for apex_scan in apex_scans 609 ] 610 611 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 612 if self.polarity == "negative": 613 polarity = -1 614 elif self.polarity == "positive": 615 polarity = 1 616 617 if not use_parser: 618 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 619 ms1_unprocessed = self._ms_unprocessed[1].copy() 620 # Set the index on _ms_unprocessed[1] to scan number 621 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 622 self._ms_unprocessed[1] = ms1_unprocessed 623 624 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 625 scans_lists_flat = list( 626 set([scan for sublist in scans_lists for scan in sublist]) 627 ) 628 if ( 629 len( 630 np.setdiff1d( 631 np.sort(scans_lists_flat), 632 np.sort(ms1_unprocessed.index.values), 633 ) 634 ) 635 > 0 636 ): 637 raise ValueError( 638 "Not all scans to average are present in the unprocessed data" 639 ) 640 641 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 642 # Get unprocessed mass spectrum from scans 643 ms = self.get_average_mass_spectrum( 644 scan_list=scan_list_average, 645 apex_scan=apex_scan, 646 spectrum_mode="profile", 647 ms_level=1, 648 auto_process=auto_process, 649 use_parser=use_parser, 650 perform_checks=False, 651 polarity=polarity, 652 ms_params=self.parameters.mass_spectrum["ms1"], 653 ) 654 # Add mass spectrum to LCMS object and associated with mass feature 655 self.add_mass_spectrum(ms) 656 657 if not use_parser: 658 # Reset the index on _ms_unprocessed[1] to not be scan number 659 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 660 self._ms_unprocessed[1] = ms1_unprocessed 661 else: 662 raise ValueError( 663 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 664 ) 665 666 # Associate the ms1 spectra with the mass features 667 for mf_id in self.mass_features: 668 self.mass_features[mf_id].mass_spectrum = self._ms[ 669 self.mass_features[mf_id].apex_scan 670 ] 671 self.mass_features[mf_id].update_mz() 672 673 # Re-process clustering if persistent homology is selected to remove duplicate mass features after adding and processing MS1 spectra 674 if self.parameters.lc_ms.peak_picking_method == "persistent homology": 675 self.cluster_mass_features(drop_children=True, sort_by="persistence")
Add MS1 spectra associated with mass features to the dataset.
Parameters
- auto_process (bool, optional): If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
- use_parser (bool, optional): If True, envoke the spectra parser to get the MS1 spectra. Default is True.
- spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
Raises
- ValueError: If mass_features is not set, must run find_mass_features() first. If apex scans are not profile mode, all apex scans must be profile mode for averaging. If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
677 def mass_features_to_df(self): 678 """Returns a pandas dataframe summarizing the mass features. 679 680 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 681 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 682 683 684 Returns 685 -------- 686 pandas.DataFrame 687 A pandas dataframe of mass features with the following columns: 688 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 689 """ 690 691 def mass_spectrum_to_string( 692 mass_spec, normalize=True, min_normalized_abun=0.01 693 ): 694 """Converts a mass spectrum to a string of m/z:abundance pairs. 695 696 Parameters 697 ----------- 698 mass_spec : MassSpectrum 699 A MassSpectrum object to be converted to a string. 700 normalize : bool, optional 701 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 702 min_normalized_abun : float, optional 703 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 704 705 Returns 706 -------- 707 str 708 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 709 """ 710 mz_np = mass_spec.to_dataframe()["m/z"].values 711 abun_np = mass_spec.to_dataframe()["Peak Height"].values 712 if normalize: 713 abun_np = abun_np / abun_np.max() 714 mz_abun = np.column_stack((mz_np, abun_np)) 715 if normalize: 716 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 717 mz_abun_str = [ 718 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 719 for mz, abun in mz_abun 720 ] 721 return "; ".join(mz_abun_str) 722 723 cols_in_df = [ 724 "id", 725 "_apex_scan", 726 "start_scan", 727 "final_scan", 728 "_retention_time", 729 "_intensity", 730 "_persistence", 731 "_area", 732 "_dispersity_index", 733 "_tailing_factor", 734 "monoisotopic_mf_id", 735 "isotopologue_type", 736 "mass_spectrum_deconvoluted_parent", 737 ] 738 df_mf_list = [] 739 for mf_id in self.mass_features.keys(): 740 # Find cols_in_df that are in single_mf 741 df_keys = list( 742 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 743 ) 744 dict_mf = {} 745 for key in df_keys: 746 dict_mf[key] = getattr(self.mass_features[mf_id], key) 747 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 748 # Add MS2 spectra info 749 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 750 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 751 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 752 dict_mf["associated_mass_features"] = ", ".join( 753 map( 754 str, 755 self.mass_features[mf_id].associated_mass_features_deconvoluted, 756 ) 757 ) 758 if self.mass_features[mf_id]._half_height_width is not None: 759 dict_mf["half_height_width"] = self.mass_features[ 760 mf_id 761 ].half_height_width 762 # Check if EIC for mass feature is set 763 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 764 df_mf_single["mz"] = self.mass_features[mf_id].mz 765 df_mf_list.append(df_mf_single) 766 df_mf = pd.concat(df_mf_list) 767 768 # rename _area to area and id to mf_id 769 df_mf = df_mf.rename( 770 columns={ 771 "_area": "area", 772 "id": "mf_id", 773 "_apex_scan": "apex_scan", 774 "_retention_time": "scan_time", 775 "_intensity": "intensity", 776 "_persistence": "persistence", 777 "_dispersity_index": "dispersity_index", 778 "_tailing_factor": "tailing_factor", 779 } 780 ) 781 782 # reorder columns 783 col_order = [ 784 "mf_id", 785 "scan_time", 786 "mz", 787 "apex_scan", 788 "start_scan", 789 "final_scan", 790 "intensity", 791 "persistence", 792 "area", 793 "half_height_width", 794 "tailing_factor", 795 "dispersity_index", 796 "monoisotopic_mf_id", 797 "isotopologue_type", 798 "mass_spectrum_deconvoluted_parent", 799 "associated_mass_features", 800 "ms2_spectrum", 801 ] 802 # drop columns that are not in col_order 803 cols_to_order = [col for col in col_order if col in df_mf.columns] 804 df_mf = df_mf[cols_to_order] 805 806 # reset index to mf_id 807 df_mf = df_mf.set_index("mf_id") 808 df_mf.index.name = "mf_id" 809 810 return df_mf
Returns a pandas dataframe summarizing the mass features.
The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID).
Returns
- pandas.DataFrame: A pandas dataframe of mass features with the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
812 def mass_features_ms1_annot_to_df(self): 813 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 814 815 Returns 816 -------- 817 pandas.DataFrame 818 A pandas dataframe of MS1 annotations for the mass features in the dataset. 819 The index is set to mf_id (mass feature ID) 820 821 Raises 822 ------ 823 Warning 824 If no MS1 annotations were found for the mass features in the dataset. 825 """ 826 annot_df_list_ms1 = [] 827 for mf_id in self.mass_features.keys(): 828 if self.mass_features[mf_id].mass_spectrum is None: 829 pass 830 else: 831 # Add ms1 annotations to ms1 annotation list 832 if ( 833 np.abs( 834 ( 835 self.mass_features[mf_id].ms1_peak.mz_exp 836 - self.mass_features[mf_id].mz 837 ) 838 ) 839 < 0.01 840 ): 841 # Get the molecular formula from the mass spectrum 842 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 843 # Subset to pull out only the peak associated with the mass feature 844 annot_df = annot_df[ 845 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 846 ].copy() 847 848 # Remove the index column and add column for mf_id 849 annot_df = annot_df.drop(columns=["Index"]) 850 annot_df["mf_id"] = mf_id 851 annot_df_list_ms1.append(annot_df) 852 853 if len(annot_df_list_ms1) > 0: 854 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 855 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 856 annot_ms1_df_full.index.name = "mf_id" 857 858 else: 859 annot_ms1_df_full = None 860 # Warn that no ms1 annotations were found 861 warnings.warn( 862 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 863 UserWarning, 864 ) 865 866 return annot_ms1_df_full
Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
Returns
- pandas.DataFrame: A pandas dataframe of MS1 annotations for the mass features in the dataset. The index is set to mf_id (mass feature ID)
Raises
- Warning: If no MS1 annotations were found for the mass features in the dataset.
868 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 869 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 870 871 Parameters 872 ----------- 873 molecular_metadata : dict of MolecularMetadata objects 874 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 875 876 Returns 877 -------- 878 pandas.DataFrame 879 A pandas dataframe of MS2 annotations for the mass features in the dataset, 880 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 881 882 Raises 883 ------ 884 Warning 885 If no MS2 annotations were found for the mass features in the dataset. 886 """ 887 annot_df_list_ms2 = [] 888 for mf_id in self.mass_features.keys(): 889 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 890 # Add ms2 annotations to ms2 annotation list 891 for result in self.mass_features[mf_id].ms2_similarity_results: 892 annot_df_ms2 = result.to_dataframe() 893 annot_df_ms2["mf_id"] = mf_id 894 annot_df_list_ms2.append(annot_df_ms2) 895 896 if len(annot_df_list_ms2) > 0: 897 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 898 if molecular_metadata is not None: 899 molecular_metadata_df = pd.concat( 900 [ 901 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 902 for k, v in molecular_metadata.items() 903 ], 904 ignore_index=True, 905 ) 906 molecular_metadata_df = molecular_metadata_df.rename( 907 columns={"id": "ref_mol_id"} 908 ) 909 annot_ms2_df_full = annot_ms2_df_full.merge( 910 molecular_metadata_df, on="ref_mol_id", how="left" 911 ) 912 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 913 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 914 ).copy() 915 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 916 annot_ms2_df_full.index.name = "mf_id" 917 else: 918 annot_ms2_df_full = None 919 # Warn that no ms2 annotations were found 920 warnings.warn( 921 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 922 UserWarning, 923 ) 924 925 return annot_ms2_df_full
Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
Parameters
- molecular_metadata (dict of MolecularMetadata objects): A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None.
Returns
- pandas.DataFrame: A pandas dataframe of MS2 annotations for the mass features in the dataset, and optionally molecular metadata. The index is set to mf_id (mass feature ID)
Raises
- Warning: If no MS2 annotations were found for the mass features in the dataset.
927 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 928 """Returns a figure displaying 929 (1) thresholded, unprocessed data 930 (2) the m/z features 931 (3) which m/z features are associated with MS2 spectra 932 933 Parameters 934 ----------- 935 binsize : float 936 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 937 mf_plot : boolean 938 Indicates whether to plot the m/z features. Defaults to True. 939 ms2_plot : boolean 940 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 941 return_fig : boolean 942 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 943 944 Returns 945 -------- 946 matplotlib.pyplot.Figure 947 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 948 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 949 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 950 features with associated with MS2 spectra are plotted, they are displayed in red. 951 952 Raises 953 ------ 954 Warning 955 If m/z features are set to be plot but aren't in the dataset. 956 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 957 were found for the m/z features in the dataset. 958 """ 959 if mf_plot: 960 # Check if mass_features is set, raise error if not 961 if self.mass_features is None: 962 raise ValueError( 963 "mass_features not set, must run find_mass_features() first" 964 ) 965 ## call mass feature data 966 mf_df = self.mass_features_to_df() 967 968 if ms2_plot: 969 if not mf_plot: 970 # Check if mass_features is set, raise error if not 971 if self.mass_features is None: 972 raise ValueError( 973 "mass_features not set, must run find_mass_features() first" 974 ) 975 976 ## call m/z feature data 977 mf_df = self.mass_features_to_df() 978 979 # Check if ms2_spectrum is set, raise error if not 980 if 'ms2_spectrum' not in mf_df.columns: 981 raise ValueError( 982 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 983 ) 984 985 ## threshold and grid unprocessed data 986 df = self._ms_unprocessed[1].copy() 987 df = df.dropna(subset=['intensity']).reset_index(drop = True) 988 threshold = ph_int_min_thresh * df.intensity.max() 989 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 990 df = self.grid_data(df_thres) 991 992 ## format unprocessed data for plotting 993 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 994 mz_grid = np.arange(0, np.max(df.mz), binsize) 995 mz_data = np.array(df.mz) 996 df['mz_bin'] = find_closest(mz_grid, mz_data) 997 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 998 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 999 1000 ## generate figure 1001 fig = plt.figure() 1002 plt.scatter( 1003 unproc_df.scan_time, 1004 unproc_df.mz_bin*binsize, 1005 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1006 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1007 cmap = 'Greys_r', 1008 s = 1 1009 ) 1010 1011 if mf_plot: 1012 if ms2_plot: 1013 plt.scatter( 1014 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1015 mf_df[mf_df.ms2_spectrum.isna()].mz, 1016 c = 'c', 1017 s = 4, 1018 label = 'M/Z features without MS2' 1019 ) 1020 else: 1021 plt.scatter( 1022 mf_df.scan_time, 1023 mf_df.mz, 1024 c = 'c', 1025 s = 4, 1026 label = 'M/Z features' 1027 ) 1028 1029 if ms2_plot: 1030 plt.scatter( 1031 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1032 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1033 c = 'r', 1034 s = 2, 1035 label = 'M/Z features with MS2' 1036 ) 1037 1038 if mf_plot == True or ms2_plot == True: 1039 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1040 plt.xlabel('Scan time') 1041 plt.ylabel('m/z') 1042 plt.ylim(0, np.ceil(np.max(df.mz))) 1043 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1044 plt.title('Composite Feature Map') 1045 1046 if return_fig: 1047 plt.close(fig) 1048 return fig 1049 1050 else: 1051 plt.show()
Returns a figure displaying (1) thresholded, unprocessed data (2) the m/z features (3) which m/z features are associated with MS2 spectra
Parameters
- binsize (float): Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4.
- mf_plot (boolean): Indicates whether to plot the m/z features. Defaults to True.
- ms2_plot (boolean): Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
- return_fig (boolean): Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
Returns
- matplotlib.pyplot.Figure: A figure with the thresholded, unprocessed data on an axis of m/z value with respect to scan time. Unprocessed data is displayed in gray scale with darker colors indicating higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z features with associated with MS2 spectra are plotted, they are displayed in red.
Raises
- Warning: If m/z features are set to be plot but aren't in the dataset. If m/z features with associated MS2 data are set to be plot but no MS2 annotations were found for the m/z features in the dataset.
1090 def set_tic_list_from_data(self, overwrite=False): 1091 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1092 1093 Parameters 1094 ----------- 1095 overwrite : bool, optional 1096 If True, overwrites the TIC list if it is already set. Defaults to False. 1097 1098 Notes 1099 ----- 1100 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1101 1102 Raises 1103 ------ 1104 ValueError 1105 If no mass spectra are found in the dataset. 1106 If the TIC list is already set and overwrite is False. 1107 """ 1108 # Check if _ms is empty and raise error if so 1109 if len(self._ms) == 0: 1110 raise ValueError("No mass spectra found in dataset") 1111 1112 # Check if tic_list is already set and raise error if so 1113 if len(self.tic) > 0 and not overwrite: 1114 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1115 1116 self.tic = [self._ms.get(i).tic for i in self.scans_number]
Sets the TIC list from the mass spectrum objects within the _ms dictionary.
Parameters
- overwrite (bool, optional): If True, overwrites the TIC list if it is already set. Defaults to False.
Notes
If the _ms dictionary is incomplete, sets the TIC list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the TIC list is already set and overwrite is False.
1118 def set_retention_time_from_data(self, overwrite=False): 1119 """Sets the retention time list from the data in the _ms dictionary. 1120 1121 Parameters 1122 ----------- 1123 overwrite : bool, optional 1124 If True, overwrites the retention time list if it is already set. Defaults to False. 1125 1126 Notes 1127 ----- 1128 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1129 1130 Raises 1131 ------ 1132 ValueError 1133 If no mass spectra are found in the dataset. 1134 If the retention time list is already set and overwrite is False. 1135 """ 1136 # Check if _ms is empty and raise error if so 1137 if len(self._ms) == 0: 1138 raise ValueError("No mass spectra found in dataset") 1139 1140 # Check if retention_time_list is already set and raise error if so 1141 if len(self.retention_time) > 0 and not overwrite: 1142 raise ValueError( 1143 "Retention time list already set, use overwrite=True to overwrite" 1144 ) 1145 1146 retention_time_list = [] 1147 for key_ms in sorted(self._ms.keys()): 1148 retention_time_list.append(self._ms.get(key_ms).retention_time) 1149 self.retention_time = retention_time_list
Sets the retention time list from the data in the _ms dictionary.
Parameters
- overwrite (bool, optional): If True, overwrites the retention time list if it is already set. Defaults to False.
Notes
If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the retention time list is already set and overwrite is False.
1151 def set_scans_number_from_data(self, overwrite=False): 1152 """Sets the scan number list from the data in the _ms dictionary. 1153 1154 Notes 1155 ----- 1156 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1157 1158 Raises 1159 ------ 1160 ValueError 1161 If no mass spectra are found in the dataset. 1162 If the scan number list is already set and overwrite is False. 1163 """ 1164 # Check if _ms is empty and raise error if so 1165 if len(self._ms) == 0: 1166 raise ValueError("No mass spectra found in dataset") 1167 1168 # Check if scans_number_list is already set and raise error if so 1169 if len(self.scans_number) > 0 and not overwrite: 1170 raise ValueError( 1171 "Scan number list already set, use overwrite=True to overwrite" 1172 ) 1173 1174 self.scans_number = sorted(self._ms.keys())
Sets the scan number list from the data in the _ms dictionary.
Notes
If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the scan number list is already set and overwrite is False.
Inherited Members
- MassSpectraBase
- file_location
- analyzer
- instrument_label
- add_mass_spectrum
- add_mass_spectra
- get_time_of_scan_id
- scan_df
- ms
- corems.mass_spectra.calc.lc_calc.LCCalculations
- get_max_eic
- smooth_tic
- eic_centroid_detector
- find_nearest_scan
- add_peak_metrics
- get_average_mass_spectrum
- find_mass_features
- integrate_mass_features
- find_c13_mass_features
- deconvolute_ms1_mass_features