corems.mass_spectra.factory.lc_class
1from pathlib import Path 2 3import numpy as np 4import pandas as pd 5import warnings 6import matplotlib.pyplot as plt 7 8from corems.encapsulation.factory.parameters import LCMSParameters 9from corems.mass_spectra.calc.lc_calc import LCCalculations, PHCalculations 10from corems.molecular_id.search.lcms_spectral_search import LCMSSpectralSearch 11from corems.mass_spectrum.input.numpyArray import ms_from_array_profile, ms_from_array_centroid 12from corems.mass_spectra.calc.lc_calc import find_closest 13 14 15class MassSpectraBase: 16 """Base class for mass spectra objects. 17 18 Parameters 19 ----------- 20 file_location : str or Path 21 The location of the file containing the mass spectra data. 22 analyzer : str, optional 23 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 24 instrument_label : str, optional 25 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 26 sample_name : str, optional 27 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 28 spectra_parser : object, optional 29 The spectra parser object used to create the mass spectra object. Defaults to None. 30 31 Attributes 32 ----------- 33 spectra_parser_class : class 34 The class of the spectra parser used to create the mass spectra object. 35 file_location : str or Path 36 The location of the file containing the mass spectra data. 37 sample_name : str 38 The name of the sample; defaults to the file name if not provided to the parser. 39 analyzer : str 40 The type of analyzer used to generate the mass spectra data. Derived from the spectra parser. 41 instrument_label : str 42 The type of instrument used to generate the mass spectra data. Derived from the spectra parser. 43 _scan_info : dict 44 A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, 45 scan text, and scan window (lower and upper). 46 Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame. 47 _ms : dict 48 A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary. 49 _ms_unprocessed: dictionary of pandas.DataFrames or None 50 A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. 51 Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None. 52 53 Methods 54 -------- 55 * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). 56 Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans 57 * get_time_of_scan_id(scan). 58 Returns the scan time for the specified scan number. 59 """ 60 61 def __init__( 62 self, 63 file_location, 64 analyzer="Unknown", 65 instrument_label="Unknown", 66 sample_name=None, 67 spectra_parser=None, 68 ): 69 if isinstance(file_location, str): 70 file_location = Path(file_location) 71 else: 72 file_location = file_location 73 if not file_location.exists(): 74 raise FileExistsError("File does not exist: " + str(file_location)) 75 76 if sample_name: 77 self.sample_name = sample_name 78 else: 79 self.sample_name = file_location.stem 80 81 self.file_location = file_location 82 self.analyzer = analyzer 83 self.instrument_label = instrument_label 84 85 # Add the spectra parser class to the object if it is not None 86 if spectra_parser is not None: 87 self.spectra_parser_class = spectra_parser.__class__ 88 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 89 if ( 90 self.sample_name is not None 91 and self.sample_name != self.spectra_parser.sample_name 92 ): 93 warnings.warn( 94 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 95 UserWarning, 96 ) 97 if self.analyzer != self.spectra_parser.analyzer: 98 warnings.warn( 99 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 100 UserWarning, 101 ) 102 if self.instrument_label != self.spectra_parser.instrument_label: 103 warnings.warn( 104 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 105 UserWarning, 106 ) 107 if self.file_location != self.spectra_parser.file_location: 108 warnings.warn( 109 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 110 UserWarning, 111 ) 112 113 # Instantiate empty dictionaries for scan information and mass spectra 114 self._scan_info = {} 115 self._ms = {} 116 self._ms_unprocessed = {} 117 118 @property 119 def spectra_parser(self): 120 """Returns an instance of the spectra parser class.""" 121 return self.spectra_parser_class(self.file_location) 122 123 def add_mass_spectrum(self, mass_spec): 124 """Adds a mass spectrum to the dataset. 125 126 Parameters 127 ----------- 128 mass_spec : MassSpectrum 129 The corems MassSpectrum object to be added to the dataset. 130 131 Notes 132 ----- 133 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 134 """ 135 # check if mass_spec has a scan_number attribute 136 if not hasattr(mass_spec, "scan_number"): 137 raise ValueError( 138 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 139 ) 140 self._ms[mass_spec.scan_number] = mass_spec 141 142 def add_mass_spectra( 143 self, 144 scan_list, 145 spectrum_mode=None, 146 ms_level=1, 147 use_parser=True, 148 auto_process=True, 149 ms_params=None, 150 ): 151 """Add mass spectra to _ms dictionary, from a list of scans or single scan 152 153 Notes 154 ----- 155 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 156 157 158 Parameters 159 ----------- 160 scan_list : list of ints 161 List of scans to use to populate _ms slot 162 spectrum_mode : str or None 163 The spectrum mode to use for the mass spectra. 164 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 165 Defaults to None. 166 ms_level : int, optional 167 The MS level to use for the mass spectra. 168 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 169 Defaults to 1. 170 using_parser : bool 171 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 172 auto_process : bool 173 Whether to auto-process the mass spectra. Defaults to True. 174 ms_params : MSParameters or None 175 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 176 177 Raises 178 ------ 179 TypeError 180 If scan_list is not a list of ints 181 ValueError 182 If polarity is not 'positive' or 'negative' 183 If ms_level is not 1 or 2 184 """ 185 186 # check if scan_list is a list or a single int; if single int, convert to list 187 if isinstance(scan_list, int): 188 scan_list = [scan_list] 189 if not isinstance(scan_list, list): 190 raise TypeError("scan_list must be a list of integers") 191 for scan in scan_list: 192 if not isinstance(scan, int): 193 raise TypeError("scan_list must be a list of integers") 194 195 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 196 if self.polarity == "negative": 197 polarity = -1 198 elif self.polarity == "positive": 199 polarity = 1 200 else: 201 raise ValueError( 202 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 203 ) 204 205 # is not using_parser, check that ms1 and ms2 are not None 206 if not use_parser: 207 if ms_level not in self._ms_unprocessed.keys(): 208 raise ValueError( 209 "ms_level {} not found in _ms_unprocessed dictionary".format( 210 ms_level 211 ) 212 ) 213 214 scan_list = list(set(scan_list)) 215 scan_list.sort() 216 if not use_parser: 217 if self._ms_unprocessed[ms_level] is None: 218 raise ValueError( 219 "No unprocessed data found for ms_level {}".format(ms_level) 220 ) 221 if ( 222 len( 223 np.setdiff1d( 224 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 225 ) 226 ) 227 > 0 228 ): 229 raise ValueError( 230 "Not all scans in scan_list are present in the unprocessed data" 231 ) 232 # Prepare the ms_df for parsing 233 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 234 235 if use_parser: 236 # Use batch function to get all mass spectra at once 237 if spectrum_mode is None: 238 # get spectrum mode from _scan_info for each scan 239 spectrum_modes = [self.scan_df.loc[scan, "ms_format"] for scan in scan_list] 240 spectrum_mode_batch = spectrum_modes[0] if len(set(spectrum_modes)) == 1 else None 241 else: 242 spectrum_mode_batch = spectrum_mode 243 244 ms_list = self.spectra_parser.get_mass_spectra_from_scan_list( 245 scan_list=scan_list, 246 spectrum_mode=spectrum_mode_batch, 247 auto_process=False, 248 ) 249 250 # Process each mass spectrum 251 for i, scan in enumerate(scan_list): 252 ms = ms_list[i] if i < len(ms_list) else None 253 if ms is not None: 254 if ms_params is not None: 255 ms.parameters = ms_params 256 ms.scan_number = scan 257 if auto_process: 258 ms.process_mass_spec() 259 self.add_mass_spectrum(ms) 260 else: 261 # Original non-parser logic remains unchanged 262 for scan in scan_list: 263 ms = None 264 if spectrum_mode is None: 265 # get spectrum mode from _scan_info 266 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 267 else: 268 spectrum_mode_scan = spectrum_mode 269 270 my_ms_df = ms_df.loc[scan] 271 if spectrum_mode_scan == "profile": 272 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 273 ms = ms_from_array_profile( 274 my_ms_df.mz, 275 my_ms_df.intensity, 276 self.file_location, 277 polarity=polarity, 278 auto_process=False, 279 ) 280 else: 281 ms = ms_from_array_centroid( 282 mz = my_ms_df.mz, 283 abundance = my_ms_df.intensity, 284 rp = [np.nan] * len(my_ms_df.mz), 285 s2n = [np.nan] * len(my_ms_df.mz), 286 dataname = self.file_location, 287 polarity=polarity, 288 auto_process=False, 289 ) 290 291 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 292 if ms is not None: 293 if ms_params is not None: 294 ms.parameters = ms_params 295 ms.scan_number = scan 296 if auto_process: 297 ms.process_mass_spec() 298 self.add_mass_spectrum(ms) 299 300 def get_time_of_scan_id(self, scan): 301 """Returns the scan time for the specified scan number. 302 303 Parameters 304 ----------- 305 scan : int 306 The scan number of the desired scan time. 307 308 Returns 309 -------- 310 float 311 The scan time for the specified scan number (in minutes). 312 313 Raises 314 ------ 315 ValueError 316 If no scan time is found for the specified scan number. 317 """ 318 # Check if _retenion_time_list is empty and raise error if so 319 if len(self._retention_time_list) == 0: 320 raise ValueError("No retention times found in dataset") 321 rt = self._retention_time_list[self._scans_number_list.index(scan)] 322 return rt 323 324 @property 325 def scan_df(self): 326 """ 327 pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). 328 """ 329 scan_df = pd.DataFrame.from_dict(self._scan_info) 330 return scan_df 331 332 @property 333 def ms(self): 334 """ 335 dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles 336 """ 337 return self._ms 338 339 340 @scan_df.setter 341 def scan_df(self, df): 342 """ 343 Sets the scan data for the dataset. 344 345 Parameters 346 ----------- 347 df : pandas.DataFrame 348 A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level, 349 precursor m/z, scan text, and scan window (lower and upper). 350 """ 351 self._scan_info = df.to_dict() 352 353 def __getitem__(self, scan_number): 354 return self._ms.get(scan_number) 355 356 357class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch): 358 """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object. 359 360 This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method. 361 362 Parameters 363 ----------- 364 file_location : str or Path 365 The location of the file containing the mass spectra data. 366 analyzer : str, optional 367 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 368 instrument_label : str, optional 369 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 370 sample_name : str, optional 371 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 372 spectra_parser : object, optional 373 The spectra parser object used to create the mass spectra object. Defaults to None. 374 375 Attributes 376 ----------- 377 polarity : str 378 The polarity of the ionization mode used for the dataset. 379 _parameters : LCMSParameters 380 The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters. 381 _retention_time_list : numpy.ndarray 382 An array of retention times for the dataset. 383 _scans_number_list : list 384 A list of scan numbers for the dataset. 385 _tic_list : numpy.ndarray 386 An array of total ion current (TIC) values for the dataset. 387 eics : dict 388 A dictionary containing extracted ion chromatograms (EICs) for the dataset. 389 Key is the mz of the EIC. Initialized as an empty dictionary. 390 mass_features : dictionary of LCMSMassFeature objects 391 A dictionary containing mass features for the dataset. 392 Key is mass feature ID. Initialized as an empty dictionary. 393 spectral_search_results : dictionary of MS2SearchResults objects 394 A dictionary containing spectral search results for the dataset. 395 Key is scan number : precursor mz. Initialized as an empty dictionary. 396 397 Methods 398 -------- 399 * get_parameters_json(). 400 Returns the parameters used for the LC-MS analysis in JSON format. 401 * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) 402 Adds which MS2 scans are associated with each mass feature to the 403 mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary. 404 * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) 405 Adds the MS1 spectra associated with each mass feature to the 406 mass_features dictionary and adds the MS1 spectra to the _ms dictionary. 407 * mass_features_to_df() 408 Returns a pandas dataframe summarizing the mass features in the dataset. 409 * set_tic_list_from_data(overwrite=False) 410 Sets the TIC list from the mass spectrum objects within the _ms dictionary. 411 * set_retention_time_from_data(overwrite=False) 412 Sets the retention time list from the data in the _ms dictionary. 413 * set_scans_number_from_data(overwrite=False) 414 Sets the scan number list from the data in the _ms dictionary. 415 * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) 416 Generates plot of M/Z features comparing scan time vs M/Z value 417 """ 418 419 def __init__( 420 self, 421 file_location, 422 analyzer="Unknown", 423 instrument_label="Unknown", 424 sample_name=None, 425 spectra_parser=None, 426 ): 427 super().__init__( 428 file_location, analyzer, instrument_label, sample_name, spectra_parser 429 ) 430 self.polarity = "" 431 self._parameters = LCMSParameters() 432 self._retention_time_list = [] 433 self._scans_number_list = [] 434 self._tic_list = [] 435 self.eics = {} 436 self.mass_features = {} 437 self.spectral_search_results = {} 438 439 def get_parameters_json(self): 440 """Returns the parameters stored for the LC-MS object in JSON format. 441 442 Returns 443 -------- 444 str 445 The parameters used for the LC-MS analysis in JSON format. 446 """ 447 return self.parameters.to_json() 448 449 def remove_unprocessed_data(self, ms_level=None): 450 """Removes the unprocessed data from the LCMSBase object. 451 452 Parameters 453 ----------- 454 ms_level : int, optional 455 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 456 457 Raises 458 ------ 459 ValueError 460 If ms_level is not 1 or 2. 461 462 Notes 463 ----- 464 This method is useful for freeing up memory after the data has been processed. 465 """ 466 if ms_level is None: 467 for ms_level in self._ms_unprocessed.keys(): 468 self._ms_unprocessed[ms_level] = None 469 if ms_level not in [1, 2]: 470 raise ValueError("ms_level must be 1 or 2") 471 self._ms_unprocessed[ms_level] = None 472 473 def add_associated_ms2_dda( 474 self, 475 auto_process=True, 476 use_parser=True, 477 spectrum_mode=None, 478 ms_params_key="ms2", 479 scan_filter=None, 480 ): 481 """Add MS2 spectra associated with mass features to the dataset. 482 483 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 484 485 Parameters 486 ----------- 487 auto_process : bool, optional 488 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 489 use_parser : bool, optional 490 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 491 spectrum_mode : str or None, optional 492 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 493 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 494 Defaults to None. (faster if defined, otherwise will check each scan) 495 ms_params_key : string, optional 496 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 497 Defaults to 'ms2'. 498 scan_filter : str 499 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 500 "hcd" will pull out only HCD scans. 501 502 Raises 503 ------ 504 ValueError 505 If mass_features is not set, must run find_mass_features() first. 506 If no MS2 scans are found in the dataset. 507 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 508 """ 509 # Check if mass_features is set, raise error if not 510 if self.mass_features is None: 511 raise ValueError( 512 "mass_features not set, must run find_mass_features() first" 513 ) 514 515 # reconfigure ms_params to get the correct mass spectrum parameters from the key 516 ms_params = self.parameters.mass_spectrum[ms_params_key] 517 518 mf_df = self.mass_features_to_df().copy() 519 # Find ms2 scans that have a precursor m/z value 520 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 521 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 522 # drop ms2 scans that have no tic 523 ms2_scans = ms2_scans[ms2_scans.tic > 0] 524 if ms2_scans is None: 525 raise ValueError("No DDA scans found in dataset") 526 527 if scan_filter is not None: 528 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 529 # set tolerance in rt space (in minutes) and mz space (in daltons) 530 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 531 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 532 533 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 534 dda_scans = [] 535 for i, row in mf_df.iterrows(): 536 ms2_scans_filtered = ms2_scans[ 537 ms2_scans.scan_time.between( 538 row.scan_time - time_tol, row.scan_time + time_tol 539 ) 540 ] 541 ms2_scans_filtered = ms2_scans_filtered[ 542 ms2_scans_filtered.precursor_mz.between( 543 row.mz - mz_tol, row.mz + mz_tol 544 ) 545 ] 546 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 547 self.mass_features[i].ms2_scan_numbers = ( 548 ms2_scans_filtered.scan.tolist() 549 + self.mass_features[i].ms2_scan_numbers 550 ) 551 # add to _ms attribute 552 self.add_mass_spectra( 553 scan_list=list(set(dda_scans)), 554 auto_process=auto_process, 555 spectrum_mode=spectrum_mode, 556 use_parser=use_parser, 557 ms_params=ms_params, 558 ) 559 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 560 for mf_id in self.mass_features: 561 if self.mass_features[mf_id].ms2_scan_numbers is not None: 562 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 563 if dda_scan in self._ms.keys(): 564 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 565 dda_scan 566 ] 567 568 def add_associated_ms1( 569 self, auto_process=True, use_parser=True, spectrum_mode=None 570 ): 571 """Add MS1 spectra associated with mass features to the dataset. 572 573 Parameters 574 ----------- 575 auto_process : bool, optional 576 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 577 use_parser : bool, optional 578 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 579 spectrum_mode : str or None, optional 580 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 581 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 582 Defaults to None. (faster if defined, otherwise will check each scan) 583 584 Raises 585 ------ 586 ValueError 587 If mass_features is not set, must run find_mass_features() first. 588 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 589 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 590 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 591 """ 592 # Check if mass_features is set, raise error if not 593 if self.mass_features is None: 594 raise ValueError( 595 "mass_features not set, must run find_mass_features() first" 596 ) 597 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 598 599 if scans_to_average == 1: 600 # Add to LCMSobj 601 self.add_mass_spectra( 602 scan_list=[ 603 int(mf.apex_scan) for mf in self.mass_features.values() 604 ], 605 auto_process=auto_process, 606 use_parser=use_parser, 607 spectrum_mode=spectrum_mode, 608 ms_params=self.parameters.mass_spectrum["ms1"], 609 ) 610 611 elif ( 612 (scans_to_average - 1) % 2 613 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 614 apex_scans = list(set([int(mf.apex_scan) for mf in self.mass_features.values()])) 615 # Check if all apex scans are profile mode, raise error if not 616 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 617 raise ValueError("All apex scans must be profile mode for averaging") 618 619 # First get sets of scans to average 620 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 621 ms1_idx_start = ms1_scans.index(apex_scan) - int( 622 (scans_to_average - 1) / 2 623 ) 624 if ms1_idx_start < 0: 625 ms1_idx_start = 0 626 ms1_idx_end = ( 627 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 628 ) 629 if ms1_idx_end > (len(ms1_scans) - 1): 630 ms1_idx_end = len(ms1_scans) - 1 631 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 632 return scan_list 633 634 ms1_scans = self.ms1_scans 635 scans_lists = [ 636 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 637 for apex_scan in apex_scans 638 ] 639 640 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 641 if self.polarity == "negative": 642 polarity = -1 643 elif self.polarity == "positive": 644 polarity = 1 645 646 if not use_parser: 647 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 648 ms1_unprocessed = self._ms_unprocessed[1].copy() 649 # Set the index on _ms_unprocessed[1] to scan number 650 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 651 self._ms_unprocessed[1] = ms1_unprocessed 652 653 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 654 scans_lists_flat = list( 655 set([scan for sublist in scans_lists for scan in sublist]) 656 ) 657 if ( 658 len( 659 np.setdiff1d( 660 np.sort(scans_lists_flat), 661 np.sort(ms1_unprocessed.index.values), 662 ) 663 ) 664 > 0 665 ): 666 raise ValueError( 667 "Not all scans to average are present in the unprocessed data" 668 ) 669 670 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 671 # Get unprocessed mass spectrum from scans 672 ms = self.get_average_mass_spectrum( 673 scan_list=scan_list_average, 674 apex_scan=apex_scan, 675 spectrum_mode="profile", 676 ms_level=1, 677 auto_process=auto_process, 678 use_parser=use_parser, 679 perform_checks=False, 680 polarity=polarity, 681 ms_params=self.parameters.mass_spectrum["ms1"], 682 ) 683 # Add mass spectrum to LCMS object and associated with mass feature 684 self.add_mass_spectrum(ms) 685 686 if not use_parser: 687 # Reset the index on _ms_unprocessed[1] to not be scan number 688 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 689 self._ms_unprocessed[1] = ms1_unprocessed 690 else: 691 raise ValueError( 692 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 693 ) 694 695 # Associate the ms1 spectra with the mass features 696 for mf_id in self.mass_features: 697 self.mass_features[mf_id].mass_spectrum = self._ms[ 698 self.mass_features[mf_id].apex_scan 699 ] 700 self.mass_features[mf_id].update_mz() 701 702 def mass_features_to_df(self): 703 """Returns a pandas dataframe summarizing the mass features. 704 705 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 706 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 707 708 Returns 709 -------- 710 pandas.DataFrame 711 A pandas dataframe of mass features with the following columns: 712 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 713 """ 714 715 def mass_spectrum_to_string( 716 mass_spec, normalize=True, min_normalized_abun=0.01 717 ): 718 """Converts a mass spectrum to a string of m/z:abundance pairs. 719 720 Parameters 721 ----------- 722 mass_spec : MassSpectrum 723 A MassSpectrum object to be converted to a string. 724 normalize : bool, optional 725 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 726 min_normalized_abun : float, optional 727 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 728 729 Returns 730 -------- 731 str 732 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 733 """ 734 mz_np = mass_spec.to_dataframe()["m/z"].values 735 abun_np = mass_spec.to_dataframe()["Peak Height"].values 736 if normalize: 737 abun_np = abun_np / abun_np.max() 738 mz_abun = np.column_stack((mz_np, abun_np)) 739 if normalize: 740 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 741 mz_abun_str = [ 742 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 743 for mz, abun in mz_abun 744 ] 745 return "; ".join(mz_abun_str) 746 747 cols_in_df = [ 748 "id", 749 "apex_scan", 750 "start_scan", 751 "final_scan", 752 "retention_time", 753 "intensity", 754 "persistence", 755 "area", 756 "dispersity_index", 757 "normalized_dispersity_index", 758 "tailing_factor", 759 "gaussian_similarity", 760 "noise_score", 761 "noise_score_min", 762 "noise_score_max", 763 "monoisotopic_mf_id", 764 "isotopologue_type", 765 "mass_spectrum_deconvoluted_parent", 766 ] 767 df_mf_list = [] 768 for mf_id in self.mass_features.keys(): 769 # Find cols_in_df that are in single_mf 770 df_keys = list( 771 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 772 ) 773 dict_mf = {} 774 # Get the values for each key in df_keys from the mass feature object 775 for key in df_keys: 776 dict_mf[key] = getattr(self.mass_features[mf_id], key) 777 # Special handling for mass_spectrum and associated_mass_features_deconvoluted, since they are not single values 778 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 779 # Add MS2 spectra info 780 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 781 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 782 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 783 dict_mf["associated_mass_features"] = ", ".join( 784 map( 785 str, 786 self.mass_features[mf_id].associated_mass_features_deconvoluted, 787 ) 788 ) 789 # Check if EIC for mass feature is set 790 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 791 df_mf_single["mz"] = self.mass_features[mf_id].mz 792 df_mf_list.append(df_mf_single) 793 df_mf = pd.concat(df_mf_list) 794 795 # rename _area to area and id to mf_id 796 df_mf = df_mf.rename( 797 columns={ 798 "id": "mf_id", 799 "retention_time": "scan_time", 800 } 801 ) 802 803 # reorder columns 804 col_order = [ 805 "mf_id", 806 "scan_time", 807 "mz", 808 "apex_scan", 809 "start_scan", 810 "final_scan", 811 "intensity", 812 "persistence", 813 "area", 814 "half_height_width", 815 "tailing_factor", 816 "dispersity_index", 817 "normalized_dispersity_index", 818 "gaussian_similarity", 819 "noise_score", 820 "noise_score_min", 821 "noise_score_max", 822 "monoisotopic_mf_id", 823 "isotopologue_type", 824 "mass_spectrum_deconvoluted_parent", 825 "associated_mass_features", 826 "ms2_spectrum", 827 ] 828 # drop columns that are not in col_order 829 cols_to_order = [col for col in col_order if col in df_mf.columns] 830 df_mf = df_mf[cols_to_order] 831 832 # reset index to mf_id 833 df_mf = df_mf.set_index("mf_id") 834 df_mf.index.name = "mf_id" 835 836 return df_mf 837 838 def mass_features_ms1_annot_to_df(self): 839 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 840 841 Returns 842 -------- 843 pandas.DataFrame 844 A pandas dataframe of MS1 annotations for the mass features in the dataset. 845 The index is set to mf_id (mass feature ID) 846 847 Raises 848 ------ 849 Warning 850 If no MS1 annotations were found for the mass features in the dataset. 851 """ 852 annot_df_list_ms1 = [] 853 for mf_id in self.mass_features.keys(): 854 if self.mass_features[mf_id].mass_spectrum is None: 855 pass 856 else: 857 # Add ms1 annotations to ms1 annotation list 858 if ( 859 np.abs( 860 ( 861 self.mass_features[mf_id].ms1_peak.mz_exp 862 - self.mass_features[mf_id].mz 863 ) 864 ) 865 < 0.01 866 ): 867 # Get the molecular formula from the mass spectrum 868 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 869 # Subset to pull out only the peak associated with the mass feature 870 annot_df = annot_df[ 871 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 872 ].copy() 873 874 # If there are more than 1 row, remove any rows without a molecular formula 875 if len(annot_df) > 1: 876 annot_df = annot_df[~annot_df["Molecular Formula"].isna()] 877 878 # Remove the index column and add column for mf_id 879 annot_df = annot_df.drop(columns=["Index"]) 880 annot_df["mf_id"] = mf_id 881 annot_df_list_ms1.append(annot_df) 882 883 if len(annot_df_list_ms1) > 0: 884 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 885 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 886 annot_ms1_df_full.index.name = "mf_id" 887 888 else: 889 annot_ms1_df_full = None 890 # Warn that no ms1 annotations were found 891 warnings.warn( 892 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 893 UserWarning, 894 ) 895 896 return annot_ms1_df_full 897 898 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 899 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 900 901 Parameters 902 ----------- 903 molecular_metadata : dict of MolecularMetadata objects 904 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 905 906 Returns 907 -------- 908 pandas.DataFrame 909 A pandas dataframe of MS2 annotations for the mass features in the dataset, 910 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 911 912 Raises 913 ------ 914 Warning 915 If no MS2 annotations were found for the mass features in the dataset. 916 """ 917 annot_df_list_ms2 = [] 918 for mf_id in self.mass_features.keys(): 919 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 920 # Add ms2 annotations to ms2 annotation list 921 for result in self.mass_features[mf_id].ms2_similarity_results: 922 annot_df_ms2 = result.to_dataframe() 923 annot_df_ms2["mf_id"] = mf_id 924 annot_df_list_ms2.append(annot_df_ms2) 925 926 if len(annot_df_list_ms2) > 0: 927 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 928 if molecular_metadata is not None: 929 molecular_metadata_df = pd.concat( 930 [ 931 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 932 for k, v in molecular_metadata.items() 933 ], 934 ignore_index=True, 935 ) 936 molecular_metadata_df = molecular_metadata_df.rename( 937 columns={"id": "ref_mol_id"} 938 ) 939 annot_ms2_df_full = annot_ms2_df_full.merge( 940 molecular_metadata_df, on="ref_mol_id", how="left" 941 ) 942 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 943 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 944 ).copy() 945 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 946 annot_ms2_df_full.index.name = "mf_id" 947 else: 948 annot_ms2_df_full = None 949 # Warn that no ms2 annotations were found 950 warnings.warn( 951 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 952 UserWarning, 953 ) 954 955 return annot_ms2_df_full 956 957 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 958 """Returns a figure displaying 959 (1) thresholded, unprocessed data 960 (2) the m/z features 961 (3) which m/z features are associated with MS2 spectra 962 963 Parameters 964 ----------- 965 binsize : float 966 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 967 mf_plot : boolean 968 Indicates whether to plot the m/z features. Defaults to True. 969 ms2_plot : boolean 970 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 971 return_fig : boolean 972 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 973 974 Returns 975 -------- 976 matplotlib.pyplot.Figure 977 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 978 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 979 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 980 features with associated with MS2 spectra are plotted, they are displayed in red. 981 982 Raises 983 ------ 984 Warning 985 If m/z features are set to be plot but aren't in the dataset. 986 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 987 were found for the m/z features in the dataset. 988 """ 989 if mf_plot: 990 # Check if mass_features is set, raise error if not 991 if self.mass_features is None: 992 raise ValueError( 993 "mass_features not set, must run find_mass_features() first" 994 ) 995 ## call mass feature data 996 mf_df = self.mass_features_to_df() 997 998 if ms2_plot: 999 if not mf_plot: 1000 # Check if mass_features is set, raise error if not 1001 if self.mass_features is None: 1002 raise ValueError( 1003 "mass_features not set, must run find_mass_features() first" 1004 ) 1005 1006 ## call m/z feature data 1007 mf_df = self.mass_features_to_df() 1008 1009 # Check if ms2_spectrum is set, raise error if not 1010 if 'ms2_spectrum' not in mf_df.columns: 1011 raise ValueError( 1012 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 1013 ) 1014 1015 ## threshold and grid unprocessed data 1016 df = self._ms_unprocessed[1].copy() 1017 df = df.dropna(subset=['intensity']).reset_index(drop = True) 1018 threshold = ph_int_min_thresh * df.intensity.max() 1019 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 1020 df = self.grid_data(df_thres) 1021 1022 ## format unprocessed data for plotting 1023 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 1024 mz_grid = np.arange(0, np.max(df.mz), binsize) 1025 mz_data = np.array(df.mz) 1026 df['mz_bin'] = find_closest(mz_grid, mz_data) 1027 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 1028 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 1029 1030 ## generate figure 1031 fig = plt.figure() 1032 plt.scatter( 1033 unproc_df.scan_time, 1034 unproc_df.mz_bin*binsize, 1035 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1036 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1037 cmap = 'Greys_r', 1038 s = 1 1039 ) 1040 1041 if mf_plot: 1042 if ms2_plot: 1043 plt.scatter( 1044 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1045 mf_df[mf_df.ms2_spectrum.isna()].mz, 1046 c = 'c', 1047 s = 4, 1048 label = 'M/Z features without MS2' 1049 ) 1050 else: 1051 plt.scatter( 1052 mf_df.scan_time, 1053 mf_df.mz, 1054 c = 'c', 1055 s = 4, 1056 label = 'M/Z features' 1057 ) 1058 1059 if ms2_plot: 1060 plt.scatter( 1061 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1062 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1063 c = 'r', 1064 s = 2, 1065 label = 'M/Z features with MS2' 1066 ) 1067 1068 if mf_plot == True or ms2_plot == True: 1069 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1070 plt.xlabel('Scan time') 1071 plt.ylabel('m/z') 1072 plt.ylim(0, np.ceil(np.max(df.mz))) 1073 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1074 plt.title('Composite Feature Map') 1075 1076 if return_fig: 1077 plt.close(fig) 1078 return fig 1079 1080 else: 1081 plt.show() 1082 1083 def __len__(self): 1084 """ 1085 Returns the number of mass spectra in the dataset. 1086 1087 Returns 1088 -------- 1089 int 1090 The number of mass spectra in the dataset. 1091 """ 1092 return len(self._ms) 1093 1094 def __getitem__(self, scan_number): 1095 """ 1096 Returns the mass spectrum corresponding to the specified scan number. 1097 1098 Parameters 1099 ----------- 1100 scan_number : int 1101 The scan number of the desired mass spectrum. 1102 1103 Returns 1104 -------- 1105 MassSpectrum 1106 The mass spectrum corresponding to the specified scan number. 1107 """ 1108 return self._ms.get(scan_number) 1109 1110 def __iter__(self): 1111 """Returns an iterator over the mass spectra in the dataset. 1112 1113 Returns 1114 -------- 1115 iterator 1116 An iterator over the mass spectra in the dataset. 1117 """ 1118 return iter(self._ms.values()) 1119 1120 def set_tic_list_from_data(self, overwrite=False): 1121 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1122 1123 Parameters 1124 ----------- 1125 overwrite : bool, optional 1126 If True, overwrites the TIC list if it is already set. Defaults to False. 1127 1128 Notes 1129 ----- 1130 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1131 1132 Raises 1133 ------ 1134 ValueError 1135 If no mass spectra are found in the dataset. 1136 If the TIC list is already set and overwrite is False. 1137 """ 1138 # Check if _ms is empty and raise error if so 1139 if len(self._ms) == 0: 1140 raise ValueError("No mass spectra found in dataset") 1141 1142 # Check if tic_list is already set and raise error if so 1143 if len(self.tic) > 0 and not overwrite: 1144 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1145 1146 self.tic = [self._ms.get(i).tic for i in self.scans_number] 1147 1148 def set_retention_time_from_data(self, overwrite=False): 1149 """Sets the retention time list from the data in the _ms dictionary. 1150 1151 Parameters 1152 ----------- 1153 overwrite : bool, optional 1154 If True, overwrites the retention time list if it is already set. Defaults to False. 1155 1156 Notes 1157 ----- 1158 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1159 1160 Raises 1161 ------ 1162 ValueError 1163 If no mass spectra are found in the dataset. 1164 If the retention time list is already set and overwrite is False. 1165 """ 1166 # Check if _ms is empty and raise error if so 1167 if len(self._ms) == 0: 1168 raise ValueError("No mass spectra found in dataset") 1169 1170 # Check if retention_time_list is already set and raise error if so 1171 if len(self.retention_time) > 0 and not overwrite: 1172 raise ValueError( 1173 "Retention time list already set, use overwrite=True to overwrite" 1174 ) 1175 1176 retention_time_list = [] 1177 for key_ms in sorted(self._ms.keys()): 1178 retention_time_list.append(self._ms.get(key_ms).retention_time) 1179 self.retention_time = retention_time_list 1180 1181 def set_scans_number_from_data(self, overwrite=False): 1182 """Sets the scan number list from the data in the _ms dictionary. 1183 1184 Notes 1185 ----- 1186 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1187 1188 Raises 1189 ------ 1190 ValueError 1191 If no mass spectra are found in the dataset. 1192 If the scan number list is already set and overwrite is False. 1193 """ 1194 # Check if _ms is empty and raise error if so 1195 if len(self._ms) == 0: 1196 raise ValueError("No mass spectra found in dataset") 1197 1198 # Check if scans_number_list is already set and raise error if so 1199 if len(self.scans_number) > 0 and not overwrite: 1200 raise ValueError( 1201 "Scan number list already set, use overwrite=True to overwrite" 1202 ) 1203 1204 self.scans_number = sorted(self._ms.keys()) 1205 1206 @property 1207 def ms1_scans(self): 1208 """ 1209 list : A list of MS1 scan numbers for the dataset. 1210 """ 1211 return self.scan_df[self.scan_df.ms_level == 1].index.tolist() 1212 1213 @property 1214 def parameters(self): 1215 """ 1216 LCMSParameters : The parameters used for the LC-MS analysis. 1217 """ 1218 return self._parameters 1219 1220 @parameters.setter 1221 def parameters(self, paramsinstance): 1222 """ 1223 Sets the parameters used for the LC-MS analysis. 1224 1225 Parameters 1226 ----------- 1227 paramsinstance : LCMSParameters 1228 The parameters used for the LC-MS analysis. 1229 """ 1230 self._parameters = paramsinstance 1231 1232 @property 1233 def scans_number(self): 1234 """ 1235 list : A list of scan numbers for the dataset. 1236 """ 1237 return self._scans_number_list 1238 1239 @scans_number.setter 1240 def scans_number(self, scan_numbers_list): 1241 """ 1242 Sets the scan numbers for the dataset. 1243 1244 Parameters 1245 ----------- 1246 scan_numbers_list : list 1247 A list of scan numbers for the dataset. 1248 """ 1249 self._scans_number_list = scan_numbers_list 1250 1251 @property 1252 def retention_time(self): 1253 """ 1254 numpy.ndarray : An array of retention times for the dataset. 1255 """ 1256 return self._retention_time_list 1257 1258 @retention_time.setter 1259 def retention_time(self, rt_list): 1260 """ 1261 Sets the retention times for the dataset. 1262 1263 Parameters 1264 ----------- 1265 rt_list : list 1266 A list of retention times for the dataset. 1267 """ 1268 self._retention_time_list = np.array(rt_list) 1269 1270 @property 1271 def tic(self): 1272 """ 1273 numpy.ndarray : An array of TIC values for the dataset. 1274 """ 1275 return self._tic_list 1276 1277 @tic.setter 1278 def tic(self, tic_list): 1279 """ 1280 Sets the TIC values for the dataset. 1281 1282 Parameters 1283 ----------- 1284 tic_list : list 1285 A list of TIC values for the dataset. 1286 """ 1287 self._tic_list = np.array(tic_list)
16class MassSpectraBase: 17 """Base class for mass spectra objects. 18 19 Parameters 20 ----------- 21 file_location : str or Path 22 The location of the file containing the mass spectra data. 23 analyzer : str, optional 24 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 25 instrument_label : str, optional 26 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 27 sample_name : str, optional 28 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 29 spectra_parser : object, optional 30 The spectra parser object used to create the mass spectra object. Defaults to None. 31 32 Attributes 33 ----------- 34 spectra_parser_class : class 35 The class of the spectra parser used to create the mass spectra object. 36 file_location : str or Path 37 The location of the file containing the mass spectra data. 38 sample_name : str 39 The name of the sample; defaults to the file name if not provided to the parser. 40 analyzer : str 41 The type of analyzer used to generate the mass spectra data. Derived from the spectra parser. 42 instrument_label : str 43 The type of instrument used to generate the mass spectra data. Derived from the spectra parser. 44 _scan_info : dict 45 A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, 46 scan text, and scan window (lower and upper). 47 Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame. 48 _ms : dict 49 A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary. 50 _ms_unprocessed: dictionary of pandas.DataFrames or None 51 A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. 52 Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None. 53 54 Methods 55 -------- 56 * add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). 57 Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans 58 * get_time_of_scan_id(scan). 59 Returns the scan time for the specified scan number. 60 """ 61 62 def __init__( 63 self, 64 file_location, 65 analyzer="Unknown", 66 instrument_label="Unknown", 67 sample_name=None, 68 spectra_parser=None, 69 ): 70 if isinstance(file_location, str): 71 file_location = Path(file_location) 72 else: 73 file_location = file_location 74 if not file_location.exists(): 75 raise FileExistsError("File does not exist: " + str(file_location)) 76 77 if sample_name: 78 self.sample_name = sample_name 79 else: 80 self.sample_name = file_location.stem 81 82 self.file_location = file_location 83 self.analyzer = analyzer 84 self.instrument_label = instrument_label 85 86 # Add the spectra parser class to the object if it is not None 87 if spectra_parser is not None: 88 self.spectra_parser_class = spectra_parser.__class__ 89 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 90 if ( 91 self.sample_name is not None 92 and self.sample_name != self.spectra_parser.sample_name 93 ): 94 warnings.warn( 95 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 96 UserWarning, 97 ) 98 if self.analyzer != self.spectra_parser.analyzer: 99 warnings.warn( 100 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 101 UserWarning, 102 ) 103 if self.instrument_label != self.spectra_parser.instrument_label: 104 warnings.warn( 105 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 106 UserWarning, 107 ) 108 if self.file_location != self.spectra_parser.file_location: 109 warnings.warn( 110 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 111 UserWarning, 112 ) 113 114 # Instantiate empty dictionaries for scan information and mass spectra 115 self._scan_info = {} 116 self._ms = {} 117 self._ms_unprocessed = {} 118 119 @property 120 def spectra_parser(self): 121 """Returns an instance of the spectra parser class.""" 122 return self.spectra_parser_class(self.file_location) 123 124 def add_mass_spectrum(self, mass_spec): 125 """Adds a mass spectrum to the dataset. 126 127 Parameters 128 ----------- 129 mass_spec : MassSpectrum 130 The corems MassSpectrum object to be added to the dataset. 131 132 Notes 133 ----- 134 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 135 """ 136 # check if mass_spec has a scan_number attribute 137 if not hasattr(mass_spec, "scan_number"): 138 raise ValueError( 139 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 140 ) 141 self._ms[mass_spec.scan_number] = mass_spec 142 143 def add_mass_spectra( 144 self, 145 scan_list, 146 spectrum_mode=None, 147 ms_level=1, 148 use_parser=True, 149 auto_process=True, 150 ms_params=None, 151 ): 152 """Add mass spectra to _ms dictionary, from a list of scans or single scan 153 154 Notes 155 ----- 156 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 157 158 159 Parameters 160 ----------- 161 scan_list : list of ints 162 List of scans to use to populate _ms slot 163 spectrum_mode : str or None 164 The spectrum mode to use for the mass spectra. 165 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 166 Defaults to None. 167 ms_level : int, optional 168 The MS level to use for the mass spectra. 169 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 170 Defaults to 1. 171 using_parser : bool 172 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 173 auto_process : bool 174 Whether to auto-process the mass spectra. Defaults to True. 175 ms_params : MSParameters or None 176 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 177 178 Raises 179 ------ 180 TypeError 181 If scan_list is not a list of ints 182 ValueError 183 If polarity is not 'positive' or 'negative' 184 If ms_level is not 1 or 2 185 """ 186 187 # check if scan_list is a list or a single int; if single int, convert to list 188 if isinstance(scan_list, int): 189 scan_list = [scan_list] 190 if not isinstance(scan_list, list): 191 raise TypeError("scan_list must be a list of integers") 192 for scan in scan_list: 193 if not isinstance(scan, int): 194 raise TypeError("scan_list must be a list of integers") 195 196 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 197 if self.polarity == "negative": 198 polarity = -1 199 elif self.polarity == "positive": 200 polarity = 1 201 else: 202 raise ValueError( 203 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 204 ) 205 206 # is not using_parser, check that ms1 and ms2 are not None 207 if not use_parser: 208 if ms_level not in self._ms_unprocessed.keys(): 209 raise ValueError( 210 "ms_level {} not found in _ms_unprocessed dictionary".format( 211 ms_level 212 ) 213 ) 214 215 scan_list = list(set(scan_list)) 216 scan_list.sort() 217 if not use_parser: 218 if self._ms_unprocessed[ms_level] is None: 219 raise ValueError( 220 "No unprocessed data found for ms_level {}".format(ms_level) 221 ) 222 if ( 223 len( 224 np.setdiff1d( 225 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 226 ) 227 ) 228 > 0 229 ): 230 raise ValueError( 231 "Not all scans in scan_list are present in the unprocessed data" 232 ) 233 # Prepare the ms_df for parsing 234 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 235 236 if use_parser: 237 # Use batch function to get all mass spectra at once 238 if spectrum_mode is None: 239 # get spectrum mode from _scan_info for each scan 240 spectrum_modes = [self.scan_df.loc[scan, "ms_format"] for scan in scan_list] 241 spectrum_mode_batch = spectrum_modes[0] if len(set(spectrum_modes)) == 1 else None 242 else: 243 spectrum_mode_batch = spectrum_mode 244 245 ms_list = self.spectra_parser.get_mass_spectra_from_scan_list( 246 scan_list=scan_list, 247 spectrum_mode=spectrum_mode_batch, 248 auto_process=False, 249 ) 250 251 # Process each mass spectrum 252 for i, scan in enumerate(scan_list): 253 ms = ms_list[i] if i < len(ms_list) else None 254 if ms is not None: 255 if ms_params is not None: 256 ms.parameters = ms_params 257 ms.scan_number = scan 258 if auto_process: 259 ms.process_mass_spec() 260 self.add_mass_spectrum(ms) 261 else: 262 # Original non-parser logic remains unchanged 263 for scan in scan_list: 264 ms = None 265 if spectrum_mode is None: 266 # get spectrum mode from _scan_info 267 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 268 else: 269 spectrum_mode_scan = spectrum_mode 270 271 my_ms_df = ms_df.loc[scan] 272 if spectrum_mode_scan == "profile": 273 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 274 ms = ms_from_array_profile( 275 my_ms_df.mz, 276 my_ms_df.intensity, 277 self.file_location, 278 polarity=polarity, 279 auto_process=False, 280 ) 281 else: 282 ms = ms_from_array_centroid( 283 mz = my_ms_df.mz, 284 abundance = my_ms_df.intensity, 285 rp = [np.nan] * len(my_ms_df.mz), 286 s2n = [np.nan] * len(my_ms_df.mz), 287 dataname = self.file_location, 288 polarity=polarity, 289 auto_process=False, 290 ) 291 292 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 293 if ms is not None: 294 if ms_params is not None: 295 ms.parameters = ms_params 296 ms.scan_number = scan 297 if auto_process: 298 ms.process_mass_spec() 299 self.add_mass_spectrum(ms) 300 301 def get_time_of_scan_id(self, scan): 302 """Returns the scan time for the specified scan number. 303 304 Parameters 305 ----------- 306 scan : int 307 The scan number of the desired scan time. 308 309 Returns 310 -------- 311 float 312 The scan time for the specified scan number (in minutes). 313 314 Raises 315 ------ 316 ValueError 317 If no scan time is found for the specified scan number. 318 """ 319 # Check if _retenion_time_list is empty and raise error if so 320 if len(self._retention_time_list) == 0: 321 raise ValueError("No retention times found in dataset") 322 rt = self._retention_time_list[self._scans_number_list.index(scan)] 323 return rt 324 325 @property 326 def scan_df(self): 327 """ 328 pandas.DataFrame : A pandas DataFrame containing the scan info data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). 329 """ 330 scan_df = pd.DataFrame.from_dict(self._scan_info) 331 return scan_df 332 333 @property 334 def ms(self): 335 """ 336 dictionary : contains the key associated with mass spectra and values are the associated MassSpecProfiles 337 """ 338 return self._ms 339 340 341 @scan_df.setter 342 def scan_df(self, df): 343 """ 344 Sets the scan data for the dataset. 345 346 Parameters 347 ----------- 348 df : pandas.DataFrame 349 A pandas DataFrame containing the scan data with columns for scan number, scan time, ms level, 350 precursor m/z, scan text, and scan window (lower and upper). 351 """ 352 self._scan_info = df.to_dict() 353 354 def __getitem__(self, scan_number): 355 return self._ms.get(scan_number)
Base class for mass spectra objects.
Parameters
- file_location (str or Path): The location of the file containing the mass spectra data.
- analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
- instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
- sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
- spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
- spectra_parser_class (class): The class of the spectra parser used to create the mass spectra object.
- file_location (str or Path): The location of the file containing the mass spectra data.
- sample_name (str): The name of the sample; defaults to the file name if not provided to the parser.
- analyzer (str): The type of analyzer used to generate the mass spectra data. Derived from the spectra parser.
- instrument_label (str): The type of instrument used to generate the mass spectra data. Derived from the spectra parser.
- _scan_info (dict): A dictionary containing the scan data with columns for scan number, scan time, ms level, precursor m/z, scan text, and scan window (lower and upper). Associated with the property scan_df, which returns a pandas DataFrame or can set this attribute from a pandas DataFrame.
- _ms (dict): A dictionary containing mass spectra for the dataset, keys of dictionary are scan numbers. Initialized as an empty dictionary.
- _ms_unprocessed (dictionary of pandas.DataFrames or None): A dictionary of unprocssed mass spectra data, as an (optional) intermediate data product for peak picking. Key is ms_level, and value is dataframe with columns for scan number, m/z, and intensity. Default is None.
Methods
- add_mass_spectra(scan_list, spectrum_mode: str = 'profile', use_parser = True, auto_process=True). Add mass spectra (or singlel mass spectrum) to _ms slot, from a list of scans
- get_time_of_scan_id(scan). Returns the scan time for the specified scan number.
62 def __init__( 63 self, 64 file_location, 65 analyzer="Unknown", 66 instrument_label="Unknown", 67 sample_name=None, 68 spectra_parser=None, 69 ): 70 if isinstance(file_location, str): 71 file_location = Path(file_location) 72 else: 73 file_location = file_location 74 if not file_location.exists(): 75 raise FileExistsError("File does not exist: " + str(file_location)) 76 77 if sample_name: 78 self.sample_name = sample_name 79 else: 80 self.sample_name = file_location.stem 81 82 self.file_location = file_location 83 self.analyzer = analyzer 84 self.instrument_label = instrument_label 85 86 # Add the spectra parser class to the object if it is not None 87 if spectra_parser is not None: 88 self.spectra_parser_class = spectra_parser.__class__ 89 # Check that spectra_pasrser.sample_name is same as sample_name etc, raise warning if not 90 if ( 91 self.sample_name is not None 92 and self.sample_name != self.spectra_parser.sample_name 93 ): 94 warnings.warn( 95 "sample_name provided to MassSpectraBase object does not match sample_name provided to spectra parser object", 96 UserWarning, 97 ) 98 if self.analyzer != self.spectra_parser.analyzer: 99 warnings.warn( 100 "analyzer provided to MassSpectraBase object does not match analyzer provided to spectra parser object", 101 UserWarning, 102 ) 103 if self.instrument_label != self.spectra_parser.instrument_label: 104 warnings.warn( 105 "instrument provided to MassSpectraBase object does not match instrument provided to spectra parser object", 106 UserWarning, 107 ) 108 if self.file_location != self.spectra_parser.file_location: 109 warnings.warn( 110 "file_location provided to MassSpectraBase object does not match file_location provided to spectra parser object", 111 UserWarning, 112 ) 113 114 # Instantiate empty dictionaries for scan information and mass spectra 115 self._scan_info = {} 116 self._ms = {} 117 self._ms_unprocessed = {}
124 def add_mass_spectrum(self, mass_spec): 125 """Adds a mass spectrum to the dataset. 126 127 Parameters 128 ----------- 129 mass_spec : MassSpectrum 130 The corems MassSpectrum object to be added to the dataset. 131 132 Notes 133 ----- 134 This is a helper function for the add_mass_spectra() method, and is not intended to be called directly. 135 """ 136 # check if mass_spec has a scan_number attribute 137 if not hasattr(mass_spec, "scan_number"): 138 raise ValueError( 139 "Mass spectrum must have a scan_number attribute to be added to the dataset correctly" 140 ) 141 self._ms[mass_spec.scan_number] = mass_spec
Adds a mass spectrum to the dataset.
Parameters
- mass_spec (MassSpectrum): The corems MassSpectrum object to be added to the dataset.
Notes
This is a helper function for the add_mass_spectra() method, and is not intended to be called directly.
143 def add_mass_spectra( 144 self, 145 scan_list, 146 spectrum_mode=None, 147 ms_level=1, 148 use_parser=True, 149 auto_process=True, 150 ms_params=None, 151 ): 152 """Add mass spectra to _ms dictionary, from a list of scans or single scan 153 154 Notes 155 ----- 156 The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object. 157 158 159 Parameters 160 ----------- 161 scan_list : list of ints 162 List of scans to use to populate _ms slot 163 spectrum_mode : str or None 164 The spectrum mode to use for the mass spectra. 165 If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 166 Defaults to None. 167 ms_level : int, optional 168 The MS level to use for the mass spectra. 169 This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. 170 Defaults to 1. 171 using_parser : bool 172 Whether to use the mass spectra parser to get the mass spectra. Defaults to True. 173 auto_process : bool 174 Whether to auto-process the mass spectra. Defaults to True. 175 ms_params : MSParameters or None 176 The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters. 177 178 Raises 179 ------ 180 TypeError 181 If scan_list is not a list of ints 182 ValueError 183 If polarity is not 'positive' or 'negative' 184 If ms_level is not 1 or 2 185 """ 186 187 # check if scan_list is a list or a single int; if single int, convert to list 188 if isinstance(scan_list, int): 189 scan_list = [scan_list] 190 if not isinstance(scan_list, list): 191 raise TypeError("scan_list must be a list of integers") 192 for scan in scan_list: 193 if not isinstance(scan, int): 194 raise TypeError("scan_list must be a list of integers") 195 196 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 197 if self.polarity == "negative": 198 polarity = -1 199 elif self.polarity == "positive": 200 polarity = 1 201 else: 202 raise ValueError( 203 "Polarity not set for dataset, must be a either 'positive' or 'negative'" 204 ) 205 206 # is not using_parser, check that ms1 and ms2 are not None 207 if not use_parser: 208 if ms_level not in self._ms_unprocessed.keys(): 209 raise ValueError( 210 "ms_level {} not found in _ms_unprocessed dictionary".format( 211 ms_level 212 ) 213 ) 214 215 scan_list = list(set(scan_list)) 216 scan_list.sort() 217 if not use_parser: 218 if self._ms_unprocessed[ms_level] is None: 219 raise ValueError( 220 "No unprocessed data found for ms_level {}".format(ms_level) 221 ) 222 if ( 223 len( 224 np.setdiff1d( 225 scan_list, self._ms_unprocessed[ms_level].scan.tolist() 226 ) 227 ) 228 > 0 229 ): 230 raise ValueError( 231 "Not all scans in scan_list are present in the unprocessed data" 232 ) 233 # Prepare the ms_df for parsing 234 ms_df = self._ms_unprocessed[ms_level].copy().set_index("scan", drop=False) 235 236 if use_parser: 237 # Use batch function to get all mass spectra at once 238 if spectrum_mode is None: 239 # get spectrum mode from _scan_info for each scan 240 spectrum_modes = [self.scan_df.loc[scan, "ms_format"] for scan in scan_list] 241 spectrum_mode_batch = spectrum_modes[0] if len(set(spectrum_modes)) == 1 else None 242 else: 243 spectrum_mode_batch = spectrum_mode 244 245 ms_list = self.spectra_parser.get_mass_spectra_from_scan_list( 246 scan_list=scan_list, 247 spectrum_mode=spectrum_mode_batch, 248 auto_process=False, 249 ) 250 251 # Process each mass spectrum 252 for i, scan in enumerate(scan_list): 253 ms = ms_list[i] if i < len(ms_list) else None 254 if ms is not None: 255 if ms_params is not None: 256 ms.parameters = ms_params 257 ms.scan_number = scan 258 if auto_process: 259 ms.process_mass_spec() 260 self.add_mass_spectrum(ms) 261 else: 262 # Original non-parser logic remains unchanged 263 for scan in scan_list: 264 ms = None 265 if spectrum_mode is None: 266 # get spectrum mode from _scan_info 267 spectrum_mode_scan = self.scan_df.loc[scan, "ms_format"] 268 else: 269 spectrum_mode_scan = spectrum_mode 270 271 my_ms_df = ms_df.loc[scan] 272 if spectrum_mode_scan == "profile": 273 # Check this - it might be better to use the MassSpectrumProfile class to instantiate the mass spectrum 274 ms = ms_from_array_profile( 275 my_ms_df.mz, 276 my_ms_df.intensity, 277 self.file_location, 278 polarity=polarity, 279 auto_process=False, 280 ) 281 else: 282 ms = ms_from_array_centroid( 283 mz = my_ms_df.mz, 284 abundance = my_ms_df.intensity, 285 rp = [np.nan] * len(my_ms_df.mz), 286 s2n = [np.nan] * len(my_ms_df.mz), 287 dataname = self.file_location, 288 polarity=polarity, 289 auto_process=False, 290 ) 291 292 # Set the mass spectrum parameters, auto-process if auto_process is True, and add to the dataset 293 if ms is not None: 294 if ms_params is not None: 295 ms.parameters = ms_params 296 ms.scan_number = scan 297 if auto_process: 298 ms.process_mass_spec() 299 self.add_mass_spectrum(ms)
Add mass spectra to _ms dictionary, from a list of scans or single scan
Notes
The mass spectra will inherit the mass_spectrum, ms_peak, and molecular_search parameters from the LCMSBase object.
Parameters
- scan_list (list of ints): List of scans to use to populate _ms slot
- spectrum_mode (str or None): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None.
- ms_level (int, optional): The MS level to use for the mass spectra. This is used to pass the molecular_search parameters from the LCMS object to the individual MassSpectrum objects. Defaults to 1.
- using_parser (bool): Whether to use the mass spectra parser to get the mass spectra. Defaults to True.
- auto_process (bool): Whether to auto-process the mass spectra. Defaults to True.
- ms_params (MSParameters or None): The mass spectrum parameters to use for the mass spectra. If None, uses the globally set MSParameters.
Raises
- TypeError: If scan_list is not a list of ints
- ValueError: If polarity is not 'positive' or 'negative' If ms_level is not 1 or 2
301 def get_time_of_scan_id(self, scan): 302 """Returns the scan time for the specified scan number. 303 304 Parameters 305 ----------- 306 scan : int 307 The scan number of the desired scan time. 308 309 Returns 310 -------- 311 float 312 The scan time for the specified scan number (in minutes). 313 314 Raises 315 ------ 316 ValueError 317 If no scan time is found for the specified scan number. 318 """ 319 # Check if _retenion_time_list is empty and raise error if so 320 if len(self._retention_time_list) == 0: 321 raise ValueError("No retention times found in dataset") 322 rt = self._retention_time_list[self._scans_number_list.index(scan)] 323 return rt
Returns the scan time for the specified scan number.
Parameters
- scan (int): The scan number of the desired scan time.
Returns
- float: The scan time for the specified scan number (in minutes).
Raises
- ValueError: If no scan time is found for the specified scan number.
358class LCMSBase(MassSpectraBase, LCCalculations, PHCalculations, LCMSSpectralSearch): 359 """A class representing a liquid chromatography-mass spectrometry (LC-MS) data object. 360 361 This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method. 362 363 Parameters 364 ----------- 365 file_location : str or Path 366 The location of the file containing the mass spectra data. 367 analyzer : str, optional 368 The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'. 369 instrument_label : str, optional 370 The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'. 371 sample_name : str, optional 372 The name of the sample; defaults to the file name if not provided to the parser. Defaults to None. 373 spectra_parser : object, optional 374 The spectra parser object used to create the mass spectra object. Defaults to None. 375 376 Attributes 377 ----------- 378 polarity : str 379 The polarity of the ionization mode used for the dataset. 380 _parameters : LCMSParameters 381 The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters. 382 _retention_time_list : numpy.ndarray 383 An array of retention times for the dataset. 384 _scans_number_list : list 385 A list of scan numbers for the dataset. 386 _tic_list : numpy.ndarray 387 An array of total ion current (TIC) values for the dataset. 388 eics : dict 389 A dictionary containing extracted ion chromatograms (EICs) for the dataset. 390 Key is the mz of the EIC. Initialized as an empty dictionary. 391 mass_features : dictionary of LCMSMassFeature objects 392 A dictionary containing mass features for the dataset. 393 Key is mass feature ID. Initialized as an empty dictionary. 394 spectral_search_results : dictionary of MS2SearchResults objects 395 A dictionary containing spectral search results for the dataset. 396 Key is scan number : precursor mz. Initialized as an empty dictionary. 397 398 Methods 399 -------- 400 * get_parameters_json(). 401 Returns the parameters used for the LC-MS analysis in JSON format. 402 * add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) 403 Adds which MS2 scans are associated with each mass feature to the 404 mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary. 405 * add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) 406 Adds the MS1 spectra associated with each mass feature to the 407 mass_features dictionary and adds the MS1 spectra to the _ms dictionary. 408 * mass_features_to_df() 409 Returns a pandas dataframe summarizing the mass features in the dataset. 410 * set_tic_list_from_data(overwrite=False) 411 Sets the TIC list from the mass spectrum objects within the _ms dictionary. 412 * set_retention_time_from_data(overwrite=False) 413 Sets the retention time list from the data in the _ms dictionary. 414 * set_scans_number_from_data(overwrite=False) 415 Sets the scan number list from the data in the _ms dictionary. 416 * plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) 417 Generates plot of M/Z features comparing scan time vs M/Z value 418 """ 419 420 def __init__( 421 self, 422 file_location, 423 analyzer="Unknown", 424 instrument_label="Unknown", 425 sample_name=None, 426 spectra_parser=None, 427 ): 428 super().__init__( 429 file_location, analyzer, instrument_label, sample_name, spectra_parser 430 ) 431 self.polarity = "" 432 self._parameters = LCMSParameters() 433 self._retention_time_list = [] 434 self._scans_number_list = [] 435 self._tic_list = [] 436 self.eics = {} 437 self.mass_features = {} 438 self.spectral_search_results = {} 439 440 def get_parameters_json(self): 441 """Returns the parameters stored for the LC-MS object in JSON format. 442 443 Returns 444 -------- 445 str 446 The parameters used for the LC-MS analysis in JSON format. 447 """ 448 return self.parameters.to_json() 449 450 def remove_unprocessed_data(self, ms_level=None): 451 """Removes the unprocessed data from the LCMSBase object. 452 453 Parameters 454 ----------- 455 ms_level : int, optional 456 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 457 458 Raises 459 ------ 460 ValueError 461 If ms_level is not 1 or 2. 462 463 Notes 464 ----- 465 This method is useful for freeing up memory after the data has been processed. 466 """ 467 if ms_level is None: 468 for ms_level in self._ms_unprocessed.keys(): 469 self._ms_unprocessed[ms_level] = None 470 if ms_level not in [1, 2]: 471 raise ValueError("ms_level must be 1 or 2") 472 self._ms_unprocessed[ms_level] = None 473 474 def add_associated_ms2_dda( 475 self, 476 auto_process=True, 477 use_parser=True, 478 spectrum_mode=None, 479 ms_params_key="ms2", 480 scan_filter=None, 481 ): 482 """Add MS2 spectra associated with mass features to the dataset. 483 484 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 485 486 Parameters 487 ----------- 488 auto_process : bool, optional 489 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 490 use_parser : bool, optional 491 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 492 spectrum_mode : str or None, optional 493 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 494 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 495 Defaults to None. (faster if defined, otherwise will check each scan) 496 ms_params_key : string, optional 497 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 498 Defaults to 'ms2'. 499 scan_filter : str 500 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 501 "hcd" will pull out only HCD scans. 502 503 Raises 504 ------ 505 ValueError 506 If mass_features is not set, must run find_mass_features() first. 507 If no MS2 scans are found in the dataset. 508 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 509 """ 510 # Check if mass_features is set, raise error if not 511 if self.mass_features is None: 512 raise ValueError( 513 "mass_features not set, must run find_mass_features() first" 514 ) 515 516 # reconfigure ms_params to get the correct mass spectrum parameters from the key 517 ms_params = self.parameters.mass_spectrum[ms_params_key] 518 519 mf_df = self.mass_features_to_df().copy() 520 # Find ms2 scans that have a precursor m/z value 521 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 522 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 523 # drop ms2 scans that have no tic 524 ms2_scans = ms2_scans[ms2_scans.tic > 0] 525 if ms2_scans is None: 526 raise ValueError("No DDA scans found in dataset") 527 528 if scan_filter is not None: 529 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 530 # set tolerance in rt space (in minutes) and mz space (in daltons) 531 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 532 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 533 534 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 535 dda_scans = [] 536 for i, row in mf_df.iterrows(): 537 ms2_scans_filtered = ms2_scans[ 538 ms2_scans.scan_time.between( 539 row.scan_time - time_tol, row.scan_time + time_tol 540 ) 541 ] 542 ms2_scans_filtered = ms2_scans_filtered[ 543 ms2_scans_filtered.precursor_mz.between( 544 row.mz - mz_tol, row.mz + mz_tol 545 ) 546 ] 547 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 548 self.mass_features[i].ms2_scan_numbers = ( 549 ms2_scans_filtered.scan.tolist() 550 + self.mass_features[i].ms2_scan_numbers 551 ) 552 # add to _ms attribute 553 self.add_mass_spectra( 554 scan_list=list(set(dda_scans)), 555 auto_process=auto_process, 556 spectrum_mode=spectrum_mode, 557 use_parser=use_parser, 558 ms_params=ms_params, 559 ) 560 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 561 for mf_id in self.mass_features: 562 if self.mass_features[mf_id].ms2_scan_numbers is not None: 563 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 564 if dda_scan in self._ms.keys(): 565 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 566 dda_scan 567 ] 568 569 def add_associated_ms1( 570 self, auto_process=True, use_parser=True, spectrum_mode=None 571 ): 572 """Add MS1 spectra associated with mass features to the dataset. 573 574 Parameters 575 ----------- 576 auto_process : bool, optional 577 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 578 use_parser : bool, optional 579 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 580 spectrum_mode : str or None, optional 581 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 582 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 583 Defaults to None. (faster if defined, otherwise will check each scan) 584 585 Raises 586 ------ 587 ValueError 588 If mass_features is not set, must run find_mass_features() first. 589 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 590 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 591 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 592 """ 593 # Check if mass_features is set, raise error if not 594 if self.mass_features is None: 595 raise ValueError( 596 "mass_features not set, must run find_mass_features() first" 597 ) 598 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 599 600 if scans_to_average == 1: 601 # Add to LCMSobj 602 self.add_mass_spectra( 603 scan_list=[ 604 int(mf.apex_scan) for mf in self.mass_features.values() 605 ], 606 auto_process=auto_process, 607 use_parser=use_parser, 608 spectrum_mode=spectrum_mode, 609 ms_params=self.parameters.mass_spectrum["ms1"], 610 ) 611 612 elif ( 613 (scans_to_average - 1) % 2 614 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 615 apex_scans = list(set([int(mf.apex_scan) for mf in self.mass_features.values()])) 616 # Check if all apex scans are profile mode, raise error if not 617 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 618 raise ValueError("All apex scans must be profile mode for averaging") 619 620 # First get sets of scans to average 621 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 622 ms1_idx_start = ms1_scans.index(apex_scan) - int( 623 (scans_to_average - 1) / 2 624 ) 625 if ms1_idx_start < 0: 626 ms1_idx_start = 0 627 ms1_idx_end = ( 628 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 629 ) 630 if ms1_idx_end > (len(ms1_scans) - 1): 631 ms1_idx_end = len(ms1_scans) - 1 632 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 633 return scan_list 634 635 ms1_scans = self.ms1_scans 636 scans_lists = [ 637 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 638 for apex_scan in apex_scans 639 ] 640 641 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 642 if self.polarity == "negative": 643 polarity = -1 644 elif self.polarity == "positive": 645 polarity = 1 646 647 if not use_parser: 648 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 649 ms1_unprocessed = self._ms_unprocessed[1].copy() 650 # Set the index on _ms_unprocessed[1] to scan number 651 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 652 self._ms_unprocessed[1] = ms1_unprocessed 653 654 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 655 scans_lists_flat = list( 656 set([scan for sublist in scans_lists for scan in sublist]) 657 ) 658 if ( 659 len( 660 np.setdiff1d( 661 np.sort(scans_lists_flat), 662 np.sort(ms1_unprocessed.index.values), 663 ) 664 ) 665 > 0 666 ): 667 raise ValueError( 668 "Not all scans to average are present in the unprocessed data" 669 ) 670 671 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 672 # Get unprocessed mass spectrum from scans 673 ms = self.get_average_mass_spectrum( 674 scan_list=scan_list_average, 675 apex_scan=apex_scan, 676 spectrum_mode="profile", 677 ms_level=1, 678 auto_process=auto_process, 679 use_parser=use_parser, 680 perform_checks=False, 681 polarity=polarity, 682 ms_params=self.parameters.mass_spectrum["ms1"], 683 ) 684 # Add mass spectrum to LCMS object and associated with mass feature 685 self.add_mass_spectrum(ms) 686 687 if not use_parser: 688 # Reset the index on _ms_unprocessed[1] to not be scan number 689 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 690 self._ms_unprocessed[1] = ms1_unprocessed 691 else: 692 raise ValueError( 693 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 694 ) 695 696 # Associate the ms1 spectra with the mass features 697 for mf_id in self.mass_features: 698 self.mass_features[mf_id].mass_spectrum = self._ms[ 699 self.mass_features[mf_id].apex_scan 700 ] 701 self.mass_features[mf_id].update_mz() 702 703 def mass_features_to_df(self): 704 """Returns a pandas dataframe summarizing the mass features. 705 706 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 707 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 708 709 Returns 710 -------- 711 pandas.DataFrame 712 A pandas dataframe of mass features with the following columns: 713 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 714 """ 715 716 def mass_spectrum_to_string( 717 mass_spec, normalize=True, min_normalized_abun=0.01 718 ): 719 """Converts a mass spectrum to a string of m/z:abundance pairs. 720 721 Parameters 722 ----------- 723 mass_spec : MassSpectrum 724 A MassSpectrum object to be converted to a string. 725 normalize : bool, optional 726 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 727 min_normalized_abun : float, optional 728 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 729 730 Returns 731 -------- 732 str 733 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 734 """ 735 mz_np = mass_spec.to_dataframe()["m/z"].values 736 abun_np = mass_spec.to_dataframe()["Peak Height"].values 737 if normalize: 738 abun_np = abun_np / abun_np.max() 739 mz_abun = np.column_stack((mz_np, abun_np)) 740 if normalize: 741 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 742 mz_abun_str = [ 743 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 744 for mz, abun in mz_abun 745 ] 746 return "; ".join(mz_abun_str) 747 748 cols_in_df = [ 749 "id", 750 "apex_scan", 751 "start_scan", 752 "final_scan", 753 "retention_time", 754 "intensity", 755 "persistence", 756 "area", 757 "dispersity_index", 758 "normalized_dispersity_index", 759 "tailing_factor", 760 "gaussian_similarity", 761 "noise_score", 762 "noise_score_min", 763 "noise_score_max", 764 "monoisotopic_mf_id", 765 "isotopologue_type", 766 "mass_spectrum_deconvoluted_parent", 767 ] 768 df_mf_list = [] 769 for mf_id in self.mass_features.keys(): 770 # Find cols_in_df that are in single_mf 771 df_keys = list( 772 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 773 ) 774 dict_mf = {} 775 # Get the values for each key in df_keys from the mass feature object 776 for key in df_keys: 777 dict_mf[key] = getattr(self.mass_features[mf_id], key) 778 # Special handling for mass_spectrum and associated_mass_features_deconvoluted, since they are not single values 779 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 780 # Add MS2 spectra info 781 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 782 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 783 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 784 dict_mf["associated_mass_features"] = ", ".join( 785 map( 786 str, 787 self.mass_features[mf_id].associated_mass_features_deconvoluted, 788 ) 789 ) 790 # Check if EIC for mass feature is set 791 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 792 df_mf_single["mz"] = self.mass_features[mf_id].mz 793 df_mf_list.append(df_mf_single) 794 df_mf = pd.concat(df_mf_list) 795 796 # rename _area to area and id to mf_id 797 df_mf = df_mf.rename( 798 columns={ 799 "id": "mf_id", 800 "retention_time": "scan_time", 801 } 802 ) 803 804 # reorder columns 805 col_order = [ 806 "mf_id", 807 "scan_time", 808 "mz", 809 "apex_scan", 810 "start_scan", 811 "final_scan", 812 "intensity", 813 "persistence", 814 "area", 815 "half_height_width", 816 "tailing_factor", 817 "dispersity_index", 818 "normalized_dispersity_index", 819 "gaussian_similarity", 820 "noise_score", 821 "noise_score_min", 822 "noise_score_max", 823 "monoisotopic_mf_id", 824 "isotopologue_type", 825 "mass_spectrum_deconvoluted_parent", 826 "associated_mass_features", 827 "ms2_spectrum", 828 ] 829 # drop columns that are not in col_order 830 cols_to_order = [col for col in col_order if col in df_mf.columns] 831 df_mf = df_mf[cols_to_order] 832 833 # reset index to mf_id 834 df_mf = df_mf.set_index("mf_id") 835 df_mf.index.name = "mf_id" 836 837 return df_mf 838 839 def mass_features_ms1_annot_to_df(self): 840 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 841 842 Returns 843 -------- 844 pandas.DataFrame 845 A pandas dataframe of MS1 annotations for the mass features in the dataset. 846 The index is set to mf_id (mass feature ID) 847 848 Raises 849 ------ 850 Warning 851 If no MS1 annotations were found for the mass features in the dataset. 852 """ 853 annot_df_list_ms1 = [] 854 for mf_id in self.mass_features.keys(): 855 if self.mass_features[mf_id].mass_spectrum is None: 856 pass 857 else: 858 # Add ms1 annotations to ms1 annotation list 859 if ( 860 np.abs( 861 ( 862 self.mass_features[mf_id].ms1_peak.mz_exp 863 - self.mass_features[mf_id].mz 864 ) 865 ) 866 < 0.01 867 ): 868 # Get the molecular formula from the mass spectrum 869 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 870 # Subset to pull out only the peak associated with the mass feature 871 annot_df = annot_df[ 872 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 873 ].copy() 874 875 # If there are more than 1 row, remove any rows without a molecular formula 876 if len(annot_df) > 1: 877 annot_df = annot_df[~annot_df["Molecular Formula"].isna()] 878 879 # Remove the index column and add column for mf_id 880 annot_df = annot_df.drop(columns=["Index"]) 881 annot_df["mf_id"] = mf_id 882 annot_df_list_ms1.append(annot_df) 883 884 if len(annot_df_list_ms1) > 0: 885 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 886 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 887 annot_ms1_df_full.index.name = "mf_id" 888 889 else: 890 annot_ms1_df_full = None 891 # Warn that no ms1 annotations were found 892 warnings.warn( 893 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 894 UserWarning, 895 ) 896 897 return annot_ms1_df_full 898 899 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 900 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 901 902 Parameters 903 ----------- 904 molecular_metadata : dict of MolecularMetadata objects 905 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 906 907 Returns 908 -------- 909 pandas.DataFrame 910 A pandas dataframe of MS2 annotations for the mass features in the dataset, 911 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 912 913 Raises 914 ------ 915 Warning 916 If no MS2 annotations were found for the mass features in the dataset. 917 """ 918 annot_df_list_ms2 = [] 919 for mf_id in self.mass_features.keys(): 920 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 921 # Add ms2 annotations to ms2 annotation list 922 for result in self.mass_features[mf_id].ms2_similarity_results: 923 annot_df_ms2 = result.to_dataframe() 924 annot_df_ms2["mf_id"] = mf_id 925 annot_df_list_ms2.append(annot_df_ms2) 926 927 if len(annot_df_list_ms2) > 0: 928 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 929 if molecular_metadata is not None: 930 molecular_metadata_df = pd.concat( 931 [ 932 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 933 for k, v in molecular_metadata.items() 934 ], 935 ignore_index=True, 936 ) 937 molecular_metadata_df = molecular_metadata_df.rename( 938 columns={"id": "ref_mol_id"} 939 ) 940 annot_ms2_df_full = annot_ms2_df_full.merge( 941 molecular_metadata_df, on="ref_mol_id", how="left" 942 ) 943 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 944 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 945 ).copy() 946 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 947 annot_ms2_df_full.index.name = "mf_id" 948 else: 949 annot_ms2_df_full = None 950 # Warn that no ms2 annotations were found 951 warnings.warn( 952 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 953 UserWarning, 954 ) 955 956 return annot_ms2_df_full 957 958 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 959 """Returns a figure displaying 960 (1) thresholded, unprocessed data 961 (2) the m/z features 962 (3) which m/z features are associated with MS2 spectra 963 964 Parameters 965 ----------- 966 binsize : float 967 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 968 mf_plot : boolean 969 Indicates whether to plot the m/z features. Defaults to True. 970 ms2_plot : boolean 971 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 972 return_fig : boolean 973 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 974 975 Returns 976 -------- 977 matplotlib.pyplot.Figure 978 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 979 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 980 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 981 features with associated with MS2 spectra are plotted, they are displayed in red. 982 983 Raises 984 ------ 985 Warning 986 If m/z features are set to be plot but aren't in the dataset. 987 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 988 were found for the m/z features in the dataset. 989 """ 990 if mf_plot: 991 # Check if mass_features is set, raise error if not 992 if self.mass_features is None: 993 raise ValueError( 994 "mass_features not set, must run find_mass_features() first" 995 ) 996 ## call mass feature data 997 mf_df = self.mass_features_to_df() 998 999 if ms2_plot: 1000 if not mf_plot: 1001 # Check if mass_features is set, raise error if not 1002 if self.mass_features is None: 1003 raise ValueError( 1004 "mass_features not set, must run find_mass_features() first" 1005 ) 1006 1007 ## call m/z feature data 1008 mf_df = self.mass_features_to_df() 1009 1010 # Check if ms2_spectrum is set, raise error if not 1011 if 'ms2_spectrum' not in mf_df.columns: 1012 raise ValueError( 1013 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 1014 ) 1015 1016 ## threshold and grid unprocessed data 1017 df = self._ms_unprocessed[1].copy() 1018 df = df.dropna(subset=['intensity']).reset_index(drop = True) 1019 threshold = ph_int_min_thresh * df.intensity.max() 1020 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 1021 df = self.grid_data(df_thres) 1022 1023 ## format unprocessed data for plotting 1024 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 1025 mz_grid = np.arange(0, np.max(df.mz), binsize) 1026 mz_data = np.array(df.mz) 1027 df['mz_bin'] = find_closest(mz_grid, mz_data) 1028 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 1029 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 1030 1031 ## generate figure 1032 fig = plt.figure() 1033 plt.scatter( 1034 unproc_df.scan_time, 1035 unproc_df.mz_bin*binsize, 1036 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1037 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1038 cmap = 'Greys_r', 1039 s = 1 1040 ) 1041 1042 if mf_plot: 1043 if ms2_plot: 1044 plt.scatter( 1045 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1046 mf_df[mf_df.ms2_spectrum.isna()].mz, 1047 c = 'c', 1048 s = 4, 1049 label = 'M/Z features without MS2' 1050 ) 1051 else: 1052 plt.scatter( 1053 mf_df.scan_time, 1054 mf_df.mz, 1055 c = 'c', 1056 s = 4, 1057 label = 'M/Z features' 1058 ) 1059 1060 if ms2_plot: 1061 plt.scatter( 1062 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1063 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1064 c = 'r', 1065 s = 2, 1066 label = 'M/Z features with MS2' 1067 ) 1068 1069 if mf_plot == True or ms2_plot == True: 1070 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1071 plt.xlabel('Scan time') 1072 plt.ylabel('m/z') 1073 plt.ylim(0, np.ceil(np.max(df.mz))) 1074 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1075 plt.title('Composite Feature Map') 1076 1077 if return_fig: 1078 plt.close(fig) 1079 return fig 1080 1081 else: 1082 plt.show() 1083 1084 def __len__(self): 1085 """ 1086 Returns the number of mass spectra in the dataset. 1087 1088 Returns 1089 -------- 1090 int 1091 The number of mass spectra in the dataset. 1092 """ 1093 return len(self._ms) 1094 1095 def __getitem__(self, scan_number): 1096 """ 1097 Returns the mass spectrum corresponding to the specified scan number. 1098 1099 Parameters 1100 ----------- 1101 scan_number : int 1102 The scan number of the desired mass spectrum. 1103 1104 Returns 1105 -------- 1106 MassSpectrum 1107 The mass spectrum corresponding to the specified scan number. 1108 """ 1109 return self._ms.get(scan_number) 1110 1111 def __iter__(self): 1112 """Returns an iterator over the mass spectra in the dataset. 1113 1114 Returns 1115 -------- 1116 iterator 1117 An iterator over the mass spectra in the dataset. 1118 """ 1119 return iter(self._ms.values()) 1120 1121 def set_tic_list_from_data(self, overwrite=False): 1122 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1123 1124 Parameters 1125 ----------- 1126 overwrite : bool, optional 1127 If True, overwrites the TIC list if it is already set. Defaults to False. 1128 1129 Notes 1130 ----- 1131 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1132 1133 Raises 1134 ------ 1135 ValueError 1136 If no mass spectra are found in the dataset. 1137 If the TIC list is already set and overwrite is False. 1138 """ 1139 # Check if _ms is empty and raise error if so 1140 if len(self._ms) == 0: 1141 raise ValueError("No mass spectra found in dataset") 1142 1143 # Check if tic_list is already set and raise error if so 1144 if len(self.tic) > 0 and not overwrite: 1145 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1146 1147 self.tic = [self._ms.get(i).tic for i in self.scans_number] 1148 1149 def set_retention_time_from_data(self, overwrite=False): 1150 """Sets the retention time list from the data in the _ms dictionary. 1151 1152 Parameters 1153 ----------- 1154 overwrite : bool, optional 1155 If True, overwrites the retention time list if it is already set. Defaults to False. 1156 1157 Notes 1158 ----- 1159 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1160 1161 Raises 1162 ------ 1163 ValueError 1164 If no mass spectra are found in the dataset. 1165 If the retention time list is already set and overwrite is False. 1166 """ 1167 # Check if _ms is empty and raise error if so 1168 if len(self._ms) == 0: 1169 raise ValueError("No mass spectra found in dataset") 1170 1171 # Check if retention_time_list is already set and raise error if so 1172 if len(self.retention_time) > 0 and not overwrite: 1173 raise ValueError( 1174 "Retention time list already set, use overwrite=True to overwrite" 1175 ) 1176 1177 retention_time_list = [] 1178 for key_ms in sorted(self._ms.keys()): 1179 retention_time_list.append(self._ms.get(key_ms).retention_time) 1180 self.retention_time = retention_time_list 1181 1182 def set_scans_number_from_data(self, overwrite=False): 1183 """Sets the scan number list from the data in the _ms dictionary. 1184 1185 Notes 1186 ----- 1187 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1188 1189 Raises 1190 ------ 1191 ValueError 1192 If no mass spectra are found in the dataset. 1193 If the scan number list is already set and overwrite is False. 1194 """ 1195 # Check if _ms is empty and raise error if so 1196 if len(self._ms) == 0: 1197 raise ValueError("No mass spectra found in dataset") 1198 1199 # Check if scans_number_list is already set and raise error if so 1200 if len(self.scans_number) > 0 and not overwrite: 1201 raise ValueError( 1202 "Scan number list already set, use overwrite=True to overwrite" 1203 ) 1204 1205 self.scans_number = sorted(self._ms.keys()) 1206 1207 @property 1208 def ms1_scans(self): 1209 """ 1210 list : A list of MS1 scan numbers for the dataset. 1211 """ 1212 return self.scan_df[self.scan_df.ms_level == 1].index.tolist() 1213 1214 @property 1215 def parameters(self): 1216 """ 1217 LCMSParameters : The parameters used for the LC-MS analysis. 1218 """ 1219 return self._parameters 1220 1221 @parameters.setter 1222 def parameters(self, paramsinstance): 1223 """ 1224 Sets the parameters used for the LC-MS analysis. 1225 1226 Parameters 1227 ----------- 1228 paramsinstance : LCMSParameters 1229 The parameters used for the LC-MS analysis. 1230 """ 1231 self._parameters = paramsinstance 1232 1233 @property 1234 def scans_number(self): 1235 """ 1236 list : A list of scan numbers for the dataset. 1237 """ 1238 return self._scans_number_list 1239 1240 @scans_number.setter 1241 def scans_number(self, scan_numbers_list): 1242 """ 1243 Sets the scan numbers for the dataset. 1244 1245 Parameters 1246 ----------- 1247 scan_numbers_list : list 1248 A list of scan numbers for the dataset. 1249 """ 1250 self._scans_number_list = scan_numbers_list 1251 1252 @property 1253 def retention_time(self): 1254 """ 1255 numpy.ndarray : An array of retention times for the dataset. 1256 """ 1257 return self._retention_time_list 1258 1259 @retention_time.setter 1260 def retention_time(self, rt_list): 1261 """ 1262 Sets the retention times for the dataset. 1263 1264 Parameters 1265 ----------- 1266 rt_list : list 1267 A list of retention times for the dataset. 1268 """ 1269 self._retention_time_list = np.array(rt_list) 1270 1271 @property 1272 def tic(self): 1273 """ 1274 numpy.ndarray : An array of TIC values for the dataset. 1275 """ 1276 return self._tic_list 1277 1278 @tic.setter 1279 def tic(self, tic_list): 1280 """ 1281 Sets the TIC values for the dataset. 1282 1283 Parameters 1284 ----------- 1285 tic_list : list 1286 A list of TIC values for the dataset. 1287 """ 1288 self._tic_list = np.array(tic_list)
A class representing a liquid chromatography-mass spectrometry (LC-MS) data object.
This class is not intended to be instantiated directly, but rather to be instantiated by an appropriate mass spectra parser using the get_lcms_obj() method.
Parameters
- file_location (str or Path): The location of the file containing the mass spectra data.
- analyzer (str, optional): The type of analyzer used to generate the mass spectra data. Defaults to 'Unknown'.
- instrument_label (str, optional): The type of instrument used to generate the mass spectra data. Defaults to 'Unknown'.
- sample_name (str, optional): The name of the sample; defaults to the file name if not provided to the parser. Defaults to None.
- spectra_parser (object, optional): The spectra parser object used to create the mass spectra object. Defaults to None.
Attributes
- polarity (str): The polarity of the ionization mode used for the dataset.
- _parameters (LCMSParameters): The parameters used for all methods called on the LCMSBase object. Set upon instantiation from LCMSParameters.
- _retention_time_list (numpy.ndarray): An array of retention times for the dataset.
- _scans_number_list (list): A list of scan numbers for the dataset.
- _tic_list (numpy.ndarray): An array of total ion current (TIC) values for the dataset.
- eics (dict): A dictionary containing extracted ion chromatograms (EICs) for the dataset. Key is the mz of the EIC. Initialized as an empty dictionary.
- mass_features (dictionary of LCMSMassFeature objects): A dictionary containing mass features for the dataset. Key is mass feature ID. Initialized as an empty dictionary.
- spectral_search_results (dictionary of MS2SearchResults objects): A dictionary containing spectral search results for the dataset. Key is scan number : precursor mz. Initialized as an empty dictionary.
Methods
- get_parameters_json(). Returns the parameters used for the LC-MS analysis in JSON format.
- add_associated_ms2_dda(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds which MS2 scans are associated with each mass feature to the mass_features dictionary and optionally adds the MS2 spectra to the _ms dictionary.
- add_associated_ms1(add_to_lcmsobj=True, auto_process=True, use_parser=True) Adds the MS1 spectra associated with each mass feature to the mass_features dictionary and adds the MS1 spectra to the _ms dictionary.
- mass_features_to_df() Returns a pandas dataframe summarizing the mass features in the dataset.
- set_tic_list_from_data(overwrite=False) Sets the TIC list from the mass spectrum objects within the _ms dictionary.
- set_retention_time_from_data(overwrite=False) Sets the retention time list from the data in the _ms dictionary.
- set_scans_number_from_data(overwrite=False) Sets the scan number list from the data in the _ms dictionary.
- plot_composite_mz_features(binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False) Generates plot of M/Z features comparing scan time vs M/Z value
420 def __init__( 421 self, 422 file_location, 423 analyzer="Unknown", 424 instrument_label="Unknown", 425 sample_name=None, 426 spectra_parser=None, 427 ): 428 super().__init__( 429 file_location, analyzer, instrument_label, sample_name, spectra_parser 430 ) 431 self.polarity = "" 432 self._parameters = LCMSParameters() 433 self._retention_time_list = [] 434 self._scans_number_list = [] 435 self._tic_list = [] 436 self.eics = {} 437 self.mass_features = {} 438 self.spectral_search_results = {}
440 def get_parameters_json(self): 441 """Returns the parameters stored for the LC-MS object in JSON format. 442 443 Returns 444 -------- 445 str 446 The parameters used for the LC-MS analysis in JSON format. 447 """ 448 return self.parameters.to_json()
Returns the parameters stored for the LC-MS object in JSON format.
Returns
- str: The parameters used for the LC-MS analysis in JSON format.
450 def remove_unprocessed_data(self, ms_level=None): 451 """Removes the unprocessed data from the LCMSBase object. 452 453 Parameters 454 ----------- 455 ms_level : int, optional 456 The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels. 457 458 Raises 459 ------ 460 ValueError 461 If ms_level is not 1 or 2. 462 463 Notes 464 ----- 465 This method is useful for freeing up memory after the data has been processed. 466 """ 467 if ms_level is None: 468 for ms_level in self._ms_unprocessed.keys(): 469 self._ms_unprocessed[ms_level] = None 470 if ms_level not in [1, 2]: 471 raise ValueError("ms_level must be 1 or 2") 472 self._ms_unprocessed[ms_level] = None
Removes the unprocessed data from the LCMSBase object.
Parameters
- ms_level (int, optional): The MS level to remove the unprocessed data for. If None, removes unprocessed data for all MS levels.
Raises
- ValueError: If ms_level is not 1 or 2.
Notes
This method is useful for freeing up memory after the data has been processed.
474 def add_associated_ms2_dda( 475 self, 476 auto_process=True, 477 use_parser=True, 478 spectrum_mode=None, 479 ms_params_key="ms2", 480 scan_filter=None, 481 ): 482 """Add MS2 spectra associated with mass features to the dataset. 483 484 Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject) 485 486 Parameters 487 ----------- 488 auto_process : bool, optional 489 If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True. 490 use_parser : bool, optional 491 If True, envoke the spectra parser to get the MS2 spectra. Default is True. 492 spectrum_mode : str or None, optional 493 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 494 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 495 Defaults to None. (faster if defined, otherwise will check each scan) 496 ms_params_key : string, optional 497 The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. 498 Defaults to 'ms2'. 499 scan_filter : str 500 A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. 501 "hcd" will pull out only HCD scans. 502 503 Raises 504 ------ 505 ValueError 506 If mass_features is not set, must run find_mass_features() first. 507 If no MS2 scans are found in the dataset. 508 If no precursor m/z values are found in MS2 scans, not a DDA dataset. 509 """ 510 # Check if mass_features is set, raise error if not 511 if self.mass_features is None: 512 raise ValueError( 513 "mass_features not set, must run find_mass_features() first" 514 ) 515 516 # reconfigure ms_params to get the correct mass spectrum parameters from the key 517 ms_params = self.parameters.mass_spectrum[ms_params_key] 518 519 mf_df = self.mass_features_to_df().copy() 520 # Find ms2 scans that have a precursor m/z value 521 ms2_scans = self.scan_df[self.scan_df.ms_level == 2] 522 ms2_scans = ms2_scans[~ms2_scans.precursor_mz.isna()] 523 # drop ms2 scans that have no tic 524 ms2_scans = ms2_scans[ms2_scans.tic > 0] 525 if ms2_scans is None: 526 raise ValueError("No DDA scans found in dataset") 527 528 if scan_filter is not None: 529 ms2_scans = ms2_scans[ms2_scans.scan_text.str.contains(scan_filter)] 530 # set tolerance in rt space (in minutes) and mz space (in daltons) 531 time_tol = self.parameters.lc_ms.ms2_dda_rt_tolerance 532 mz_tol = self.parameters.lc_ms.ms2_dda_mz_tolerance 533 534 # for each mass feature, find the ms2 scans that are within the roi scan time and mz range 535 dda_scans = [] 536 for i, row in mf_df.iterrows(): 537 ms2_scans_filtered = ms2_scans[ 538 ms2_scans.scan_time.between( 539 row.scan_time - time_tol, row.scan_time + time_tol 540 ) 541 ] 542 ms2_scans_filtered = ms2_scans_filtered[ 543 ms2_scans_filtered.precursor_mz.between( 544 row.mz - mz_tol, row.mz + mz_tol 545 ) 546 ] 547 dda_scans = dda_scans + ms2_scans_filtered.scan.tolist() 548 self.mass_features[i].ms2_scan_numbers = ( 549 ms2_scans_filtered.scan.tolist() 550 + self.mass_features[i].ms2_scan_numbers 551 ) 552 # add to _ms attribute 553 self.add_mass_spectra( 554 scan_list=list(set(dda_scans)), 555 auto_process=auto_process, 556 spectrum_mode=spectrum_mode, 557 use_parser=use_parser, 558 ms_params=ms_params, 559 ) 560 # associate appropriate _ms attribute to appropriate mass feature's ms2_mass_spectra attribute 561 for mf_id in self.mass_features: 562 if self.mass_features[mf_id].ms2_scan_numbers is not None: 563 for dda_scan in self.mass_features[mf_id].ms2_scan_numbers: 564 if dda_scan in self._ms.keys(): 565 self.mass_features[mf_id].ms2_mass_spectra[dda_scan] = self._ms[ 566 dda_scan 567 ]
Add MS2 spectra associated with mass features to the dataset.
Populates the mass_features ms2_scan_numbers attribute (on mass_features dictionary on LCMSObject)
Parameters
- auto_process (bool, optional): If True, auto-processes the MS2 spectra before adding it to the object's _ms dictionary. Default is True.
- use_parser (bool, optional): If True, envoke the spectra parser to get the MS2 spectra. Default is True.
- spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
- ms_params_key (string, optional): The key of the mass spectrum parameters to use for the mass spectra, accessed from the LCMSObject.parameters.mass_spectrum attribute. Defaults to 'ms2'.
- scan_filter (str): A string to filter the scans to add to the _ms dictionary. If None, all scans are added. Defaults to None. "hcd" will pull out only HCD scans.
Raises
- ValueError: If mass_features is not set, must run find_mass_features() first. If no MS2 scans are found in the dataset. If no precursor m/z values are found in MS2 scans, not a DDA dataset.
569 def add_associated_ms1( 570 self, auto_process=True, use_parser=True, spectrum_mode=None 571 ): 572 """Add MS1 spectra associated with mass features to the dataset. 573 574 Parameters 575 ----------- 576 auto_process : bool, optional 577 If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True. 578 use_parser : bool, optional 579 If True, envoke the spectra parser to get the MS1 spectra. Default is True. 580 spectrum_mode : str or None, optional 581 The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode 582 from the spectra parser to ascertain the spectrum mode (this allows for mixed types). 583 Defaults to None. (faster if defined, otherwise will check each scan) 584 585 Raises 586 ------ 587 ValueError 588 If mass_features is not set, must run find_mass_features() first. 589 If apex scans are not profile mode, all apex scans must be profile mode for averaging. 590 If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). 591 If deconvolute is True and no EICs are found, did you run integrate_mass_features() first? 592 """ 593 # Check if mass_features is set, raise error if not 594 if self.mass_features is None: 595 raise ValueError( 596 "mass_features not set, must run find_mass_features() first" 597 ) 598 scans_to_average = self.parameters.lc_ms.ms1_scans_to_average 599 600 if scans_to_average == 1: 601 # Add to LCMSobj 602 self.add_mass_spectra( 603 scan_list=[ 604 int(mf.apex_scan) for mf in self.mass_features.values() 605 ], 606 auto_process=auto_process, 607 use_parser=use_parser, 608 spectrum_mode=spectrum_mode, 609 ms_params=self.parameters.mass_spectrum["ms1"], 610 ) 611 612 elif ( 613 (scans_to_average - 1) % 2 614 ) == 0: # scans_to_average = 3, 5, 7 etc, mirror l/r around apex 615 apex_scans = list(set([int(mf.apex_scan) for mf in self.mass_features.values()])) 616 # Check if all apex scans are profile mode, raise error if not 617 if not all(self.scan_df.loc[apex_scans, "ms_format"] == "profile"): 618 raise ValueError("All apex scans must be profile mode for averaging") 619 620 # First get sets of scans to average 621 def get_scans_from_apex(ms1_scans, apex_scan, scans_to_average): 622 ms1_idx_start = ms1_scans.index(apex_scan) - int( 623 (scans_to_average - 1) / 2 624 ) 625 if ms1_idx_start < 0: 626 ms1_idx_start = 0 627 ms1_idx_end = ( 628 ms1_scans.index(apex_scan) + int((scans_to_average - 1) / 2) + 1 629 ) 630 if ms1_idx_end > (len(ms1_scans) - 1): 631 ms1_idx_end = len(ms1_scans) - 1 632 scan_list = ms1_scans[ms1_idx_start:ms1_idx_end] 633 return scan_list 634 635 ms1_scans = self.ms1_scans 636 scans_lists = [ 637 get_scans_from_apex(ms1_scans, apex_scan, scans_to_average) 638 for apex_scan in apex_scans 639 ] 640 641 # set polarity to -1 if negative mode, 1 if positive mode (for mass spectrum creation) 642 if self.polarity == "negative": 643 polarity = -1 644 elif self.polarity == "positive": 645 polarity = 1 646 647 if not use_parser: 648 # Perform checks and prepare _ms_unprocessed dictionary if use_parser is False (saves time to do this once) 649 ms1_unprocessed = self._ms_unprocessed[1].copy() 650 # Set the index on _ms_unprocessed[1] to scan number 651 ms1_unprocessed = ms1_unprocessed.set_index("scan", drop=False) 652 self._ms_unprocessed[1] = ms1_unprocessed 653 654 # Check that all the scans in scan_lists are indexs in self._ms_unprocessed[1] 655 scans_lists_flat = list( 656 set([scan for sublist in scans_lists for scan in sublist]) 657 ) 658 if ( 659 len( 660 np.setdiff1d( 661 np.sort(scans_lists_flat), 662 np.sort(ms1_unprocessed.index.values), 663 ) 664 ) 665 > 0 666 ): 667 raise ValueError( 668 "Not all scans to average are present in the unprocessed data" 669 ) 670 671 for scan_list_average, apex_scan in zip(scans_lists, apex_scans): 672 # Get unprocessed mass spectrum from scans 673 ms = self.get_average_mass_spectrum( 674 scan_list=scan_list_average, 675 apex_scan=apex_scan, 676 spectrum_mode="profile", 677 ms_level=1, 678 auto_process=auto_process, 679 use_parser=use_parser, 680 perform_checks=False, 681 polarity=polarity, 682 ms_params=self.parameters.mass_spectrum["ms1"], 683 ) 684 # Add mass spectrum to LCMS object and associated with mass feature 685 self.add_mass_spectrum(ms) 686 687 if not use_parser: 688 # Reset the index on _ms_unprocessed[1] to not be scan number 689 ms1_unprocessed = ms1_unprocessed.reset_index(drop=True) 690 self._ms_unprocessed[1] = ms1_unprocessed 691 else: 692 raise ValueError( 693 "Number of scans to average must be 1 or an integer with an integer median (i.e. 3, 5, 7, 9)" 694 ) 695 696 # Associate the ms1 spectra with the mass features 697 for mf_id in self.mass_features: 698 self.mass_features[mf_id].mass_spectrum = self._ms[ 699 self.mass_features[mf_id].apex_scan 700 ] 701 self.mass_features[mf_id].update_mz()
Add MS1 spectra associated with mass features to the dataset.
Parameters
- auto_process (bool, optional): If True, auto-processes the MS1 spectra before adding it to the object's _ms dictionary. Default is True.
- use_parser (bool, optional): If True, envoke the spectra parser to get the MS1 spectra. Default is True.
- spectrum_mode (str or None, optional): The spectrum mode to use for the mass spectra. If None, method will use the spectrum mode from the spectra parser to ascertain the spectrum mode (this allows for mixed types). Defaults to None. (faster if defined, otherwise will check each scan)
Raises
- ValueError: If mass_features is not set, must run find_mass_features() first. If apex scans are not profile mode, all apex scans must be profile mode for averaging. If number of scans to average is not 1 or an integer with an integer median (i.e. 3, 5, 7, 9). If deconvolute is True and no EICs are found, did you run integrate_mass_features() first?
703 def mass_features_to_df(self): 704 """Returns a pandas dataframe summarizing the mass features. 705 706 The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, 707 persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID). 708 709 Returns 710 -------- 711 pandas.DataFrame 712 A pandas dataframe of mass features with the following columns: 713 mf_id, mz, apex_scan, scan_time, intensity, persistence, area. 714 """ 715 716 def mass_spectrum_to_string( 717 mass_spec, normalize=True, min_normalized_abun=0.01 718 ): 719 """Converts a mass spectrum to a string of m/z:abundance pairs. 720 721 Parameters 722 ----------- 723 mass_spec : MassSpectrum 724 A MassSpectrum object to be converted to a string. 725 normalize : bool, optional 726 If True, normalizes the abundance values to a maximum of 1. Defaults to True. 727 min_normalized_abun : float, optional 728 The minimum normalized abundance value to include in the string, only used if normalize is True. Defaults to 0.01. 729 730 Returns 731 -------- 732 str 733 A string of m/z:abundance pairs from the mass spectrum, separated by a semicolon. 734 """ 735 mz_np = mass_spec.to_dataframe()["m/z"].values 736 abun_np = mass_spec.to_dataframe()["Peak Height"].values 737 if normalize: 738 abun_np = abun_np / abun_np.max() 739 mz_abun = np.column_stack((mz_np, abun_np)) 740 if normalize: 741 mz_abun = mz_abun[mz_abun[:, 1] > min_normalized_abun] 742 mz_abun_str = [ 743 str(round(mz, ndigits=4)) + ":" + str(round(abun, ndigits=2)) 744 for mz, abun in mz_abun 745 ] 746 return "; ".join(mz_abun_str) 747 748 cols_in_df = [ 749 "id", 750 "apex_scan", 751 "start_scan", 752 "final_scan", 753 "retention_time", 754 "intensity", 755 "persistence", 756 "area", 757 "dispersity_index", 758 "normalized_dispersity_index", 759 "tailing_factor", 760 "gaussian_similarity", 761 "noise_score", 762 "noise_score_min", 763 "noise_score_max", 764 "monoisotopic_mf_id", 765 "isotopologue_type", 766 "mass_spectrum_deconvoluted_parent", 767 ] 768 df_mf_list = [] 769 for mf_id in self.mass_features.keys(): 770 # Find cols_in_df that are in single_mf 771 df_keys = list( 772 set(cols_in_df).intersection(self.mass_features[mf_id].__dir__()) 773 ) 774 dict_mf = {} 775 # Get the values for each key in df_keys from the mass feature object 776 for key in df_keys: 777 dict_mf[key] = getattr(self.mass_features[mf_id], key) 778 # Special handling for mass_spectrum and associated_mass_features_deconvoluted, since they are not single values 779 if len(self.mass_features[mf_id].ms2_scan_numbers) > 0: 780 # Add MS2 spectra info 781 best_ms2_spectrum = self.mass_features[mf_id].best_ms2 782 dict_mf["ms2_spectrum"] = mass_spectrum_to_string(best_ms2_spectrum) 783 if len(self.mass_features[mf_id].associated_mass_features_deconvoluted) > 0: 784 dict_mf["associated_mass_features"] = ", ".join( 785 map( 786 str, 787 self.mass_features[mf_id].associated_mass_features_deconvoluted, 788 ) 789 ) 790 # Check if EIC for mass feature is set 791 df_mf_single = pd.DataFrame(dict_mf, index=[mf_id]) 792 df_mf_single["mz"] = self.mass_features[mf_id].mz 793 df_mf_list.append(df_mf_single) 794 df_mf = pd.concat(df_mf_list) 795 796 # rename _area to area and id to mf_id 797 df_mf = df_mf.rename( 798 columns={ 799 "id": "mf_id", 800 "retention_time": "scan_time", 801 } 802 ) 803 804 # reorder columns 805 col_order = [ 806 "mf_id", 807 "scan_time", 808 "mz", 809 "apex_scan", 810 "start_scan", 811 "final_scan", 812 "intensity", 813 "persistence", 814 "area", 815 "half_height_width", 816 "tailing_factor", 817 "dispersity_index", 818 "normalized_dispersity_index", 819 "gaussian_similarity", 820 "noise_score", 821 "noise_score_min", 822 "noise_score_max", 823 "monoisotopic_mf_id", 824 "isotopologue_type", 825 "mass_spectrum_deconvoluted_parent", 826 "associated_mass_features", 827 "ms2_spectrum", 828 ] 829 # drop columns that are not in col_order 830 cols_to_order = [col for col in col_order if col in df_mf.columns] 831 df_mf = df_mf[cols_to_order] 832 833 # reset index to mf_id 834 df_mf = df_mf.set_index("mf_id") 835 df_mf.index.name = "mf_id" 836 837 return df_mf
Returns a pandas dataframe summarizing the mass features.
The dataframe contains the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area, monoisotopic_mf_id, and isotopologue_type. The index is set to mf_id (mass feature ID).
Returns
- pandas.DataFrame: A pandas dataframe of mass features with the following columns: mf_id, mz, apex_scan, scan_time, intensity, persistence, area.
839 def mass_features_ms1_annot_to_df(self): 840 """Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset. 841 842 Returns 843 -------- 844 pandas.DataFrame 845 A pandas dataframe of MS1 annotations for the mass features in the dataset. 846 The index is set to mf_id (mass feature ID) 847 848 Raises 849 ------ 850 Warning 851 If no MS1 annotations were found for the mass features in the dataset. 852 """ 853 annot_df_list_ms1 = [] 854 for mf_id in self.mass_features.keys(): 855 if self.mass_features[mf_id].mass_spectrum is None: 856 pass 857 else: 858 # Add ms1 annotations to ms1 annotation list 859 if ( 860 np.abs( 861 ( 862 self.mass_features[mf_id].ms1_peak.mz_exp 863 - self.mass_features[mf_id].mz 864 ) 865 ) 866 < 0.01 867 ): 868 # Get the molecular formula from the mass spectrum 869 annot_df = self.mass_features[mf_id].mass_spectrum.to_dataframe() 870 # Subset to pull out only the peak associated with the mass feature 871 annot_df = annot_df[ 872 annot_df["Index"] == self.mass_features[mf_id].ms1_peak.index 873 ].copy() 874 875 # If there are more than 1 row, remove any rows without a molecular formula 876 if len(annot_df) > 1: 877 annot_df = annot_df[~annot_df["Molecular Formula"].isna()] 878 879 # Remove the index column and add column for mf_id 880 annot_df = annot_df.drop(columns=["Index"]) 881 annot_df["mf_id"] = mf_id 882 annot_df_list_ms1.append(annot_df) 883 884 if len(annot_df_list_ms1) > 0: 885 annot_ms1_df_full = pd.concat(annot_df_list_ms1) 886 annot_ms1_df_full = annot_ms1_df_full.set_index("mf_id") 887 annot_ms1_df_full.index.name = "mf_id" 888 889 else: 890 annot_ms1_df_full = None 891 # Warn that no ms1 annotations were found 892 warnings.warn( 893 "No MS1 annotations found for mass features in dataset, were MS1 spectra added and processed within the dataset?", 894 UserWarning, 895 ) 896 897 return annot_ms1_df_full
Returns a pandas dataframe summarizing the MS1 annotations for the mass features in the dataset.
Returns
- pandas.DataFrame: A pandas dataframe of MS1 annotations for the mass features in the dataset. The index is set to mf_id (mass feature ID)
Raises
- Warning: If no MS1 annotations were found for the mass features in the dataset.
899 def mass_features_ms2_annot_to_df(self, molecular_metadata=None): 900 """Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset. 901 902 Parameters 903 ----------- 904 molecular_metadata : dict of MolecularMetadata objects 905 A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None. 906 907 Returns 908 -------- 909 pandas.DataFrame 910 A pandas dataframe of MS2 annotations for the mass features in the dataset, 911 and optionally molecular metadata. The index is set to mf_id (mass feature ID) 912 913 Raises 914 ------ 915 Warning 916 If no MS2 annotations were found for the mass features in the dataset. 917 """ 918 annot_df_list_ms2 = [] 919 for mf_id in self.mass_features.keys(): 920 if len(self.mass_features[mf_id].ms2_similarity_results) > 0: 921 # Add ms2 annotations to ms2 annotation list 922 for result in self.mass_features[mf_id].ms2_similarity_results: 923 annot_df_ms2 = result.to_dataframe() 924 annot_df_ms2["mf_id"] = mf_id 925 annot_df_list_ms2.append(annot_df_ms2) 926 927 if len(annot_df_list_ms2) > 0: 928 annot_ms2_df_full = pd.concat(annot_df_list_ms2) 929 if molecular_metadata is not None: 930 molecular_metadata_df = pd.concat( 931 [ 932 pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() 933 for k, v in molecular_metadata.items() 934 ], 935 ignore_index=True, 936 ) 937 molecular_metadata_df = molecular_metadata_df.rename( 938 columns={"id": "ref_mol_id"} 939 ) 940 annot_ms2_df_full = annot_ms2_df_full.merge( 941 molecular_metadata_df, on="ref_mol_id", how="left" 942 ) 943 annot_ms2_df_full = annot_ms2_df_full.drop_duplicates( 944 subset=["mf_id", "query_spectrum_id", "ref_ms_id"] 945 ).copy() 946 annot_ms2_df_full = annot_ms2_df_full.set_index("mf_id") 947 annot_ms2_df_full.index.name = "mf_id" 948 else: 949 annot_ms2_df_full = None 950 # Warn that no ms2 annotations were found 951 warnings.warn( 952 "No MS2 annotations found for mass features in dataset, were MS2 spectra added and searched against a database?", 953 UserWarning, 954 ) 955 956 return annot_ms2_df_full
Returns a pandas dataframe summarizing the MS2 annotations for the mass features in the dataset.
Parameters
- molecular_metadata (dict of MolecularMetadata objects): A dictionary of MolecularMetadata objects, keyed by metabref_mol_id. Defaults to None.
Returns
- pandas.DataFrame: A pandas dataframe of MS2 annotations for the mass features in the dataset, and optionally molecular metadata. The index is set to mf_id (mass feature ID)
Raises
- Warning: If no MS2 annotations were found for the mass features in the dataset.
958 def plot_composite_mz_features(self, binsize = 1e-4, ph_int_min_thresh = 0.001, mf_plot = True, ms2_plot = True, return_fig = False): 959 """Returns a figure displaying 960 (1) thresholded, unprocessed data 961 (2) the m/z features 962 (3) which m/z features are associated with MS2 spectra 963 964 Parameters 965 ----------- 966 binsize : float 967 Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4. 968 mf_plot : boolean 969 Indicates whether to plot the m/z features. Defaults to True. 970 ms2_plot : boolean 971 Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True. 972 return_fig : boolean 973 Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False. 974 975 Returns 976 -------- 977 matplotlib.pyplot.Figure 978 A figure with the thresholded, unprocessed data on an axis of m/z value with respect to 979 scan time. Unprocessed data is displayed in gray scale with darker colors indicating 980 higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z 981 features with associated with MS2 spectra are plotted, they are displayed in red. 982 983 Raises 984 ------ 985 Warning 986 If m/z features are set to be plot but aren't in the dataset. 987 If m/z features with associated MS2 data are set to be plot but no MS2 annotations 988 were found for the m/z features in the dataset. 989 """ 990 if mf_plot: 991 # Check if mass_features is set, raise error if not 992 if self.mass_features is None: 993 raise ValueError( 994 "mass_features not set, must run find_mass_features() first" 995 ) 996 ## call mass feature data 997 mf_df = self.mass_features_to_df() 998 999 if ms2_plot: 1000 if not mf_plot: 1001 # Check if mass_features is set, raise error if not 1002 if self.mass_features is None: 1003 raise ValueError( 1004 "mass_features not set, must run find_mass_features() first" 1005 ) 1006 1007 ## call m/z feature data 1008 mf_df = self.mass_features_to_df() 1009 1010 # Check if ms2_spectrum is set, raise error if not 1011 if 'ms2_spectrum' not in mf_df.columns: 1012 raise ValueError( 1013 "ms2_spectrum not set, must run add_associated_ms2_dda() first" 1014 ) 1015 1016 ## threshold and grid unprocessed data 1017 df = self._ms_unprocessed[1].copy() 1018 df = df.dropna(subset=['intensity']).reset_index(drop = True) 1019 threshold = ph_int_min_thresh * df.intensity.max() 1020 df_thres = df[df["intensity"] > threshold].reset_index(drop = True).copy() 1021 df = self.grid_data(df_thres) 1022 1023 ## format unprocessed data for plotting 1024 df = df.merge(self.scan_df[['scan', 'scan_time']], on = 'scan') 1025 mz_grid = np.arange(0, np.max(df.mz), binsize) 1026 mz_data = np.array(df.mz) 1027 df['mz_bin'] = find_closest(mz_grid, mz_data) 1028 df['ab_bin'] = df.groupby(['mz_bin', 'scan_time']).intensity.transform(sum) 1029 unproc_df = df[['scan_time', 'mz_bin', 'ab_bin']].drop_duplicates(ignore_index = True) 1030 1031 ## generate figure 1032 fig = plt.figure() 1033 plt.scatter( 1034 unproc_df.scan_time, 1035 unproc_df.mz_bin*binsize, 1036 c = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1037 alpha = unproc_df.ab_bin/np.max(unproc_df.ab_bin), 1038 cmap = 'Greys_r', 1039 s = 1 1040 ) 1041 1042 if mf_plot: 1043 if ms2_plot: 1044 plt.scatter( 1045 mf_df[mf_df.ms2_spectrum.isna()].scan_time, 1046 mf_df[mf_df.ms2_spectrum.isna()].mz, 1047 c = 'c', 1048 s = 4, 1049 label = 'M/Z features without MS2' 1050 ) 1051 else: 1052 plt.scatter( 1053 mf_df.scan_time, 1054 mf_df.mz, 1055 c = 'c', 1056 s = 4, 1057 label = 'M/Z features' 1058 ) 1059 1060 if ms2_plot: 1061 plt.scatter( 1062 mf_df[~mf_df.ms2_spectrum.isna()].scan_time, 1063 mf_df[~mf_df.ms2_spectrum.isna()].mz, 1064 c = 'r', 1065 s = 2, 1066 label = 'M/Z features with MS2' 1067 ) 1068 1069 if mf_plot == True or ms2_plot == True: 1070 plt.legend(loc = 'lower center', bbox_to_anchor = (0.5, -0.25), ncol = 2) 1071 plt.xlabel('Scan time') 1072 plt.ylabel('m/z') 1073 plt.ylim(0, np.ceil(np.max(df.mz))) 1074 plt.xlim(0, np.ceil(np.max(df.scan_time))) 1075 plt.title('Composite Feature Map') 1076 1077 if return_fig: 1078 plt.close(fig) 1079 return fig 1080 1081 else: 1082 plt.show()
Returns a figure displaying (1) thresholded, unprocessed data (2) the m/z features (3) which m/z features are associated with MS2 spectra
Parameters
- binsize (float): Desired binsize for the m/z axis of the composite feature map. Defaults to 1e-4.
- mf_plot (boolean): Indicates whether to plot the m/z features. Defaults to True.
- ms2_plot (boolean): Indicates whether to identify m/z features with associated MS2 spectra. Defaults to True.
- return_fig (boolean): Indicates whether to plot composite feature map (False) or return figure object (True). Defaults to False.
Returns
- matplotlib.pyplot.Figure: A figure with the thresholded, unprocessed data on an axis of m/z value with respect to scan time. Unprocessed data is displayed in gray scale with darker colors indicating higher intensities. If m/z features are plotted, they are displayed in cyan. If m/z features with associated with MS2 spectra are plotted, they are displayed in red.
Raises
- Warning: If m/z features are set to be plot but aren't in the dataset. If m/z features with associated MS2 data are set to be plot but no MS2 annotations were found for the m/z features in the dataset.
1121 def set_tic_list_from_data(self, overwrite=False): 1122 """Sets the TIC list from the mass spectrum objects within the _ms dictionary. 1123 1124 Parameters 1125 ----------- 1126 overwrite : bool, optional 1127 If True, overwrites the TIC list if it is already set. Defaults to False. 1128 1129 Notes 1130 ----- 1131 If the _ms dictionary is incomplete, sets the TIC list to an empty list. 1132 1133 Raises 1134 ------ 1135 ValueError 1136 If no mass spectra are found in the dataset. 1137 If the TIC list is already set and overwrite is False. 1138 """ 1139 # Check if _ms is empty and raise error if so 1140 if len(self._ms) == 0: 1141 raise ValueError("No mass spectra found in dataset") 1142 1143 # Check if tic_list is already set and raise error if so 1144 if len(self.tic) > 0 and not overwrite: 1145 raise ValueError("TIC list already set, use overwrite=True to overwrite") 1146 1147 self.tic = [self._ms.get(i).tic for i in self.scans_number]
Sets the TIC list from the mass spectrum objects within the _ms dictionary.
Parameters
- overwrite (bool, optional): If True, overwrites the TIC list if it is already set. Defaults to False.
Notes
If the _ms dictionary is incomplete, sets the TIC list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the TIC list is already set and overwrite is False.
1149 def set_retention_time_from_data(self, overwrite=False): 1150 """Sets the retention time list from the data in the _ms dictionary. 1151 1152 Parameters 1153 ----------- 1154 overwrite : bool, optional 1155 If True, overwrites the retention time list if it is already set. Defaults to False. 1156 1157 Notes 1158 ----- 1159 If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list. 1160 1161 Raises 1162 ------ 1163 ValueError 1164 If no mass spectra are found in the dataset. 1165 If the retention time list is already set and overwrite is False. 1166 """ 1167 # Check if _ms is empty and raise error if so 1168 if len(self._ms) == 0: 1169 raise ValueError("No mass spectra found in dataset") 1170 1171 # Check if retention_time_list is already set and raise error if so 1172 if len(self.retention_time) > 0 and not overwrite: 1173 raise ValueError( 1174 "Retention time list already set, use overwrite=True to overwrite" 1175 ) 1176 1177 retention_time_list = [] 1178 for key_ms in sorted(self._ms.keys()): 1179 retention_time_list.append(self._ms.get(key_ms).retention_time) 1180 self.retention_time = retention_time_list
Sets the retention time list from the data in the _ms dictionary.
Parameters
- overwrite (bool, optional): If True, overwrites the retention time list if it is already set. Defaults to False.
Notes
If the _ms dictionary is empty or incomplete, sets the retention time list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the retention time list is already set and overwrite is False.
1182 def set_scans_number_from_data(self, overwrite=False): 1183 """Sets the scan number list from the data in the _ms dictionary. 1184 1185 Notes 1186 ----- 1187 If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list. 1188 1189 Raises 1190 ------ 1191 ValueError 1192 If no mass spectra are found in the dataset. 1193 If the scan number list is already set and overwrite is False. 1194 """ 1195 # Check if _ms is empty and raise error if so 1196 if len(self._ms) == 0: 1197 raise ValueError("No mass spectra found in dataset") 1198 1199 # Check if scans_number_list is already set and raise error if so 1200 if len(self.scans_number) > 0 and not overwrite: 1201 raise ValueError( 1202 "Scan number list already set, use overwrite=True to overwrite" 1203 ) 1204 1205 self.scans_number = sorted(self._ms.keys())
Sets the scan number list from the data in the _ms dictionary.
Notes
If the _ms dictionary is empty or incomplete, sets the scan number list to an empty list.
Raises
- ValueError: If no mass spectra are found in the dataset. If the scan number list is already set and overwrite is False.
Inherited Members
- MassSpectraBase
- file_location
- analyzer
- instrument_label
- spectra_parser
- add_mass_spectrum
- add_mass_spectra
- get_time_of_scan_id
- scan_df
- ms
- corems.mass_spectra.calc.lc_calc.LCCalculations
- get_max_eic
- smooth_tic
- eic_centroid_detector
- find_nearest_scan
- add_peak_metrics
- get_average_mass_spectrum
- find_mass_features
- integrate_mass_features
- find_c13_mass_features
- deconvolute_ms1_mass_features