corems.mass_spectra.input.corems_hdf5
1__author__ = "Yuri E. Corilo" 2__date__ = "Oct 29, 2019" 3 4 5from threading import Thread 6from pathlib import Path 7 8import pandas as pd 9 10from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature 11from corems.encapsulation.input.parameter_from_json import ( 12 load_and_set_json_parameters_lcms, 13 load_and_set_toml_parameters_lcms, 14) 15from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase 16from corems.mass_spectra.factory.chromat_data import EIC_Data 17from corems.mass_spectra.input.parserbase import SpectraParserInterface 18from corems.mass_spectrum.input.coremsHDF5 import ReadCoreMSHDF_MassSpectrum 19from corems.molecular_id.factory.spectrum_search_results import SpectrumSearchResults 20from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader 21from corems.mass_spectra.input.mzml import MZMLSpectraParser 22 23 24class ReadCoreMSHDFMassSpectra( 25 SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread 26): 27 """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object. 28 29 Parameters 30 ---------- 31 file_location : str 32 The location of the HDF5 file to read, including the suffix. 33 34 Attributes 35 ---------- 36 file_location : str 37 The location of the HDF5 file to read. 38 h5pydata : h5py.File 39 The HDF5 file object. 40 scans : list 41 A list of the location of individual mass spectra within the HDF5 file. 42 scan_number_list : list 43 A list of the scan numbers of the mass spectra within the HDF5 file. 44 parameters_location : str 45 The location of the parameters file (json or toml). 46 47 Methods 48 ------- 49 * import_mass_spectra(mass_spectra). 50 Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object. 51 * get_mass_spectrum_from_scan(scan_number). 52 Return mass spectrum data object from scan number. 53 * load(). 54 Placeholder method to meet the requirements of the SpectraParserInterface. 55 * run(mass_spectra). 56 Runs the importer functions to populate a LCMS or MassSpectraBase object. 57 * import_scan_info(mass_spectra). 58 Imports the scan info from the HDF5 file to populate the _scan_info attribute 59 on the LCMS or MassSpectraBase object 60 * import_ms_unprocessed(mass_spectra). 61 Imports the unprocessed mass spectra from the HDF5 file to populate the 62 _ms_unprocessed attribute on the LCMS or MassSpectraBase object 63 * import_parameters(mass_spectra). 64 Imports the parameters from the HDF5 file to populate the parameters 65 attribute on the LCMS or MassSpectraBase object 66 * import_mass_features(mass_spectra). 67 Imports the mass features from the HDF5 file to populate the mass_features 68 attribute on the LCMS or MassSpectraBase object 69 * import_eics(mass_spectra). 70 Imports the extracted ion chromatograms from the HDF5 file to populate the 71 eics attribute on the LCMS or MassSpectraBase object 72 * import_spectral_search_results(mass_spectra). 73 Imports the spectral search results from the HDF5 file to populate the 74 spectral_search_results attribute on the LCMS or MassSpectraBase object 75 * get_mass_spectra_obj(). 76 Return mass spectra data object, populating the _ms list on the LCMS or 77 MassSpectraBase object from the HDF5 file 78 * get_lcms_obj(). 79 Return LCMSBase object, populating the majority of the attributes on the 80 LCMS object from the HDF5 file 81 82 """ 83 84 def __init__(self, file_location: str): 85 Thread.__init__(self) 86 ReadCoreMSHDF_MassSpectrum.__init__(self, file_location) 87 88 # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file 89 self.scans = [ 90 "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys()) 91 ] 92 self.scan_number_list = sorted( 93 [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())] 94 ) 95 96 # set the location of the parameters file (json or toml) 97 add_files = [ 98 x 99 for x in self.file_location.parent.glob( 100 self.file_location.name.replace(".hdf5", ".*") 101 ) 102 if x.suffix != ".hdf5" 103 ] 104 if len([x for x in add_files if x.suffix == ".json"]) > 0: 105 self.parameters_location = [x for x in add_files if x.suffix == ".json"][0] 106 elif len([x for x in add_files if x.suffix == ".toml"]) > 0: 107 self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0] 108 else: 109 self.parameters_location = None 110 111 def get_mass_spectrum_from_scan(self, scan_number): 112 """Return mass spectrum data object from scan number.""" 113 if scan_number in self.scan_number_list: 114 mass_spec = self.get_mass_spectrum(scan_number) 115 return mass_spec 116 else: 117 raise Exception("Scan number not found in HDF5 file.") 118 119 def load(self) -> None: 120 """ """ 121 pass 122 123 def get_ms_raw(self, spectra=None, scan_df=None) -> dict: 124 """ """ 125 # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation 126 if spectra is not None or scan_df is not None: 127 SyntaxWarning( 128 "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation." 129 ) 130 ms_unprocessed = {} 131 dict_group_load = self.h5pydata["ms_unprocessed"] 132 dict_group_keys = dict_group_load.keys() 133 for k in dict_group_keys: 134 ms_up_int = dict_group_load[k][:] 135 ms_unprocessed[int(k)] = pd.DataFrame( 136 ms_up_int, columns=["scan", "mz", "intensity"] 137 ) 138 return ms_unprocessed 139 140 def get_scan_df(self) -> pd.DataFrame: 141 scan_info = {} 142 dict_group_load = self.h5pydata["scan_info"] 143 dict_group_keys = dict_group_load.keys() 144 for k in dict_group_keys: 145 scan_info[k] = dict_group_load[k][:] 146 scan_df = pd.DataFrame(scan_info) 147 scan_df.set_index("scan", inplace=True, drop=False) 148 str_df = scan_df.select_dtypes([object]) 149 str_df = str_df.stack().str.decode("utf-8").unstack() 150 for col in str_df: 151 scan_df[col] = str_df[col] 152 return scan_df 153 154 def run(self, mass_spectra, load_raw=True) -> None: 155 """Runs the importer functions to populate a LCMS or MassSpectraBase object. 156 157 Notes 158 ----- 159 The following functions are run in order, if the HDF5 file contains the necessary data: 160 1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object. 161 2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object. 162 3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object. 163 4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object. 164 5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object. 165 6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object. 166 7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object. 167 168 Parameters 169 ---------- 170 mass_spectra : LCMSBase or MassSpectraBase 171 The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes. 172 load_raw : bool 173 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 174 Returns 175 ------- 176 None, but populates several attributes on the LCMS or MassSpectraBase object. 177 178 """ 179 if self.parameters_location is not None: 180 # Populate the parameters attribute on the LCMS object 181 self.import_parameters(mass_spectra) 182 183 if "mass_spectra" in self.h5pydata: 184 # Populate the _ms list on the LCMS object 185 self.import_mass_spectra(mass_spectra, load_raw=load_raw) 186 187 if "scan_info" in self.h5pydata: 188 # Populate the _scan_info attribute on the LCMS object 189 self.import_scan_info(mass_spectra) 190 191 if "ms_unprocessed" in self.h5pydata and load_raw: 192 # Populate the _ms_unprocessed attribute on the LCMS object 193 self.import_ms_unprocessed(mass_spectra) 194 195 if "mass_features" in self.h5pydata: 196 # Populate the mass_features attribute on the LCMS object 197 self.import_mass_features(mass_spectra) 198 199 if "eics" in self.h5pydata: 200 # Populate the eics attribute on the LCMS object 201 self.import_eics(mass_spectra) 202 203 if "spectral_search_results" in self.h5pydata: 204 # Populate the spectral_search_results attribute on the LCMS object 205 self.import_spectral_search_results(mass_spectra) 206 207 def import_mass_spectra(self, mass_spectra, load_raw=True) -> None: 208 """Imports all mass spectra from the HDF5 file. 209 210 Parameters 211 ---------- 212 mass_spectra : LCMSBase | MassSpectraBase 213 The MassSpectraBase or LCMSBase object to populate with mass spectra. 214 load_raw : bool 215 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default 216 217 Returns 218 ------- 219 None, but populates the '_ms' list on the LCMSBase or MassSpectraBase 220 object with mass spectra from the HDF5 file. 221 """ 222 for scan_number in self.scan_number_list: 223 mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw) 224 mass_spec.scan_number = scan_number 225 mass_spectra.add_mass_spectrum(mass_spec) 226 227 def import_scan_info(self, mass_spectra) -> None: 228 """Imports the scan info from the HDF5 file. 229 230 Parameters 231 ---------- 232 lcms : LCMSBase | MassSpectraBase 233 The MassSpectraBase or LCMSBase objects 234 235 Returns 236 ------- 237 None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase 238 object with a pandas DataFrame of the 'scan_info' from the HDF5 file. 239 240 """ 241 scan_df = self.get_scan_df() 242 mass_spectra.scan_df = scan_df 243 244 def import_ms_unprocessed(self, mass_spectra) -> None: 245 """Imports the unprocessed mass spectra from the HDF5 file. 246 247 Parameters 248 ---------- 249 lcms : LCMSBase | MassSpectraBase 250 The MassSpectraBase or LCMSBase objects 251 252 Returns 253 ------- 254 None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase 255 object with a dictionary of the 'ms_unprocessed' from the HDF5 file. 256 257 """ 258 ms_unprocessed = self.get_ms_raw() 259 mass_spectra._ms_unprocessed = ms_unprocessed 260 261 def import_parameters(self, mass_spectra) -> None: 262 """Imports the parameters from the HDF5 file. 263 264 Parameters 265 ---------- 266 mass_spectra : LCMSBase | MassSpectraBase 267 The MassSpectraBase or LCMSBase object to populate with parameters. 268 269 Returns 270 ------- 271 None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase 272 object with a dictionary of the 'parameters' from the HDF5 file. 273 274 """ 275 if ".json" == self.parameters_location.suffix: 276 load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location) 277 if ".toml" == self.parameters_location.suffix: 278 load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location) 279 else: 280 raise Exception( 281 "Parameters file must be in JSON format, TOML format is not yet supported." 282 ) 283 284 def import_mass_features(self, mass_spectra) -> None: 285 """Imports the mass features from the HDF5 file. 286 287 Parameters 288 ---------- 289 mass_spectra : LCMSBase | MassSpectraBase 290 The MassSpectraBase or LCMSBase object to populate with mass features. 291 292 Returns 293 ------- 294 None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase 295 object with a dictionary of the 'mass_features' from the HDF5 file. 296 297 """ 298 dict_group_load = self.h5pydata["mass_features"] 299 dict_group_keys = dict_group_load.keys() 300 for k in dict_group_keys: 301 # Instantiate the MassFeature object 302 mass_feature = LCMSMassFeature( 303 mass_spectra, 304 mz=dict_group_load[k].attrs["_mz_exp"], 305 retention_time=dict_group_load[k].attrs["_retention_time"], 306 intensity=dict_group_load[k].attrs["_intensity"], 307 apex_scan=dict_group_load[k].attrs["_apex_scan"], 308 persistence=dict_group_load[k].attrs["_persistence"], 309 id=int(k), 310 ) 311 312 # Populate additional attributes on the MassFeature object 313 for key in dict_group_load[k].attrs.keys() - { 314 "_mz_exp", 315 "_mz_cal", 316 "_retention_time", 317 "_intensity", 318 "_apex_scan", 319 "_persistence", 320 }: 321 setattr(mass_feature, key, dict_group_load[k].attrs[key]) 322 323 # Populate attributes on MassFeature object that are lists 324 for key in dict_group_load[k].keys(): 325 setattr(mass_feature, key, dict_group_load[k][key][:]) 326 327 mass_spectra.mass_features[int(k)] = mass_feature 328 329 # Associate mass features with ms1 and ms2 spectra, if available 330 for mf_id in mass_spectra.mass_features.keys(): 331 if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys(): 332 mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[ 333 mass_spectra.mass_features[mf_id].apex_scan 334 ] 335 if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None: 336 for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers: 337 if ms2_scan in mass_spectra._ms.keys(): 338 mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = ( 339 mass_spectra._ms[ms2_scan] 340 ) 341 342 def import_eics(self, mass_spectra): 343 """Imports the extracted ion chromatograms from the HDF5 file. 344 345 Parameters 346 ---------- 347 mass_spectra : LCMSBase | MassSpectraBase 348 The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms. 349 350 Returns 351 ------- 352 None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase 353 object with a dictionary of the 'eics' from the HDF5 file. 354 355 """ 356 dict_group_load = self.h5pydata["eics"] 357 dict_group_keys = dict_group_load.keys() 358 for k in dict_group_keys: 359 my_eic = EIC_Data( 360 scans=dict_group_load[k]["scans"][:], 361 time=dict_group_load[k]["time"][:], 362 eic=dict_group_load[k]["eic"][:], 363 ) 364 for key in dict_group_load[k].keys(): 365 if key not in ["scans", "time", "eic"]: 366 setattr(my_eic, key, dict_group_load[k][key][:]) 367 # if key is apexes, convert to a tuple of a list 368 if key == "apexes" and len(my_eic.apexes) > 0: 369 my_eic.apexes = [tuple(x) for x in my_eic.apexes] 370 # Add to mass_spectra object 371 mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic 372 373 # Add to mass features 374 for idx in mass_spectra.mass_features.keys(): 375 mz = mass_spectra.mass_features[idx].mz 376 if mz in mass_spectra.eics.keys(): 377 mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz] 378 379 def import_spectral_search_results(self, mass_spectra): 380 """Imports the spectral search results from the HDF5 file. 381 382 Parameters 383 ---------- 384 mass_spectra : LCMSBase | MassSpectraBase 385 The MassSpectraBase or LCMSBase object to populate with spectral search results. 386 387 Returns 388 ------- 389 None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase 390 object with a dictionary of the 'spectral_search_results' from the HDF5 file. 391 392 """ 393 overall_results_dict = {} 394 ms2_results_load = self.h5pydata["spectral_search_results"] 395 for k in ms2_results_load.keys(): 396 overall_results_dict[int(k)] = {} 397 for k2 in ms2_results_load[k].keys(): 398 ms2_search_res = SpectrumSearchResults( 399 query_spectrum=mass_spectra._ms[int(k)], 400 precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"], 401 spectral_similarity_search_results={}, 402 ) 403 404 for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}: 405 setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:])) 406 overall_results_dict[int(k)][ 407 ms2_results_load[k][k2].attrs["precursor_mz"] 408 ] = ms2_search_res 409 410 # add to mass_spectra 411 mass_spectra.spectral_search_results.update(overall_results_dict) 412 413 # If there are mass features, associate the results with each mass feature 414 if len(mass_spectra.mass_features) > 0: 415 for mass_feature_id, mass_feature in mass_spectra.mass_features.items(): 416 scan_ids = mass_feature.ms2_scan_numbers 417 for ms2_scan_id in scan_ids: 418 precursor_mz = mass_feature.mz 419 try: 420 mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz] 421 except KeyError: 422 pass 423 else: 424 mass_spectra.mass_features[ 425 mass_feature_id 426 ].ms2_similarity_results.append( 427 mass_spectra.spectral_search_results[ms2_scan_id][ 428 precursor_mz 429 ] 430 ) 431 432 def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase: 433 """ 434 Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file. 435 436 Parameters 437 ---------- 438 load_raw : bool 439 If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True. 440 441 """ 442 # Instantiate the LCMS object 443 spectra_obj = MassSpectraBase( 444 file_location=self.file_location, 445 analyzer=self.analyzer, 446 instrument_label=self.instrument_label, 447 sample_name=self.sample_name, 448 ) 449 450 # This will populate the _ms list on the LCMS or MassSpectraBase object 451 self.run(spectra_obj, load_raw=load_raw) 452 453 return spectra_obj 454 455 def get_lcms_obj( 456 self, load_raw=True, use_original_parser=True, raw_file_path=None 457 ) -> LCMSBase: 458 """ 459 Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file. 460 461 Parameters 462 ---------- 463 load_raw : bool 464 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 465 use_original_parser : bool 466 If True, use the original parser to populate the LCMS object. Default is True. 467 raw_file_path : str 468 The location of the raw file to parse if attempting to use original parser. 469 Default is None, which attempts to get the raw file path from the HDF5 file. 470 If the original file path has moved, this parameter can be used to specify the new location. 471 """ 472 # Instantiate the LCMS object 473 lcms_obj = LCMSBase( 474 file_location=self.file_location, 475 analyzer=self.analyzer, 476 instrument_label=self.instrument_label, 477 sample_name=self.sample_name, 478 ) 479 480 # This will populate the majority of the attributes on the LCMS object 481 self.run(lcms_obj, load_raw=load_raw) 482 483 # Set final attributes of the LCMS object 484 lcms_obj.polarity = self.h5pydata.attrs["polarity"] 485 lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan) 486 lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time) 487 lcms_obj._tic_list = list(lcms_obj.scan_df.tic) 488 489 # If use_original_parser is True, instantiate the original parser and populate the LCMS object 490 if use_original_parser: 491 lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path) 492 493 return lcms_obj 494 495 def add_original_parser(self, mass_spectra, raw_file_path=None): 496 """ 497 Add the original parser to the mass spectra object. 498 499 Parameters 500 ---------- 501 mass_spectra : MassSpectraBase | LCMSBase 502 The MassSpectraBase or LCMSBase object to add the original parser to. 503 raw_file_path : str 504 The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file. 505 """ 506 # Try to get the raw file path from the HDF5 file 507 if raw_file_path is None: 508 raw_file_path = self.h5pydata.attrs["original_file_location"] 509 # Check if og_file_location exists, if not raise an error 510 raw_file_path = self.h5pydata.attrs["original_file_location"] 511 512 raw_file_path = Path(raw_file_path) 513 if not raw_file_path.exists(): 514 raise FileExistsError( 515 "File does not exist: " + str(raw_file_path), 516 ". Cannot use original parser for instatiating the lcms_obj.", 517 ) 518 519 # Get the original parser type 520 og_parser_type = self.h5pydata.attrs["parser_type"] 521 522 if og_parser_type == "ImportMassSpectraThermoMSFileReader": 523 parser = ImportMassSpectraThermoMSFileReader(raw_file_path) 524 elif og_parser_type == "MZMLSpectraParser": 525 parser = MZMLSpectraParser(raw_file_path) 526 527 mass_spectra.spectra_parser_class = parser.__class__ 528 mass_spectra.spectra_parser = parser 529 530 return mass_spectra
25class ReadCoreMSHDFMassSpectra( 26 SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread 27): 28 """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object. 29 30 Parameters 31 ---------- 32 file_location : str 33 The location of the HDF5 file to read, including the suffix. 34 35 Attributes 36 ---------- 37 file_location : str 38 The location of the HDF5 file to read. 39 h5pydata : h5py.File 40 The HDF5 file object. 41 scans : list 42 A list of the location of individual mass spectra within the HDF5 file. 43 scan_number_list : list 44 A list of the scan numbers of the mass spectra within the HDF5 file. 45 parameters_location : str 46 The location of the parameters file (json or toml). 47 48 Methods 49 ------- 50 * import_mass_spectra(mass_spectra). 51 Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object. 52 * get_mass_spectrum_from_scan(scan_number). 53 Return mass spectrum data object from scan number. 54 * load(). 55 Placeholder method to meet the requirements of the SpectraParserInterface. 56 * run(mass_spectra). 57 Runs the importer functions to populate a LCMS or MassSpectraBase object. 58 * import_scan_info(mass_spectra). 59 Imports the scan info from the HDF5 file to populate the _scan_info attribute 60 on the LCMS or MassSpectraBase object 61 * import_ms_unprocessed(mass_spectra). 62 Imports the unprocessed mass spectra from the HDF5 file to populate the 63 _ms_unprocessed attribute on the LCMS or MassSpectraBase object 64 * import_parameters(mass_spectra). 65 Imports the parameters from the HDF5 file to populate the parameters 66 attribute on the LCMS or MassSpectraBase object 67 * import_mass_features(mass_spectra). 68 Imports the mass features from the HDF5 file to populate the mass_features 69 attribute on the LCMS or MassSpectraBase object 70 * import_eics(mass_spectra). 71 Imports the extracted ion chromatograms from the HDF5 file to populate the 72 eics attribute on the LCMS or MassSpectraBase object 73 * import_spectral_search_results(mass_spectra). 74 Imports the spectral search results from the HDF5 file to populate the 75 spectral_search_results attribute on the LCMS or MassSpectraBase object 76 * get_mass_spectra_obj(). 77 Return mass spectra data object, populating the _ms list on the LCMS or 78 MassSpectraBase object from the HDF5 file 79 * get_lcms_obj(). 80 Return LCMSBase object, populating the majority of the attributes on the 81 LCMS object from the HDF5 file 82 83 """ 84 85 def __init__(self, file_location: str): 86 Thread.__init__(self) 87 ReadCoreMSHDF_MassSpectrum.__init__(self, file_location) 88 89 # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file 90 self.scans = [ 91 "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys()) 92 ] 93 self.scan_number_list = sorted( 94 [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())] 95 ) 96 97 # set the location of the parameters file (json or toml) 98 add_files = [ 99 x 100 for x in self.file_location.parent.glob( 101 self.file_location.name.replace(".hdf5", ".*") 102 ) 103 if x.suffix != ".hdf5" 104 ] 105 if len([x for x in add_files if x.suffix == ".json"]) > 0: 106 self.parameters_location = [x for x in add_files if x.suffix == ".json"][0] 107 elif len([x for x in add_files if x.suffix == ".toml"]) > 0: 108 self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0] 109 else: 110 self.parameters_location = None 111 112 def get_mass_spectrum_from_scan(self, scan_number): 113 """Return mass spectrum data object from scan number.""" 114 if scan_number in self.scan_number_list: 115 mass_spec = self.get_mass_spectrum(scan_number) 116 return mass_spec 117 else: 118 raise Exception("Scan number not found in HDF5 file.") 119 120 def load(self) -> None: 121 """ """ 122 pass 123 124 def get_ms_raw(self, spectra=None, scan_df=None) -> dict: 125 """ """ 126 # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation 127 if spectra is not None or scan_df is not None: 128 SyntaxWarning( 129 "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation." 130 ) 131 ms_unprocessed = {} 132 dict_group_load = self.h5pydata["ms_unprocessed"] 133 dict_group_keys = dict_group_load.keys() 134 for k in dict_group_keys: 135 ms_up_int = dict_group_load[k][:] 136 ms_unprocessed[int(k)] = pd.DataFrame( 137 ms_up_int, columns=["scan", "mz", "intensity"] 138 ) 139 return ms_unprocessed 140 141 def get_scan_df(self) -> pd.DataFrame: 142 scan_info = {} 143 dict_group_load = self.h5pydata["scan_info"] 144 dict_group_keys = dict_group_load.keys() 145 for k in dict_group_keys: 146 scan_info[k] = dict_group_load[k][:] 147 scan_df = pd.DataFrame(scan_info) 148 scan_df.set_index("scan", inplace=True, drop=False) 149 str_df = scan_df.select_dtypes([object]) 150 str_df = str_df.stack().str.decode("utf-8").unstack() 151 for col in str_df: 152 scan_df[col] = str_df[col] 153 return scan_df 154 155 def run(self, mass_spectra, load_raw=True) -> None: 156 """Runs the importer functions to populate a LCMS or MassSpectraBase object. 157 158 Notes 159 ----- 160 The following functions are run in order, if the HDF5 file contains the necessary data: 161 1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object. 162 2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object. 163 3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object. 164 4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object. 165 5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object. 166 6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object. 167 7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object. 168 169 Parameters 170 ---------- 171 mass_spectra : LCMSBase or MassSpectraBase 172 The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes. 173 load_raw : bool 174 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 175 Returns 176 ------- 177 None, but populates several attributes on the LCMS or MassSpectraBase object. 178 179 """ 180 if self.parameters_location is not None: 181 # Populate the parameters attribute on the LCMS object 182 self.import_parameters(mass_spectra) 183 184 if "mass_spectra" in self.h5pydata: 185 # Populate the _ms list on the LCMS object 186 self.import_mass_spectra(mass_spectra, load_raw=load_raw) 187 188 if "scan_info" in self.h5pydata: 189 # Populate the _scan_info attribute on the LCMS object 190 self.import_scan_info(mass_spectra) 191 192 if "ms_unprocessed" in self.h5pydata and load_raw: 193 # Populate the _ms_unprocessed attribute on the LCMS object 194 self.import_ms_unprocessed(mass_spectra) 195 196 if "mass_features" in self.h5pydata: 197 # Populate the mass_features attribute on the LCMS object 198 self.import_mass_features(mass_spectra) 199 200 if "eics" in self.h5pydata: 201 # Populate the eics attribute on the LCMS object 202 self.import_eics(mass_spectra) 203 204 if "spectral_search_results" in self.h5pydata: 205 # Populate the spectral_search_results attribute on the LCMS object 206 self.import_spectral_search_results(mass_spectra) 207 208 def import_mass_spectra(self, mass_spectra, load_raw=True) -> None: 209 """Imports all mass spectra from the HDF5 file. 210 211 Parameters 212 ---------- 213 mass_spectra : LCMSBase | MassSpectraBase 214 The MassSpectraBase or LCMSBase object to populate with mass spectra. 215 load_raw : bool 216 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default 217 218 Returns 219 ------- 220 None, but populates the '_ms' list on the LCMSBase or MassSpectraBase 221 object with mass spectra from the HDF5 file. 222 """ 223 for scan_number in self.scan_number_list: 224 mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw) 225 mass_spec.scan_number = scan_number 226 mass_spectra.add_mass_spectrum(mass_spec) 227 228 def import_scan_info(self, mass_spectra) -> None: 229 """Imports the scan info from the HDF5 file. 230 231 Parameters 232 ---------- 233 lcms : LCMSBase | MassSpectraBase 234 The MassSpectraBase or LCMSBase objects 235 236 Returns 237 ------- 238 None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase 239 object with a pandas DataFrame of the 'scan_info' from the HDF5 file. 240 241 """ 242 scan_df = self.get_scan_df() 243 mass_spectra.scan_df = scan_df 244 245 def import_ms_unprocessed(self, mass_spectra) -> None: 246 """Imports the unprocessed mass spectra from the HDF5 file. 247 248 Parameters 249 ---------- 250 lcms : LCMSBase | MassSpectraBase 251 The MassSpectraBase or LCMSBase objects 252 253 Returns 254 ------- 255 None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase 256 object with a dictionary of the 'ms_unprocessed' from the HDF5 file. 257 258 """ 259 ms_unprocessed = self.get_ms_raw() 260 mass_spectra._ms_unprocessed = ms_unprocessed 261 262 def import_parameters(self, mass_spectra) -> None: 263 """Imports the parameters from the HDF5 file. 264 265 Parameters 266 ---------- 267 mass_spectra : LCMSBase | MassSpectraBase 268 The MassSpectraBase or LCMSBase object to populate with parameters. 269 270 Returns 271 ------- 272 None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase 273 object with a dictionary of the 'parameters' from the HDF5 file. 274 275 """ 276 if ".json" == self.parameters_location.suffix: 277 load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location) 278 if ".toml" == self.parameters_location.suffix: 279 load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location) 280 else: 281 raise Exception( 282 "Parameters file must be in JSON format, TOML format is not yet supported." 283 ) 284 285 def import_mass_features(self, mass_spectra) -> None: 286 """Imports the mass features from the HDF5 file. 287 288 Parameters 289 ---------- 290 mass_spectra : LCMSBase | MassSpectraBase 291 The MassSpectraBase or LCMSBase object to populate with mass features. 292 293 Returns 294 ------- 295 None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase 296 object with a dictionary of the 'mass_features' from the HDF5 file. 297 298 """ 299 dict_group_load = self.h5pydata["mass_features"] 300 dict_group_keys = dict_group_load.keys() 301 for k in dict_group_keys: 302 # Instantiate the MassFeature object 303 mass_feature = LCMSMassFeature( 304 mass_spectra, 305 mz=dict_group_load[k].attrs["_mz_exp"], 306 retention_time=dict_group_load[k].attrs["_retention_time"], 307 intensity=dict_group_load[k].attrs["_intensity"], 308 apex_scan=dict_group_load[k].attrs["_apex_scan"], 309 persistence=dict_group_load[k].attrs["_persistence"], 310 id=int(k), 311 ) 312 313 # Populate additional attributes on the MassFeature object 314 for key in dict_group_load[k].attrs.keys() - { 315 "_mz_exp", 316 "_mz_cal", 317 "_retention_time", 318 "_intensity", 319 "_apex_scan", 320 "_persistence", 321 }: 322 setattr(mass_feature, key, dict_group_load[k].attrs[key]) 323 324 # Populate attributes on MassFeature object that are lists 325 for key in dict_group_load[k].keys(): 326 setattr(mass_feature, key, dict_group_load[k][key][:]) 327 328 mass_spectra.mass_features[int(k)] = mass_feature 329 330 # Associate mass features with ms1 and ms2 spectra, if available 331 for mf_id in mass_spectra.mass_features.keys(): 332 if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys(): 333 mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[ 334 mass_spectra.mass_features[mf_id].apex_scan 335 ] 336 if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None: 337 for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers: 338 if ms2_scan in mass_spectra._ms.keys(): 339 mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = ( 340 mass_spectra._ms[ms2_scan] 341 ) 342 343 def import_eics(self, mass_spectra): 344 """Imports the extracted ion chromatograms from the HDF5 file. 345 346 Parameters 347 ---------- 348 mass_spectra : LCMSBase | MassSpectraBase 349 The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms. 350 351 Returns 352 ------- 353 None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase 354 object with a dictionary of the 'eics' from the HDF5 file. 355 356 """ 357 dict_group_load = self.h5pydata["eics"] 358 dict_group_keys = dict_group_load.keys() 359 for k in dict_group_keys: 360 my_eic = EIC_Data( 361 scans=dict_group_load[k]["scans"][:], 362 time=dict_group_load[k]["time"][:], 363 eic=dict_group_load[k]["eic"][:], 364 ) 365 for key in dict_group_load[k].keys(): 366 if key not in ["scans", "time", "eic"]: 367 setattr(my_eic, key, dict_group_load[k][key][:]) 368 # if key is apexes, convert to a tuple of a list 369 if key == "apexes" and len(my_eic.apexes) > 0: 370 my_eic.apexes = [tuple(x) for x in my_eic.apexes] 371 # Add to mass_spectra object 372 mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic 373 374 # Add to mass features 375 for idx in mass_spectra.mass_features.keys(): 376 mz = mass_spectra.mass_features[idx].mz 377 if mz in mass_spectra.eics.keys(): 378 mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz] 379 380 def import_spectral_search_results(self, mass_spectra): 381 """Imports the spectral search results from the HDF5 file. 382 383 Parameters 384 ---------- 385 mass_spectra : LCMSBase | MassSpectraBase 386 The MassSpectraBase or LCMSBase object to populate with spectral search results. 387 388 Returns 389 ------- 390 None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase 391 object with a dictionary of the 'spectral_search_results' from the HDF5 file. 392 393 """ 394 overall_results_dict = {} 395 ms2_results_load = self.h5pydata["spectral_search_results"] 396 for k in ms2_results_load.keys(): 397 overall_results_dict[int(k)] = {} 398 for k2 in ms2_results_load[k].keys(): 399 ms2_search_res = SpectrumSearchResults( 400 query_spectrum=mass_spectra._ms[int(k)], 401 precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"], 402 spectral_similarity_search_results={}, 403 ) 404 405 for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}: 406 setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:])) 407 overall_results_dict[int(k)][ 408 ms2_results_load[k][k2].attrs["precursor_mz"] 409 ] = ms2_search_res 410 411 # add to mass_spectra 412 mass_spectra.spectral_search_results.update(overall_results_dict) 413 414 # If there are mass features, associate the results with each mass feature 415 if len(mass_spectra.mass_features) > 0: 416 for mass_feature_id, mass_feature in mass_spectra.mass_features.items(): 417 scan_ids = mass_feature.ms2_scan_numbers 418 for ms2_scan_id in scan_ids: 419 precursor_mz = mass_feature.mz 420 try: 421 mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz] 422 except KeyError: 423 pass 424 else: 425 mass_spectra.mass_features[ 426 mass_feature_id 427 ].ms2_similarity_results.append( 428 mass_spectra.spectral_search_results[ms2_scan_id][ 429 precursor_mz 430 ] 431 ) 432 433 def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase: 434 """ 435 Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file. 436 437 Parameters 438 ---------- 439 load_raw : bool 440 If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True. 441 442 """ 443 # Instantiate the LCMS object 444 spectra_obj = MassSpectraBase( 445 file_location=self.file_location, 446 analyzer=self.analyzer, 447 instrument_label=self.instrument_label, 448 sample_name=self.sample_name, 449 ) 450 451 # This will populate the _ms list on the LCMS or MassSpectraBase object 452 self.run(spectra_obj, load_raw=load_raw) 453 454 return spectra_obj 455 456 def get_lcms_obj( 457 self, load_raw=True, use_original_parser=True, raw_file_path=None 458 ) -> LCMSBase: 459 """ 460 Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file. 461 462 Parameters 463 ---------- 464 load_raw : bool 465 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 466 use_original_parser : bool 467 If True, use the original parser to populate the LCMS object. Default is True. 468 raw_file_path : str 469 The location of the raw file to parse if attempting to use original parser. 470 Default is None, which attempts to get the raw file path from the HDF5 file. 471 If the original file path has moved, this parameter can be used to specify the new location. 472 """ 473 # Instantiate the LCMS object 474 lcms_obj = LCMSBase( 475 file_location=self.file_location, 476 analyzer=self.analyzer, 477 instrument_label=self.instrument_label, 478 sample_name=self.sample_name, 479 ) 480 481 # This will populate the majority of the attributes on the LCMS object 482 self.run(lcms_obj, load_raw=load_raw) 483 484 # Set final attributes of the LCMS object 485 lcms_obj.polarity = self.h5pydata.attrs["polarity"] 486 lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan) 487 lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time) 488 lcms_obj._tic_list = list(lcms_obj.scan_df.tic) 489 490 # If use_original_parser is True, instantiate the original parser and populate the LCMS object 491 if use_original_parser: 492 lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path) 493 494 return lcms_obj 495 496 def add_original_parser(self, mass_spectra, raw_file_path=None): 497 """ 498 Add the original parser to the mass spectra object. 499 500 Parameters 501 ---------- 502 mass_spectra : MassSpectraBase | LCMSBase 503 The MassSpectraBase or LCMSBase object to add the original parser to. 504 raw_file_path : str 505 The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file. 506 """ 507 # Try to get the raw file path from the HDF5 file 508 if raw_file_path is None: 509 raw_file_path = self.h5pydata.attrs["original_file_location"] 510 # Check if og_file_location exists, if not raise an error 511 raw_file_path = self.h5pydata.attrs["original_file_location"] 512 513 raw_file_path = Path(raw_file_path) 514 if not raw_file_path.exists(): 515 raise FileExistsError( 516 "File does not exist: " + str(raw_file_path), 517 ". Cannot use original parser for instatiating the lcms_obj.", 518 ) 519 520 # Get the original parser type 521 og_parser_type = self.h5pydata.attrs["parser_type"] 522 523 if og_parser_type == "ImportMassSpectraThermoMSFileReader": 524 parser = ImportMassSpectraThermoMSFileReader(raw_file_path) 525 elif og_parser_type == "MZMLSpectraParser": 526 parser = MZMLSpectraParser(raw_file_path) 527 528 mass_spectra.spectra_parser_class = parser.__class__ 529 mass_spectra.spectra_parser = parser 530 531 return mass_spectra
Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.
Parameters
- file_location (str): The location of the HDF5 file to read, including the suffix.
Attributes
- file_location (str): The location of the HDF5 file to read.
- h5pydata (h5py.File): The HDF5 file object.
- scans (list): A list of the location of individual mass spectra within the HDF5 file.
- scan_number_list (list): A list of the scan numbers of the mass spectra within the HDF5 file.
- parameters_location (str): The location of the parameters file (json or toml).
Methods
- import_mass_spectra(mass_spectra). Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
- get_mass_spectrum_from_scan(scan_number). Return mass spectrum data object from scan number.
- load(). Placeholder method to meet the requirements of the SpectraParserInterface.
- run(mass_spectra). Runs the importer functions to populate a LCMS or MassSpectraBase object.
- import_scan_info(mass_spectra). Imports the scan info from the HDF5 file to populate the _scan_info attribute on the LCMS or MassSpectraBase object
- import_ms_unprocessed(mass_spectra). Imports the unprocessed mass spectra from the HDF5 file to populate the _ms_unprocessed attribute on the LCMS or MassSpectraBase object
- import_parameters(mass_spectra). Imports the parameters from the HDF5 file to populate the parameters attribute on the LCMS or MassSpectraBase object
- import_mass_features(mass_spectra). Imports the mass features from the HDF5 file to populate the mass_features attribute on the LCMS or MassSpectraBase object
- import_eics(mass_spectra). Imports the extracted ion chromatograms from the HDF5 file to populate the eics attribute on the LCMS or MassSpectraBase object
- import_spectral_search_results(mass_spectra). Imports the spectral search results from the HDF5 file to populate the spectral_search_results attribute on the LCMS or MassSpectraBase object
- get_mass_spectra_obj(). Return mass spectra data object, populating the _ms list on the LCMS or MassSpectraBase object from the HDF5 file
- get_lcms_obj(). Return LCMSBase object, populating the majority of the attributes on the LCMS object from the HDF5 file
85 def __init__(self, file_location: str): 86 Thread.__init__(self) 87 ReadCoreMSHDF_MassSpectrum.__init__(self, file_location) 88 89 # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file 90 self.scans = [ 91 "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys()) 92 ] 93 self.scan_number_list = sorted( 94 [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())] 95 ) 96 97 # set the location of the parameters file (json or toml) 98 add_files = [ 99 x 100 for x in self.file_location.parent.glob( 101 self.file_location.name.replace(".hdf5", ".*") 102 ) 103 if x.suffix != ".hdf5" 104 ] 105 if len([x for x in add_files if x.suffix == ".json"]) > 0: 106 self.parameters_location = [x for x in add_files if x.suffix == ".json"][0] 107 elif len([x for x in add_files if x.suffix == ".toml"]) > 0: 108 self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0] 109 else: 110 self.parameters_location = None
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
112 def get_mass_spectrum_from_scan(self, scan_number): 113 """Return mass spectrum data object from scan number.""" 114 if scan_number in self.scan_number_list: 115 mass_spec = self.get_mass_spectrum(scan_number) 116 return mass_spec 117 else: 118 raise Exception("Scan number not found in HDF5 file.")
Return mass spectrum data object from scan number.
124 def get_ms_raw(self, spectra=None, scan_df=None) -> dict: 125 """ """ 126 # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation 127 if spectra is not None or scan_df is not None: 128 SyntaxWarning( 129 "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation." 130 ) 131 ms_unprocessed = {} 132 dict_group_load = self.h5pydata["ms_unprocessed"] 133 dict_group_keys = dict_group_load.keys() 134 for k in dict_group_keys: 135 ms_up_int = dict_group_load[k][:] 136 ms_unprocessed[int(k)] = pd.DataFrame( 137 ms_up_int, columns=["scan", "mz", "intensity"] 138 ) 139 return ms_unprocessed
141 def get_scan_df(self) -> pd.DataFrame: 142 scan_info = {} 143 dict_group_load = self.h5pydata["scan_info"] 144 dict_group_keys = dict_group_load.keys() 145 for k in dict_group_keys: 146 scan_info[k] = dict_group_load[k][:] 147 scan_df = pd.DataFrame(scan_info) 148 scan_df.set_index("scan", inplace=True, drop=False) 149 str_df = scan_df.select_dtypes([object]) 150 str_df = str_df.stack().str.decode("utf-8").unstack() 151 for col in str_df: 152 scan_df[col] = str_df[col] 153 return scan_df
Return scan data as a pandas DataFrame.
155 def run(self, mass_spectra, load_raw=True) -> None: 156 """Runs the importer functions to populate a LCMS or MassSpectraBase object. 157 158 Notes 159 ----- 160 The following functions are run in order, if the HDF5 file contains the necessary data: 161 1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object. 162 2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object. 163 3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object. 164 4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object. 165 5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object. 166 6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object. 167 7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object. 168 169 Parameters 170 ---------- 171 mass_spectra : LCMSBase or MassSpectraBase 172 The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes. 173 load_raw : bool 174 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 175 Returns 176 ------- 177 None, but populates several attributes on the LCMS or MassSpectraBase object. 178 179 """ 180 if self.parameters_location is not None: 181 # Populate the parameters attribute on the LCMS object 182 self.import_parameters(mass_spectra) 183 184 if "mass_spectra" in self.h5pydata: 185 # Populate the _ms list on the LCMS object 186 self.import_mass_spectra(mass_spectra, load_raw=load_raw) 187 188 if "scan_info" in self.h5pydata: 189 # Populate the _scan_info attribute on the LCMS object 190 self.import_scan_info(mass_spectra) 191 192 if "ms_unprocessed" in self.h5pydata and load_raw: 193 # Populate the _ms_unprocessed attribute on the LCMS object 194 self.import_ms_unprocessed(mass_spectra) 195 196 if "mass_features" in self.h5pydata: 197 # Populate the mass_features attribute on the LCMS object 198 self.import_mass_features(mass_spectra) 199 200 if "eics" in self.h5pydata: 201 # Populate the eics attribute on the LCMS object 202 self.import_eics(mass_spectra) 203 204 if "spectral_search_results" in self.h5pydata: 205 # Populate the spectral_search_results attribute on the LCMS object 206 self.import_spectral_search_results(mass_spectra)
Runs the importer functions to populate a LCMS or MassSpectraBase object.
Notes
The following functions are run in order, if the HDF5 file contains the necessary data:
- import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
- import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
- import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
- import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
- import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
- import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
- import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
Parameters
- mass_spectra (LCMSBase or MassSpectraBase): The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
- load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
Returns
- None, but populates several attributes on the LCMS or MassSpectraBase object.
208 def import_mass_spectra(self, mass_spectra, load_raw=True) -> None: 209 """Imports all mass spectra from the HDF5 file. 210 211 Parameters 212 ---------- 213 mass_spectra : LCMSBase | MassSpectraBase 214 The MassSpectraBase or LCMSBase object to populate with mass spectra. 215 load_raw : bool 216 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default 217 218 Returns 219 ------- 220 None, but populates the '_ms' list on the LCMSBase or MassSpectraBase 221 object with mass spectra from the HDF5 file. 222 """ 223 for scan_number in self.scan_number_list: 224 mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw) 225 mass_spec.scan_number = scan_number 226 mass_spectra.add_mass_spectrum(mass_spec)
Imports all mass spectra from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass spectra.
- load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
Returns
- None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
- object with mass spectra from the HDF5 file.
228 def import_scan_info(self, mass_spectra) -> None: 229 """Imports the scan info from the HDF5 file. 230 231 Parameters 232 ---------- 233 lcms : LCMSBase | MassSpectraBase 234 The MassSpectraBase or LCMSBase objects 235 236 Returns 237 ------- 238 None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase 239 object with a pandas DataFrame of the 'scan_info' from the HDF5 file. 240 241 """ 242 scan_df = self.get_scan_df() 243 mass_spectra.scan_df = scan_df
Imports the scan info from the HDF5 file.
Parameters
- lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
- None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
- object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
245 def import_ms_unprocessed(self, mass_spectra) -> None: 246 """Imports the unprocessed mass spectra from the HDF5 file. 247 248 Parameters 249 ---------- 250 lcms : LCMSBase | MassSpectraBase 251 The MassSpectraBase or LCMSBase objects 252 253 Returns 254 ------- 255 None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase 256 object with a dictionary of the 'ms_unprocessed' from the HDF5 file. 257 258 """ 259 ms_unprocessed = self.get_ms_raw() 260 mass_spectra._ms_unprocessed = ms_unprocessed
Imports the unprocessed mass spectra from the HDF5 file.
Parameters
- lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
- None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
- object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
262 def import_parameters(self, mass_spectra) -> None: 263 """Imports the parameters from the HDF5 file. 264 265 Parameters 266 ---------- 267 mass_spectra : LCMSBase | MassSpectraBase 268 The MassSpectraBase or LCMSBase object to populate with parameters. 269 270 Returns 271 ------- 272 None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase 273 object with a dictionary of the 'parameters' from the HDF5 file. 274 275 """ 276 if ".json" == self.parameters_location.suffix: 277 load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location) 278 if ".toml" == self.parameters_location.suffix: 279 load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location) 280 else: 281 raise Exception( 282 "Parameters file must be in JSON format, TOML format is not yet supported." 283 )
Imports the parameters from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with parameters.
Returns
- None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
- object with a dictionary of the 'parameters' from the HDF5 file.
285 def import_mass_features(self, mass_spectra) -> None: 286 """Imports the mass features from the HDF5 file. 287 288 Parameters 289 ---------- 290 mass_spectra : LCMSBase | MassSpectraBase 291 The MassSpectraBase or LCMSBase object to populate with mass features. 292 293 Returns 294 ------- 295 None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase 296 object with a dictionary of the 'mass_features' from the HDF5 file. 297 298 """ 299 dict_group_load = self.h5pydata["mass_features"] 300 dict_group_keys = dict_group_load.keys() 301 for k in dict_group_keys: 302 # Instantiate the MassFeature object 303 mass_feature = LCMSMassFeature( 304 mass_spectra, 305 mz=dict_group_load[k].attrs["_mz_exp"], 306 retention_time=dict_group_load[k].attrs["_retention_time"], 307 intensity=dict_group_load[k].attrs["_intensity"], 308 apex_scan=dict_group_load[k].attrs["_apex_scan"], 309 persistence=dict_group_load[k].attrs["_persistence"], 310 id=int(k), 311 ) 312 313 # Populate additional attributes on the MassFeature object 314 for key in dict_group_load[k].attrs.keys() - { 315 "_mz_exp", 316 "_mz_cal", 317 "_retention_time", 318 "_intensity", 319 "_apex_scan", 320 "_persistence", 321 }: 322 setattr(mass_feature, key, dict_group_load[k].attrs[key]) 323 324 # Populate attributes on MassFeature object that are lists 325 for key in dict_group_load[k].keys(): 326 setattr(mass_feature, key, dict_group_load[k][key][:]) 327 328 mass_spectra.mass_features[int(k)] = mass_feature 329 330 # Associate mass features with ms1 and ms2 spectra, if available 331 for mf_id in mass_spectra.mass_features.keys(): 332 if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys(): 333 mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[ 334 mass_spectra.mass_features[mf_id].apex_scan 335 ] 336 if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None: 337 for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers: 338 if ms2_scan in mass_spectra._ms.keys(): 339 mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = ( 340 mass_spectra._ms[ms2_scan] 341 )
Imports the mass features from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass features.
Returns
- None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
- object with a dictionary of the 'mass_features' from the HDF5 file.
343 def import_eics(self, mass_spectra): 344 """Imports the extracted ion chromatograms from the HDF5 file. 345 346 Parameters 347 ---------- 348 mass_spectra : LCMSBase | MassSpectraBase 349 The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms. 350 351 Returns 352 ------- 353 None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase 354 object with a dictionary of the 'eics' from the HDF5 file. 355 356 """ 357 dict_group_load = self.h5pydata["eics"] 358 dict_group_keys = dict_group_load.keys() 359 for k in dict_group_keys: 360 my_eic = EIC_Data( 361 scans=dict_group_load[k]["scans"][:], 362 time=dict_group_load[k]["time"][:], 363 eic=dict_group_load[k]["eic"][:], 364 ) 365 for key in dict_group_load[k].keys(): 366 if key not in ["scans", "time", "eic"]: 367 setattr(my_eic, key, dict_group_load[k][key][:]) 368 # if key is apexes, convert to a tuple of a list 369 if key == "apexes" and len(my_eic.apexes) > 0: 370 my_eic.apexes = [tuple(x) for x in my_eic.apexes] 371 # Add to mass_spectra object 372 mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic 373 374 # Add to mass features 375 for idx in mass_spectra.mass_features.keys(): 376 mz = mass_spectra.mass_features[idx].mz 377 if mz in mass_spectra.eics.keys(): 378 mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]
Imports the extracted ion chromatograms from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
Returns
- None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
- object with a dictionary of the 'eics' from the HDF5 file.
380 def import_spectral_search_results(self, mass_spectra): 381 """Imports the spectral search results from the HDF5 file. 382 383 Parameters 384 ---------- 385 mass_spectra : LCMSBase | MassSpectraBase 386 The MassSpectraBase or LCMSBase object to populate with spectral search results. 387 388 Returns 389 ------- 390 None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase 391 object with a dictionary of the 'spectral_search_results' from the HDF5 file. 392 393 """ 394 overall_results_dict = {} 395 ms2_results_load = self.h5pydata["spectral_search_results"] 396 for k in ms2_results_load.keys(): 397 overall_results_dict[int(k)] = {} 398 for k2 in ms2_results_load[k].keys(): 399 ms2_search_res = SpectrumSearchResults( 400 query_spectrum=mass_spectra._ms[int(k)], 401 precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"], 402 spectral_similarity_search_results={}, 403 ) 404 405 for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}: 406 setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:])) 407 overall_results_dict[int(k)][ 408 ms2_results_load[k][k2].attrs["precursor_mz"] 409 ] = ms2_search_res 410 411 # add to mass_spectra 412 mass_spectra.spectral_search_results.update(overall_results_dict) 413 414 # If there are mass features, associate the results with each mass feature 415 if len(mass_spectra.mass_features) > 0: 416 for mass_feature_id, mass_feature in mass_spectra.mass_features.items(): 417 scan_ids = mass_feature.ms2_scan_numbers 418 for ms2_scan_id in scan_ids: 419 precursor_mz = mass_feature.mz 420 try: 421 mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz] 422 except KeyError: 423 pass 424 else: 425 mass_spectra.mass_features[ 426 mass_feature_id 427 ].ms2_similarity_results.append( 428 mass_spectra.spectral_search_results[ms2_scan_id][ 429 precursor_mz 430 ] 431 )
Imports the spectral search results from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with spectral search results.
Returns
- None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
- object with a dictionary of the 'spectral_search_results' from the HDF5 file.
433 def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase: 434 """ 435 Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file. 436 437 Parameters 438 ---------- 439 load_raw : bool 440 If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True. 441 442 """ 443 # Instantiate the LCMS object 444 spectra_obj = MassSpectraBase( 445 file_location=self.file_location, 446 analyzer=self.analyzer, 447 instrument_label=self.instrument_label, 448 sample_name=self.sample_name, 449 ) 450 451 # This will populate the _ms list on the LCMS or MassSpectraBase object 452 self.run(spectra_obj, load_raw=load_raw) 453 454 return spectra_obj
Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
Parameters
- load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
456 def get_lcms_obj( 457 self, load_raw=True, use_original_parser=True, raw_file_path=None 458 ) -> LCMSBase: 459 """ 460 Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file. 461 462 Parameters 463 ---------- 464 load_raw : bool 465 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 466 use_original_parser : bool 467 If True, use the original parser to populate the LCMS object. Default is True. 468 raw_file_path : str 469 The location of the raw file to parse if attempting to use original parser. 470 Default is None, which attempts to get the raw file path from the HDF5 file. 471 If the original file path has moved, this parameter can be used to specify the new location. 472 """ 473 # Instantiate the LCMS object 474 lcms_obj = LCMSBase( 475 file_location=self.file_location, 476 analyzer=self.analyzer, 477 instrument_label=self.instrument_label, 478 sample_name=self.sample_name, 479 ) 480 481 # This will populate the majority of the attributes on the LCMS object 482 self.run(lcms_obj, load_raw=load_raw) 483 484 # Set final attributes of the LCMS object 485 lcms_obj.polarity = self.h5pydata.attrs["polarity"] 486 lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan) 487 lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time) 488 lcms_obj._tic_list = list(lcms_obj.scan_df.tic) 489 490 # If use_original_parser is True, instantiate the original parser and populate the LCMS object 491 if use_original_parser: 492 lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path) 493 494 return lcms_obj
Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
Parameters
- load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
- use_original_parser (bool): If True, use the original parser to populate the LCMS object. Default is True.
- raw_file_path (str): The location of the raw file to parse if attempting to use original parser. Default is None, which attempts to get the raw file path from the HDF5 file. If the original file path has moved, this parameter can be used to specify the new location.
496 def add_original_parser(self, mass_spectra, raw_file_path=None): 497 """ 498 Add the original parser to the mass spectra object. 499 500 Parameters 501 ---------- 502 mass_spectra : MassSpectraBase | LCMSBase 503 The MassSpectraBase or LCMSBase object to add the original parser to. 504 raw_file_path : str 505 The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file. 506 """ 507 # Try to get the raw file path from the HDF5 file 508 if raw_file_path is None: 509 raw_file_path = self.h5pydata.attrs["original_file_location"] 510 # Check if og_file_location exists, if not raise an error 511 raw_file_path = self.h5pydata.attrs["original_file_location"] 512 513 raw_file_path = Path(raw_file_path) 514 if not raw_file_path.exists(): 515 raise FileExistsError( 516 "File does not exist: " + str(raw_file_path), 517 ". Cannot use original parser for instatiating the lcms_obj.", 518 ) 519 520 # Get the original parser type 521 og_parser_type = self.h5pydata.attrs["parser_type"] 522 523 if og_parser_type == "ImportMassSpectraThermoMSFileReader": 524 parser = ImportMassSpectraThermoMSFileReader(raw_file_path) 525 elif og_parser_type == "MZMLSpectraParser": 526 parser = MZMLSpectraParser(raw_file_path) 527 528 mass_spectra.spectra_parser_class = parser.__class__ 529 mass_spectra.spectra_parser = parser 530 531 return mass_spectra
Add the original parser to the mass spectra object.
Parameters
- mass_spectra (MassSpectraBase | LCMSBase): The MassSpectraBase or LCMSBase object to add the original parser to.
- raw_file_path (str): The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
Inherited Members
- corems.mass_spectrum.input.coremsHDF5.ReadCoreMSHDF_MassSpectrum
- h5pydata
- load_raw_data
- get_mass_spectrum
- load_settings
- get_dataframe
- get_time_index_to_pull
- get_high_level_attr_data
- get_scan_group_attr_data
- get_raw_data_attr_data
- get_output_parameters
- corems.mass_spectrum.input.baseClass.MassListBaseClass
- file_location
- header_lines
- isCentroid
- isThermoProfile
- headerless
- analyzer
- instrument_label
- sample_name
- parameters
- set_parameter_from_toml
- set_parameter_from_json
- data_type
- delimiter
- encoding_detector
- set_data_type
- clean_data_frame
- check_columns
- read_xml_peaks
- get_xml_polarity
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id