corems.mass_spectra.input.corems_hdf5
1__author__ = "Yuri E. Corilo" 2__date__ = "Oct 29, 2019" 3 4 5from threading import Thread 6from pathlib import Path 7 8import pandas as pd 9import warnings 10 11from corems.chroma_peak.factory.chroma_peak_classes import LCMSMassFeature 12from corems.encapsulation.input.parameter_from_json import ( 13 load_and_set_json_parameters_lcms, 14 load_and_set_toml_parameters_lcms, 15) 16from corems.mass_spectra.factory.lc_class import LCMSBase, MassSpectraBase 17from corems.mass_spectra.factory.chromat_data import EIC_Data 18from corems.mass_spectra.input.parserbase import SpectraParserInterface 19from corems.mass_spectrum.input.coremsHDF5 import ReadCoreMSHDF_MassSpectrum 20from corems.molecular_id.factory.spectrum_search_results import SpectrumSearchResults 21from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader 22from corems.mass_spectra.input.mzml import MZMLSpectraParser 23 24 25class ReadCoreMSHDFMassSpectra( 26 SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread 27): 28 """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object. 29 30 Parameters 31 ---------- 32 file_location : str 33 The location of the HDF5 file to read, including the suffix. 34 35 Attributes 36 ---------- 37 file_location : str 38 The location of the HDF5 file to read. 39 h5pydata : h5py.File 40 The HDF5 file object. 41 scans : list 42 A list of the location of individual mass spectra within the HDF5 file. 43 scan_number_list : list 44 A list of the scan numbers of the mass spectra within the HDF5 file. 45 parameters_location : str 46 The location of the parameters file (json or toml). 47 48 Methods 49 ------- 50 * import_mass_spectra(mass_spectra). 51 Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object. 52 * get_mass_spectrum_from_scan(scan_number). 53 Return mass spectrum data object from scan number. 54 * load(). 55 Placeholder method to meet the requirements of the SpectraParserInterface. 56 * run(mass_spectra). 57 Runs the importer functions to populate a LCMS or MassSpectraBase object. 58 * import_scan_info(mass_spectra). 59 Imports the scan info from the HDF5 file to populate the _scan_info attribute 60 on the LCMS or MassSpectraBase object 61 * import_ms_unprocessed(mass_spectra). 62 Imports the unprocessed mass spectra from the HDF5 file to populate the 63 _ms_unprocessed attribute on the LCMS or MassSpectraBase object 64 * import_parameters(mass_spectra). 65 Imports the parameters from the HDF5 file to populate the parameters 66 attribute on the LCMS or MassSpectraBase object 67 * import_mass_features(mass_spectra). 68 Imports the mass features from the HDF5 file to populate the mass_features 69 attribute on the LCMS or MassSpectraBase object 70 * import_eics(mass_spectra). 71 Imports the extracted ion chromatograms from the HDF5 file to populate the 72 eics attribute on the LCMS or MassSpectraBase object 73 * import_spectral_search_results(mass_spectra). 74 Imports the spectral search results from the HDF5 file to populate the 75 spectral_search_results attribute on the LCMS or MassSpectraBase object 76 * get_mass_spectra_obj(). 77 Return mass spectra data object, populating the _ms list on the LCMS or 78 MassSpectraBase object from the HDF5 file 79 * get_lcms_obj(). 80 Return LCMSBase object, populating the majority of the attributes on the 81 LCMS object from the HDF5 file 82 83 """ 84 85 def __init__(self, file_location: str): 86 Thread.__init__(self) 87 ReadCoreMSHDF_MassSpectrum.__init__(self, file_location) 88 89 # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file 90 self.scans = [ 91 "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys()) 92 ] 93 self.scan_number_list = sorted( 94 [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())] 95 ) 96 97 # set the location of the parameters file (json or toml) 98 add_files = [ 99 x 100 for x in self.file_location.parent.glob( 101 self.file_location.name.replace(".hdf5", ".*") 102 ) 103 if x.suffix != ".hdf5" 104 ] 105 if len([x for x in add_files if x.suffix == ".json"]) > 0: 106 self.parameters_location = [x for x in add_files if x.suffix == ".json"][0] 107 elif len([x for x in add_files if x.suffix == ".toml"]) > 0: 108 self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0] 109 else: 110 self.parameters_location = None 111 112 def get_mass_spectrum_from_scan(self, scan_number): 113 """Return mass spectrum data object from scan number.""" 114 if scan_number in self.scan_number_list: 115 mass_spec = self.get_mass_spectrum(scan_number) 116 return mass_spec 117 else: 118 raise Exception("Scan number not found in HDF5 file.") 119 120 def load(self) -> None: 121 """ """ 122 pass 123 124 def get_ms_raw(self, spectra=None, scan_df=None) -> dict: 125 """ """ 126 # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation 127 if spectra is not None or scan_df is not None: 128 SyntaxWarning( 129 "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation." 130 ) 131 ms_unprocessed = {} 132 dict_group_load = self.h5pydata["ms_unprocessed"] 133 dict_group_keys = dict_group_load.keys() 134 for k in dict_group_keys: 135 ms_up_int = dict_group_load[k][:] 136 ms_unprocessed[int(k)] = pd.DataFrame( 137 ms_up_int, columns=["scan", "mz", "intensity"] 138 ) 139 return ms_unprocessed 140 141 def get_scan_df(self) -> pd.DataFrame: 142 scan_info = {} 143 dict_group_load = self.h5pydata["scan_info"] 144 dict_group_keys = dict_group_load.keys() 145 for k in dict_group_keys: 146 scan_info[k] = dict_group_load[k][:] 147 scan_df = pd.DataFrame(scan_info) 148 scan_df.set_index("scan", inplace=True, drop=False) 149 str_df = scan_df.select_dtypes([object]) 150 str_df = str_df.stack().str.decode("utf-8").unstack() 151 for col in str_df: 152 scan_df[col] = str_df[col] 153 return scan_df 154 155 def run(self, mass_spectra, load_raw=True) -> None: 156 """Runs the importer functions to populate a LCMS or MassSpectraBase object. 157 158 Notes 159 ----- 160 The following functions are run in order, if the HDF5 file contains the necessary data: 161 1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object. 162 2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object. 163 3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object. 164 4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object. 165 5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object. 166 6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object. 167 7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object. 168 169 Parameters 170 ---------- 171 mass_spectra : LCMSBase or MassSpectraBase 172 The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes. 173 load_raw : bool 174 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 175 Returns 176 ------- 177 None, but populates several attributes on the LCMS or MassSpectraBase object. 178 179 """ 180 if self.parameters_location is not None: 181 # Populate the parameters attribute on the LCMS object 182 self.import_parameters(mass_spectra) 183 184 if "mass_spectra" in self.h5pydata: 185 # Populate the _ms list on the LCMS object 186 self.import_mass_spectra(mass_spectra, load_raw=load_raw) 187 188 if "scan_info" in self.h5pydata: 189 # Populate the _scan_info attribute on the LCMS object 190 self.import_scan_info(mass_spectra) 191 192 if "ms_unprocessed" in self.h5pydata and load_raw: 193 # Populate the _ms_unprocessed attribute on the LCMS object 194 self.import_ms_unprocessed(mass_spectra) 195 196 if "mass_features" in self.h5pydata: 197 # Populate the mass_features attribute on the LCMS object 198 self.import_mass_features(mass_spectra) 199 200 if "eics" in self.h5pydata: 201 # Populate the eics attribute on the LCMS object 202 self.import_eics(mass_spectra) 203 204 if "spectral_search_results" in self.h5pydata: 205 # Populate the spectral_search_results attribute on the LCMS object 206 self.import_spectral_search_results(mass_spectra) 207 208 def import_mass_spectra(self, mass_spectra, load_raw=True) -> None: 209 """Imports all mass spectra from the HDF5 file. 210 211 Parameters 212 ---------- 213 mass_spectra : LCMSBase | MassSpectraBase 214 The MassSpectraBase or LCMSBase object to populate with mass spectra. 215 load_raw : bool 216 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default 217 218 Returns 219 ------- 220 None, but populates the '_ms' list on the LCMSBase or MassSpectraBase 221 object with mass spectra from the HDF5 file. 222 """ 223 for scan_number in self.scan_number_list: 224 mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw) 225 mass_spec.scan_number = scan_number 226 mass_spectra.add_mass_spectrum(mass_spec) 227 228 def import_scan_info(self, mass_spectra) -> None: 229 """Imports the scan info from the HDF5 file. 230 231 Parameters 232 ---------- 233 lcms : LCMSBase | MassSpectraBase 234 The MassSpectraBase or LCMSBase objects 235 236 Returns 237 ------- 238 None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase 239 object with a pandas DataFrame of the 'scan_info' from the HDF5 file. 240 241 """ 242 scan_df = self.get_scan_df() 243 mass_spectra.scan_df = scan_df 244 245 def import_ms_unprocessed(self, mass_spectra) -> None: 246 """Imports the unprocessed mass spectra from the HDF5 file. 247 248 Parameters 249 ---------- 250 lcms : LCMSBase | MassSpectraBase 251 The MassSpectraBase or LCMSBase objects 252 253 Returns 254 ------- 255 None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase 256 object with a dictionary of the 'ms_unprocessed' from the HDF5 file. 257 258 """ 259 ms_unprocessed = self.get_ms_raw() 260 mass_spectra._ms_unprocessed = ms_unprocessed 261 262 def import_parameters(self, mass_spectra) -> None: 263 """Imports the parameters from the HDF5 file. 264 265 Parameters 266 ---------- 267 mass_spectra : LCMSBase | MassSpectraBase 268 The MassSpectraBase or LCMSBase object to populate with parameters. 269 270 Returns 271 ------- 272 None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase 273 object with a dictionary of the 'parameters' from the HDF5 file. 274 275 """ 276 if ".json" == self.parameters_location.suffix: 277 load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location) 278 if ".toml" == self.parameters_location.suffix: 279 load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location) 280 else: 281 raise Exception( 282 "Parameters file must be in JSON format, TOML format is not yet supported." 283 ) 284 285 def import_mass_features(self, mass_spectra) -> None: 286 """Imports the mass features from the HDF5 file. 287 288 Parameters 289 ---------- 290 mass_spectra : LCMSBase | MassSpectraBase 291 The MassSpectraBase or LCMSBase object to populate with mass features. 292 293 Returns 294 ------- 295 None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase 296 object with a dictionary of the 'mass_features' from the HDF5 file. 297 298 """ 299 dict_group_load = self.h5pydata["mass_features"] 300 dict_group_keys = dict_group_load.keys() 301 for k in dict_group_keys: 302 # Instantiate the MassFeature object 303 mass_feature = LCMSMassFeature( 304 mass_spectra, 305 mz=dict_group_load[k].attrs["_mz_exp"], 306 retention_time=dict_group_load[k].attrs["_retention_time"], 307 intensity=dict_group_load[k].attrs["_intensity"], 308 apex_scan=dict_group_load[k].attrs["_apex_scan"], 309 persistence=dict_group_load[k].attrs["_persistence"], 310 id=int(k), 311 ) 312 313 # Populate additional attributes on the MassFeature object 314 for key in dict_group_load[k].attrs.keys() - { 315 "_mz_exp", 316 "_mz_cal", 317 "_retention_time", 318 "_intensity", 319 "_apex_scan", 320 "_persistence", 321 }: 322 setattr(mass_feature, key, dict_group_load[k].attrs[key]) 323 324 # Populate attributes on MassFeature object that are lists 325 for key in dict_group_load[k].keys(): 326 setattr(mass_feature, key, dict_group_load[k][key][:]) 327 328 mass_spectra.mass_features[int(k)] = mass_feature 329 330 # Associate mass features with ms1 and ms2 spectra, if available 331 for mf_id in mass_spectra.mass_features.keys(): 332 if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys(): 333 mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[ 334 mass_spectra.mass_features[mf_id].apex_scan 335 ] 336 if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None: 337 for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers: 338 if ms2_scan in mass_spectra._ms.keys(): 339 mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = ( 340 mass_spectra._ms[ms2_scan] 341 ) 342 343 def import_eics(self, mass_spectra): 344 """Imports the extracted ion chromatograms from the HDF5 file. 345 346 Parameters 347 ---------- 348 mass_spectra : LCMSBase | MassSpectraBase 349 The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms. 350 351 Returns 352 ------- 353 None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase 354 object with a dictionary of the 'eics' from the HDF5 file. 355 356 """ 357 dict_group_load = self.h5pydata["eics"] 358 dict_group_keys = dict_group_load.keys() 359 for k in dict_group_keys: 360 my_eic = EIC_Data( 361 scans=dict_group_load[k]["scans"][:], 362 time=dict_group_load[k]["time"][:], 363 eic=dict_group_load[k]["eic"][:], 364 ) 365 for key in dict_group_load[k].keys(): 366 if key not in ["scans", "time", "eic"]: 367 setattr(my_eic, key, dict_group_load[k][key][:]) 368 # if key is apexes, convert to a tuple of a list 369 if key == "apexes" and len(my_eic.apexes) > 0: 370 my_eic.apexes = [tuple(x) for x in my_eic.apexes] 371 # Add to mass_spectra object 372 mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic 373 374 # Add to mass features 375 for idx in mass_spectra.mass_features.keys(): 376 mz = mass_spectra.mass_features[idx].mz 377 if mz in mass_spectra.eics.keys(): 378 mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz] 379 380 def import_spectral_search_results(self, mass_spectra): 381 """Imports the spectral search results from the HDF5 file. 382 383 Parameters 384 ---------- 385 mass_spectra : LCMSBase | MassSpectraBase 386 The MassSpectraBase or LCMSBase object to populate with spectral search results. 387 388 Returns 389 ------- 390 None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase 391 object with a dictionary of the 'spectral_search_results' from the HDF5 file. 392 393 """ 394 overall_results_dict = {} 395 ms2_results_load = self.h5pydata["spectral_search_results"] 396 for k in ms2_results_load.keys(): 397 overall_results_dict[int(k)] = {} 398 for k2 in ms2_results_load[k].keys(): 399 ms2_search_res = SpectrumSearchResults( 400 query_spectrum=mass_spectra._ms[int(k)], 401 precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"], 402 spectral_similarity_search_results={}, 403 ) 404 405 for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}: 406 setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:])) 407 overall_results_dict[int(k)][ 408 ms2_results_load[k][k2].attrs["precursor_mz"] 409 ] = ms2_search_res 410 411 # add to mass_spectra 412 mass_spectra.spectral_search_results.update(overall_results_dict) 413 414 # If there are mass features, associate the results with each mass feature 415 if len(mass_spectra.mass_features) > 0: 416 for mass_feature_id, mass_feature in mass_spectra.mass_features.items(): 417 scan_ids = mass_feature.ms2_scan_numbers 418 for ms2_scan_id in scan_ids: 419 precursor_mz = mass_feature.mz 420 try: 421 mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz] 422 except KeyError: 423 pass 424 else: 425 mass_spectra.mass_features[ 426 mass_feature_id 427 ].ms2_similarity_results.append( 428 mass_spectra.spectral_search_results[ms2_scan_id][ 429 precursor_mz 430 ] 431 ) 432 433 def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase: 434 """ 435 Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file. 436 437 Parameters 438 ---------- 439 load_raw : bool 440 If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True. 441 442 """ 443 # Instantiate the LCMS object 444 spectra_obj = MassSpectraBase( 445 file_location=self.file_location, 446 analyzer=self.analyzer, 447 instrument_label=self.instrument_label, 448 sample_name=self.sample_name, 449 ) 450 451 # This will populate the _ms list on the LCMS or MassSpectraBase object 452 self.run(spectra_obj, load_raw=load_raw) 453 454 return spectra_obj 455 456 def get_lcms_obj( 457 self, load_raw=True, use_original_parser=True, raw_file_path=None 458 ) -> LCMSBase: 459 """ 460 Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file. 461 462 Parameters 463 ---------- 464 load_raw : bool 465 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 466 use_original_parser : bool 467 If True, use the original parser to populate the LCMS object. Default is True. 468 raw_file_path : str 469 The location of the raw file to parse if attempting to use original parser. 470 Default is None, which attempts to get the raw file path from the HDF5 file. 471 If the original file path has moved, this parameter can be used to specify the new location. 472 """ 473 # Instantiate the LCMS object 474 lcms_obj = LCMSBase( 475 file_location=self.file_location, 476 analyzer=self.analyzer, 477 instrument_label=self.instrument_label, 478 sample_name=self.sample_name, 479 ) 480 481 # This will populate the majority of the attributes on the LCMS object 482 self.run(lcms_obj, load_raw=load_raw) 483 484 # Set final attributes of the LCMS object 485 lcms_obj.polarity = self.h5pydata.attrs["polarity"] 486 lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan) 487 lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time) 488 lcms_obj._tic_list = list(lcms_obj.scan_df.tic) 489 490 # If use_original_parser is True, instantiate the original parser and populate the LCMS object 491 if use_original_parser: 492 lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path) 493 494 return lcms_obj 495 496 def add_original_parser(self, mass_spectra, raw_file_path=None): 497 """ 498 Add the original parser to the mass spectra object. 499 500 Parameters 501 ---------- 502 mass_spectra : MassSpectraBase | LCMSBase 503 The MassSpectraBase or LCMSBase object to add the original parser to. 504 raw_file_path : str 505 The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file. 506 """ 507 # Try to get the raw file path from the HDF5 file 508 if raw_file_path is None: 509 raw_file_path = self.h5pydata.attrs["original_file_location"] 510 # Check if og_file_location exists, if not raise an error 511 raw_file_path = self.h5pydata.attrs["original_file_location"] 512 513 raw_file_path = Path(raw_file_path) 514 if not raw_file_path.exists(): 515 raise FileExistsError( 516 "File does not exist: " + str(raw_file_path), 517 ". Cannot use original parser for instatiating the lcms_obj.", 518 ) 519 520 # Get the original parser type 521 og_parser_type = self.h5pydata.attrs["parser_type"] 522 523 if og_parser_type == "ImportMassSpectraThermoMSFileReader": 524 parser = ImportMassSpectraThermoMSFileReader(raw_file_path) 525 elif og_parser_type == "MZMLSpectraParser": 526 parser = MZMLSpectraParser(raw_file_path) 527 528 mass_spectra.spectra_parser_class = parser.__class__ 529 mass_spectra.spectra_parser = parser 530 531 return mass_spectra 532 533 def get_creation_time(self): 534 """ 535 Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None. 536 """ 537 warnings.warn( 538 "Creation time is not available in CoreMS HDF5 files, returning None." \ 539 "This should be accessed through the original parser.", 540 ) 541 return None 542 543 def get_instrument_info(self): 544 """ 545 Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None. 546 """ 547 warnings.warn( 548 "Instrument info is not available in CoreMS HDF5 files, returning None." \ 549 "This should be accessed through the original parser.", 550 ) 551 return None
26class ReadCoreMSHDFMassSpectra( 27 SpectraParserInterface, ReadCoreMSHDF_MassSpectrum, Thread 28): 29 """Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object. 30 31 Parameters 32 ---------- 33 file_location : str 34 The location of the HDF5 file to read, including the suffix. 35 36 Attributes 37 ---------- 38 file_location : str 39 The location of the HDF5 file to read. 40 h5pydata : h5py.File 41 The HDF5 file object. 42 scans : list 43 A list of the location of individual mass spectra within the HDF5 file. 44 scan_number_list : list 45 A list of the scan numbers of the mass spectra within the HDF5 file. 46 parameters_location : str 47 The location of the parameters file (json or toml). 48 49 Methods 50 ------- 51 * import_mass_spectra(mass_spectra). 52 Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object. 53 * get_mass_spectrum_from_scan(scan_number). 54 Return mass spectrum data object from scan number. 55 * load(). 56 Placeholder method to meet the requirements of the SpectraParserInterface. 57 * run(mass_spectra). 58 Runs the importer functions to populate a LCMS or MassSpectraBase object. 59 * import_scan_info(mass_spectra). 60 Imports the scan info from the HDF5 file to populate the _scan_info attribute 61 on the LCMS or MassSpectraBase object 62 * import_ms_unprocessed(mass_spectra). 63 Imports the unprocessed mass spectra from the HDF5 file to populate the 64 _ms_unprocessed attribute on the LCMS or MassSpectraBase object 65 * import_parameters(mass_spectra). 66 Imports the parameters from the HDF5 file to populate the parameters 67 attribute on the LCMS or MassSpectraBase object 68 * import_mass_features(mass_spectra). 69 Imports the mass features from the HDF5 file to populate the mass_features 70 attribute on the LCMS or MassSpectraBase object 71 * import_eics(mass_spectra). 72 Imports the extracted ion chromatograms from the HDF5 file to populate the 73 eics attribute on the LCMS or MassSpectraBase object 74 * import_spectral_search_results(mass_spectra). 75 Imports the spectral search results from the HDF5 file to populate the 76 spectral_search_results attribute on the LCMS or MassSpectraBase object 77 * get_mass_spectra_obj(). 78 Return mass spectra data object, populating the _ms list on the LCMS or 79 MassSpectraBase object from the HDF5 file 80 * get_lcms_obj(). 81 Return LCMSBase object, populating the majority of the attributes on the 82 LCMS object from the HDF5 file 83 84 """ 85 86 def __init__(self, file_location: str): 87 Thread.__init__(self) 88 ReadCoreMSHDF_MassSpectrum.__init__(self, file_location) 89 90 # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file 91 self.scans = [ 92 "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys()) 93 ] 94 self.scan_number_list = sorted( 95 [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())] 96 ) 97 98 # set the location of the parameters file (json or toml) 99 add_files = [ 100 x 101 for x in self.file_location.parent.glob( 102 self.file_location.name.replace(".hdf5", ".*") 103 ) 104 if x.suffix != ".hdf5" 105 ] 106 if len([x for x in add_files if x.suffix == ".json"]) > 0: 107 self.parameters_location = [x for x in add_files if x.suffix == ".json"][0] 108 elif len([x for x in add_files if x.suffix == ".toml"]) > 0: 109 self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0] 110 else: 111 self.parameters_location = None 112 113 def get_mass_spectrum_from_scan(self, scan_number): 114 """Return mass spectrum data object from scan number.""" 115 if scan_number in self.scan_number_list: 116 mass_spec = self.get_mass_spectrum(scan_number) 117 return mass_spec 118 else: 119 raise Exception("Scan number not found in HDF5 file.") 120 121 def load(self) -> None: 122 """ """ 123 pass 124 125 def get_ms_raw(self, spectra=None, scan_df=None) -> dict: 126 """ """ 127 # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation 128 if spectra is not None or scan_df is not None: 129 SyntaxWarning( 130 "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation." 131 ) 132 ms_unprocessed = {} 133 dict_group_load = self.h5pydata["ms_unprocessed"] 134 dict_group_keys = dict_group_load.keys() 135 for k in dict_group_keys: 136 ms_up_int = dict_group_load[k][:] 137 ms_unprocessed[int(k)] = pd.DataFrame( 138 ms_up_int, columns=["scan", "mz", "intensity"] 139 ) 140 return ms_unprocessed 141 142 def get_scan_df(self) -> pd.DataFrame: 143 scan_info = {} 144 dict_group_load = self.h5pydata["scan_info"] 145 dict_group_keys = dict_group_load.keys() 146 for k in dict_group_keys: 147 scan_info[k] = dict_group_load[k][:] 148 scan_df = pd.DataFrame(scan_info) 149 scan_df.set_index("scan", inplace=True, drop=False) 150 str_df = scan_df.select_dtypes([object]) 151 str_df = str_df.stack().str.decode("utf-8").unstack() 152 for col in str_df: 153 scan_df[col] = str_df[col] 154 return scan_df 155 156 def run(self, mass_spectra, load_raw=True) -> None: 157 """Runs the importer functions to populate a LCMS or MassSpectraBase object. 158 159 Notes 160 ----- 161 The following functions are run in order, if the HDF5 file contains the necessary data: 162 1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object. 163 2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object. 164 3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object. 165 4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object. 166 5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object. 167 6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object. 168 7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object. 169 170 Parameters 171 ---------- 172 mass_spectra : LCMSBase or MassSpectraBase 173 The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes. 174 load_raw : bool 175 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 176 Returns 177 ------- 178 None, but populates several attributes on the LCMS or MassSpectraBase object. 179 180 """ 181 if self.parameters_location is not None: 182 # Populate the parameters attribute on the LCMS object 183 self.import_parameters(mass_spectra) 184 185 if "mass_spectra" in self.h5pydata: 186 # Populate the _ms list on the LCMS object 187 self.import_mass_spectra(mass_spectra, load_raw=load_raw) 188 189 if "scan_info" in self.h5pydata: 190 # Populate the _scan_info attribute on the LCMS object 191 self.import_scan_info(mass_spectra) 192 193 if "ms_unprocessed" in self.h5pydata and load_raw: 194 # Populate the _ms_unprocessed attribute on the LCMS object 195 self.import_ms_unprocessed(mass_spectra) 196 197 if "mass_features" in self.h5pydata: 198 # Populate the mass_features attribute on the LCMS object 199 self.import_mass_features(mass_spectra) 200 201 if "eics" in self.h5pydata: 202 # Populate the eics attribute on the LCMS object 203 self.import_eics(mass_spectra) 204 205 if "spectral_search_results" in self.h5pydata: 206 # Populate the spectral_search_results attribute on the LCMS object 207 self.import_spectral_search_results(mass_spectra) 208 209 def import_mass_spectra(self, mass_spectra, load_raw=True) -> None: 210 """Imports all mass spectra from the HDF5 file. 211 212 Parameters 213 ---------- 214 mass_spectra : LCMSBase | MassSpectraBase 215 The MassSpectraBase or LCMSBase object to populate with mass spectra. 216 load_raw : bool 217 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default 218 219 Returns 220 ------- 221 None, but populates the '_ms' list on the LCMSBase or MassSpectraBase 222 object with mass spectra from the HDF5 file. 223 """ 224 for scan_number in self.scan_number_list: 225 mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw) 226 mass_spec.scan_number = scan_number 227 mass_spectra.add_mass_spectrum(mass_spec) 228 229 def import_scan_info(self, mass_spectra) -> None: 230 """Imports the scan info from the HDF5 file. 231 232 Parameters 233 ---------- 234 lcms : LCMSBase | MassSpectraBase 235 The MassSpectraBase or LCMSBase objects 236 237 Returns 238 ------- 239 None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase 240 object with a pandas DataFrame of the 'scan_info' from the HDF5 file. 241 242 """ 243 scan_df = self.get_scan_df() 244 mass_spectra.scan_df = scan_df 245 246 def import_ms_unprocessed(self, mass_spectra) -> None: 247 """Imports the unprocessed mass spectra from the HDF5 file. 248 249 Parameters 250 ---------- 251 lcms : LCMSBase | MassSpectraBase 252 The MassSpectraBase or LCMSBase objects 253 254 Returns 255 ------- 256 None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase 257 object with a dictionary of the 'ms_unprocessed' from the HDF5 file. 258 259 """ 260 ms_unprocessed = self.get_ms_raw() 261 mass_spectra._ms_unprocessed = ms_unprocessed 262 263 def import_parameters(self, mass_spectra) -> None: 264 """Imports the parameters from the HDF5 file. 265 266 Parameters 267 ---------- 268 mass_spectra : LCMSBase | MassSpectraBase 269 The MassSpectraBase or LCMSBase object to populate with parameters. 270 271 Returns 272 ------- 273 None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase 274 object with a dictionary of the 'parameters' from the HDF5 file. 275 276 """ 277 if ".json" == self.parameters_location.suffix: 278 load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location) 279 if ".toml" == self.parameters_location.suffix: 280 load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location) 281 else: 282 raise Exception( 283 "Parameters file must be in JSON format, TOML format is not yet supported." 284 ) 285 286 def import_mass_features(self, mass_spectra) -> None: 287 """Imports the mass features from the HDF5 file. 288 289 Parameters 290 ---------- 291 mass_spectra : LCMSBase | MassSpectraBase 292 The MassSpectraBase or LCMSBase object to populate with mass features. 293 294 Returns 295 ------- 296 None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase 297 object with a dictionary of the 'mass_features' from the HDF5 file. 298 299 """ 300 dict_group_load = self.h5pydata["mass_features"] 301 dict_group_keys = dict_group_load.keys() 302 for k in dict_group_keys: 303 # Instantiate the MassFeature object 304 mass_feature = LCMSMassFeature( 305 mass_spectra, 306 mz=dict_group_load[k].attrs["_mz_exp"], 307 retention_time=dict_group_load[k].attrs["_retention_time"], 308 intensity=dict_group_load[k].attrs["_intensity"], 309 apex_scan=dict_group_load[k].attrs["_apex_scan"], 310 persistence=dict_group_load[k].attrs["_persistence"], 311 id=int(k), 312 ) 313 314 # Populate additional attributes on the MassFeature object 315 for key in dict_group_load[k].attrs.keys() - { 316 "_mz_exp", 317 "_mz_cal", 318 "_retention_time", 319 "_intensity", 320 "_apex_scan", 321 "_persistence", 322 }: 323 setattr(mass_feature, key, dict_group_load[k].attrs[key]) 324 325 # Populate attributes on MassFeature object that are lists 326 for key in dict_group_load[k].keys(): 327 setattr(mass_feature, key, dict_group_load[k][key][:]) 328 329 mass_spectra.mass_features[int(k)] = mass_feature 330 331 # Associate mass features with ms1 and ms2 spectra, if available 332 for mf_id in mass_spectra.mass_features.keys(): 333 if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys(): 334 mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[ 335 mass_spectra.mass_features[mf_id].apex_scan 336 ] 337 if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None: 338 for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers: 339 if ms2_scan in mass_spectra._ms.keys(): 340 mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = ( 341 mass_spectra._ms[ms2_scan] 342 ) 343 344 def import_eics(self, mass_spectra): 345 """Imports the extracted ion chromatograms from the HDF5 file. 346 347 Parameters 348 ---------- 349 mass_spectra : LCMSBase | MassSpectraBase 350 The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms. 351 352 Returns 353 ------- 354 None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase 355 object with a dictionary of the 'eics' from the HDF5 file. 356 357 """ 358 dict_group_load = self.h5pydata["eics"] 359 dict_group_keys = dict_group_load.keys() 360 for k in dict_group_keys: 361 my_eic = EIC_Data( 362 scans=dict_group_load[k]["scans"][:], 363 time=dict_group_load[k]["time"][:], 364 eic=dict_group_load[k]["eic"][:], 365 ) 366 for key in dict_group_load[k].keys(): 367 if key not in ["scans", "time", "eic"]: 368 setattr(my_eic, key, dict_group_load[k][key][:]) 369 # if key is apexes, convert to a tuple of a list 370 if key == "apexes" and len(my_eic.apexes) > 0: 371 my_eic.apexes = [tuple(x) for x in my_eic.apexes] 372 # Add to mass_spectra object 373 mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic 374 375 # Add to mass features 376 for idx in mass_spectra.mass_features.keys(): 377 mz = mass_spectra.mass_features[idx].mz 378 if mz in mass_spectra.eics.keys(): 379 mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz] 380 381 def import_spectral_search_results(self, mass_spectra): 382 """Imports the spectral search results from the HDF5 file. 383 384 Parameters 385 ---------- 386 mass_spectra : LCMSBase | MassSpectraBase 387 The MassSpectraBase or LCMSBase object to populate with spectral search results. 388 389 Returns 390 ------- 391 None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase 392 object with a dictionary of the 'spectral_search_results' from the HDF5 file. 393 394 """ 395 overall_results_dict = {} 396 ms2_results_load = self.h5pydata["spectral_search_results"] 397 for k in ms2_results_load.keys(): 398 overall_results_dict[int(k)] = {} 399 for k2 in ms2_results_load[k].keys(): 400 ms2_search_res = SpectrumSearchResults( 401 query_spectrum=mass_spectra._ms[int(k)], 402 precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"], 403 spectral_similarity_search_results={}, 404 ) 405 406 for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}: 407 setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:])) 408 overall_results_dict[int(k)][ 409 ms2_results_load[k][k2].attrs["precursor_mz"] 410 ] = ms2_search_res 411 412 # add to mass_spectra 413 mass_spectra.spectral_search_results.update(overall_results_dict) 414 415 # If there are mass features, associate the results with each mass feature 416 if len(mass_spectra.mass_features) > 0: 417 for mass_feature_id, mass_feature in mass_spectra.mass_features.items(): 418 scan_ids = mass_feature.ms2_scan_numbers 419 for ms2_scan_id in scan_ids: 420 precursor_mz = mass_feature.mz 421 try: 422 mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz] 423 except KeyError: 424 pass 425 else: 426 mass_spectra.mass_features[ 427 mass_feature_id 428 ].ms2_similarity_results.append( 429 mass_spectra.spectral_search_results[ms2_scan_id][ 430 precursor_mz 431 ] 432 ) 433 434 def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase: 435 """ 436 Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file. 437 438 Parameters 439 ---------- 440 load_raw : bool 441 If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True. 442 443 """ 444 # Instantiate the LCMS object 445 spectra_obj = MassSpectraBase( 446 file_location=self.file_location, 447 analyzer=self.analyzer, 448 instrument_label=self.instrument_label, 449 sample_name=self.sample_name, 450 ) 451 452 # This will populate the _ms list on the LCMS or MassSpectraBase object 453 self.run(spectra_obj, load_raw=load_raw) 454 455 return spectra_obj 456 457 def get_lcms_obj( 458 self, load_raw=True, use_original_parser=True, raw_file_path=None 459 ) -> LCMSBase: 460 """ 461 Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file. 462 463 Parameters 464 ---------- 465 load_raw : bool 466 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 467 use_original_parser : bool 468 If True, use the original parser to populate the LCMS object. Default is True. 469 raw_file_path : str 470 The location of the raw file to parse if attempting to use original parser. 471 Default is None, which attempts to get the raw file path from the HDF5 file. 472 If the original file path has moved, this parameter can be used to specify the new location. 473 """ 474 # Instantiate the LCMS object 475 lcms_obj = LCMSBase( 476 file_location=self.file_location, 477 analyzer=self.analyzer, 478 instrument_label=self.instrument_label, 479 sample_name=self.sample_name, 480 ) 481 482 # This will populate the majority of the attributes on the LCMS object 483 self.run(lcms_obj, load_raw=load_raw) 484 485 # Set final attributes of the LCMS object 486 lcms_obj.polarity = self.h5pydata.attrs["polarity"] 487 lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan) 488 lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time) 489 lcms_obj._tic_list = list(lcms_obj.scan_df.tic) 490 491 # If use_original_parser is True, instantiate the original parser and populate the LCMS object 492 if use_original_parser: 493 lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path) 494 495 return lcms_obj 496 497 def add_original_parser(self, mass_spectra, raw_file_path=None): 498 """ 499 Add the original parser to the mass spectra object. 500 501 Parameters 502 ---------- 503 mass_spectra : MassSpectraBase | LCMSBase 504 The MassSpectraBase or LCMSBase object to add the original parser to. 505 raw_file_path : str 506 The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file. 507 """ 508 # Try to get the raw file path from the HDF5 file 509 if raw_file_path is None: 510 raw_file_path = self.h5pydata.attrs["original_file_location"] 511 # Check if og_file_location exists, if not raise an error 512 raw_file_path = self.h5pydata.attrs["original_file_location"] 513 514 raw_file_path = Path(raw_file_path) 515 if not raw_file_path.exists(): 516 raise FileExistsError( 517 "File does not exist: " + str(raw_file_path), 518 ". Cannot use original parser for instatiating the lcms_obj.", 519 ) 520 521 # Get the original parser type 522 og_parser_type = self.h5pydata.attrs["parser_type"] 523 524 if og_parser_type == "ImportMassSpectraThermoMSFileReader": 525 parser = ImportMassSpectraThermoMSFileReader(raw_file_path) 526 elif og_parser_type == "MZMLSpectraParser": 527 parser = MZMLSpectraParser(raw_file_path) 528 529 mass_spectra.spectra_parser_class = parser.__class__ 530 mass_spectra.spectra_parser = parser 531 532 return mass_spectra 533 534 def get_creation_time(self): 535 """ 536 Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None. 537 """ 538 warnings.warn( 539 "Creation time is not available in CoreMS HDF5 files, returning None." \ 540 "This should be accessed through the original parser.", 541 ) 542 return None 543 544 def get_instrument_info(self): 545 """ 546 Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None. 547 """ 548 warnings.warn( 549 "Instrument info is not available in CoreMS HDF5 files, returning None." \ 550 "This should be accessed through the original parser.", 551 ) 552 return None
Class to read CoreMS HDF5 files and populate a LCMS or MassSpectraBase object.
Parameters
- file_location (str): The location of the HDF5 file to read, including the suffix.
Attributes
- file_location (str): The location of the HDF5 file to read.
- h5pydata (h5py.File): The HDF5 file object.
- scans (list): A list of the location of individual mass spectra within the HDF5 file.
- scan_number_list (list): A list of the scan numbers of the mass spectra within the HDF5 file.
- parameters_location (str): The location of the parameters file (json or toml).
Methods
- import_mass_spectra(mass_spectra). Imports all mass spectra from the HDF5 file onto the LCMS or MassSpectraBase object.
- get_mass_spectrum_from_scan(scan_number). Return mass spectrum data object from scan number.
- load(). Placeholder method to meet the requirements of the SpectraParserInterface.
- run(mass_spectra). Runs the importer functions to populate a LCMS or MassSpectraBase object.
- import_scan_info(mass_spectra). Imports the scan info from the HDF5 file to populate the _scan_info attribute on the LCMS or MassSpectraBase object
- import_ms_unprocessed(mass_spectra). Imports the unprocessed mass spectra from the HDF5 file to populate the _ms_unprocessed attribute on the LCMS or MassSpectraBase object
- import_parameters(mass_spectra). Imports the parameters from the HDF5 file to populate the parameters attribute on the LCMS or MassSpectraBase object
- import_mass_features(mass_spectra). Imports the mass features from the HDF5 file to populate the mass_features attribute on the LCMS or MassSpectraBase object
- import_eics(mass_spectra). Imports the extracted ion chromatograms from the HDF5 file to populate the eics attribute on the LCMS or MassSpectraBase object
- import_spectral_search_results(mass_spectra). Imports the spectral search results from the HDF5 file to populate the spectral_search_results attribute on the LCMS or MassSpectraBase object
- get_mass_spectra_obj(). Return mass spectra data object, populating the _ms list on the LCMS or MassSpectraBase object from the HDF5 file
- get_lcms_obj(). Return LCMSBase object, populating the majority of the attributes on the LCMS object from the HDF5 file
86 def __init__(self, file_location: str): 87 Thread.__init__(self) 88 ReadCoreMSHDF_MassSpectrum.__init__(self, file_location) 89 90 # override the scans attribute on ReadCoreMSHDF_MassSpectrum class to expect a nested location within the HDF5 file 91 self.scans = [ 92 "mass_spectra/" + x for x in list(self.h5pydata["mass_spectra"].keys()) 93 ] 94 self.scan_number_list = sorted( 95 [int(float(i)) for i in list(self.h5pydata["mass_spectra"].keys())] 96 ) 97 98 # set the location of the parameters file (json or toml) 99 add_files = [ 100 x 101 for x in self.file_location.parent.glob( 102 self.file_location.name.replace(".hdf5", ".*") 103 ) 104 if x.suffix != ".hdf5" 105 ] 106 if len([x for x in add_files if x.suffix == ".json"]) > 0: 107 self.parameters_location = [x for x in add_files if x.suffix == ".json"][0] 108 elif len([x for x in add_files if x.suffix == ".toml"]) > 0: 109 self.parameters_location = [x for x in add_files if x.suffix == ".toml"][0] 110 else: 111 self.parameters_location = None
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
113 def get_mass_spectrum_from_scan(self, scan_number): 114 """Return mass spectrum data object from scan number.""" 115 if scan_number in self.scan_number_list: 116 mass_spec = self.get_mass_spectrum(scan_number) 117 return mass_spec 118 else: 119 raise Exception("Scan number not found in HDF5 file.")
Return mass spectrum data object from scan number.
125 def get_ms_raw(self, spectra=None, scan_df=None) -> dict: 126 """ """ 127 # Warn if spectra or scan_df are not None that they are not used for CoreMS HDF5 files and should be rerun after instantiation 128 if spectra is not None or scan_df is not None: 129 SyntaxWarning( 130 "get_ms_raw method for CoreMS HDF5 files can only access saved data, consider rerunning after instantiation." 131 ) 132 ms_unprocessed = {} 133 dict_group_load = self.h5pydata["ms_unprocessed"] 134 dict_group_keys = dict_group_load.keys() 135 for k in dict_group_keys: 136 ms_up_int = dict_group_load[k][:] 137 ms_unprocessed[int(k)] = pd.DataFrame( 138 ms_up_int, columns=["scan", "mz", "intensity"] 139 ) 140 return ms_unprocessed
142 def get_scan_df(self) -> pd.DataFrame: 143 scan_info = {} 144 dict_group_load = self.h5pydata["scan_info"] 145 dict_group_keys = dict_group_load.keys() 146 for k in dict_group_keys: 147 scan_info[k] = dict_group_load[k][:] 148 scan_df = pd.DataFrame(scan_info) 149 scan_df.set_index("scan", inplace=True, drop=False) 150 str_df = scan_df.select_dtypes([object]) 151 str_df = str_df.stack().str.decode("utf-8").unstack() 152 for col in str_df: 153 scan_df[col] = str_df[col] 154 return scan_df
Return scan data as a pandas DataFrame.
156 def run(self, mass_spectra, load_raw=True) -> None: 157 """Runs the importer functions to populate a LCMS or MassSpectraBase object. 158 159 Notes 160 ----- 161 The following functions are run in order, if the HDF5 file contains the necessary data: 162 1. import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object. 163 2. import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object. 164 3. import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object. 165 4. import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object. 166 5. import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object. 167 6. import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object. 168 7. import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object. 169 170 Parameters 171 ---------- 172 mass_spectra : LCMSBase or MassSpectraBase 173 The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes. 174 load_raw : bool 175 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 176 Returns 177 ------- 178 None, but populates several attributes on the LCMS or MassSpectraBase object. 179 180 """ 181 if self.parameters_location is not None: 182 # Populate the parameters attribute on the LCMS object 183 self.import_parameters(mass_spectra) 184 185 if "mass_spectra" in self.h5pydata: 186 # Populate the _ms list on the LCMS object 187 self.import_mass_spectra(mass_spectra, load_raw=load_raw) 188 189 if "scan_info" in self.h5pydata: 190 # Populate the _scan_info attribute on the LCMS object 191 self.import_scan_info(mass_spectra) 192 193 if "ms_unprocessed" in self.h5pydata and load_raw: 194 # Populate the _ms_unprocessed attribute on the LCMS object 195 self.import_ms_unprocessed(mass_spectra) 196 197 if "mass_features" in self.h5pydata: 198 # Populate the mass_features attribute on the LCMS object 199 self.import_mass_features(mass_spectra) 200 201 if "eics" in self.h5pydata: 202 # Populate the eics attribute on the LCMS object 203 self.import_eics(mass_spectra) 204 205 if "spectral_search_results" in self.h5pydata: 206 # Populate the spectral_search_results attribute on the LCMS object 207 self.import_spectral_search_results(mass_spectra)
Runs the importer functions to populate a LCMS or MassSpectraBase object.
Notes
The following functions are run in order, if the HDF5 file contains the necessary data:
- import_parameters(), which populates the parameters attribute on the LCMS or MassSpectraBase object.
- import_mass_spectra(), which populates the _ms list on the LCMS or MassSpectraBase object.
- import_scan_info(), which populates the _scan_info on the LCMS or MassSpectraBase object.
- import_ms_unprocessed(), which populates the _ms_unprocessed attribute on the LCMS or MassSpectraBase object.
- import_mass_features(), which populates the mass_features attribute on the LCMS or MassSpectraBase object.
- import_eics(), which populates the eics attribute on the LCMS or MassSpectraBase object.
- import_spectral_search_results(), which populates the spectral_search_results attribute on the LCMS or MassSpectraBase object.
Parameters
- mass_spectra (LCMSBase or MassSpectraBase): The LCMS or MassSpectraBase object to populate with mass spectra, generally instantiated with only the file_location, analyzer, and instrument_label attributes.
- load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
Returns
- None, but populates several attributes on the LCMS or MassSpectraBase object.
209 def import_mass_spectra(self, mass_spectra, load_raw=True) -> None: 210 """Imports all mass spectra from the HDF5 file. 211 212 Parameters 213 ---------- 214 mass_spectra : LCMSBase | MassSpectraBase 215 The MassSpectraBase or LCMSBase object to populate with mass spectra. 216 load_raw : bool 217 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default 218 219 Returns 220 ------- 221 None, but populates the '_ms' list on the LCMSBase or MassSpectraBase 222 object with mass spectra from the HDF5 file. 223 """ 224 for scan_number in self.scan_number_list: 225 mass_spec = self.get_mass_spectrum(scan_number, load_raw=load_raw) 226 mass_spec.scan_number = scan_number 227 mass_spectra.add_mass_spectrum(mass_spec)
Imports all mass spectra from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass spectra.
- load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default
Returns
- None, but populates the '_ms' list on the LCMSBase or MassSpectraBase
- object with mass spectra from the HDF5 file.
229 def import_scan_info(self, mass_spectra) -> None: 230 """Imports the scan info from the HDF5 file. 231 232 Parameters 233 ---------- 234 lcms : LCMSBase | MassSpectraBase 235 The MassSpectraBase or LCMSBase objects 236 237 Returns 238 ------- 239 None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase 240 object with a pandas DataFrame of the 'scan_info' from the HDF5 file. 241 242 """ 243 scan_df = self.get_scan_df() 244 mass_spectra.scan_df = scan_df
Imports the scan info from the HDF5 file.
Parameters
- lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
- None, but populates the 'scan_df' attribute on the LCMSBase or MassSpectraBase
- object with a pandas DataFrame of the 'scan_info' from the HDF5 file.
246 def import_ms_unprocessed(self, mass_spectra) -> None: 247 """Imports the unprocessed mass spectra from the HDF5 file. 248 249 Parameters 250 ---------- 251 lcms : LCMSBase | MassSpectraBase 252 The MassSpectraBase or LCMSBase objects 253 254 Returns 255 ------- 256 None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase 257 object with a dictionary of the 'ms_unprocessed' from the HDF5 file. 258 259 """ 260 ms_unprocessed = self.get_ms_raw() 261 mass_spectra._ms_unprocessed = ms_unprocessed
Imports the unprocessed mass spectra from the HDF5 file.
Parameters
- lcms (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase objects
Returns
- None, but populates the '_ms_unprocessed' attribute on the LCMSBase or MassSpectraBase
- object with a dictionary of the 'ms_unprocessed' from the HDF5 file.
263 def import_parameters(self, mass_spectra) -> None: 264 """Imports the parameters from the HDF5 file. 265 266 Parameters 267 ---------- 268 mass_spectra : LCMSBase | MassSpectraBase 269 The MassSpectraBase or LCMSBase object to populate with parameters. 270 271 Returns 272 ------- 273 None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase 274 object with a dictionary of the 'parameters' from the HDF5 file. 275 276 """ 277 if ".json" == self.parameters_location.suffix: 278 load_and_set_json_parameters_lcms(mass_spectra, self.parameters_location) 279 if ".toml" == self.parameters_location.suffix: 280 load_and_set_toml_parameters_lcms(mass_spectra, self.parameters_location) 281 else: 282 raise Exception( 283 "Parameters file must be in JSON format, TOML format is not yet supported." 284 )
Imports the parameters from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with parameters.
Returns
- None, but populates the 'parameters' attribute on the LCMS or MassSpectraBase
- object with a dictionary of the 'parameters' from the HDF5 file.
286 def import_mass_features(self, mass_spectra) -> None: 287 """Imports the mass features from the HDF5 file. 288 289 Parameters 290 ---------- 291 mass_spectra : LCMSBase | MassSpectraBase 292 The MassSpectraBase or LCMSBase object to populate with mass features. 293 294 Returns 295 ------- 296 None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase 297 object with a dictionary of the 'mass_features' from the HDF5 file. 298 299 """ 300 dict_group_load = self.h5pydata["mass_features"] 301 dict_group_keys = dict_group_load.keys() 302 for k in dict_group_keys: 303 # Instantiate the MassFeature object 304 mass_feature = LCMSMassFeature( 305 mass_spectra, 306 mz=dict_group_load[k].attrs["_mz_exp"], 307 retention_time=dict_group_load[k].attrs["_retention_time"], 308 intensity=dict_group_load[k].attrs["_intensity"], 309 apex_scan=dict_group_load[k].attrs["_apex_scan"], 310 persistence=dict_group_load[k].attrs["_persistence"], 311 id=int(k), 312 ) 313 314 # Populate additional attributes on the MassFeature object 315 for key in dict_group_load[k].attrs.keys() - { 316 "_mz_exp", 317 "_mz_cal", 318 "_retention_time", 319 "_intensity", 320 "_apex_scan", 321 "_persistence", 322 }: 323 setattr(mass_feature, key, dict_group_load[k].attrs[key]) 324 325 # Populate attributes on MassFeature object that are lists 326 for key in dict_group_load[k].keys(): 327 setattr(mass_feature, key, dict_group_load[k][key][:]) 328 329 mass_spectra.mass_features[int(k)] = mass_feature 330 331 # Associate mass features with ms1 and ms2 spectra, if available 332 for mf_id in mass_spectra.mass_features.keys(): 333 if mass_spectra.mass_features[mf_id].apex_scan in mass_spectra._ms.keys(): 334 mass_spectra.mass_features[mf_id].mass_spectrum = mass_spectra._ms[ 335 mass_spectra.mass_features[mf_id].apex_scan 336 ] 337 if mass_spectra.mass_features[mf_id].ms2_scan_numbers is not None: 338 for ms2_scan in mass_spectra.mass_features[mf_id].ms2_scan_numbers: 339 if ms2_scan in mass_spectra._ms.keys(): 340 mass_spectra.mass_features[mf_id].ms2_mass_spectra[ms2_scan] = ( 341 mass_spectra._ms[ms2_scan] 342 )
Imports the mass features from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with mass features.
Returns
- None, but populates the 'mass_features' attribute on the LCMSBase or MassSpectraBase
- object with a dictionary of the 'mass_features' from the HDF5 file.
344 def import_eics(self, mass_spectra): 345 """Imports the extracted ion chromatograms from the HDF5 file. 346 347 Parameters 348 ---------- 349 mass_spectra : LCMSBase | MassSpectraBase 350 The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms. 351 352 Returns 353 ------- 354 None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase 355 object with a dictionary of the 'eics' from the HDF5 file. 356 357 """ 358 dict_group_load = self.h5pydata["eics"] 359 dict_group_keys = dict_group_load.keys() 360 for k in dict_group_keys: 361 my_eic = EIC_Data( 362 scans=dict_group_load[k]["scans"][:], 363 time=dict_group_load[k]["time"][:], 364 eic=dict_group_load[k]["eic"][:], 365 ) 366 for key in dict_group_load[k].keys(): 367 if key not in ["scans", "time", "eic"]: 368 setattr(my_eic, key, dict_group_load[k][key][:]) 369 # if key is apexes, convert to a tuple of a list 370 if key == "apexes" and len(my_eic.apexes) > 0: 371 my_eic.apexes = [tuple(x) for x in my_eic.apexes] 372 # Add to mass_spectra object 373 mass_spectra.eics[dict_group_load[k].attrs["mz"]] = my_eic 374 375 # Add to mass features 376 for idx in mass_spectra.mass_features.keys(): 377 mz = mass_spectra.mass_features[idx].mz 378 if mz in mass_spectra.eics.keys(): 379 mass_spectra.mass_features[idx]._eic_data = mass_spectra.eics[mz]
Imports the extracted ion chromatograms from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with extracted ion chromatograms.
Returns
- None, but populates the 'eics' attribute on the LCMSBase or MassSpectraBase
- object with a dictionary of the 'eics' from the HDF5 file.
381 def import_spectral_search_results(self, mass_spectra): 382 """Imports the spectral search results from the HDF5 file. 383 384 Parameters 385 ---------- 386 mass_spectra : LCMSBase | MassSpectraBase 387 The MassSpectraBase or LCMSBase object to populate with spectral search results. 388 389 Returns 390 ------- 391 None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase 392 object with a dictionary of the 'spectral_search_results' from the HDF5 file. 393 394 """ 395 overall_results_dict = {} 396 ms2_results_load = self.h5pydata["spectral_search_results"] 397 for k in ms2_results_load.keys(): 398 overall_results_dict[int(k)] = {} 399 for k2 in ms2_results_load[k].keys(): 400 ms2_search_res = SpectrumSearchResults( 401 query_spectrum=mass_spectra._ms[int(k)], 402 precursor_mz=ms2_results_load[k][k2].attrs["precursor_mz"], 403 spectral_similarity_search_results={}, 404 ) 405 406 for key in ms2_results_load[k][k2].keys() - {"precursor_mz"}: 407 setattr(ms2_search_res, key, list(ms2_results_load[k][k2][key][:])) 408 overall_results_dict[int(k)][ 409 ms2_results_load[k][k2].attrs["precursor_mz"] 410 ] = ms2_search_res 411 412 # add to mass_spectra 413 mass_spectra.spectral_search_results.update(overall_results_dict) 414 415 # If there are mass features, associate the results with each mass feature 416 if len(mass_spectra.mass_features) > 0: 417 for mass_feature_id, mass_feature in mass_spectra.mass_features.items(): 418 scan_ids = mass_feature.ms2_scan_numbers 419 for ms2_scan_id in scan_ids: 420 precursor_mz = mass_feature.mz 421 try: 422 mass_spectra.spectral_search_results[ms2_scan_id][precursor_mz] 423 except KeyError: 424 pass 425 else: 426 mass_spectra.mass_features[ 427 mass_feature_id 428 ].ms2_similarity_results.append( 429 mass_spectra.spectral_search_results[ms2_scan_id][ 430 precursor_mz 431 ] 432 )
Imports the spectral search results from the HDF5 file.
Parameters
- mass_spectra (LCMSBase | MassSpectraBase): The MassSpectraBase or LCMSBase object to populate with spectral search results.
Returns
- None, but populates the 'spectral_search_results' attribute on the LCMSBase or MassSpectraBase
- object with a dictionary of the 'spectral_search_results' from the HDF5 file.
434 def get_mass_spectra_obj(self, load_raw=True) -> MassSpectraBase: 435 """ 436 Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file. 437 438 Parameters 439 ---------- 440 load_raw : bool 441 If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True. 442 443 """ 444 # Instantiate the LCMS object 445 spectra_obj = MassSpectraBase( 446 file_location=self.file_location, 447 analyzer=self.analyzer, 448 instrument_label=self.instrument_label, 449 sample_name=self.sample_name, 450 ) 451 452 # This will populate the _ms list on the LCMS or MassSpectraBase object 453 self.run(spectra_obj, load_raw=load_raw) 454 455 return spectra_obj
Return mass spectra data object, populating the _ms list on MassSpectraBase object from the HDF5 file.
Parameters
- load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall spectra object and individual mass spectra. Default is True.
457 def get_lcms_obj( 458 self, load_raw=True, use_original_parser=True, raw_file_path=None 459 ) -> LCMSBase: 460 """ 461 Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file. 462 463 Parameters 464 ---------- 465 load_raw : bool 466 If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True. 467 use_original_parser : bool 468 If True, use the original parser to populate the LCMS object. Default is True. 469 raw_file_path : str 470 The location of the raw file to parse if attempting to use original parser. 471 Default is None, which attempts to get the raw file path from the HDF5 file. 472 If the original file path has moved, this parameter can be used to specify the new location. 473 """ 474 # Instantiate the LCMS object 475 lcms_obj = LCMSBase( 476 file_location=self.file_location, 477 analyzer=self.analyzer, 478 instrument_label=self.instrument_label, 479 sample_name=self.sample_name, 480 ) 481 482 # This will populate the majority of the attributes on the LCMS object 483 self.run(lcms_obj, load_raw=load_raw) 484 485 # Set final attributes of the LCMS object 486 lcms_obj.polarity = self.h5pydata.attrs["polarity"] 487 lcms_obj._scans_number_list = list(lcms_obj.scan_df.scan) 488 lcms_obj._retention_time_list = list(lcms_obj.scan_df.scan_time) 489 lcms_obj._tic_list = list(lcms_obj.scan_df.tic) 490 491 # If use_original_parser is True, instantiate the original parser and populate the LCMS object 492 if use_original_parser: 493 lcms_obj = self.add_original_parser(lcms_obj, raw_file_path=raw_file_path) 494 495 return lcms_obj
Return LCMSBase object, populating attributes on the LCMSBase object from the HDF5 file.
Parameters
- load_raw (bool): If True, load raw data (unprocessed) from HDF5 files for overall lcms object and individual mass spectra. Default is True.
- use_original_parser (bool): If True, use the original parser to populate the LCMS object. Default is True.
- raw_file_path (str): The location of the raw file to parse if attempting to use original parser. Default is None, which attempts to get the raw file path from the HDF5 file. If the original file path has moved, this parameter can be used to specify the new location.
497 def add_original_parser(self, mass_spectra, raw_file_path=None): 498 """ 499 Add the original parser to the mass spectra object. 500 501 Parameters 502 ---------- 503 mass_spectra : MassSpectraBase | LCMSBase 504 The MassSpectraBase or LCMSBase object to add the original parser to. 505 raw_file_path : str 506 The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file. 507 """ 508 # Try to get the raw file path from the HDF5 file 509 if raw_file_path is None: 510 raw_file_path = self.h5pydata.attrs["original_file_location"] 511 # Check if og_file_location exists, if not raise an error 512 raw_file_path = self.h5pydata.attrs["original_file_location"] 513 514 raw_file_path = Path(raw_file_path) 515 if not raw_file_path.exists(): 516 raise FileExistsError( 517 "File does not exist: " + str(raw_file_path), 518 ". Cannot use original parser for instatiating the lcms_obj.", 519 ) 520 521 # Get the original parser type 522 og_parser_type = self.h5pydata.attrs["parser_type"] 523 524 if og_parser_type == "ImportMassSpectraThermoMSFileReader": 525 parser = ImportMassSpectraThermoMSFileReader(raw_file_path) 526 elif og_parser_type == "MZMLSpectraParser": 527 parser = MZMLSpectraParser(raw_file_path) 528 529 mass_spectra.spectra_parser_class = parser.__class__ 530 mass_spectra.spectra_parser = parser 531 532 return mass_spectra
Add the original parser to the mass spectra object.
Parameters
- mass_spectra (MassSpectraBase | LCMSBase): The MassSpectraBase or LCMSBase object to add the original parser to.
- raw_file_path (str): The location of the raw file to parse. Default is None, which attempts to get the raw file path from the HDF5 file.
534 def get_creation_time(self): 535 """ 536 Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None. 537 """ 538 warnings.warn( 539 "Creation time is not available in CoreMS HDF5 files, returning None." \ 540 "This should be accessed through the original parser.", 541 ) 542 return None
Raise a NotImplemented Warning, as creation time is not available in CoreMS HDF5 files and returning None.
544 def get_instrument_info(self): 545 """ 546 Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None. 547 """ 548 warnings.warn( 549 "Instrument info is not available in CoreMS HDF5 files, returning None." \ 550 "This should be accessed through the original parser.", 551 ) 552 return None
Raise a NotImplemented Warning, as instrument info is not available in CoreMS HDF5 files and returning None.
Inherited Members
- corems.mass_spectrum.input.coremsHDF5.ReadCoreMSHDF_MassSpectrum
- h5pydata
- load_raw_data
- get_mass_spectrum
- load_settings
- get_dataframe
- get_time_index_to_pull
- get_high_level_attr_data
- get_scan_group_attr_data
- get_raw_data_attr_data
- get_output_parameters
- corems.mass_spectrum.input.baseClass.MassListBaseClass
- file_location
- header_lines
- isCentroid
- isThermoProfile
- headerless
- analyzer
- instrument_label
- sample_name
- parameters
- set_parameter_from_toml
- set_parameter_from_json
- data_type
- delimiter
- encoding_detector
- set_data_type
- clean_data_frame
- check_columns
- read_xml_peaks
- get_xml_polarity
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id