corems.mass_spectrum.output.export
1__author__ = "Yuri E. Corilo" 2__date__ = "Nov 11, 2019" 3 4import json 5from datetime import datetime, timezone 6from pathlib import Path 7from threading import Thread 8 9import h5py 10import toml 11import numpy as np 12from numpy import NaN, empty 13from pandas import DataFrame 14 15from corems.encapsulation.constant import Atoms, Labels #Labels is accessed in the eval() function 16from corems.encapsulation.output import parameter_to_dict 17from corems.mass_spectrum.factory.MassSpectrumClasses import MassSpecfromFreq 18 19 20class HighResMassSpecExport(Thread): 21 """A class for exporting high-resolution mass spectra. 22 23 Parameters 24 ---------- 25 out_file_path : str 26 The output file path. 27 mass_spectrum : MassSpectrum 28 The mass spectrum to export. 29 output_type : str, optional 30 The type of output file. Defaults to 'excel'. Can be 'excel', 'csv', 'pandas' or 'hdf5'. 31 32 Attributes 33 ---------- 34 output_file : Path 35 The output file path. 36 output_type : str 37 The type of output file. 38 mass_spectrum : MassSpectrum 39 The mass spectrum to export. 40 atoms_order_list : list 41 The list of assigned atoms in the order specified by Atoms.atoms_order list. 42 columns_label : list 43 The column labels in order. 44 45 Methods 46 ------- 47 * save(). 48 Save the mass spectrum data to the output file. 49 * run(). 50 Run the export process. 51 * get_pandas_df(). 52 Returns the mass spectrum data as a pandas DataFrame. 53 * write_settings(output_path, mass_spectrum). 54 Writes the settings of the mass spectrum to a JSON file. 55 * to_pandas(write_metadata=True). 56 Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file. 57 * to_excel(write_metadata=True). 58 Exports the mass spectrum data to an Excel file. 59 * to_csv(write_metadata=True). 60 Exports the mass spectrum data to a CSV file. 61 * to_json(). 62 Exports the mass spectrum data to a JSON string. 63 * to_hdf(). 64 Exports the mass spectrum data to an HDF5 file. 65 * parameters_to_toml(). 66 Converts the mass spectrum parameters to a TOML string. 67 * parameters_to_json(). 68 Converts the mass spectrum parameters to a JSON string. 69 * get_mass_spec_attrs(mass_spectrum). 70 Returns the mass spectrum attributes as a dictionary. 71 * get_all_used_atoms_in_order(mass_spectrum). 72 Returns the list of assigned atoms in the order specified by Atoms.atoms_order list. 73 * list_dict_to_list(mass_spectrum, is_hdf5=False). 74 Returns the mass spectrum data as a list of dictionaries. 75 * get_list_dict_data(mass_spectrum, include_no_match=True, include_isotopologues=True, isotopologue_inline=True, no_match_inline=False, is_hdf5=False). 76 Returns the mass spectrum data as a list of dictionaries. 77 78 """ 79 80 def __init__(self, out_file_path, mass_spectrum, output_type="excel"): 81 Thread.__init__(self) 82 83 self.output_file = Path(out_file_path) 84 85 # 'excel', 'csv' or 'pandas' 86 self.output_type = output_type 87 88 self.mass_spectrum = mass_spectrum 89 90 # collect all assigned atoms and order them accordingly to the Atoms.atoms_order list 91 self.atoms_order_list = self.get_all_used_atoms_in_order(self.mass_spectrum) 92 93 self._init_columns() 94 95 def _init_columns(self): 96 """Initialize the columns for the mass spectrum output.""" 97 # column labels in order 98 self.columns_label = [ 99 "Index", 100 "m/z", 101 "Calibrated m/z", 102 "Calculated m/z", 103 "Peak Height", 104 "Peak Area", 105 "Resolving Power", 106 "S/N", 107 "Ion Charge", 108 "m/z Error (ppm)", 109 "m/z Error Score", 110 "Isotopologue Similarity", 111 "Confidence Score", 112 "DBE", 113 "O/C", 114 "H/C", 115 "Heteroatom Class", 116 "Ion Type", 117 "Adduct", 118 "Is Isotopologue", 119 "Mono Isotopic Index", 120 "Molecular Formula", 121 ] 122 123 @property 124 def output_type(self): 125 """Returns the output type of the mass spectrum.""" 126 return self._output_type 127 128 @output_type.setter 129 def output_type(self, output_type): 130 output_types = ["excel", "csv", "pandas", "hdf5"] 131 if output_type in output_types: 132 self._output_type = output_type 133 else: 134 raise TypeError( 135 'Supported types are "excel", "csv" or "pandas", %s entered' 136 % output_type 137 ) 138 139 def save(self): 140 """Save the mass spectrum data to the output file. 141 142 Raises 143 ------ 144 ValueError 145 If the output type is not supported. 146 """ 147 148 if self.output_type == "excel": 149 self.to_excel() 150 elif self.output_type == "csv": 151 self.to_csv() 152 elif self.output_type == "pandas": 153 self.to_pandas() 154 elif self.output_type == "hdf5": 155 self.to_hdf() 156 else: 157 raise ValueError( 158 "Unkown output type: %s; it can be 'excel', 'csv' or 'pandas'" 159 % self.output_type 160 ) 161 162 def run(self): 163 """Run the export process. 164 165 This method is called when the thread starts. 166 It calls the save method to perform the export.""" 167 self.save() 168 169 def get_pandas_df(self, additional_columns=None): 170 """Returns the mass spectrum data as a pandas DataFrame. 171 172 Parameters 173 ---------- 174 additional_columns : list, optional 175 Additional columns to include in the DataFrame. Defaults to None. 176 Suitable additional columns are: 'Aromaticity Index', 'NOSC', 'Aromaticity Index (modified)'. 177 178 Returns 179 ------- 180 DataFrame 181 The mass spectrum data as a pandas DataFrame. 182 """ 183 if additional_columns is not None: 184 possible_additional_columns = [ 185 "Aromaticity Index", 186 "NOSC", 187 "Aromaticity Index (modified)", 188 ] 189 if additional_columns: 190 for column in additional_columns: 191 if column not in possible_additional_columns: 192 raise ValueError("Invalid additional column: %s" % column) 193 columns = ( 194 self.columns_label 195 + additional_columns 196 + self.get_all_used_atoms_in_order(self.mass_spectrum) 197 ) 198 else: 199 columns = self.columns_label + self.get_all_used_atoms_in_order( 200 self.mass_spectrum 201 ) 202 dict_data_list = self.get_list_dict_data( 203 self.mass_spectrum, additional_columns=additional_columns 204 ) 205 df = DataFrame(dict_data_list, columns=columns) 206 df.name = self.output_file 207 return df 208 209 def write_settings(self, output_path, mass_spectrum): 210 """Writes the settings of the mass spectrum to a JSON file. 211 212 Parameters 213 ---------- 214 output_path : str 215 The output file path. 216 mass_spectrum : MassSpectrum 217 The mass spectrum to export. 218 """ 219 220 import json 221 222 dict_setting = parameter_to_dict.get_dict_data_ms(mass_spectrum) 223 224 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(mass_spectrum) 225 dict_setting["analyzer"] = mass_spectrum.analyzer 226 dict_setting["instrument_label"] = mass_spectrum.instrument_label 227 dict_setting["sample_name"] = mass_spectrum.sample_name 228 229 with open( 230 output_path.with_suffix(".json"), 231 "w", 232 encoding="utf8", 233 ) as outfile: 234 output = json.dumps( 235 dict_setting, sort_keys=True, indent=4, separators=(",", ": ") 236 ) 237 outfile.write(output) 238 239 def to_pandas(self, write_metadata=True): 240 """Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file. 241 242 Parameters 243 ---------- 244 write_metadata : bool, optional 245 Whether to write the metadata to a JSON file. Defaults to True. 246 """ 247 248 columns = self.columns_label + self.get_all_used_atoms_in_order( 249 self.mass_spectrum 250 ) 251 252 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 253 254 df = DataFrame(dict_data_list, columns=columns) 255 256 df.to_pickle(self.output_file.with_suffix(".pkl")) 257 258 if write_metadata: 259 self.write_settings(self.output_file, self.mass_spectrum) 260 261 def to_excel(self, write_metadata=True): 262 """Exports the mass spectrum data to an Excel file. 263 264 Parameters 265 ---------- 266 write_metadata : bool, optional 267 Whether to write the metadata to a JSON file. Defaults to True. 268 """ 269 270 columns = self.columns_label + self.get_all_used_atoms_in_order( 271 self.mass_spectrum 272 ) 273 274 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 275 276 df = DataFrame(dict_data_list, columns=columns) 277 278 df.to_excel(self.output_file.with_suffix(".xlsx")) 279 280 if write_metadata: 281 self.write_settings(self.output_file, self.mass_spectrum) 282 283 def to_csv(self, write_metadata=True): 284 """Exports the mass spectrum data to a CSV file. 285 286 Parameters 287 ---------- 288 write_metadata : bool, optional 289 Whether to write the metadata to a JSON file. Defaults to True. 290 """ 291 292 columns = self.columns_label + self.get_all_used_atoms_in_order( 293 self.mass_spectrum 294 ) 295 296 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 297 298 import csv 299 300 try: 301 with open(self.output_file.with_suffix(".csv"), "w", newline="") as csvfile: 302 writer = csv.DictWriter(csvfile, fieldnames=columns) 303 writer.writeheader() 304 for data in dict_data_list: 305 writer.writerow(data) 306 if write_metadata: 307 self.write_settings(self.output_file, self.mass_spectrum) 308 309 except IOError as ioerror: 310 print(ioerror) 311 312 def to_json(self): 313 """Exports the mass spectrum data to a JSON string.""" 314 315 columns = self.columns_label + self.get_all_used_atoms_in_order( 316 self.mass_spectrum 317 ) 318 319 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 320 321 df = DataFrame(dict_data_list, columns=columns) 322 323 # for key, values in dict_data.items(): 324 # if not values: dict_data[key] = NaN 325 326 # output = json.dumps(dict_data, sort_keys=True, indent=4, separators=(',', ': ')) 327 return df.to_json(orient="records") 328 329 def add_mass_spectrum_to_hdf5( 330 self, 331 hdf_handle, 332 mass_spectrum, 333 group_key, 334 mass_spectra_group=None, 335 export_raw=True, 336 ): 337 """Adds the mass spectrum data to an HDF5 file. 338 339 Parameters 340 ---------- 341 hdf_handle : h5py.File 342 The HDF5 file handle. 343 mass_spectrum : MassSpectrum 344 The mass spectrum to add to the HDF5 file. 345 group_key : str 346 The group key (where to add the mass spectrum data within the HDF5 file). 347 mass_spectra_group : h5py.Group, optional 348 The mass spectra group. Defaults to None (no group, mass spectrum is added to the root). 349 export_raw : bool, optional 350 Whether to export the raw data. Defaults to True. 351 If False, only the processed data (peaks) is exported (essentially centroided data). 352 """ 353 if mass_spectra_group is None: 354 # Check if the file has the necessary attributes and add them if not 355 # This assumes that if there is a mass_spectra_group, these attributes were already added to the file 356 if not hdf_handle.attrs.get("date_utc"): 357 timenow = str( 358 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 359 ) 360 hdf_handle.attrs["date_utc"] = timenow 361 hdf_handle.attrs["file_name"] = mass_spectrum.filename.name 362 hdf_handle.attrs["data_structure"] = "mass_spectrum" 363 hdf_handle.attrs["analyzer"] = mass_spectrum.analyzer 364 hdf_handle.attrs["instrument_label"] = mass_spectrum.instrument_label 365 hdf_handle.attrs["sample_name"] = mass_spectrum.sample_name 366 367 list_results = self.list_dict_to_list(mass_spectrum, is_hdf5=True) 368 369 dict_ms_attrs = self.get_mass_spec_attrs(mass_spectrum) 370 371 setting_dicts = parameter_to_dict.get_dict_data_ms(mass_spectrum) 372 373 columns_labels = json.dumps( 374 self.columns_label + self.get_all_used_atoms_in_order(mass_spectrum), 375 sort_keys=False, 376 indent=4, 377 separators=(",", ": "), 378 ) 379 380 group_key = group_key 381 382 if mass_spectra_group is not None: 383 hdf_handle = mass_spectra_group 384 385 if group_key not in hdf_handle.keys(): 386 scan_group = hdf_handle.create_group(group_key) 387 388 # If there is raw data (from profile data) save it 389 if not mass_spectrum.is_centroid and export_raw: 390 mz_abun_array = empty(shape=(2, len(mass_spectrum.abundance_profile)), dtype=np.float32) 391 392 mz_abun_array[0] = mass_spectrum.abundance_profile 393 mz_abun_array[1] = mass_spectrum.mz_exp_profile 394 395 raw_ms_dataset = scan_group.create_dataset( 396 "raw_ms", data=mz_abun_array, dtype="f4", compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True 397 ) 398 399 # Add metadata to the raw_ms dataset when it exists 400 raw_ms_dataset.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs) 401 402 if isinstance(mass_spectrum, MassSpecfromFreq): 403 raw_ms_dataset.attrs["TransientSetting"] = json.dumps( 404 setting_dicts.get("TransientSetting"), 405 sort_keys=False, 406 indent=4, 407 separators=(",", ": "), 408 ) 409 else: 410 # For centroided data, store metadata directly on the scan group 411 scan_group.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs) 412 413 if isinstance(mass_spectrum, MassSpecfromFreq): 414 scan_group.attrs["TransientSetting"] = json.dumps( 415 setting_dicts.get("TransientSetting"), 416 sort_keys=False, 417 indent=4, 418 separators=(",", ": "), 419 ) 420 421 else: 422 scan_group = hdf_handle.get(group_key) 423 424 # if there is not processed data len = 0, otherwise len() will return next index 425 index_processed_data = str(len(scan_group.keys())) 426 427 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 428 429 # Convert list_results to numpy array and optimize data types 430 array_results = np.array(list_results) 431 432 # Apply data type optimization for numeric columns 433 if array_results.dtype == np.float64: 434 array_results = array_results.astype(np.float32) 435 elif array_results.dtype == np.int64: 436 array_results = array_results.astype(np.int32) 437 438 processed_dset = scan_group.create_dataset( 439 index_processed_data, data=array_results, compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True 440 ) 441 442 processed_dset.attrs["date_utc"] = timenow 443 444 processed_dset.attrs["ColumnsLabels"] = columns_labels 445 446 processed_dset.attrs["MoleculaSearchSetting"] = json.dumps( 447 setting_dicts.get("MoleculaSearch"), 448 sort_keys=False, 449 indent=4, 450 separators=(",", ": "), 451 ) 452 453 processed_dset.attrs["MassSpecPeakSetting"] = json.dumps( 454 setting_dicts.get("MassSpecPeak"), 455 sort_keys=False, 456 indent=4, 457 separators=(",", ": "), 458 ) 459 460 processed_dset.attrs["MassSpectrumSetting"] = json.dumps( 461 setting_dicts.get("MassSpectrum"), 462 sort_keys=False, 463 indent=4, 464 separators=(",", ": "), 465 ) 466 467 def to_hdf(self): 468 """Exports the mass spectrum data to an HDF5 file.""" 469 470 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 471 self.add_mass_spectrum_to_hdf5( 472 hdf_handle, self.mass_spectrum, str(self.mass_spectrum.scan_number) 473 ) 474 475 def parameters_to_toml(self): 476 """Converts the mass spectrum parameters to a TOML string. 477 478 Returns 479 ------- 480 str 481 The TOML string of the mass spectrum parameters. 482 """ 483 484 dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum) 485 486 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum) 487 dict_setting["analyzer"] = self.mass_spectrum.analyzer 488 dict_setting["instrument_label"] = self.mass_spectrum.instrument_label 489 dict_setting["sample_name"] = self.mass_spectrum.sample_name 490 491 output = toml.dumps(dict_setting) 492 493 return output 494 495 def parameters_to_json(self): 496 """Converts the mass spectrum parameters to a JSON string. 497 498 Returns 499 ------- 500 str 501 The JSON string of the mass spectrum parameters. 502 """ 503 504 dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum) 505 506 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum) 507 dict_setting["analyzer"] = self.mass_spectrum.analyzer 508 dict_setting["instrument_label"] = self.mass_spectrum.instrument_label 509 dict_setting["sample_name"] = self.mass_spectrum.sample_name 510 511 output = json.dumps(dict_setting) 512 513 return output 514 515 def get_mass_spec_attrs(self, mass_spectrum): 516 """Returns the mass spectrum attributes as a dictionary. 517 518 Parameters 519 ---------- 520 mass_spectrum : MassSpectrum 521 The mass spectrum to export. 522 523 Returns 524 ------- 525 dict 526 The mass spectrum attributes. 527 """ 528 529 dict_ms_attrs = {} 530 dict_ms_attrs["polarity"] = mass_spectrum.polarity 531 dict_ms_attrs["rt"] = mass_spectrum.retention_time 532 dict_ms_attrs["tic"] = mass_spectrum.tic 533 dict_ms_attrs["mobility_scan"] = mass_spectrum.mobility_scan 534 dict_ms_attrs["mobility_rt"] = mass_spectrum.mobility_rt 535 dict_ms_attrs["Aterm"] = mass_spectrum.Aterm 536 dict_ms_attrs["Bterm"] = mass_spectrum.Bterm 537 dict_ms_attrs["Cterm"] = mass_spectrum.Cterm 538 dict_ms_attrs["baseline_noise"] = mass_spectrum.baseline_noise 539 dict_ms_attrs["baseline_noise_std"] = mass_spectrum.baseline_noise_std 540 541 return dict_ms_attrs 542 543 def get_all_used_atoms_in_order(self, mass_spectrum): 544 """Returns the list of assigned atoms in the order specified by Atoms.atoms_order list. 545 546 Parameters 547 ---------- 548 mass_spectrum : MassSpectrum 549 The mass spectrum to export. 550 551 Returns 552 ------- 553 list 554 The list of assigned atoms in the order specified by Atoms.atoms_order list. 555 """ 556 557 atoms_in_order = Atoms.atoms_order 558 all_used_atoms = set() 559 if mass_spectrum: 560 for ms_peak in mass_spectrum: 561 if ms_peak: 562 for m_formula in ms_peak: 563 for atom in m_formula.atoms: 564 all_used_atoms.add(atom) 565 566 def sort_method(atom): 567 return [atoms_in_order.index(atom)] 568 569 return sorted(all_used_atoms, key=sort_method) 570 571 def list_dict_to_list(self, mass_spectrum, is_hdf5=False): 572 """Returns the mass spectrum data as a list of dictionaries. 573 574 Parameters 575 ---------- 576 mass_spectrum : MassSpectrum 577 The mass spectrum to export. 578 is_hdf5 : bool, optional 579 Whether the mass spectrum is being exported to an HDF5 file. Defaults to False. 580 581 Returns 582 ------- 583 list 584 The mass spectrum data as a list of dictionaries. 585 """ 586 587 column_labels = self.columns_label + self.get_all_used_atoms_in_order( 588 mass_spectrum 589 ) 590 591 dict_list = self.get_list_dict_data(mass_spectrum, is_hdf5=is_hdf5) 592 593 all_lines = [] 594 for dict_res in dict_list: 595 result_line = [NaN] * len(column_labels) 596 597 for label, value in dict_res.items(): 598 label_index = column_labels.index(label) 599 result_line[label_index] = value 600 601 all_lines.append(result_line) 602 603 return all_lines 604 605 def get_list_dict_data( 606 self, 607 mass_spectrum, 608 include_no_match=True, 609 include_isotopologues=True, 610 isotopologue_inline=True, 611 no_match_inline=False, 612 is_hdf5=False, 613 additional_columns=None, 614 ): 615 """Returns the mass spectrum data as a list of dictionaries. 616 617 Parameters 618 ---------- 619 mass_spectrum : MassSpectrum 620 The mass spectrum to export. 621 include_no_match : bool, optional 622 Whether to include unassigned (no match) data. Defaults to True. 623 include_isotopologues : bool, optional 624 Whether to include isotopologues. Defaults to True. 625 isotopologue_inline : bool, optional 626 Whether to include isotopologues inline. Defaults to True. 627 no_match_inline : bool, optional 628 Whether to include unassigned (no match) data inline. Defaults to False. 629 is_hdf5 : bool, optional 630 Whether the mass spectrum is being exported to an HDF5 file. Defaults to False. 631 632 Returns 633 ------- 634 list 635 The mass spectrum data as a list of dictionaries. 636 """ 637 638 dict_data_list = [] 639 640 if is_hdf5: 641 encode = ".encode('utf-8')" 642 else: 643 encode = "" 644 645 def add_no_match_dict_data(index, ms_peak): 646 """ 647 Export dictionary of mspeak info for unassigned (no match) data 648 """ 649 dict_result = { 650 "Index": index, 651 "m/z": ms_peak._mz_exp, 652 "Calibrated m/z": ms_peak.mz_exp, 653 "Peak Height": ms_peak.abundance, 654 "Peak Area": ms_peak.area, 655 "Resolving Power": ms_peak.resolving_power, 656 "S/N": ms_peak.signal_to_noise, 657 "Ion Charge": ms_peak.ion_charge, 658 "Heteroatom Class": eval("Labels.unassigned{}".format(encode)), 659 } 660 661 dict_data_list.append(dict_result) 662 663 def add_match_dict_data(index, ms_peak, mformula, additional_columns=None): 664 """ 665 Export dictionary of mspeak info for assigned (match) data 666 """ 667 formula_dict = mformula.to_dict() 668 669 dict_result = { 670 "Index": index, 671 "m/z": ms_peak._mz_exp, 672 "Calibrated m/z": ms_peak.mz_exp, 673 "Calculated m/z": mformula.mz_calc, 674 "Peak Height": ms_peak.abundance, 675 "Peak Area": ms_peak.area, 676 "Resolving Power": ms_peak.resolving_power, 677 "S/N": ms_peak.signal_to_noise, 678 "Ion Charge": ms_peak.ion_charge, 679 "m/z Error (ppm)": mformula.mz_error, 680 "Confidence Score": mformula.confidence_score, 681 "Isotopologue Similarity": mformula.isotopologue_similarity, 682 "m/z Error Score": mformula.average_mz_error_score, 683 "DBE": mformula.dbe, 684 "Heteroatom Class": eval("mformula.class_label{}".format(encode)), 685 "H/C": mformula.H_C, 686 "O/C": mformula.O_C, 687 "Ion Type": eval("mformula.ion_type.lower(){}".format(encode)), 688 "Is Isotopologue": int(mformula.is_isotopologue), 689 "Molecular Formula": eval("mformula.string{}".format(encode)), 690 } 691 if additional_columns is not None: 692 possible_dict = { 693 "Aromaticity Index": mformula.A_I, 694 "NOSC": mformula.nosc, 695 "Aromaticity Index (modified)": mformula.A_I_mod, 696 } 697 for column in additional_columns: 698 dict_result[column] = possible_dict.get(column) 699 700 if mformula.adduct_atom: 701 dict_result["Adduct"] = eval("mformula.adduct_atom{}".format(encode)) 702 703 if mformula.is_isotopologue: 704 dict_result["Mono Isotopic Index"] = mformula.mspeak_index_mono_isotopic 705 706 if self.atoms_order_list is None: 707 atoms_order_list = self.get_all_used_atoms_in_order(mass_spectrum) 708 else: 709 atoms_order_list = self.atoms_order_list 710 711 for atom in atoms_order_list: 712 if atom in formula_dict.keys(): 713 dict_result[atom] = formula_dict.get(atom) 714 715 dict_data_list.append(dict_result) 716 717 score_methods = mass_spectrum.molecular_search_settings.score_methods 718 selected_score_method = ( 719 mass_spectrum.molecular_search_settings.output_score_method 720 ) 721 722 if selected_score_method in score_methods: 723 # temp set score method as the one chosen in the output 724 current_method = mass_spectrum.molecular_search_settings.score_method 725 mass_spectrum.molecular_search_settings.score_method = selected_score_method 726 727 for index, ms_peak in enumerate(mass_spectrum): 728 # print(ms_peak.mz_exp) 729 730 if ms_peak: 731 m_formula = ms_peak.best_molecular_formula_candidate 732 733 if m_formula: 734 if not m_formula.is_isotopologue: 735 add_match_dict_data( 736 index, 737 ms_peak, 738 m_formula, 739 additional_columns=additional_columns, 740 ) 741 742 for ( 743 iso_mspeak_index, 744 iso_mf_formula, 745 ) in m_formula.mspeak_mf_isotopologues_indexes: 746 iso_ms_peak = mass_spectrum[iso_mspeak_index] 747 add_match_dict_data( 748 iso_mspeak_index, 749 iso_ms_peak, 750 iso_mf_formula, 751 additional_columns=additional_columns, 752 ) 753 else: 754 if include_no_match and no_match_inline: 755 add_no_match_dict_data(index, ms_peak) 756 757 if include_no_match and not no_match_inline: 758 for index, ms_peak in enumerate(mass_spectrum): 759 if not ms_peak: 760 add_no_match_dict_data(index, ms_peak) 761 # reset score method as the one chosen in the output 762 mass_spectrum.molecular_search_settings.score_method = current_method 763 764 else: 765 for index, ms_peak in enumerate(mass_spectrum): 766 # check if there is a molecular formula candidate for the msPeak 767 768 if ms_peak: 769 # m_formula = ms_peak.molecular_formula_lowest_error 770 for m_formula in ms_peak: 771 if mass_spectrum.molecular_search_settings.output_min_score > 0: 772 if ( 773 m_formula.confidence_score 774 >= mass_spectrum.molecular_search_settings.output_min_score 775 ): 776 if m_formula.is_isotopologue: # isotopologues inline 777 if include_isotopologues and isotopologue_inline: 778 add_match_dict_data( 779 index, 780 ms_peak, 781 m_formula, 782 additional_columns=additional_columns, 783 ) 784 else: 785 add_match_dict_data( 786 index, 787 ms_peak, 788 m_formula, 789 additional_columns=additional_columns, 790 ) # add monoisotopic peak 791 792 # cutoff because of low score 793 else: 794 add_no_match_dict_data(index, ms_peak) 795 796 else: 797 if m_formula.is_isotopologue: # isotopologues inline 798 if include_isotopologues and isotopologue_inline: 799 add_match_dict_data( 800 index, 801 ms_peak, 802 m_formula, 803 additional_columns=additional_columns, 804 ) 805 else: 806 add_match_dict_data( 807 index, 808 ms_peak, 809 m_formula, 810 additional_columns=additional_columns, 811 ) # add monoisotopic peak 812 else: 813 # include not_match 814 if include_no_match and no_match_inline: 815 add_no_match_dict_data(index, ms_peak) 816 817 if include_isotopologues and not isotopologue_inline: 818 for index, ms_peak in enumerate(mass_spectrum): 819 for m_formula in ms_peak: 820 if m_formula.is_isotopologue: 821 if ( 822 m_formula.confidence_score 823 >= mass_spectrum.molecular_search_settings.output_min_score 824 ): 825 add_match_dict_data( 826 index, 827 ms_peak, 828 m_formula, 829 additional_columns=additional_columns, 830 ) 831 832 if include_no_match and not no_match_inline: 833 for index, ms_peak in enumerate(mass_spectrum): 834 if not ms_peak: 835 add_no_match_dict_data(index, ms_peak) 836 837 # remove duplicated add_match data possibly introduced on the output_score_filter step 838 res = [] 839 [res.append(x) for x in dict_data_list if x not in res] 840 841 return res
21class HighResMassSpecExport(Thread): 22 """A class for exporting high-resolution mass spectra. 23 24 Parameters 25 ---------- 26 out_file_path : str 27 The output file path. 28 mass_spectrum : MassSpectrum 29 The mass spectrum to export. 30 output_type : str, optional 31 The type of output file. Defaults to 'excel'. Can be 'excel', 'csv', 'pandas' or 'hdf5'. 32 33 Attributes 34 ---------- 35 output_file : Path 36 The output file path. 37 output_type : str 38 The type of output file. 39 mass_spectrum : MassSpectrum 40 The mass spectrum to export. 41 atoms_order_list : list 42 The list of assigned atoms in the order specified by Atoms.atoms_order list. 43 columns_label : list 44 The column labels in order. 45 46 Methods 47 ------- 48 * save(). 49 Save the mass spectrum data to the output file. 50 * run(). 51 Run the export process. 52 * get_pandas_df(). 53 Returns the mass spectrum data as a pandas DataFrame. 54 * write_settings(output_path, mass_spectrum). 55 Writes the settings of the mass spectrum to a JSON file. 56 * to_pandas(write_metadata=True). 57 Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file. 58 * to_excel(write_metadata=True). 59 Exports the mass spectrum data to an Excel file. 60 * to_csv(write_metadata=True). 61 Exports the mass spectrum data to a CSV file. 62 * to_json(). 63 Exports the mass spectrum data to a JSON string. 64 * to_hdf(). 65 Exports the mass spectrum data to an HDF5 file. 66 * parameters_to_toml(). 67 Converts the mass spectrum parameters to a TOML string. 68 * parameters_to_json(). 69 Converts the mass spectrum parameters to a JSON string. 70 * get_mass_spec_attrs(mass_spectrum). 71 Returns the mass spectrum attributes as a dictionary. 72 * get_all_used_atoms_in_order(mass_spectrum). 73 Returns the list of assigned atoms in the order specified by Atoms.atoms_order list. 74 * list_dict_to_list(mass_spectrum, is_hdf5=False). 75 Returns the mass spectrum data as a list of dictionaries. 76 * get_list_dict_data(mass_spectrum, include_no_match=True, include_isotopologues=True, isotopologue_inline=True, no_match_inline=False, is_hdf5=False). 77 Returns the mass spectrum data as a list of dictionaries. 78 79 """ 80 81 def __init__(self, out_file_path, mass_spectrum, output_type="excel"): 82 Thread.__init__(self) 83 84 self.output_file = Path(out_file_path) 85 86 # 'excel', 'csv' or 'pandas' 87 self.output_type = output_type 88 89 self.mass_spectrum = mass_spectrum 90 91 # collect all assigned atoms and order them accordingly to the Atoms.atoms_order list 92 self.atoms_order_list = self.get_all_used_atoms_in_order(self.mass_spectrum) 93 94 self._init_columns() 95 96 def _init_columns(self): 97 """Initialize the columns for the mass spectrum output.""" 98 # column labels in order 99 self.columns_label = [ 100 "Index", 101 "m/z", 102 "Calibrated m/z", 103 "Calculated m/z", 104 "Peak Height", 105 "Peak Area", 106 "Resolving Power", 107 "S/N", 108 "Ion Charge", 109 "m/z Error (ppm)", 110 "m/z Error Score", 111 "Isotopologue Similarity", 112 "Confidence Score", 113 "DBE", 114 "O/C", 115 "H/C", 116 "Heteroatom Class", 117 "Ion Type", 118 "Adduct", 119 "Is Isotopologue", 120 "Mono Isotopic Index", 121 "Molecular Formula", 122 ] 123 124 @property 125 def output_type(self): 126 """Returns the output type of the mass spectrum.""" 127 return self._output_type 128 129 @output_type.setter 130 def output_type(self, output_type): 131 output_types = ["excel", "csv", "pandas", "hdf5"] 132 if output_type in output_types: 133 self._output_type = output_type 134 else: 135 raise TypeError( 136 'Supported types are "excel", "csv" or "pandas", %s entered' 137 % output_type 138 ) 139 140 def save(self): 141 """Save the mass spectrum data to the output file. 142 143 Raises 144 ------ 145 ValueError 146 If the output type is not supported. 147 """ 148 149 if self.output_type == "excel": 150 self.to_excel() 151 elif self.output_type == "csv": 152 self.to_csv() 153 elif self.output_type == "pandas": 154 self.to_pandas() 155 elif self.output_type == "hdf5": 156 self.to_hdf() 157 else: 158 raise ValueError( 159 "Unkown output type: %s; it can be 'excel', 'csv' or 'pandas'" 160 % self.output_type 161 ) 162 163 def run(self): 164 """Run the export process. 165 166 This method is called when the thread starts. 167 It calls the save method to perform the export.""" 168 self.save() 169 170 def get_pandas_df(self, additional_columns=None): 171 """Returns the mass spectrum data as a pandas DataFrame. 172 173 Parameters 174 ---------- 175 additional_columns : list, optional 176 Additional columns to include in the DataFrame. Defaults to None. 177 Suitable additional columns are: 'Aromaticity Index', 'NOSC', 'Aromaticity Index (modified)'. 178 179 Returns 180 ------- 181 DataFrame 182 The mass spectrum data as a pandas DataFrame. 183 """ 184 if additional_columns is not None: 185 possible_additional_columns = [ 186 "Aromaticity Index", 187 "NOSC", 188 "Aromaticity Index (modified)", 189 ] 190 if additional_columns: 191 for column in additional_columns: 192 if column not in possible_additional_columns: 193 raise ValueError("Invalid additional column: %s" % column) 194 columns = ( 195 self.columns_label 196 + additional_columns 197 + self.get_all_used_atoms_in_order(self.mass_spectrum) 198 ) 199 else: 200 columns = self.columns_label + self.get_all_used_atoms_in_order( 201 self.mass_spectrum 202 ) 203 dict_data_list = self.get_list_dict_data( 204 self.mass_spectrum, additional_columns=additional_columns 205 ) 206 df = DataFrame(dict_data_list, columns=columns) 207 df.name = self.output_file 208 return df 209 210 def write_settings(self, output_path, mass_spectrum): 211 """Writes the settings of the mass spectrum to a JSON file. 212 213 Parameters 214 ---------- 215 output_path : str 216 The output file path. 217 mass_spectrum : MassSpectrum 218 The mass spectrum to export. 219 """ 220 221 import json 222 223 dict_setting = parameter_to_dict.get_dict_data_ms(mass_spectrum) 224 225 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(mass_spectrum) 226 dict_setting["analyzer"] = mass_spectrum.analyzer 227 dict_setting["instrument_label"] = mass_spectrum.instrument_label 228 dict_setting["sample_name"] = mass_spectrum.sample_name 229 230 with open( 231 output_path.with_suffix(".json"), 232 "w", 233 encoding="utf8", 234 ) as outfile: 235 output = json.dumps( 236 dict_setting, sort_keys=True, indent=4, separators=(",", ": ") 237 ) 238 outfile.write(output) 239 240 def to_pandas(self, write_metadata=True): 241 """Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file. 242 243 Parameters 244 ---------- 245 write_metadata : bool, optional 246 Whether to write the metadata to a JSON file. Defaults to True. 247 """ 248 249 columns = self.columns_label + self.get_all_used_atoms_in_order( 250 self.mass_spectrum 251 ) 252 253 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 254 255 df = DataFrame(dict_data_list, columns=columns) 256 257 df.to_pickle(self.output_file.with_suffix(".pkl")) 258 259 if write_metadata: 260 self.write_settings(self.output_file, self.mass_spectrum) 261 262 def to_excel(self, write_metadata=True): 263 """Exports the mass spectrum data to an Excel file. 264 265 Parameters 266 ---------- 267 write_metadata : bool, optional 268 Whether to write the metadata to a JSON file. Defaults to True. 269 """ 270 271 columns = self.columns_label + self.get_all_used_atoms_in_order( 272 self.mass_spectrum 273 ) 274 275 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 276 277 df = DataFrame(dict_data_list, columns=columns) 278 279 df.to_excel(self.output_file.with_suffix(".xlsx")) 280 281 if write_metadata: 282 self.write_settings(self.output_file, self.mass_spectrum) 283 284 def to_csv(self, write_metadata=True): 285 """Exports the mass spectrum data to a CSV file. 286 287 Parameters 288 ---------- 289 write_metadata : bool, optional 290 Whether to write the metadata to a JSON file. Defaults to True. 291 """ 292 293 columns = self.columns_label + self.get_all_used_atoms_in_order( 294 self.mass_spectrum 295 ) 296 297 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 298 299 import csv 300 301 try: 302 with open(self.output_file.with_suffix(".csv"), "w", newline="") as csvfile: 303 writer = csv.DictWriter(csvfile, fieldnames=columns) 304 writer.writeheader() 305 for data in dict_data_list: 306 writer.writerow(data) 307 if write_metadata: 308 self.write_settings(self.output_file, self.mass_spectrum) 309 310 except IOError as ioerror: 311 print(ioerror) 312 313 def to_json(self): 314 """Exports the mass spectrum data to a JSON string.""" 315 316 columns = self.columns_label + self.get_all_used_atoms_in_order( 317 self.mass_spectrum 318 ) 319 320 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 321 322 df = DataFrame(dict_data_list, columns=columns) 323 324 # for key, values in dict_data.items(): 325 # if not values: dict_data[key] = NaN 326 327 # output = json.dumps(dict_data, sort_keys=True, indent=4, separators=(',', ': ')) 328 return df.to_json(orient="records") 329 330 def add_mass_spectrum_to_hdf5( 331 self, 332 hdf_handle, 333 mass_spectrum, 334 group_key, 335 mass_spectra_group=None, 336 export_raw=True, 337 ): 338 """Adds the mass spectrum data to an HDF5 file. 339 340 Parameters 341 ---------- 342 hdf_handle : h5py.File 343 The HDF5 file handle. 344 mass_spectrum : MassSpectrum 345 The mass spectrum to add to the HDF5 file. 346 group_key : str 347 The group key (where to add the mass spectrum data within the HDF5 file). 348 mass_spectra_group : h5py.Group, optional 349 The mass spectra group. Defaults to None (no group, mass spectrum is added to the root). 350 export_raw : bool, optional 351 Whether to export the raw data. Defaults to True. 352 If False, only the processed data (peaks) is exported (essentially centroided data). 353 """ 354 if mass_spectra_group is None: 355 # Check if the file has the necessary attributes and add them if not 356 # This assumes that if there is a mass_spectra_group, these attributes were already added to the file 357 if not hdf_handle.attrs.get("date_utc"): 358 timenow = str( 359 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 360 ) 361 hdf_handle.attrs["date_utc"] = timenow 362 hdf_handle.attrs["file_name"] = mass_spectrum.filename.name 363 hdf_handle.attrs["data_structure"] = "mass_spectrum" 364 hdf_handle.attrs["analyzer"] = mass_spectrum.analyzer 365 hdf_handle.attrs["instrument_label"] = mass_spectrum.instrument_label 366 hdf_handle.attrs["sample_name"] = mass_spectrum.sample_name 367 368 list_results = self.list_dict_to_list(mass_spectrum, is_hdf5=True) 369 370 dict_ms_attrs = self.get_mass_spec_attrs(mass_spectrum) 371 372 setting_dicts = parameter_to_dict.get_dict_data_ms(mass_spectrum) 373 374 columns_labels = json.dumps( 375 self.columns_label + self.get_all_used_atoms_in_order(mass_spectrum), 376 sort_keys=False, 377 indent=4, 378 separators=(",", ": "), 379 ) 380 381 group_key = group_key 382 383 if mass_spectra_group is not None: 384 hdf_handle = mass_spectra_group 385 386 if group_key not in hdf_handle.keys(): 387 scan_group = hdf_handle.create_group(group_key) 388 389 # If there is raw data (from profile data) save it 390 if not mass_spectrum.is_centroid and export_raw: 391 mz_abun_array = empty(shape=(2, len(mass_spectrum.abundance_profile)), dtype=np.float32) 392 393 mz_abun_array[0] = mass_spectrum.abundance_profile 394 mz_abun_array[1] = mass_spectrum.mz_exp_profile 395 396 raw_ms_dataset = scan_group.create_dataset( 397 "raw_ms", data=mz_abun_array, dtype="f4", compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True 398 ) 399 400 # Add metadata to the raw_ms dataset when it exists 401 raw_ms_dataset.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs) 402 403 if isinstance(mass_spectrum, MassSpecfromFreq): 404 raw_ms_dataset.attrs["TransientSetting"] = json.dumps( 405 setting_dicts.get("TransientSetting"), 406 sort_keys=False, 407 indent=4, 408 separators=(",", ": "), 409 ) 410 else: 411 # For centroided data, store metadata directly on the scan group 412 scan_group.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs) 413 414 if isinstance(mass_spectrum, MassSpecfromFreq): 415 scan_group.attrs["TransientSetting"] = json.dumps( 416 setting_dicts.get("TransientSetting"), 417 sort_keys=False, 418 indent=4, 419 separators=(",", ": "), 420 ) 421 422 else: 423 scan_group = hdf_handle.get(group_key) 424 425 # if there is not processed data len = 0, otherwise len() will return next index 426 index_processed_data = str(len(scan_group.keys())) 427 428 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 429 430 # Convert list_results to numpy array and optimize data types 431 array_results = np.array(list_results) 432 433 # Apply data type optimization for numeric columns 434 if array_results.dtype == np.float64: 435 array_results = array_results.astype(np.float32) 436 elif array_results.dtype == np.int64: 437 array_results = array_results.astype(np.int32) 438 439 processed_dset = scan_group.create_dataset( 440 index_processed_data, data=array_results, compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True 441 ) 442 443 processed_dset.attrs["date_utc"] = timenow 444 445 processed_dset.attrs["ColumnsLabels"] = columns_labels 446 447 processed_dset.attrs["MoleculaSearchSetting"] = json.dumps( 448 setting_dicts.get("MoleculaSearch"), 449 sort_keys=False, 450 indent=4, 451 separators=(",", ": "), 452 ) 453 454 processed_dset.attrs["MassSpecPeakSetting"] = json.dumps( 455 setting_dicts.get("MassSpecPeak"), 456 sort_keys=False, 457 indent=4, 458 separators=(",", ": "), 459 ) 460 461 processed_dset.attrs["MassSpectrumSetting"] = json.dumps( 462 setting_dicts.get("MassSpectrum"), 463 sort_keys=False, 464 indent=4, 465 separators=(",", ": "), 466 ) 467 468 def to_hdf(self): 469 """Exports the mass spectrum data to an HDF5 file.""" 470 471 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 472 self.add_mass_spectrum_to_hdf5( 473 hdf_handle, self.mass_spectrum, str(self.mass_spectrum.scan_number) 474 ) 475 476 def parameters_to_toml(self): 477 """Converts the mass spectrum parameters to a TOML string. 478 479 Returns 480 ------- 481 str 482 The TOML string of the mass spectrum parameters. 483 """ 484 485 dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum) 486 487 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum) 488 dict_setting["analyzer"] = self.mass_spectrum.analyzer 489 dict_setting["instrument_label"] = self.mass_spectrum.instrument_label 490 dict_setting["sample_name"] = self.mass_spectrum.sample_name 491 492 output = toml.dumps(dict_setting) 493 494 return output 495 496 def parameters_to_json(self): 497 """Converts the mass spectrum parameters to a JSON string. 498 499 Returns 500 ------- 501 str 502 The JSON string of the mass spectrum parameters. 503 """ 504 505 dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum) 506 507 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum) 508 dict_setting["analyzer"] = self.mass_spectrum.analyzer 509 dict_setting["instrument_label"] = self.mass_spectrum.instrument_label 510 dict_setting["sample_name"] = self.mass_spectrum.sample_name 511 512 output = json.dumps(dict_setting) 513 514 return output 515 516 def get_mass_spec_attrs(self, mass_spectrum): 517 """Returns the mass spectrum attributes as a dictionary. 518 519 Parameters 520 ---------- 521 mass_spectrum : MassSpectrum 522 The mass spectrum to export. 523 524 Returns 525 ------- 526 dict 527 The mass spectrum attributes. 528 """ 529 530 dict_ms_attrs = {} 531 dict_ms_attrs["polarity"] = mass_spectrum.polarity 532 dict_ms_attrs["rt"] = mass_spectrum.retention_time 533 dict_ms_attrs["tic"] = mass_spectrum.tic 534 dict_ms_attrs["mobility_scan"] = mass_spectrum.mobility_scan 535 dict_ms_attrs["mobility_rt"] = mass_spectrum.mobility_rt 536 dict_ms_attrs["Aterm"] = mass_spectrum.Aterm 537 dict_ms_attrs["Bterm"] = mass_spectrum.Bterm 538 dict_ms_attrs["Cterm"] = mass_spectrum.Cterm 539 dict_ms_attrs["baseline_noise"] = mass_spectrum.baseline_noise 540 dict_ms_attrs["baseline_noise_std"] = mass_spectrum.baseline_noise_std 541 542 return dict_ms_attrs 543 544 def get_all_used_atoms_in_order(self, mass_spectrum): 545 """Returns the list of assigned atoms in the order specified by Atoms.atoms_order list. 546 547 Parameters 548 ---------- 549 mass_spectrum : MassSpectrum 550 The mass spectrum to export. 551 552 Returns 553 ------- 554 list 555 The list of assigned atoms in the order specified by Atoms.atoms_order list. 556 """ 557 558 atoms_in_order = Atoms.atoms_order 559 all_used_atoms = set() 560 if mass_spectrum: 561 for ms_peak in mass_spectrum: 562 if ms_peak: 563 for m_formula in ms_peak: 564 for atom in m_formula.atoms: 565 all_used_atoms.add(atom) 566 567 def sort_method(atom): 568 return [atoms_in_order.index(atom)] 569 570 return sorted(all_used_atoms, key=sort_method) 571 572 def list_dict_to_list(self, mass_spectrum, is_hdf5=False): 573 """Returns the mass spectrum data as a list of dictionaries. 574 575 Parameters 576 ---------- 577 mass_spectrum : MassSpectrum 578 The mass spectrum to export. 579 is_hdf5 : bool, optional 580 Whether the mass spectrum is being exported to an HDF5 file. Defaults to False. 581 582 Returns 583 ------- 584 list 585 The mass spectrum data as a list of dictionaries. 586 """ 587 588 column_labels = self.columns_label + self.get_all_used_atoms_in_order( 589 mass_spectrum 590 ) 591 592 dict_list = self.get_list_dict_data(mass_spectrum, is_hdf5=is_hdf5) 593 594 all_lines = [] 595 for dict_res in dict_list: 596 result_line = [NaN] * len(column_labels) 597 598 for label, value in dict_res.items(): 599 label_index = column_labels.index(label) 600 result_line[label_index] = value 601 602 all_lines.append(result_line) 603 604 return all_lines 605 606 def get_list_dict_data( 607 self, 608 mass_spectrum, 609 include_no_match=True, 610 include_isotopologues=True, 611 isotopologue_inline=True, 612 no_match_inline=False, 613 is_hdf5=False, 614 additional_columns=None, 615 ): 616 """Returns the mass spectrum data as a list of dictionaries. 617 618 Parameters 619 ---------- 620 mass_spectrum : MassSpectrum 621 The mass spectrum to export. 622 include_no_match : bool, optional 623 Whether to include unassigned (no match) data. Defaults to True. 624 include_isotopologues : bool, optional 625 Whether to include isotopologues. Defaults to True. 626 isotopologue_inline : bool, optional 627 Whether to include isotopologues inline. Defaults to True. 628 no_match_inline : bool, optional 629 Whether to include unassigned (no match) data inline. Defaults to False. 630 is_hdf5 : bool, optional 631 Whether the mass spectrum is being exported to an HDF5 file. Defaults to False. 632 633 Returns 634 ------- 635 list 636 The mass spectrum data as a list of dictionaries. 637 """ 638 639 dict_data_list = [] 640 641 if is_hdf5: 642 encode = ".encode('utf-8')" 643 else: 644 encode = "" 645 646 def add_no_match_dict_data(index, ms_peak): 647 """ 648 Export dictionary of mspeak info for unassigned (no match) data 649 """ 650 dict_result = { 651 "Index": index, 652 "m/z": ms_peak._mz_exp, 653 "Calibrated m/z": ms_peak.mz_exp, 654 "Peak Height": ms_peak.abundance, 655 "Peak Area": ms_peak.area, 656 "Resolving Power": ms_peak.resolving_power, 657 "S/N": ms_peak.signal_to_noise, 658 "Ion Charge": ms_peak.ion_charge, 659 "Heteroatom Class": eval("Labels.unassigned{}".format(encode)), 660 } 661 662 dict_data_list.append(dict_result) 663 664 def add_match_dict_data(index, ms_peak, mformula, additional_columns=None): 665 """ 666 Export dictionary of mspeak info for assigned (match) data 667 """ 668 formula_dict = mformula.to_dict() 669 670 dict_result = { 671 "Index": index, 672 "m/z": ms_peak._mz_exp, 673 "Calibrated m/z": ms_peak.mz_exp, 674 "Calculated m/z": mformula.mz_calc, 675 "Peak Height": ms_peak.abundance, 676 "Peak Area": ms_peak.area, 677 "Resolving Power": ms_peak.resolving_power, 678 "S/N": ms_peak.signal_to_noise, 679 "Ion Charge": ms_peak.ion_charge, 680 "m/z Error (ppm)": mformula.mz_error, 681 "Confidence Score": mformula.confidence_score, 682 "Isotopologue Similarity": mformula.isotopologue_similarity, 683 "m/z Error Score": mformula.average_mz_error_score, 684 "DBE": mformula.dbe, 685 "Heteroatom Class": eval("mformula.class_label{}".format(encode)), 686 "H/C": mformula.H_C, 687 "O/C": mformula.O_C, 688 "Ion Type": eval("mformula.ion_type.lower(){}".format(encode)), 689 "Is Isotopologue": int(mformula.is_isotopologue), 690 "Molecular Formula": eval("mformula.string{}".format(encode)), 691 } 692 if additional_columns is not None: 693 possible_dict = { 694 "Aromaticity Index": mformula.A_I, 695 "NOSC": mformula.nosc, 696 "Aromaticity Index (modified)": mformula.A_I_mod, 697 } 698 for column in additional_columns: 699 dict_result[column] = possible_dict.get(column) 700 701 if mformula.adduct_atom: 702 dict_result["Adduct"] = eval("mformula.adduct_atom{}".format(encode)) 703 704 if mformula.is_isotopologue: 705 dict_result["Mono Isotopic Index"] = mformula.mspeak_index_mono_isotopic 706 707 if self.atoms_order_list is None: 708 atoms_order_list = self.get_all_used_atoms_in_order(mass_spectrum) 709 else: 710 atoms_order_list = self.atoms_order_list 711 712 for atom in atoms_order_list: 713 if atom in formula_dict.keys(): 714 dict_result[atom] = formula_dict.get(atom) 715 716 dict_data_list.append(dict_result) 717 718 score_methods = mass_spectrum.molecular_search_settings.score_methods 719 selected_score_method = ( 720 mass_spectrum.molecular_search_settings.output_score_method 721 ) 722 723 if selected_score_method in score_methods: 724 # temp set score method as the one chosen in the output 725 current_method = mass_spectrum.molecular_search_settings.score_method 726 mass_spectrum.molecular_search_settings.score_method = selected_score_method 727 728 for index, ms_peak in enumerate(mass_spectrum): 729 # print(ms_peak.mz_exp) 730 731 if ms_peak: 732 m_formula = ms_peak.best_molecular_formula_candidate 733 734 if m_formula: 735 if not m_formula.is_isotopologue: 736 add_match_dict_data( 737 index, 738 ms_peak, 739 m_formula, 740 additional_columns=additional_columns, 741 ) 742 743 for ( 744 iso_mspeak_index, 745 iso_mf_formula, 746 ) in m_formula.mspeak_mf_isotopologues_indexes: 747 iso_ms_peak = mass_spectrum[iso_mspeak_index] 748 add_match_dict_data( 749 iso_mspeak_index, 750 iso_ms_peak, 751 iso_mf_formula, 752 additional_columns=additional_columns, 753 ) 754 else: 755 if include_no_match and no_match_inline: 756 add_no_match_dict_data(index, ms_peak) 757 758 if include_no_match and not no_match_inline: 759 for index, ms_peak in enumerate(mass_spectrum): 760 if not ms_peak: 761 add_no_match_dict_data(index, ms_peak) 762 # reset score method as the one chosen in the output 763 mass_spectrum.molecular_search_settings.score_method = current_method 764 765 else: 766 for index, ms_peak in enumerate(mass_spectrum): 767 # check if there is a molecular formula candidate for the msPeak 768 769 if ms_peak: 770 # m_formula = ms_peak.molecular_formula_lowest_error 771 for m_formula in ms_peak: 772 if mass_spectrum.molecular_search_settings.output_min_score > 0: 773 if ( 774 m_formula.confidence_score 775 >= mass_spectrum.molecular_search_settings.output_min_score 776 ): 777 if m_formula.is_isotopologue: # isotopologues inline 778 if include_isotopologues and isotopologue_inline: 779 add_match_dict_data( 780 index, 781 ms_peak, 782 m_formula, 783 additional_columns=additional_columns, 784 ) 785 else: 786 add_match_dict_data( 787 index, 788 ms_peak, 789 m_formula, 790 additional_columns=additional_columns, 791 ) # add monoisotopic peak 792 793 # cutoff because of low score 794 else: 795 add_no_match_dict_data(index, ms_peak) 796 797 else: 798 if m_formula.is_isotopologue: # isotopologues inline 799 if include_isotopologues and isotopologue_inline: 800 add_match_dict_data( 801 index, 802 ms_peak, 803 m_formula, 804 additional_columns=additional_columns, 805 ) 806 else: 807 add_match_dict_data( 808 index, 809 ms_peak, 810 m_formula, 811 additional_columns=additional_columns, 812 ) # add monoisotopic peak 813 else: 814 # include not_match 815 if include_no_match and no_match_inline: 816 add_no_match_dict_data(index, ms_peak) 817 818 if include_isotopologues and not isotopologue_inline: 819 for index, ms_peak in enumerate(mass_spectrum): 820 for m_formula in ms_peak: 821 if m_formula.is_isotopologue: 822 if ( 823 m_formula.confidence_score 824 >= mass_spectrum.molecular_search_settings.output_min_score 825 ): 826 add_match_dict_data( 827 index, 828 ms_peak, 829 m_formula, 830 additional_columns=additional_columns, 831 ) 832 833 if include_no_match and not no_match_inline: 834 for index, ms_peak in enumerate(mass_spectrum): 835 if not ms_peak: 836 add_no_match_dict_data(index, ms_peak) 837 838 # remove duplicated add_match data possibly introduced on the output_score_filter step 839 res = [] 840 [res.append(x) for x in dict_data_list if x not in res] 841 842 return res
A class for exporting high-resolution mass spectra.
Parameters
- out_file_path (str): The output file path.
- mass_spectrum (MassSpectrum): The mass spectrum to export.
- output_type (str, optional): The type of output file. Defaults to 'excel'. Can be 'excel', 'csv', 'pandas' or 'hdf5'.
Attributes
- output_file (Path): The output file path.
- output_type (str): The type of output file.
- mass_spectrum (MassSpectrum): The mass spectrum to export.
- atoms_order_list (list): The list of assigned atoms in the order specified by Atoms.atoms_order list.
- columns_label (list): The column labels in order.
Methods
- save(). Save the mass spectrum data to the output file.
- run(). Run the export process.
- get_pandas_df(). Returns the mass spectrum data as a pandas DataFrame.
- write_settings(output_path, mass_spectrum). Writes the settings of the mass spectrum to a JSON file.
- to_pandas(write_metadata=True). Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.
- to_excel(write_metadata=True). Exports the mass spectrum data to an Excel file.
- to_csv(write_metadata=True). Exports the mass spectrum data to a CSV file.
- to_json(). Exports the mass spectrum data to a JSON string.
- to_hdf(). Exports the mass spectrum data to an HDF5 file.
- parameters_to_toml(). Converts the mass spectrum parameters to a TOML string.
- parameters_to_json(). Converts the mass spectrum parameters to a JSON string.
- get_mass_spec_attrs(mass_spectrum). Returns the mass spectrum attributes as a dictionary.
- get_all_used_atoms_in_order(mass_spectrum). Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.
- list_dict_to_list(mass_spectrum, is_hdf5=False). Returns the mass spectrum data as a list of dictionaries.
- get_list_dict_data(mass_spectrum, include_no_match=True, include_isotopologues=True, isotopologue_inline=True, no_match_inline=False, is_hdf5=False). Returns the mass spectrum data as a list of dictionaries.
81 def __init__(self, out_file_path, mass_spectrum, output_type="excel"): 82 Thread.__init__(self) 83 84 self.output_file = Path(out_file_path) 85 86 # 'excel', 'csv' or 'pandas' 87 self.output_type = output_type 88 89 self.mass_spectrum = mass_spectrum 90 91 # collect all assigned atoms and order them accordingly to the Atoms.atoms_order list 92 self.atoms_order_list = self.get_all_used_atoms_in_order(self.mass_spectrum) 93 94 self._init_columns()
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
140 def save(self): 141 """Save the mass spectrum data to the output file. 142 143 Raises 144 ------ 145 ValueError 146 If the output type is not supported. 147 """ 148 149 if self.output_type == "excel": 150 self.to_excel() 151 elif self.output_type == "csv": 152 self.to_csv() 153 elif self.output_type == "pandas": 154 self.to_pandas() 155 elif self.output_type == "hdf5": 156 self.to_hdf() 157 else: 158 raise ValueError( 159 "Unkown output type: %s; it can be 'excel', 'csv' or 'pandas'" 160 % self.output_type 161 )
Save the mass spectrum data to the output file.
Raises
- ValueError: If the output type is not supported.
163 def run(self): 164 """Run the export process. 165 166 This method is called when the thread starts. 167 It calls the save method to perform the export.""" 168 self.save()
Run the export process.
This method is called when the thread starts. It calls the save method to perform the export.
170 def get_pandas_df(self, additional_columns=None): 171 """Returns the mass spectrum data as a pandas DataFrame. 172 173 Parameters 174 ---------- 175 additional_columns : list, optional 176 Additional columns to include in the DataFrame. Defaults to None. 177 Suitable additional columns are: 'Aromaticity Index', 'NOSC', 'Aromaticity Index (modified)'. 178 179 Returns 180 ------- 181 DataFrame 182 The mass spectrum data as a pandas DataFrame. 183 """ 184 if additional_columns is not None: 185 possible_additional_columns = [ 186 "Aromaticity Index", 187 "NOSC", 188 "Aromaticity Index (modified)", 189 ] 190 if additional_columns: 191 for column in additional_columns: 192 if column not in possible_additional_columns: 193 raise ValueError("Invalid additional column: %s" % column) 194 columns = ( 195 self.columns_label 196 + additional_columns 197 + self.get_all_used_atoms_in_order(self.mass_spectrum) 198 ) 199 else: 200 columns = self.columns_label + self.get_all_used_atoms_in_order( 201 self.mass_spectrum 202 ) 203 dict_data_list = self.get_list_dict_data( 204 self.mass_spectrum, additional_columns=additional_columns 205 ) 206 df = DataFrame(dict_data_list, columns=columns) 207 df.name = self.output_file 208 return df
Returns the mass spectrum data as a pandas DataFrame.
Parameters
- additional_columns (list, optional): Additional columns to include in the DataFrame. Defaults to None. Suitable additional columns are: 'Aromaticity Index', 'NOSC', 'Aromaticity Index (modified)'.
Returns
- DataFrame: The mass spectrum data as a pandas DataFrame.
210 def write_settings(self, output_path, mass_spectrum): 211 """Writes the settings of the mass spectrum to a JSON file. 212 213 Parameters 214 ---------- 215 output_path : str 216 The output file path. 217 mass_spectrum : MassSpectrum 218 The mass spectrum to export. 219 """ 220 221 import json 222 223 dict_setting = parameter_to_dict.get_dict_data_ms(mass_spectrum) 224 225 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(mass_spectrum) 226 dict_setting["analyzer"] = mass_spectrum.analyzer 227 dict_setting["instrument_label"] = mass_spectrum.instrument_label 228 dict_setting["sample_name"] = mass_spectrum.sample_name 229 230 with open( 231 output_path.with_suffix(".json"), 232 "w", 233 encoding="utf8", 234 ) as outfile: 235 output = json.dumps( 236 dict_setting, sort_keys=True, indent=4, separators=(",", ": ") 237 ) 238 outfile.write(output)
Writes the settings of the mass spectrum to a JSON file.
Parameters
- output_path (str): The output file path.
- mass_spectrum (MassSpectrum): The mass spectrum to export.
240 def to_pandas(self, write_metadata=True): 241 """Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file. 242 243 Parameters 244 ---------- 245 write_metadata : bool, optional 246 Whether to write the metadata to a JSON file. Defaults to True. 247 """ 248 249 columns = self.columns_label + self.get_all_used_atoms_in_order( 250 self.mass_spectrum 251 ) 252 253 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 254 255 df = DataFrame(dict_data_list, columns=columns) 256 257 df.to_pickle(self.output_file.with_suffix(".pkl")) 258 259 if write_metadata: 260 self.write_settings(self.output_file, self.mass_spectrum)
Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.
Parameters
- write_metadata (bool, optional): Whether to write the metadata to a JSON file. Defaults to True.
262 def to_excel(self, write_metadata=True): 263 """Exports the mass spectrum data to an Excel file. 264 265 Parameters 266 ---------- 267 write_metadata : bool, optional 268 Whether to write the metadata to a JSON file. Defaults to True. 269 """ 270 271 columns = self.columns_label + self.get_all_used_atoms_in_order( 272 self.mass_spectrum 273 ) 274 275 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 276 277 df = DataFrame(dict_data_list, columns=columns) 278 279 df.to_excel(self.output_file.with_suffix(".xlsx")) 280 281 if write_metadata: 282 self.write_settings(self.output_file, self.mass_spectrum)
Exports the mass spectrum data to an Excel file.
Parameters
- write_metadata (bool, optional): Whether to write the metadata to a JSON file. Defaults to True.
284 def to_csv(self, write_metadata=True): 285 """Exports the mass spectrum data to a CSV file. 286 287 Parameters 288 ---------- 289 write_metadata : bool, optional 290 Whether to write the metadata to a JSON file. Defaults to True. 291 """ 292 293 columns = self.columns_label + self.get_all_used_atoms_in_order( 294 self.mass_spectrum 295 ) 296 297 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 298 299 import csv 300 301 try: 302 with open(self.output_file.with_suffix(".csv"), "w", newline="") as csvfile: 303 writer = csv.DictWriter(csvfile, fieldnames=columns) 304 writer.writeheader() 305 for data in dict_data_list: 306 writer.writerow(data) 307 if write_metadata: 308 self.write_settings(self.output_file, self.mass_spectrum) 309 310 except IOError as ioerror: 311 print(ioerror)
Exports the mass spectrum data to a CSV file.
Parameters
- write_metadata (bool, optional): Whether to write the metadata to a JSON file. Defaults to True.
313 def to_json(self): 314 """Exports the mass spectrum data to a JSON string.""" 315 316 columns = self.columns_label + self.get_all_used_atoms_in_order( 317 self.mass_spectrum 318 ) 319 320 dict_data_list = self.get_list_dict_data(self.mass_spectrum) 321 322 df = DataFrame(dict_data_list, columns=columns) 323 324 # for key, values in dict_data.items(): 325 # if not values: dict_data[key] = NaN 326 327 # output = json.dumps(dict_data, sort_keys=True, indent=4, separators=(',', ': ')) 328 return df.to_json(orient="records")
Exports the mass spectrum data to a JSON string.
330 def add_mass_spectrum_to_hdf5( 331 self, 332 hdf_handle, 333 mass_spectrum, 334 group_key, 335 mass_spectra_group=None, 336 export_raw=True, 337 ): 338 """Adds the mass spectrum data to an HDF5 file. 339 340 Parameters 341 ---------- 342 hdf_handle : h5py.File 343 The HDF5 file handle. 344 mass_spectrum : MassSpectrum 345 The mass spectrum to add to the HDF5 file. 346 group_key : str 347 The group key (where to add the mass spectrum data within the HDF5 file). 348 mass_spectra_group : h5py.Group, optional 349 The mass spectra group. Defaults to None (no group, mass spectrum is added to the root). 350 export_raw : bool, optional 351 Whether to export the raw data. Defaults to True. 352 If False, only the processed data (peaks) is exported (essentially centroided data). 353 """ 354 if mass_spectra_group is None: 355 # Check if the file has the necessary attributes and add them if not 356 # This assumes that if there is a mass_spectra_group, these attributes were already added to the file 357 if not hdf_handle.attrs.get("date_utc"): 358 timenow = str( 359 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 360 ) 361 hdf_handle.attrs["date_utc"] = timenow 362 hdf_handle.attrs["file_name"] = mass_spectrum.filename.name 363 hdf_handle.attrs["data_structure"] = "mass_spectrum" 364 hdf_handle.attrs["analyzer"] = mass_spectrum.analyzer 365 hdf_handle.attrs["instrument_label"] = mass_spectrum.instrument_label 366 hdf_handle.attrs["sample_name"] = mass_spectrum.sample_name 367 368 list_results = self.list_dict_to_list(mass_spectrum, is_hdf5=True) 369 370 dict_ms_attrs = self.get_mass_spec_attrs(mass_spectrum) 371 372 setting_dicts = parameter_to_dict.get_dict_data_ms(mass_spectrum) 373 374 columns_labels = json.dumps( 375 self.columns_label + self.get_all_used_atoms_in_order(mass_spectrum), 376 sort_keys=False, 377 indent=4, 378 separators=(",", ": "), 379 ) 380 381 group_key = group_key 382 383 if mass_spectra_group is not None: 384 hdf_handle = mass_spectra_group 385 386 if group_key not in hdf_handle.keys(): 387 scan_group = hdf_handle.create_group(group_key) 388 389 # If there is raw data (from profile data) save it 390 if not mass_spectrum.is_centroid and export_raw: 391 mz_abun_array = empty(shape=(2, len(mass_spectrum.abundance_profile)), dtype=np.float32) 392 393 mz_abun_array[0] = mass_spectrum.abundance_profile 394 mz_abun_array[1] = mass_spectrum.mz_exp_profile 395 396 raw_ms_dataset = scan_group.create_dataset( 397 "raw_ms", data=mz_abun_array, dtype="f4", compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True 398 ) 399 400 # Add metadata to the raw_ms dataset when it exists 401 raw_ms_dataset.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs) 402 403 if isinstance(mass_spectrum, MassSpecfromFreq): 404 raw_ms_dataset.attrs["TransientSetting"] = json.dumps( 405 setting_dicts.get("TransientSetting"), 406 sort_keys=False, 407 indent=4, 408 separators=(",", ": "), 409 ) 410 else: 411 # For centroided data, store metadata directly on the scan group 412 scan_group.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs) 413 414 if isinstance(mass_spectrum, MassSpecfromFreq): 415 scan_group.attrs["TransientSetting"] = json.dumps( 416 setting_dicts.get("TransientSetting"), 417 sort_keys=False, 418 indent=4, 419 separators=(",", ": "), 420 ) 421 422 else: 423 scan_group = hdf_handle.get(group_key) 424 425 # if there is not processed data len = 0, otherwise len() will return next index 426 index_processed_data = str(len(scan_group.keys())) 427 428 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 429 430 # Convert list_results to numpy array and optimize data types 431 array_results = np.array(list_results) 432 433 # Apply data type optimization for numeric columns 434 if array_results.dtype == np.float64: 435 array_results = array_results.astype(np.float32) 436 elif array_results.dtype == np.int64: 437 array_results = array_results.astype(np.int32) 438 439 processed_dset = scan_group.create_dataset( 440 index_processed_data, data=array_results, compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True 441 ) 442 443 processed_dset.attrs["date_utc"] = timenow 444 445 processed_dset.attrs["ColumnsLabels"] = columns_labels 446 447 processed_dset.attrs["MoleculaSearchSetting"] = json.dumps( 448 setting_dicts.get("MoleculaSearch"), 449 sort_keys=False, 450 indent=4, 451 separators=(",", ": "), 452 ) 453 454 processed_dset.attrs["MassSpecPeakSetting"] = json.dumps( 455 setting_dicts.get("MassSpecPeak"), 456 sort_keys=False, 457 indent=4, 458 separators=(",", ": "), 459 ) 460 461 processed_dset.attrs["MassSpectrumSetting"] = json.dumps( 462 setting_dicts.get("MassSpectrum"), 463 sort_keys=False, 464 indent=4, 465 separators=(",", ": "), 466 )
Adds the mass spectrum data to an HDF5 file.
Parameters
- hdf_handle (h5py.File): The HDF5 file handle.
- mass_spectrum (MassSpectrum): The mass spectrum to add to the HDF5 file.
- group_key (str): The group key (where to add the mass spectrum data within the HDF5 file).
- mass_spectra_group (h5py.Group, optional): The mass spectra group. Defaults to None (no group, mass spectrum is added to the root).
- export_raw (bool, optional): Whether to export the raw data. Defaults to True. If False, only the processed data (peaks) is exported (essentially centroided data).
468 def to_hdf(self): 469 """Exports the mass spectrum data to an HDF5 file.""" 470 471 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 472 self.add_mass_spectrum_to_hdf5( 473 hdf_handle, self.mass_spectrum, str(self.mass_spectrum.scan_number) 474 )
Exports the mass spectrum data to an HDF5 file.
476 def parameters_to_toml(self): 477 """Converts the mass spectrum parameters to a TOML string. 478 479 Returns 480 ------- 481 str 482 The TOML string of the mass spectrum parameters. 483 """ 484 485 dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum) 486 487 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum) 488 dict_setting["analyzer"] = self.mass_spectrum.analyzer 489 dict_setting["instrument_label"] = self.mass_spectrum.instrument_label 490 dict_setting["sample_name"] = self.mass_spectrum.sample_name 491 492 output = toml.dumps(dict_setting) 493 494 return output
Converts the mass spectrum parameters to a TOML string.
Returns
- str: The TOML string of the mass spectrum parameters.
496 def parameters_to_json(self): 497 """Converts the mass spectrum parameters to a JSON string. 498 499 Returns 500 ------- 501 str 502 The JSON string of the mass spectrum parameters. 503 """ 504 505 dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum) 506 507 dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum) 508 dict_setting["analyzer"] = self.mass_spectrum.analyzer 509 dict_setting["instrument_label"] = self.mass_spectrum.instrument_label 510 dict_setting["sample_name"] = self.mass_spectrum.sample_name 511 512 output = json.dumps(dict_setting) 513 514 return output
Converts the mass spectrum parameters to a JSON string.
Returns
- str: The JSON string of the mass spectrum parameters.
516 def get_mass_spec_attrs(self, mass_spectrum): 517 """Returns the mass spectrum attributes as a dictionary. 518 519 Parameters 520 ---------- 521 mass_spectrum : MassSpectrum 522 The mass spectrum to export. 523 524 Returns 525 ------- 526 dict 527 The mass spectrum attributes. 528 """ 529 530 dict_ms_attrs = {} 531 dict_ms_attrs["polarity"] = mass_spectrum.polarity 532 dict_ms_attrs["rt"] = mass_spectrum.retention_time 533 dict_ms_attrs["tic"] = mass_spectrum.tic 534 dict_ms_attrs["mobility_scan"] = mass_spectrum.mobility_scan 535 dict_ms_attrs["mobility_rt"] = mass_spectrum.mobility_rt 536 dict_ms_attrs["Aterm"] = mass_spectrum.Aterm 537 dict_ms_attrs["Bterm"] = mass_spectrum.Bterm 538 dict_ms_attrs["Cterm"] = mass_spectrum.Cterm 539 dict_ms_attrs["baseline_noise"] = mass_spectrum.baseline_noise 540 dict_ms_attrs["baseline_noise_std"] = mass_spectrum.baseline_noise_std 541 542 return dict_ms_attrs
Returns the mass spectrum attributes as a dictionary.
Parameters
- mass_spectrum (MassSpectrum): The mass spectrum to export.
Returns
- dict: The mass spectrum attributes.
544 def get_all_used_atoms_in_order(self, mass_spectrum): 545 """Returns the list of assigned atoms in the order specified by Atoms.atoms_order list. 546 547 Parameters 548 ---------- 549 mass_spectrum : MassSpectrum 550 The mass spectrum to export. 551 552 Returns 553 ------- 554 list 555 The list of assigned atoms in the order specified by Atoms.atoms_order list. 556 """ 557 558 atoms_in_order = Atoms.atoms_order 559 all_used_atoms = set() 560 if mass_spectrum: 561 for ms_peak in mass_spectrum: 562 if ms_peak: 563 for m_formula in ms_peak: 564 for atom in m_formula.atoms: 565 all_used_atoms.add(atom) 566 567 def sort_method(atom): 568 return [atoms_in_order.index(atom)] 569 570 return sorted(all_used_atoms, key=sort_method)
Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.
Parameters
- mass_spectrum (MassSpectrum): The mass spectrum to export.
Returns
- list: The list of assigned atoms in the order specified by Atoms.atoms_order list.
572 def list_dict_to_list(self, mass_spectrum, is_hdf5=False): 573 """Returns the mass spectrum data as a list of dictionaries. 574 575 Parameters 576 ---------- 577 mass_spectrum : MassSpectrum 578 The mass spectrum to export. 579 is_hdf5 : bool, optional 580 Whether the mass spectrum is being exported to an HDF5 file. Defaults to False. 581 582 Returns 583 ------- 584 list 585 The mass spectrum data as a list of dictionaries. 586 """ 587 588 column_labels = self.columns_label + self.get_all_used_atoms_in_order( 589 mass_spectrum 590 ) 591 592 dict_list = self.get_list_dict_data(mass_spectrum, is_hdf5=is_hdf5) 593 594 all_lines = [] 595 for dict_res in dict_list: 596 result_line = [NaN] * len(column_labels) 597 598 for label, value in dict_res.items(): 599 label_index = column_labels.index(label) 600 result_line[label_index] = value 601 602 all_lines.append(result_line) 603 604 return all_lines
Returns the mass spectrum data as a list of dictionaries.
Parameters
- mass_spectrum (MassSpectrum): The mass spectrum to export.
- is_hdf5 (bool, optional): Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
Returns
- list: The mass spectrum data as a list of dictionaries.
606 def get_list_dict_data( 607 self, 608 mass_spectrum, 609 include_no_match=True, 610 include_isotopologues=True, 611 isotopologue_inline=True, 612 no_match_inline=False, 613 is_hdf5=False, 614 additional_columns=None, 615 ): 616 """Returns the mass spectrum data as a list of dictionaries. 617 618 Parameters 619 ---------- 620 mass_spectrum : MassSpectrum 621 The mass spectrum to export. 622 include_no_match : bool, optional 623 Whether to include unassigned (no match) data. Defaults to True. 624 include_isotopologues : bool, optional 625 Whether to include isotopologues. Defaults to True. 626 isotopologue_inline : bool, optional 627 Whether to include isotopologues inline. Defaults to True. 628 no_match_inline : bool, optional 629 Whether to include unassigned (no match) data inline. Defaults to False. 630 is_hdf5 : bool, optional 631 Whether the mass spectrum is being exported to an HDF5 file. Defaults to False. 632 633 Returns 634 ------- 635 list 636 The mass spectrum data as a list of dictionaries. 637 """ 638 639 dict_data_list = [] 640 641 if is_hdf5: 642 encode = ".encode('utf-8')" 643 else: 644 encode = "" 645 646 def add_no_match_dict_data(index, ms_peak): 647 """ 648 Export dictionary of mspeak info for unassigned (no match) data 649 """ 650 dict_result = { 651 "Index": index, 652 "m/z": ms_peak._mz_exp, 653 "Calibrated m/z": ms_peak.mz_exp, 654 "Peak Height": ms_peak.abundance, 655 "Peak Area": ms_peak.area, 656 "Resolving Power": ms_peak.resolving_power, 657 "S/N": ms_peak.signal_to_noise, 658 "Ion Charge": ms_peak.ion_charge, 659 "Heteroatom Class": eval("Labels.unassigned{}".format(encode)), 660 } 661 662 dict_data_list.append(dict_result) 663 664 def add_match_dict_data(index, ms_peak, mformula, additional_columns=None): 665 """ 666 Export dictionary of mspeak info for assigned (match) data 667 """ 668 formula_dict = mformula.to_dict() 669 670 dict_result = { 671 "Index": index, 672 "m/z": ms_peak._mz_exp, 673 "Calibrated m/z": ms_peak.mz_exp, 674 "Calculated m/z": mformula.mz_calc, 675 "Peak Height": ms_peak.abundance, 676 "Peak Area": ms_peak.area, 677 "Resolving Power": ms_peak.resolving_power, 678 "S/N": ms_peak.signal_to_noise, 679 "Ion Charge": ms_peak.ion_charge, 680 "m/z Error (ppm)": mformula.mz_error, 681 "Confidence Score": mformula.confidence_score, 682 "Isotopologue Similarity": mformula.isotopologue_similarity, 683 "m/z Error Score": mformula.average_mz_error_score, 684 "DBE": mformula.dbe, 685 "Heteroatom Class": eval("mformula.class_label{}".format(encode)), 686 "H/C": mformula.H_C, 687 "O/C": mformula.O_C, 688 "Ion Type": eval("mformula.ion_type.lower(){}".format(encode)), 689 "Is Isotopologue": int(mformula.is_isotopologue), 690 "Molecular Formula": eval("mformula.string{}".format(encode)), 691 } 692 if additional_columns is not None: 693 possible_dict = { 694 "Aromaticity Index": mformula.A_I, 695 "NOSC": mformula.nosc, 696 "Aromaticity Index (modified)": mformula.A_I_mod, 697 } 698 for column in additional_columns: 699 dict_result[column] = possible_dict.get(column) 700 701 if mformula.adduct_atom: 702 dict_result["Adduct"] = eval("mformula.adduct_atom{}".format(encode)) 703 704 if mformula.is_isotopologue: 705 dict_result["Mono Isotopic Index"] = mformula.mspeak_index_mono_isotopic 706 707 if self.atoms_order_list is None: 708 atoms_order_list = self.get_all_used_atoms_in_order(mass_spectrum) 709 else: 710 atoms_order_list = self.atoms_order_list 711 712 for atom in atoms_order_list: 713 if atom in formula_dict.keys(): 714 dict_result[atom] = formula_dict.get(atom) 715 716 dict_data_list.append(dict_result) 717 718 score_methods = mass_spectrum.molecular_search_settings.score_methods 719 selected_score_method = ( 720 mass_spectrum.molecular_search_settings.output_score_method 721 ) 722 723 if selected_score_method in score_methods: 724 # temp set score method as the one chosen in the output 725 current_method = mass_spectrum.molecular_search_settings.score_method 726 mass_spectrum.molecular_search_settings.score_method = selected_score_method 727 728 for index, ms_peak in enumerate(mass_spectrum): 729 # print(ms_peak.mz_exp) 730 731 if ms_peak: 732 m_formula = ms_peak.best_molecular_formula_candidate 733 734 if m_formula: 735 if not m_formula.is_isotopologue: 736 add_match_dict_data( 737 index, 738 ms_peak, 739 m_formula, 740 additional_columns=additional_columns, 741 ) 742 743 for ( 744 iso_mspeak_index, 745 iso_mf_formula, 746 ) in m_formula.mspeak_mf_isotopologues_indexes: 747 iso_ms_peak = mass_spectrum[iso_mspeak_index] 748 add_match_dict_data( 749 iso_mspeak_index, 750 iso_ms_peak, 751 iso_mf_formula, 752 additional_columns=additional_columns, 753 ) 754 else: 755 if include_no_match and no_match_inline: 756 add_no_match_dict_data(index, ms_peak) 757 758 if include_no_match and not no_match_inline: 759 for index, ms_peak in enumerate(mass_spectrum): 760 if not ms_peak: 761 add_no_match_dict_data(index, ms_peak) 762 # reset score method as the one chosen in the output 763 mass_spectrum.molecular_search_settings.score_method = current_method 764 765 else: 766 for index, ms_peak in enumerate(mass_spectrum): 767 # check if there is a molecular formula candidate for the msPeak 768 769 if ms_peak: 770 # m_formula = ms_peak.molecular_formula_lowest_error 771 for m_formula in ms_peak: 772 if mass_spectrum.molecular_search_settings.output_min_score > 0: 773 if ( 774 m_formula.confidence_score 775 >= mass_spectrum.molecular_search_settings.output_min_score 776 ): 777 if m_formula.is_isotopologue: # isotopologues inline 778 if include_isotopologues and isotopologue_inline: 779 add_match_dict_data( 780 index, 781 ms_peak, 782 m_formula, 783 additional_columns=additional_columns, 784 ) 785 else: 786 add_match_dict_data( 787 index, 788 ms_peak, 789 m_formula, 790 additional_columns=additional_columns, 791 ) # add monoisotopic peak 792 793 # cutoff because of low score 794 else: 795 add_no_match_dict_data(index, ms_peak) 796 797 else: 798 if m_formula.is_isotopologue: # isotopologues inline 799 if include_isotopologues and isotopologue_inline: 800 add_match_dict_data( 801 index, 802 ms_peak, 803 m_formula, 804 additional_columns=additional_columns, 805 ) 806 else: 807 add_match_dict_data( 808 index, 809 ms_peak, 810 m_formula, 811 additional_columns=additional_columns, 812 ) # add monoisotopic peak 813 else: 814 # include not_match 815 if include_no_match and no_match_inline: 816 add_no_match_dict_data(index, ms_peak) 817 818 if include_isotopologues and not isotopologue_inline: 819 for index, ms_peak in enumerate(mass_spectrum): 820 for m_formula in ms_peak: 821 if m_formula.is_isotopologue: 822 if ( 823 m_formula.confidence_score 824 >= mass_spectrum.molecular_search_settings.output_min_score 825 ): 826 add_match_dict_data( 827 index, 828 ms_peak, 829 m_formula, 830 additional_columns=additional_columns, 831 ) 832 833 if include_no_match and not no_match_inline: 834 for index, ms_peak in enumerate(mass_spectrum): 835 if not ms_peak: 836 add_no_match_dict_data(index, ms_peak) 837 838 # remove duplicated add_match data possibly introduced on the output_score_filter step 839 res = [] 840 [res.append(x) for x in dict_data_list if x not in res] 841 842 return res
Returns the mass spectrum data as a list of dictionaries.
Parameters
- mass_spectrum (MassSpectrum): The mass spectrum to export.
- include_no_match (bool, optional): Whether to include unassigned (no match) data. Defaults to True.
- include_isotopologues (bool, optional): Whether to include isotopologues. Defaults to True.
- isotopologue_inline (bool, optional): Whether to include isotopologues inline. Defaults to True.
- no_match_inline (bool, optional): Whether to include unassigned (no match) data inline. Defaults to False.
- is_hdf5 (bool, optional): Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
Returns
- list: The mass spectrum data as a list of dictionaries.
Inherited Members
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id