corems.mass_spectra.output.export
1__author__ = "Yuri E. Corilo" 2__date__ = "Dec 14, 2010" 3 4 5import csv 6import json 7import re 8import uuid 9import warnings 10from datetime import datetime, timezone 11from pathlib import Path 12 13import h5py 14import numpy as np 15import pandas as pd 16from openpyxl import load_workbook 17from pandas import DataFrame, ExcelWriter, read_excel 18 19from corems import __version__, corems_md5 20from corems.encapsulation.output import parameter_to_dict 21from corems.encapsulation.output.parameter_to_json import ( 22 dump_lcms_settings_json, 23 dump_lcms_settings_toml, 24) 25from corems.mass_spectrum.output.export import HighResMassSpecExport 26from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula 27from corems.molecular_id.calc.SpectralSimilarity import methods_name 28 29ion_type_dict = { 30 # adduct : [atoms to add, atoms to subtract when calculating formula of ion 31 "M+": [{}, {}], 32 "protonated": [{"H": 1}, {}], 33 "[M+H]+": [{"H": 1}, {}], 34 "[M+NH4]+": [{"N": 1, "H": 4}, {}], # ammonium 35 "[M+Na]+": [{"Na": 1}, {}], 36 "[M+K]+": [{"K": 1}, {}], 37 "[M+2Na+Cl]+": [{"Na": 2, "Cl": 1}, {}], 38 "[M+2Na-H]+": [{"Na": 2}, {"H": 1}], 39 "[M+C2H3Na2O2]+": [{"C": 2, "H": 3, "Na": 2, "O": 2}, {}], 40 "[M+C4H10N3]+": [{"C": 4, "H": 10, "N": 3}, {}], 41 "[M+NH4+ACN]+": [{"C": 2, "H": 7, "N": 2}, {}], 42 "[M+H-H2O]+": [{}, {"H": 1, "O": 1}], 43 "de-protonated": [{}, {"H": 1}], 44 "[M-H]-": [{}, {"H": 1}], 45 "[M+Cl]-": [{"Cl": 1}, {}], 46 "[M+HCOO]-": [{"C": 1, "H": 1, "O": 2}, {}], # formate 47 "[M+CH3COO]-": [{"C": 2, "H": 3, "O": 2}, {}], # acetate 48 "[M+2NaAc+Cl]-": [{"Na": 2, "C": 2, "H": 3, "O": 2, "Cl": 1}, {}], 49 "[M+K-2H]-": [{"K": 1}, {"H": 2}], 50 "[M+Na-2H]-": [{"Na": 1}, {"H": 2}], 51} 52 53 54class LowResGCMSExport: 55 """A class to export low resolution GC-MS data. 56 57 This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame. 58 59 Parameters: 60 ---------- 61 out_file_path : str 62 The output file path. 63 gcms : object 64 The low resolution GCMS object. 65 66 Attributes: 67 ---------- 68 output_file : Path 69 The output file path as a Path object. 70 gcms : object 71 The low resolution GCMS object. 72 73 Methods: 74 ------- 75 * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame. 76 * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string. 77 * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file. 78 * to_excel(write_mode='a', write_metadata=True, id_label="corems:"), 79 Export the data to an Excel file. 80 * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). 81 Export the data to a CSV file. 82 * to_hdf(id_label="corems:"). 83 Export the data to an HDF5 file. 84 * get_data_stats(gcms). 85 Get statistics about the GCMS data. 86 87 """ 88 89 def __init__(self, out_file_path, gcms): 90 self.output_file = Path(out_file_path) 91 92 self.gcms = gcms 93 94 self._init_columns() 95 96 def _init_columns(self): 97 """Initialize the column names for the exported data. 98 99 Returns: 100 ------- 101 list 102 The list of column names. 103 """ 104 105 columns = [ 106 "Sample name", 107 "Peak Index", 108 "Retention Time", 109 "Retention Time Ref", 110 "Peak Height", 111 "Peak Area", 112 "Retention index", 113 "Retention index Ref", 114 "Retention Index Score", 115 "Similarity Score", 116 "Spectral Similarity Score", 117 "Compound Name", 118 "Chebi ID", 119 "Kegg Compound ID", 120 "Inchi", 121 "Inchi Key", 122 "Smiles", 123 "Molecular Formula", 124 "IUPAC Name", 125 "Traditional Name", 126 "Common Name", 127 "Derivatization", 128 ] 129 130 if self.gcms.molecular_search_settings.exploratory_mode: 131 columns.extend( 132 [ 133 "Weighted Cosine Correlation", 134 "Cosine Correlation", 135 "Stein Scott Similarity", 136 "Pearson Correlation", 137 "Spearman Correlation", 138 "Kendall Tau Correlation", 139 "Euclidean Distance", 140 "Manhattan Distance", 141 "Jaccard Distance", 142 "DWT Correlation", 143 "DFT Correlation", 144 ] 145 ) 146 147 columns.extend(list(methods_name.values())) 148 149 return columns 150 151 def get_pandas_df(self, id_label="corems:"): 152 """Get the exported data as a Pandas DataFrame. 153 154 Parameters: 155 ---------- 156 id_label : str, optional 157 The ID label for the data. Default is "corems:". 158 159 Returns: 160 ------- 161 DataFrame 162 The exported data as a Pandas DataFrame. 163 """ 164 165 columns = self._init_columns() 166 167 dict_data_list = self.get_list_dict_data(self.gcms) 168 169 df = DataFrame(dict_data_list, columns=columns) 170 171 df.name = self.gcms.sample_name 172 173 return df 174 175 def get_json(self, nan=False, id_label="corems:"): 176 """Get the exported data as a JSON string. 177 178 Parameters: 179 ---------- 180 nan : bool, optional 181 Whether to include NaN values in the JSON string. Default is False. 182 id_label : str, optional 183 The ID label for the data. Default is "corems:". 184 185 """ 186 187 import json 188 189 dict_data_list = self.get_list_dict_data(self.gcms) 190 191 return json.dumps( 192 dict_data_list, sort_keys=False, indent=4, separators=(",", ": ") 193 ) 194 195 def to_pandas(self, write_metadata=True, id_label="corems:"): 196 """Export the data to a Pandas DataFrame and save it as a pickle file. 197 198 Parameters: 199 ---------- 200 write_metadata : bool, optional 201 Whether to write metadata to the output file. 202 id_label : str, optional 203 The ID label for the data. 204 """ 205 206 columns = self._init_columns() 207 208 dict_data_list = self.get_list_dict_data(self.gcms) 209 210 df = DataFrame(dict_data_list, columns=columns) 211 212 df.to_pickle(self.output_file.with_suffix(".pkl")) 213 214 if write_metadata: 215 self.write_settings( 216 self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:" 217 ) 218 219 def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"): 220 """Export the data to an Excel file. 221 222 Parameters: 223 ---------- 224 write_mode : str, optional 225 The write mode for the Excel file. Default is 'a' (append). 226 write_metadata : bool, optional 227 Whether to write metadata to the output file. Default is True. 228 id_label : str, optional 229 The ID label for the data. Default is "corems:". 230 """ 231 232 out_put_path = self.output_file.with_suffix(".xlsx") 233 234 columns = self._init_columns() 235 236 dict_data_list = self.get_list_dict_data(self.gcms) 237 238 df = DataFrame(dict_data_list, columns=columns) 239 240 if write_mode == "a" and out_put_path.exists(): 241 writer = ExcelWriter(out_put_path, engine="openpyxl") 242 # try to open an existing workbook 243 writer.book = load_workbook(out_put_path) 244 # copy existing sheets 245 writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets) 246 # read existing file 247 reader = read_excel(out_put_path) 248 # write out the new sheet 249 df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1) 250 251 writer.close() 252 else: 253 df.to_excel( 254 self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl" 255 ) 256 257 if write_metadata: 258 self.write_settings(out_put_path, self.gcms, id_label=id_label) 259 260 def to_csv( 261 self, 262 separate_output=False, 263 write_mode="w", 264 write_metadata=True, 265 id_label="corems:", 266 ): 267 """Export the data to a CSV file. 268 269 Parameters: 270 ---------- 271 separate_output : bool, optional 272 Whether to separate the output into multiple files. Default is False. 273 write_mode : str, optional 274 The write mode for the CSV file. Default is 'w' (write). 275 write_metadata : bool, optional 276 Whether to write metadata to the output file. Default is True. 277 id_label : str, optional 278 The ID label for the data. Default is "corems:". 279 """ 280 281 if separate_output: 282 # set write mode to write 283 # this mode will overwrite the file without warning 284 write_mode = "w" 285 else: 286 # set write mode to append 287 write_mode = "a" 288 289 columns = self._init_columns() 290 291 dict_data_list = self.get_list_dict_data(self.gcms) 292 293 out_put_path = self.output_file.with_suffix(".csv") 294 295 write_header = not out_put_path.exists() 296 297 try: 298 with open(out_put_path, write_mode, newline="") as csvfile: 299 writer = csv.DictWriter(csvfile, fieldnames=columns) 300 if write_header: 301 writer.writeheader() 302 for data in dict_data_list: 303 writer.writerow(data) 304 305 if write_metadata: 306 self.write_settings(out_put_path, self.gcms, id_label=id_label) 307 308 except IOError as ioerror: 309 print(ioerror) 310 311 def to_hdf(self, id_label="corems:"): 312 """Export the data to an HDF5 file. 313 314 Parameters: 315 ---------- 316 id_label : str, optional 317 The ID label for the data. Default is "corems:". 318 """ 319 320 # save sample at a time 321 def add_compound(gc_peak, compound_obj): 322 modifier = compound_obj.classify if compound_obj.classify else "" 323 compound_group = compound_obj.name.replace("/", "") + " " + modifier 324 325 if compound_group not in peak_group: 326 compound_group = peak_group.create_group(compound_group) 327 328 # compound_group.attrs["retention_time"] = compound_obj.retention_time 329 compound_group.attrs["retention_index"] = compound_obj.ri 330 compound_group.attrs["retention_index_score"] = compound_obj.ri_score 331 compound_group.attrs["spectral_similarity_score"] = ( 332 compound_obj.spectral_similarity_score 333 ) 334 compound_group.attrs["similarity_score"] = compound_obj.similarity_score 335 336 compond_mz = compound_group.create_dataset( 337 "mz", data=np.array(compound_obj.mz), dtype="f8" 338 ) 339 compond_abundance = compound_group.create_dataset( 340 "abundance", data=np.array(compound_obj.abundance), dtype="f8" 341 ) 342 343 if self.gcms.molecular_search_settings.exploratory_mode: 344 compound_group.attrs["Spectral Similarities"] = json.dumps( 345 compound_obj.spectral_similarity_scores, 346 sort_keys=False, 347 indent=4, 348 separators=(",", ":"), 349 ) 350 else: 351 warnings.warn("Skipping duplicate reference compound.") 352 353 import json 354 from datetime import datetime, timezone 355 356 import h5py 357 import numpy as np 358 359 output_path = self.output_file.with_suffix(".hdf5") 360 361 with h5py.File(output_path, "w") as hdf_handle: 362 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 363 hdf_handle.attrs["time_stamp"] = timenow 364 hdf_handle.attrs["data_structure"] = "gcms" 365 hdf_handle.attrs["analyzer"] = self.gcms.analyzer 366 hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label 367 368 hdf_handle.attrs["sample_id"] = "self.gcms.id" 369 hdf_handle.attrs["sample_name"] = self.gcms.sample_name 370 hdf_handle.attrs["input_data"] = str(self.gcms.file_location) 371 hdf_handle.attrs["output_data"] = str(output_path) 372 hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex 373 hdf_handle.attrs["corems_version"] = __version__ 374 375 hdf_handle.attrs["Stats"] = json.dumps( 376 self.get_data_stats(self.gcms), 377 sort_keys=False, 378 indent=4, 379 separators=(",", ": "), 380 ) 381 hdf_handle.attrs["Calibration"] = json.dumps( 382 self.get_calibration_stats(self.gcms, id_label), 383 sort_keys=False, 384 indent=4, 385 separators=(",", ": "), 386 ) 387 hdf_handle.attrs["Blank"] = json.dumps( 388 self.get_blank_stats(self.gcms), 389 sort_keys=False, 390 indent=4, 391 separators=(",", ": "), 392 ) 393 394 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms) 395 hdf_handle.attrs["CoreMSParameters"] = json.dumps( 396 corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ") 397 ) 398 399 scans_dataset = hdf_handle.create_dataset( 400 "scans", data=np.array(self.gcms.scans_number), dtype="f8" 401 ) 402 rt_dataset = hdf_handle.create_dataset( 403 "rt", data=np.array(self.gcms.retention_time), dtype="f8" 404 ) 405 tic_dataset = hdf_handle.create_dataset( 406 "tic", data=np.array(self.gcms.tic), dtype="f8" 407 ) 408 processed_tic_dataset = hdf_handle.create_dataset( 409 "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8" 410 ) 411 412 output_score_method = ( 413 self.gcms.molecular_search_settings.output_score_method 414 ) 415 416 for gc_peak in self.gcms: 417 # print(gc_peak.retention_time) 418 # print(gc_peak.tic) 419 420 # check if there is a compound candidate 421 peak_group = hdf_handle.create_group(str(gc_peak.retention_time)) 422 peak_group.attrs["deconvolution"] = int( 423 self.gcms.chromatogram_settings.use_deconvolution 424 ) 425 426 peak_group.attrs["start_scan"] = gc_peak.start_scan 427 peak_group.attrs["apex_scan"] = gc_peak.apex_scan 428 peak_group.attrs["final_scan"] = gc_peak.final_scan 429 430 peak_group.attrs["retention_index"] = gc_peak.ri 431 peak_group.attrs["retention_time"] = gc_peak.retention_time 432 peak_group.attrs["area"] = gc_peak.area 433 434 mz = peak_group.create_dataset( 435 "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8" 436 ) 437 abundance = peak_group.create_dataset( 438 "abundance", 439 data=np.array(gc_peak.mass_spectrum.abundance), 440 dtype="f8", 441 ) 442 443 if gc_peak: 444 if output_score_method == "highest_sim_score": 445 compound_obj = gc_peak.highest_score_compound 446 add_compound(gc_peak, compound_obj) 447 448 elif output_score_method == "highest_ss": 449 compound_obj = gc_peak.highest_ss_compound 450 add_compound(gc_peak, compound_obj) 451 452 else: 453 for compound_obj in gc_peak: 454 add_compound(gc_peak, compound_obj) 455 456 def get_data_stats(self, gcms): 457 """Get statistics about the GCMS data. 458 459 Parameters: 460 ---------- 461 gcms : object 462 The low resolution GCMS object. 463 464 Returns: 465 ------- 466 dict 467 A dictionary containing the data statistics. 468 """ 469 470 matched_peaks = gcms.matched_peaks 471 no_matched_peaks = gcms.no_matched_peaks 472 unique_metabolites = gcms.unique_metabolites 473 474 peak_matchs_above_0p85 = 0 475 unique_peak_match_above_0p85 = 0 476 for match_peak in matched_peaks: 477 gc_peak_above_85 = 0 478 matches_above_85 = list( 479 filter(lambda m: m.similarity_score >= 0.85, match_peak) 480 ) 481 if matches_above_85: 482 peak_matchs_above_0p85 += 1 483 if len(matches_above_85) == 1: 484 unique_peak_match_above_0p85 += 1 485 486 data_stats = {} 487 data_stats["average_signal_noise"] = "ni" 488 data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range 489 data_stats["total_number_peaks"] = len(gcms) 490 data_stats["total_peaks_matched"] = len(matched_peaks) 491 data_stats["total_peaks_without_matches"] = len(no_matched_peaks) 492 data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85 493 data_stats["single_matches_above_similarity_score_0.85"] = ( 494 unique_peak_match_above_0p85 495 ) 496 data_stats["unique_metabolites"] = len(unique_metabolites) 497 498 return data_stats 499 500 def get_calibration_stats(self, gcms, id_label): 501 """Get statistics about the GC-MS calibration. 502 503 Parameters: 504 ---------- 505 """ 506 calibration_parameters = {} 507 508 calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref 509 calibration_parameters["data_url"] = str(gcms.cal_file_path) 510 calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path) 511 calibration_parameters["data_name"] = str(gcms.cal_file_path.stem) 512 calibration_parameters["calibration_method"] = "" 513 514 return calibration_parameters 515 516 def get_blank_stats(self, gcms): 517 """Get statistics about the GC-MS blank.""" 518 blank_parameters = {} 519 520 blank_parameters["data_name"] = "ni" 521 blank_parameters["blank_id"] = "ni" 522 blank_parameters["data_url"] = "ni" 523 blank_parameters["has_input"] = "ni" 524 blank_parameters["common_features_to_blank"] = "ni" 525 526 return blank_parameters 527 528 def get_instrument_metadata(self, gcms): 529 """Get metadata about the GC-MS instrument.""" 530 instrument_metadata = {} 531 532 instrument_metadata["analyzer"] = gcms.analyzer 533 instrument_metadata["instrument_label"] = gcms.instrument_label 534 instrument_metadata["instrument_id"] = uuid.uuid4().hex 535 536 return instrument_metadata 537 538 def get_data_metadata(self, gcms, id_label, output_path): 539 """Get metadata about the GC-MS data. 540 541 Parameters: 542 ---------- 543 gcms : object 544 The low resolution GCMS object. 545 id_label : str 546 The ID label for the data. 547 output_path : str 548 The output file path. 549 550 Returns: 551 ------- 552 dict 553 A dictionary containing the data metadata. 554 """ 555 if isinstance(output_path, str): 556 output_path = Path(output_path) 557 558 paramaters_path = output_path.with_suffix(".json") 559 560 if paramaters_path.exists(): 561 with paramaters_path.open() as current_param: 562 metadata = json.load(current_param) 563 data_metadata = metadata.get("Data") 564 else: 565 data_metadata = {} 566 data_metadata["data_name"] = [] 567 data_metadata["input_data_url"] = [] 568 data_metadata["has_input"] = [] 569 570 data_metadata["data_name"].append(gcms.sample_name) 571 data_metadata["input_data_url"].append(str(gcms.file_location)) 572 data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location)) 573 574 data_metadata["output_data_name"] = str(output_path.stem) 575 data_metadata["output_data_url"] = str(output_path) 576 data_metadata["has_output"] = id_label + corems_md5(output_path) 577 578 return data_metadata 579 580 def get_parameters_json(self, gcms, id_label, output_path): 581 """Get the parameters as a JSON string. 582 583 Parameters: 584 ---------- 585 gcms : GCMS object 586 The low resolution GCMS object. 587 id_label : str 588 The ID label for the data. 589 output_path : str 590 The output file path. 591 592 Returns: 593 ------- 594 str 595 The parameters as a JSON string. 596 """ 597 598 output_parameters_dict = {} 599 output_parameters_dict["Data"] = self.get_data_metadata( 600 gcms, id_label, output_path 601 ) 602 output_parameters_dict["Stats"] = self.get_data_stats(gcms) 603 output_parameters_dict["Calibration"] = self.get_calibration_stats( 604 gcms, id_label 605 ) 606 output_parameters_dict["Blank"] = self.get_blank_stats(gcms) 607 output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms) 608 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms) 609 corems_dict_setting["corems_version"] = __version__ 610 output_parameters_dict["CoreMSParameters"] = corems_dict_setting 611 output_parameters_dict["has_metabolite"] = gcms.metabolites_data 612 output = json.dumps( 613 output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ") 614 ) 615 616 return output 617 618 def write_settings(self, output_path, gcms, id_label="emsl:"): 619 """Write the settings to a JSON file. 620 621 Parameters: 622 ---------- 623 output_path : str 624 The output file path. 625 gcms : GCMS object 626 The low resolution GCMS object. 627 id_label : str 628 The ID label for the data. Default is "emsl:". 629 630 """ 631 632 output = self.get_parameters_json(gcms, id_label, output_path) 633 634 with open( 635 output_path.with_suffix(".json"), 636 "w", 637 encoding="utf8", 638 ) as outfile: 639 outfile.write(output) 640 641 def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False): 642 """Get the exported data as a list of dictionaries. 643 644 Parameters: 645 ---------- 646 gcms : object 647 The low resolution GCMS object. 648 include_no_match : bool, optional 649 Whether to include no match data. Default is True. 650 no_match_inline : bool, optional 651 Whether to include no match data inline. Default is False. 652 653 Returns: 654 ------- 655 list 656 The exported data as a list of dictionaries. 657 """ 658 659 output_score_method = gcms.molecular_search_settings.output_score_method 660 661 dict_data_list = [] 662 663 def add_match_dict_data(): 664 derivatization = "{}:{}:{}".format( 665 compound_obj.classify, 666 compound_obj.derivativenum, 667 compound_obj.derivatization, 668 ) 669 out_dict = { 670 "Sample name": gcms.sample_name, 671 "Peak Index": gcpeak_index, 672 "Retention Time": gc_peak.retention_time, 673 "Retention Time Ref": compound_obj.retention_time, 674 "Peak Height": gc_peak.tic, 675 "Peak Area": gc_peak.area, 676 "Retention index": gc_peak.ri, 677 "Retention index Ref": compound_obj.ri, 678 "Retention Index Score": compound_obj.ri_score, 679 "Spectral Similarity Score": compound_obj.spectral_similarity_score, 680 "Similarity Score": compound_obj.similarity_score, 681 "Compound Name": compound_obj.name, 682 "Chebi ID": compound_obj.metadata.chebi, 683 "Kegg Compound ID": compound_obj.metadata.kegg, 684 "Inchi": compound_obj.metadata.inchi, 685 "Inchi Key": compound_obj.metadata.inchikey, 686 "Smiles": compound_obj.metadata.smiles, 687 "Molecular Formula": compound_obj.formula, 688 "IUPAC Name": compound_obj.metadata.iupac_name, 689 "Traditional Name": compound_obj.metadata.traditional_name, 690 "Common Name": compound_obj.metadata.common_name, 691 "Derivatization": derivatization, 692 } 693 694 if self.gcms.molecular_search_settings.exploratory_mode: 695 out_dict.update( 696 { 697 "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get( 698 "weighted_cosine_correlation" 699 ), 700 "Cosine Correlation": compound_obj.spectral_similarity_scores.get( 701 "cosine_correlation" 702 ), 703 "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get( 704 "stein_scott_similarity" 705 ), 706 "Pearson Correlation": compound_obj.spectral_similarity_scores.get( 707 "pearson_correlation" 708 ), 709 "Spearman Correlation": compound_obj.spectral_similarity_scores.get( 710 "spearman_correlation" 711 ), 712 "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get( 713 "kendall_tau_correlation" 714 ), 715 "DFT Correlation": compound_obj.spectral_similarity_scores.get( 716 "dft_correlation" 717 ), 718 "DWT Correlation": compound_obj.spectral_similarity_scores.get( 719 "dwt_correlation" 720 ), 721 "Euclidean Distance": compound_obj.spectral_similarity_scores.get( 722 "euclidean_distance" 723 ), 724 "Manhattan Distance": compound_obj.spectral_similarity_scores.get( 725 "manhattan_distance" 726 ), 727 "Jaccard Distance": compound_obj.spectral_similarity_scores.get( 728 "jaccard_distance" 729 ), 730 } 731 ) 732 for method in methods_name: 733 out_dict[methods_name.get(method)] = ( 734 compound_obj.spectral_similarity_scores.get(method) 735 ) 736 737 dict_data_list.append(out_dict) 738 739 def add_no_match_dict_data(): 740 dict_data_list.append( 741 { 742 "Sample name": gcms.sample_name, 743 "Peak Index": gcpeak_index, 744 "Retention Time": gc_peak.retention_time, 745 "Peak Height": gc_peak.tic, 746 "Peak Area": gc_peak.area, 747 "Retention index": gc_peak.ri, 748 } 749 ) 750 751 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 752 # check if there is a compound candidate 753 if gc_peak: 754 if output_score_method == "highest_sim_score": 755 compound_obj = gc_peak.highest_score_compound 756 add_match_dict_data() 757 758 elif output_score_method == "highest_ss": 759 compound_obj = gc_peak.highest_ss_compound 760 add_match_dict_data() 761 762 else: 763 for compound_obj in gc_peak: 764 add_match_dict_data() # add monoisotopic peak 765 766 else: 767 # include not_match 768 if include_no_match and no_match_inline: 769 add_no_match_dict_data() 770 771 if include_no_match and not no_match_inline: 772 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 773 if not gc_peak: 774 add_no_match_dict_data() 775 776 return dict_data_list 777 778 779class HighResMassSpectraExport(HighResMassSpecExport): 780 """A class to export high resolution mass spectra data. 781 782 This class provides methods to export high resolution mass spectra data to various formats 783 such as Excel, CSV, HDF5, and Pandas DataFrame. 784 785 Parameters 786 ---------- 787 out_file_path : str | Path 788 The output file path. 789 mass_spectra : object 790 The high resolution mass spectra object. 791 output_type : str, optional 792 The output type. Default is 'excel'. 793 794 Attributes 795 ---------- 796 output_file : Path 797 The output file path without suffix 798 dir_loc : Path 799 The directory location for the output file, 800 by default this will be the output_file + ".corems" and all output files will be 801 written into this location 802 mass_spectra : MassSpectraBase 803 The high resolution mass spectra object. 804 """ 805 806 def __init__(self, out_file_path, mass_spectra, output_type="excel"): 807 super().__init__( 808 out_file_path=out_file_path, mass_spectrum=None, output_type=output_type 809 ) 810 811 self.dir_loc = Path(out_file_path + ".corems") 812 self.dir_loc.mkdir(exist_ok=True) 813 # Place the output file in the directory 814 self.output_file = self.dir_loc / Path(out_file_path).name 815 self._output_type = output_type # 'excel', 'csv', 'pandas' or 'hdf5' 816 self.mass_spectra = mass_spectra 817 self.atoms_order_list = None 818 self._init_columns() 819 820 def get_pandas_df(self): 821 """Get the mass spectra as a list of Pandas DataFrames.""" 822 823 list_df = [] 824 825 for mass_spectrum in self.mass_spectra: 826 columns = self.columns_label + self.get_all_used_atoms_in_order( 827 mass_spectrum 828 ) 829 830 dict_data_list = self.get_list_dict_data(mass_spectrum) 831 832 df = DataFrame(dict_data_list, columns=columns) 833 834 scan_number = mass_spectrum.scan_number 835 836 df.name = str(self.output_file) + "_" + str(scan_number) 837 838 list_df.append(df) 839 840 return list_df 841 842 def to_pandas(self, write_metadata=True): 843 """Export the data to a Pandas DataFrame and save it as a pickle file. 844 845 Parameters: 846 ---------- 847 write_metadata : bool, optional 848 Whether to write metadata to the output file. Default is True. 849 """ 850 851 for mass_spectrum in self.mass_spectra: 852 columns = self.columns_label + self.get_all_used_atoms_in_order( 853 mass_spectrum 854 ) 855 856 dict_data_list = self.get_list_dict_data(mass_spectrum) 857 858 df = DataFrame(dict_data_list, columns=columns) 859 860 scan_number = mass_spectrum.scan_number 861 862 out_filename = Path( 863 "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl") 864 ) 865 866 df.to_pickle(self.dir_loc / out_filename) 867 868 if write_metadata: 869 self.write_settings( 870 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 871 ) 872 873 def to_excel(self, write_metadata=True): 874 """Export the data to an Excel file. 875 876 Parameters: 877 ---------- 878 write_metadata : bool, optional 879 Whether to write metadata to the output file. Default is True. 880 """ 881 for mass_spectrum in self.mass_spectra: 882 columns = self.columns_label + self.get_all_used_atoms_in_order( 883 mass_spectrum 884 ) 885 886 dict_data_list = self.get_list_dict_data(mass_spectrum) 887 888 df = DataFrame(dict_data_list, columns=columns) 889 890 scan_number = mass_spectrum.scan_number 891 892 out_filename = Path( 893 "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx") 894 ) 895 896 df.to_excel(self.dir_loc / out_filename) 897 898 if write_metadata: 899 self.write_settings( 900 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 901 ) 902 903 def to_csv(self, write_metadata=True): 904 """Export the data to a CSV file. 905 906 Parameters: 907 ---------- 908 write_metadata : bool, optional 909 Whether to write metadata to the output file. Default is True. 910 """ 911 import csv 912 913 for mass_spectrum in self.mass_spectra: 914 columns = self.columns_label + self.get_all_used_atoms_in_order( 915 mass_spectrum 916 ) 917 918 scan_number = mass_spectrum.scan_number 919 920 dict_data_list = self.get_list_dict_data(mass_spectrum) 921 922 out_filename = Path( 923 "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv") 924 ) 925 926 with open(self.dir_loc / out_filename, "w", newline="") as csvfile: 927 writer = csv.DictWriter(csvfile, fieldnames=columns) 928 writer.writeheader() 929 for data in dict_data_list: 930 writer.writerow(data) 931 932 if write_metadata: 933 self.write_settings( 934 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 935 ) 936 937 def get_mass_spectra_attrs(self): 938 """Get the mass spectra attributes as a JSON string. 939 940 Parameters: 941 ---------- 942 mass_spectra : object 943 The high resolution mass spectra object. 944 945 Returns: 946 ------- 947 str 948 The mass spectra attributes as a JSON string. 949 """ 950 dict_ms_attrs = {} 951 dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer 952 dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label 953 dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name 954 955 return json.dumps( 956 dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ") 957 ) 958 959 def to_hdf(self, overwrite=False, export_raw=True): 960 """Export the data to an HDF5 file. 961 962 Parameters 963 ---------- 964 overwrite : bool, optional 965 Whether to overwrite the output file. Default is False. 966 export_raw : bool, optional 967 Whether to export the raw mass spectra data. Default is True. 968 """ 969 if overwrite: 970 if self.output_file.with_suffix(".hdf5").exists(): 971 self.output_file.with_suffix(".hdf5").unlink() 972 973 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 974 if not hdf_handle.attrs.get("date_utc"): 975 # Set metadata for all mass spectra 976 timenow = str( 977 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 978 ) 979 hdf_handle.attrs["date_utc"] = timenow 980 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 981 hdf_handle.attrs["data_structure"] = "mass_spectra" 982 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 983 hdf_handle.attrs["instrument_label"] = ( 984 self.mass_spectra.instrument_label 985 ) 986 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 987 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 988 hdf_handle.attrs["parser_type"] = ( 989 self.mass_spectra.spectra_parser_class.__name__ 990 ) 991 hdf_handle.attrs["original_file_location"] = ( 992 self.mass_spectra.file_location._str 993 ) 994 995 if "mass_spectra" not in hdf_handle: 996 mass_spectra_group = hdf_handle.create_group("mass_spectra") 997 else: 998 mass_spectra_group = hdf_handle.get("mass_spectra") 999 1000 for mass_spectrum in self.mass_spectra: 1001 group_key = str(int(mass_spectrum.scan_number)) 1002 1003 self.add_mass_spectrum_to_hdf5( 1004 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw 1005 ) 1006 1007 1008class LCMSExport(HighResMassSpectraExport): 1009 """A class to export high resolution LC-MS data. 1010 1011 This class provides methods to export high resolution LC-MS data to HDF5. 1012 1013 Parameters 1014 ---------- 1015 out_file_path : str | Path 1016 The output file path, do not include the file extension. 1017 lcms_object : LCMSBase 1018 The high resolution lc-ms object. 1019 """ 1020 1021 def __init__(self, out_file_path, mass_spectra): 1022 super().__init__(out_file_path, mass_spectra, output_type="hdf5") 1023 1024 def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"): 1025 """Export the data to an HDF5. 1026 1027 Parameters 1028 ---------- 1029 overwrite : bool, optional 1030 Whether to overwrite the output file. Default is False. 1031 save_parameters : bool, optional 1032 Whether to save the parameters as a separate json or toml file. Default is True. 1033 parameter_format : str, optional 1034 The format to save the parameters in. Default is 'toml'. 1035 1036 Raises 1037 ------ 1038 ValueError 1039 If parameter_format is not 'json' or 'toml'. 1040 """ 1041 export_profile_spectra = ( 1042 self.mass_spectra.parameters.lc_ms.export_profile_spectra 1043 ) 1044 1045 # Parameterized export: all spectra (default) or only relevant spectra 1046 export_only_relevant = self.mass_spectra.parameters.lc_ms.export_only_relevant_mass_spectra 1047 if export_only_relevant: 1048 relevant_scan_numbers = set() 1049 # Add MS1 spectra associated with mass features (apex scans) and best MS2 spectra 1050 for mass_feature in self.mass_spectra.mass_features.values(): 1051 relevant_scan_numbers.add(mass_feature.apex_scan) 1052 if mass_feature.best_ms2 is not None: 1053 relevant_scan_numbers.add(mass_feature.best_ms2.scan_number) 1054 if overwrite: 1055 if self.output_file.with_suffix(".hdf5").exists(): 1056 self.output_file.with_suffix(".hdf5").unlink() 1057 1058 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1059 if not hdf_handle.attrs.get("date_utc"): 1060 # Set metadata for all mass spectra 1061 timenow = str( 1062 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 1063 ) 1064 hdf_handle.attrs["date_utc"] = timenow 1065 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 1066 hdf_handle.attrs["data_structure"] = "mass_spectra" 1067 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 1068 hdf_handle.attrs["instrument_label"] = ( 1069 self.mass_spectra.instrument_label 1070 ) 1071 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 1072 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 1073 hdf_handle.attrs["parser_type"] = ( 1074 self.mass_spectra.spectra_parser_class.__name__ 1075 ) 1076 hdf_handle.attrs["original_file_location"] = ( 1077 self.mass_spectra.file_location._str 1078 ) 1079 1080 if "mass_spectra" not in hdf_handle: 1081 mass_spectra_group = hdf_handle.create_group("mass_spectra") 1082 else: 1083 mass_spectra_group = hdf_handle.get("mass_spectra") 1084 1085 # Export logic based on parameter 1086 for mass_spectrum in self.mass_spectra: 1087 if export_only_relevant: 1088 if mass_spectrum.scan_number in relevant_scan_numbers: 1089 group_key = str(int(mass_spectrum.scan_number)) 1090 self.add_mass_spectrum_to_hdf5( 1091 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra 1092 ) 1093 else: 1094 group_key = str(int(mass_spectrum.scan_number)) 1095 self.add_mass_spectrum_to_hdf5( 1096 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra 1097 ) 1098 1099 # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file 1100 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1101 # Add scan_info to hdf5 file 1102 if "scan_info" not in hdf_handle: 1103 scan_info_group = hdf_handle.create_group("scan_info") 1104 for k, v in self.mass_spectra._scan_info.items(): 1105 array = np.array(list(v.values())) 1106 if array.dtype.str[0:2] == "<U": 1107 # Convert Unicode strings to UTF-8 encoded bytes first 1108 string_data = [str(item) for item in array] 1109 string_dtype = h5py.string_dtype(encoding='utf-8') 1110 scan_info_group.create_dataset(k, data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1111 else: 1112 # Apply data type optimization for numeric data 1113 if array.dtype == np.float64: 1114 array = array.astype(np.float32) 1115 elif array.dtype == np.int64: 1116 array = array.astype(np.int32) 1117 scan_info_group.create_dataset(k, data=array, compression="gzip", compression_opts=9, chunks=True) 1118 1119 # Add ms_unprocessed to hdf5 file 1120 export_unprocessed_ms1 = ( 1121 self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1 1122 ) 1123 if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1: 1124 if "ms_unprocessed" not in hdf_handle: 1125 ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed") 1126 else: 1127 ms_unprocessed_group = hdf_handle.get("ms_unprocessed") 1128 for k, v in self.mass_spectra._ms_unprocessed.items(): 1129 array = np.array(v) 1130 # Apply data type optimization and compression 1131 if array.dtype == np.int64: 1132 array = array.astype(np.int32) 1133 elif array.dtype == np.float64: 1134 array = array.astype(np.float32) 1135 elif array.dtype.str[0:2] == "<U": 1136 # Convert Unicode strings to UTF-8 encoded strings 1137 string_data = [str(item) for item in array] 1138 string_dtype = h5py.string_dtype(encoding='utf-8') 1139 ms_unprocessed_group.create_dataset(str(k), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1140 continue 1141 ms_unprocessed_group.create_dataset(str(k), data=array, compression="gzip", compression_opts=9, chunks=True) 1142 1143 # Add LCMS mass features to hdf5 file 1144 if len(self.mass_spectra.mass_features) > 0: 1145 if "mass_features" not in hdf_handle: 1146 mass_features_group = hdf_handle.create_group("mass_features") 1147 else: 1148 mass_features_group = hdf_handle.get("mass_features") 1149 1150 # Create group for each mass feature, with key as the mass feature id 1151 for k, v in self.mass_spectra.mass_features.items(): 1152 mass_features_group.create_group(str(k)) 1153 # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array) 1154 for k2, v2 in v.__dict__.items(): 1155 if v2 is not None: 1156 # Check if the attribute is an integer or float and set as an attribute in the mass feature group 1157 if k2 not in [ 1158 "chromatogram_parent", 1159 "ms2_mass_spectra", 1160 "mass_spectrum", 1161 "_eic_data", 1162 "ms2_similarity_results", 1163 ]: 1164 if k2 == "ms2_scan_numbers": 1165 array = np.array(v2) 1166 # Convert int64 to int32 1167 if array.dtype == np.int64: 1168 array = array.astype(np.int32) 1169 mass_features_group[str(k)].create_dataset( 1170 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1171 ) 1172 elif k2 == "_half_height_width": 1173 array = np.array(v2) 1174 # Convert float64 to float32 1175 if array.dtype == np.float64: 1176 array = array.astype(np.float32) 1177 mass_features_group[str(k)].create_dataset( 1178 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1179 ) 1180 elif k2 == "_ms_deconvoluted_idx": 1181 array = np.array(v2) 1182 # Convert int64 to int32 1183 if array.dtype == np.int64: 1184 array = array.astype(np.int32) 1185 mass_features_group[str(k)].create_dataset( 1186 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1187 ) 1188 elif k2 == "associated_mass_features_deconvoluted": 1189 array = np.array(v2) 1190 # Convert int64 to int32 1191 if array.dtype == np.int64: 1192 array = array.astype(np.int32) 1193 mass_features_group[str(k)].create_dataset( 1194 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1195 ) 1196 elif k2 == "_noise_score": 1197 array = np.array(v2) 1198 # Convert float64 to float32 1199 if array.dtype == np.float64: 1200 array = array.astype(np.float32) 1201 mass_features_group[str(k)].create_dataset( 1202 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1203 ) 1204 elif ( 1205 isinstance(v2, int) 1206 or isinstance(v2, float) 1207 or isinstance(v2, str) 1208 or isinstance(v2, np.integer) 1209 or isinstance(v2, np.float32) 1210 or isinstance(v2, np.float64) 1211 or isinstance(v2, np.bool_) 1212 ): 1213 # Convert numpy types to smaller precision for storage 1214 if isinstance(v2, np.int64): 1215 v2 = np.int32(v2) 1216 elif isinstance(v2, np.float64): 1217 v2 = np.float32(v2) 1218 mass_features_group[str(k)].attrs[str(k2)] = v2 1219 else: 1220 raise TypeError( 1221 f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file" 1222 ) 1223 1224 # Add EIC data to hdf5 file 1225 export_eics = self.mass_spectra.parameters.lc_ms.export_eics 1226 if len(self.mass_spectra.eics) > 0 and export_eics: 1227 if "eics" not in hdf_handle: 1228 eic_group = hdf_handle.create_group("eics") 1229 else: 1230 eic_group = hdf_handle.get("eics") 1231 1232 # Create group for each eic 1233 for k, v in self.mass_spectra.eics.items(): 1234 eic_group.create_group(str(k)) 1235 eic_group[str(k)].attrs["mz"] = k 1236 # Loop through each of the attributes and add them as datasets (if array) 1237 for k2, v2 in v.__dict__.items(): 1238 if v2 is not None: 1239 array = np.array(v2) 1240 # Apply data type optimization and compression 1241 if array.dtype == np.int64: 1242 array = array.astype(np.int32) 1243 elif array.dtype == np.float64: 1244 array = array.astype(np.float32) 1245 elif array.dtype.str[0:2] == "<U": 1246 # Convert Unicode strings to UTF-8 encoded strings 1247 string_data = [str(item) for item in array] 1248 string_dtype = h5py.string_dtype(encoding='utf-8') 1249 eic_group[str(k)].create_dataset(str(k2), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1250 continue 1251 eic_group[str(k)].create_dataset(str(k2), data=array, compression="gzip", compression_opts=9, chunks=True) 1252 1253 # Add ms2_search results to hdf5 file (parameterized) 1254 if len(self.mass_spectra.spectral_search_results) > 0: 1255 if "spectral_search_results" not in hdf_handle: 1256 spectral_search_results = hdf_handle.create_group( 1257 "spectral_search_results" 1258 ) 1259 else: 1260 spectral_search_results = hdf_handle.get("spectral_search_results") 1261 # Create group for each search result by ms2_scan / precursor_mz 1262 for k, v in self.mass_spectra.spectral_search_results.items(): 1263 # If parameter is set, only export spectral search results for relevant scans 1264 if export_only_relevant and k not in relevant_scan_numbers: 1265 continue 1266 spectral_search_results.create_group(str(k)) 1267 for k2, v2 in v.items(): 1268 spectral_search_results[str(k)].create_group(str(k2)) 1269 spectral_search_results[str(k)][str(k2)].attrs[ 1270 "precursor_mz" 1271 ] = v2.precursor_mz 1272 spectral_search_results[str(k)][str(k2)].attrs[ 1273 "query_spectrum_id" 1274 ] = v2.query_spectrum_id 1275 # Loop through each of the attributes and add them as datasets (if array) 1276 for k3, v3 in v2.__dict__.items(): 1277 if v3 is not None and k3 not in [ 1278 "query_spectrum", 1279 "precursor_mz", 1280 "query_spectrum_id", 1281 ]: 1282 if k3 == "query_frag_types" or k3 == "ref_frag_types": 1283 v3 = [", ".join(x) for x in v3] 1284 if all(v3 is not None for v3 in v3): 1285 array = np.array(v3) 1286 if array.dtype.str[0:2] == "<U": 1287 array = array.astype("S") 1288 spectral_search_results[str(k)][str(k2)].create_dataset( 1289 str(k3), data=array 1290 ) 1291 if save_parameters: 1292 # Check if parameter_format is valid 1293 if parameter_format not in ["json", "toml"]: 1294 raise ValueError("parameter_format must be 'json' or 'toml'") 1295 1296 if parameter_format == "json": 1297 dump_lcms_settings_json( 1298 filename=self.output_file.with_suffix(".json"), 1299 lcms_obj=self.mass_spectra, 1300 ) 1301 elif parameter_format == "toml": 1302 dump_lcms_settings_toml( 1303 filename=self.output_file.with_suffix(".toml"), 1304 lcms_obj=self.mass_spectra, 1305 ) 1306 1307class LCMSMetabolomicsExport(LCMSExport): 1308 """A class to export LCMS metabolite data. 1309 1310 This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report. 1311 1312 Parameters 1313 ---------- 1314 out_file_path : str | Path 1315 The output file path, do not include the file extension. 1316 mass_spectra : object 1317 The high resolution mass spectra object. 1318 """ 1319 1320 def __init__(self, out_file_path, mass_spectra): 1321 super().__init__(out_file_path, mass_spectra) 1322 self.ion_type_dict = ion_type_dict 1323 1324 @staticmethod 1325 def get_ion_formula(neutral_formula, ion_type): 1326 """From a neutral formula and an ion type, return the formula of the ion. 1327 1328 Notes 1329 ----- 1330 This is a static method. 1331 If the neutral_formula is not a string, this method will return None. 1332 1333 Parameters 1334 ---------- 1335 neutral_formula : str 1336 The neutral formula, this should be a string form from the MolecularFormula class 1337 (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). 1338 In the case of a simple string, the atoms are parsed based on the presence of capital letters, 1339 e.g. MgCl2 is parsed as 'Mg Cl2. 1340 ion_type : str 1341 The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. 1342 See the self.ion_type_dict for the available ion types. 1343 1344 Returns 1345 ------- 1346 str 1347 The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string. 1348 """ 1349 # If neutral_formula is not a string, return None 1350 if not isinstance(neutral_formula, str): 1351 return None 1352 1353 # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class) 1354 if re.search(r"\s", neutral_formula): 1355 neutral_formula = MolecularFormula(neutral_formula, ion_charge=0) 1356 else: 1357 form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:] 1358 elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()] 1359 counts = [re.findall(r"\d+", x) for x in form_pre.split()] 1360 neutral_formula = MolecularFormula( 1361 dict( 1362 zip( 1363 [x[0] for x in elements], 1364 [int(x[0]) if x else 1 for x in counts], 1365 ) 1366 ), 1367 ion_charge=0, 1368 ) 1369 neutral_formula_dict = neutral_formula.to_dict().copy() 1370 1371 adduct_add_dict = ion_type_dict[ion_type][0] 1372 for key in adduct_add_dict: 1373 if key in neutral_formula_dict.keys(): 1374 neutral_formula_dict[key] += adduct_add_dict[key] 1375 else: 1376 neutral_formula_dict[key] = adduct_add_dict[key] 1377 1378 adduct_subtract = ion_type_dict[ion_type][1] 1379 for key in adduct_subtract: 1380 neutral_formula_dict[key] -= adduct_subtract[key] 1381 1382 return MolecularFormula(neutral_formula_dict, ion_charge=0).string 1383 1384 @staticmethod 1385 def get_isotope_type(ion_formula): 1386 """From an ion formula, return the 13C isotope type of the ion. 1387 1388 Notes 1389 ----- 1390 This is a static method. 1391 If the ion_formula is not a string, this method will return None. 1392 This is currently only functional for 13C isotopes. 1393 1394 Parameters 1395 ---------- 1396 ion_formula : str 1397 The formula of the ion, expected to be a string like 'C2 H4 O2'. 1398 1399 Returns 1400 ------- 1401 str 1402 The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope. 1403 1404 Raises 1405 ------ 1406 ValueError 1407 If the ion_formula is not a string. 1408 """ 1409 if not isinstance(ion_formula, str): 1410 return None 1411 1412 if re.search(r"\s", ion_formula): 1413 ion_formula = MolecularFormula(ion_formula, ion_charge=0) 1414 else: 1415 raise ValueError('ion_formula should be a string like "C2 H4 O2"') 1416 ion_formula_dict = ion_formula.to_dict().copy() 1417 1418 try: 1419 iso_class = "13C" + str(ion_formula_dict.pop("13C")) 1420 except KeyError: 1421 iso_class = None 1422 1423 return iso_class 1424 1425 def report_to_csv(self, molecular_metadata=None): 1426 """Create a report of the mass features and their annotations and save it as a CSV file. 1427 1428 Parameters 1429 ---------- 1430 molecular_metadata : dict, optional 1431 The molecular metadata. Default is None. 1432 """ 1433 report = self.to_report(molecular_metadata=molecular_metadata) 1434 out_file = self.output_file.with_suffix(".csv") 1435 report.to_csv(out_file, index=False) 1436 1437 def clean_ms1_report(self, ms1_summary_full): 1438 """Clean the MS1 report. 1439 1440 Parameters 1441 ---------- 1442 ms1_summary_full : DataFrame 1443 The full MS1 summary DataFrame. 1444 1445 Returns 1446 ------- 1447 DataFrame 1448 The cleaned MS1 summary DataFrame. 1449 """ 1450 ms1_summary_full = ms1_summary_full.reset_index() 1451 cols_to_keep = [ 1452 "mf_id", 1453 "Molecular Formula", 1454 "Ion Type", 1455 "Calculated m/z", 1456 "m/z Error (ppm)", 1457 "m/z Error Score", 1458 "Is Isotopologue", 1459 "Isotopologue Similarity", 1460 "Confidence Score", 1461 ] 1462 ms1_summary = ms1_summary_full[cols_to_keep].copy() 1463 ms1_summary["ion_formula"] = [ 1464 self.get_ion_formula(f, a) 1465 for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"]) 1466 ] 1467 ms1_summary["isotopologue_type"] = [ 1468 self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist() 1469 ] 1470 1471 # Reorder columns 1472 ms1_summary = ms1_summary[ 1473 [ 1474 "mf_id", 1475 "ion_formula", 1476 "isotopologue_type", 1477 "Calculated m/z", 1478 "m/z Error (ppm)", 1479 "m/z Error Score", 1480 "Isotopologue Similarity", 1481 "Confidence Score", 1482 ] 1483 ] 1484 1485 # Set the index to mf_id 1486 ms1_summary = ms1_summary.set_index("mf_id") 1487 1488 return ms1_summary 1489 1490 def summarize_ms2_report(self, ms2_annot_report): 1491 """ 1492 Summarize the MS2 report. 1493 1494 Parameters 1495 ---------- 1496 ms2_annot_report : DataFrame 1497 The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df. 1498 1499 Returns 1500 ------- 1501 """ 1502 1503 def summarize_metabolomics_report(self, ms2_annot_report): 1504 """Summarize the MS2 hits for a metabolomics report 1505 1506 Parameters 1507 ---------- 1508 ms2_annot : DataFrame 1509 The MS2 annotation DataFrame with all annotations. 1510 1511 Returns 1512 ------- 1513 DataFrame 1514 The summarized metabolomics report. 1515 """ 1516 columns_to_drop = [ 1517 "precursor_mz", 1518 "precursor_mz_error_ppm", 1519 "cas", 1520 "data_id", 1521 "iupac_name", 1522 "traditional_name", 1523 "common_name", 1524 "casno", 1525 ] 1526 ms2_annot = ms2_annot_report.drop( 1527 columns=[col for col in columns_to_drop if col in ms2_annot_report.columns] 1528 ) 1529 1530 # Prepare information about the search results, pulling out the best hit for the single report 1531 # Group by mf_id,ref_mol_id grab row with highest entropy similarity 1532 ms2_annot = ms2_annot.reset_index() 1533 # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id 1534 ms2_annot["n_spectra_contributing"] = ( 1535 ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"] 1536 .transform("nunique") 1537 ) 1538 # Sort by entropy similarity 1539 ms2_annot = ms2_annot.sort_values( 1540 by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False] 1541 ) 1542 best_entropy = ms2_annot.drop_duplicates( 1543 subset=["mf_id", "ref_mol_id"], keep="first" 1544 ) 1545 1546 return best_entropy 1547 1548 def clean_ms2_report(self, metabolite_summary): 1549 """Clean the MS2 report. 1550 1551 Parameters 1552 ---------- 1553 metabolite_summary : DataFrame 1554 The full metabolomics summary DataFrame. 1555 1556 Returns 1557 ------- 1558 DataFrame 1559 The cleaned metabolomics summary DataFrame. 1560 """ 1561 metabolite_summary = metabolite_summary.reset_index() 1562 metabolite_summary["ion_formula"] = [ 1563 self.get_ion_formula(f, a) 1564 for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"]) 1565 ] 1566 1567 col_order = [ 1568 "mf_id", 1569 "ion_formula", 1570 "ref_ion_type", 1571 "formula", 1572 "inchikey", 1573 "name", 1574 "inchi", 1575 "chebi", 1576 "smiles", 1577 "kegg", 1578 "cas", 1579 "database_name", 1580 "ref_ms_id", 1581 "entropy_similarity", 1582 "ref_mz_in_query_fract", 1583 "n_spectra_contributing", 1584 ] 1585 1586 # Reorder columns 1587 metabolite_summary = metabolite_summary[ 1588 [col for col in col_order if col in metabolite_summary.columns] 1589 ] 1590 1591 # Convert chebi (if present) to int: 1592 if "chebi" in metabolite_summary.columns: 1593 metabolite_summary["chebi"] = metabolite_summary["chebi"].astype( 1594 "Int64", errors="ignore" 1595 ) 1596 1597 # Set the index to mf_id 1598 metabolite_summary = metabolite_summary.set_index("mf_id") 1599 1600 return metabolite_summary 1601 1602 def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report): 1603 """Combine the mass feature report with the MS1 and MS2 reports. 1604 1605 Parameters 1606 ---------- 1607 mf_report : DataFrame 1608 The mass feature report DataFrame. 1609 ms1_annot_report : DataFrame 1610 The MS1 annotation report DataFrame. 1611 ms2_annot_report : DataFrame 1612 The MS2 annotation report DataFrame. 1613 """ 1614 # If there is an ms1_annot_report, merge it with the mf_report 1615 if not ms1_annot_report.empty: 1616 # MS1 has been run and has molecular formula information 1617 mf_report = pd.merge( 1618 mf_report, 1619 ms1_annot_report, 1620 how="left", 1621 on=["mf_id", "isotopologue_type"], 1622 ) 1623 if ms2_annot_report is not None: 1624 # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly) 1625 mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()] 1626 mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"]) 1627 mf_no_ion_formula = pd.merge( 1628 mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"] 1629 ) 1630 1631 # pull out the records with ion_formula 1632 mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()] 1633 mf_with_ion_formula = pd.merge( 1634 mf_with_ion_formula, 1635 ms2_annot_report, 1636 how="left", 1637 on=["mf_id", "ion_formula"], 1638 ) 1639 1640 # put back together 1641 mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula]) 1642 1643 # Rename colums 1644 rename_dict = { 1645 "mf_id": "Mass Feature ID", 1646 "scan_time": "Retention Time (min)", 1647 "mz": "m/z", 1648 "apex_scan": "Apex Scan Number", 1649 "intensity": "Intensity", 1650 "persistence": "Persistence", 1651 "area": "Area", 1652 "half_height_width": "Half Height Width (min)", 1653 "tailing_factor": "Tailing Factor", 1654 "dispersity_index": "Dispersity Index", 1655 "ms2_spectrum": "MS2 Spectrum", 1656 "monoisotopic_mf_id": "Monoisotopic Mass Feature ID", 1657 "isotopologue_type": "Isotopologue Type", 1658 "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution", 1659 "associated_mass_features": "Associated Mass Features after Deconvolution", 1660 "ion_formula": "Ion Formula", 1661 "formula": "Molecular Formula", 1662 "ref_ion_type": "Ion Type", 1663 "annot_level": "Lipid Annotation Level", 1664 "lipid_molecular_species_id": "Lipid Molecular Species", 1665 "lipid_summed_name": "Lipid Species", 1666 "lipid_subclass": "Lipid Subclass", 1667 "lipid_class": "Lipid Class", 1668 "lipid_category": "Lipid Category", 1669 "entropy_similarity": "Entropy Similarity", 1670 "ref_mz_in_query_fract": "Library mzs in Query (fraction)", 1671 "n_spectra_contributing": "Spectra with Annotation (n)", 1672 } 1673 mf_report = mf_report.rename(columns=rename_dict) 1674 mf_report["Sample Name"] = self.mass_spectra.sample_name 1675 mf_report["Polarity"] = self.mass_spectra.polarity 1676 mf_report = mf_report[ 1677 ["Mass Feature ID", "Sample Name", "Polarity"] 1678 + [ 1679 col 1680 for col in mf_report.columns 1681 if col not in ["Mass Feature ID", "Sample Name", "Polarity"] 1682 ] 1683 ] 1684 1685 # Reorder rows by "Mass Feature ID", then "Entropy Similarity" (descending), then "Confidence Score" (descending) 1686 if "Entropy Similarity" in mf_report.columns: 1687 mf_report = mf_report.sort_values( 1688 by=["Mass Feature ID", "Entropy Similarity", "Confidence Score"], 1689 ascending=[True, False, False], 1690 ) 1691 elif "Confidence Score" in mf_report.columns: 1692 mf_report = mf_report.sort_values( 1693 by=["Mass Feature ID", "Confidence Score"], 1694 ascending=[True, False], 1695 ) 1696 # If neither "Entropy Similarity" nor "Confidence Score" are in the columns, just sort by "Mass Feature ID" 1697 else: 1698 mf_report = mf_report.sort_values("Mass Feature ID") 1699 1700 # Reset index 1701 mf_report = mf_report.reset_index(drop=True) 1702 1703 return mf_report 1704 1705 def to_report(self, molecular_metadata=None): 1706 """Create a report of the mass features and their annotations. 1707 1708 Parameters 1709 ---------- 1710 molecular_metadata : dict, optional 1711 The molecular metadata. Default is None. 1712 1713 Returns 1714 ------- 1715 DataFrame 1716 The report as a Pandas DataFrame. 1717 """ 1718 # Get mass feature dataframe 1719 mf_report = self.mass_spectra.mass_features_to_df() 1720 mf_report = mf_report.reset_index(drop=False) 1721 1722 # Get and clean ms1 annotation dataframe 1723 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 1724 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 1725 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 1726 1727 # Get, summarize, and clean ms2 annotation dataframe 1728 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 1729 molecular_metadata=molecular_metadata 1730 ) 1731 if ms2_annot_report is not None and molecular_metadata is not None: 1732 ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report) 1733 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 1734 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 1735 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 1736 else: 1737 ms2_annot_report = None 1738 1739 report = self.combine_reports( 1740 mf_report=mf_report, 1741 ms1_annot_report=ms1_annot_report, 1742 ms2_annot_report=ms2_annot_report 1743 ) 1744 1745 return report 1746class LipidomicsExport(LCMSMetabolomicsExport): 1747 """A class to export lipidomics data. 1748 1749 This class provides methods to export lipidomics data to various formats and summarize the lipid report. 1750 1751 Parameters 1752 ---------- 1753 out_file_path : str | Path 1754 The output file path, do not include the file extension. 1755 mass_spectra : object 1756 The high resolution mass spectra object. 1757 """ 1758 1759 def __init__(self, out_file_path, mass_spectra): 1760 super().__init__(out_file_path, mass_spectra) 1761 1762 def summarize_lipid_report(self, ms2_annot): 1763 """Summarize the lipid report. 1764 1765 Parameters 1766 ---------- 1767 ms2_annot : DataFrame 1768 The MS2 annotation DataFrame with all annotations. 1769 1770 Returns 1771 ------- 1772 DataFrame 1773 The summarized lipid report. 1774 """ 1775 # Drop unnecessary columns for easier viewing 1776 columns_to_drop = [ 1777 "precursor_mz", 1778 "precursor_mz_error_ppm", 1779 "metabref_mol_id", 1780 "metabref_precursor_mz", 1781 "cas", 1782 "inchikey", 1783 "inchi", 1784 "chebi", 1785 "smiles", 1786 "kegg", 1787 "data_id", 1788 "iupac_name", 1789 "traditional_name", 1790 "common_name", 1791 "casno", 1792 ] 1793 ms2_annot = ms2_annot.drop( 1794 columns=[col for col in columns_to_drop if col in ms2_annot.columns] 1795 ) 1796 1797 # If ion_types_excluded is not empty, remove those ion types 1798 ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[ 1799 "ms2" 1800 ].molecular_search.ion_types_excluded 1801 if len(ion_types_excluded) > 0: 1802 ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)] 1803 1804 # If mf_id is not present, check that the index name is mf_id and reset the index 1805 if "mf_id" not in ms2_annot.columns: 1806 if ms2_annot.index.name == "mf_id": 1807 ms2_annot = ms2_annot.reset_index() 1808 else: 1809 raise ValueError("mf_id is not present in the dataframe") 1810 1811 # Attempt to get consensus annotations to the MLF level 1812 mlf_results_all = [] 1813 for mf_id in ms2_annot["mf_id"].unique(): 1814 mlf_results_perid = [] 1815 ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy() 1816 ms2_annot_mf["n_spectra_contributing"] = ms2_annot_mf.query_spectrum_id.nunique() 1817 1818 for query_scan in ms2_annot["query_spectrum_id"].unique(): 1819 ms2_annot_sub = ms2_annot_mf[ 1820 ms2_annot_mf["query_spectrum_id"] == query_scan 1821 ].copy() 1822 1823 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1824 # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation 1825 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1826 ms2_annot_sub["entropy_max"] = ( 1827 ms2_annot_sub["entropy_similarity"] 1828 == ms2_annot_sub["entropy_similarity"].max() 1829 ) 1830 ms2_annot_sub["ref_match_fract_max"] = ( 1831 ms2_annot_sub["ref_mz_in_query_fract"] 1832 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1833 ) 1834 ms2_annot_sub["frag_max"] = ms2_annot_sub[ 1835 "query_frag_types" 1836 ].apply(lambda x: True if "MLF" in x else False) 1837 1838 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1839 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1840 ["entropy_max", "ref_match_fract_max", "frag_max"] 1841 ].all(axis=1) 1842 1843 # If there is a consensus, take the row with the highest entropy_similarity 1844 if ms2_annot_sub["consensus"].any(): 1845 ms2_annot_sub = ms2_annot_sub[ 1846 ms2_annot_sub["entropy_similarity"] 1847 == ms2_annot_sub["entropy_similarity"].max() 1848 ].head(1) 1849 mlf_results_perid.append(ms2_annot_sub) 1850 if len(mlf_results_perid) == 0: 1851 mlf_results_perid = pd.DataFrame() 1852 else: 1853 mlf_results_perid = pd.concat(mlf_results_perid) 1854 if mlf_results_perid["name"].nunique() == 1: 1855 mlf_results_perid = mlf_results_perid[ 1856 mlf_results_perid["entropy_similarity"] 1857 == mlf_results_perid["entropy_similarity"].max() 1858 ].head(1) 1859 else: 1860 mlf_results_perid = pd.DataFrame() 1861 mlf_results_all.append(mlf_results_perid) 1862 1863 # These are the consensus annotations to the MLF level 1864 if len(mlf_results_all) > 0: 1865 mlf_results_all = pd.concat(mlf_results_all) 1866 mlf_results_all["annot_level"] = mlf_results_all["structure_level"] 1867 else: 1868 # Make an empty dataframe 1869 mlf_results_all = ms2_annot.head(0) 1870 1871 # For remaining mf_ids, try to get a consensus annotation to the species level 1872 species_results_all = [] 1873 # Remove mf_ids that have consensus annotations to the MLF level 1874 ms2_annot_spec = ms2_annot[ 1875 ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique()) 1876 ] 1877 for mf_id in ms2_annot_spec["mf_id"].unique(): 1878 # Do all the hits have the same lipid_summed_name? 1879 ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy() 1880 ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub) 1881 1882 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1883 # Grab the highest entropy_similarity result 1884 ms2_annot_sub = ms2_annot_sub[ 1885 ms2_annot_sub["entropy_similarity"] 1886 == ms2_annot_sub["entropy_similarity"].max() 1887 ].head(1) 1888 species_results_all.append(ms2_annot_sub) 1889 1890 # These are the consensus annotations to the species level 1891 if len(species_results_all) > 0: 1892 species_results_all = pd.concat(species_results_all) 1893 species_results_all["annot_level"] = "species" 1894 else: 1895 # Make an empty dataframe 1896 species_results_all = ms2_annot.head(0) 1897 1898 # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level 1899 # Remove mf_ids that have consensus annotations to the species level 1900 ms2_annot_remaining = ms2_annot_spec[ 1901 ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique()) 1902 ] 1903 no_consensus = [] 1904 for mf_id in ms2_annot_remaining["mf_id"].unique(): 1905 id_sub = [] 1906 id_no_con = [] 1907 ms2_annot_sub_mf = ms2_annot_remaining[ 1908 ms2_annot_remaining["mf_id"] == mf_id 1909 ].copy() 1910 for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique(): 1911 ms2_annot_sub = ms2_annot_sub_mf[ 1912 ms2_annot_sub_mf["query_spectrum_id"] == query_scan 1913 ].copy() 1914 1915 # New columns for ranking [HIGHER RANK = BETTER] 1916 ms2_annot_sub["entropy_max"] = ( 1917 ms2_annot_sub["entropy_similarity"] 1918 == ms2_annot_sub["entropy_similarity"].max() 1919 ) 1920 ms2_annot_sub["ref_match_fract_max"] = ( 1921 ms2_annot_sub["ref_mz_in_query_fract"] 1922 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1923 ) 1924 ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply( 1925 lambda x: True if "MLF" in x else False 1926 ) 1927 1928 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1929 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1930 ["entropy_max", "ref_match_fract_max", "frag_max"] 1931 ].all(axis=1) 1932 ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]] 1933 id_sub.append(ms2_annot_sub_con) 1934 id_no_con.append(ms2_annot_sub) 1935 id_sub = pd.concat(id_sub) 1936 id_no_con = pd.concat(id_no_con) 1937 1938 # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level] 1939 if ( 1940 id_sub["query_frag_types"] 1941 .apply(lambda x: True if "MLF" in x else False) 1942 .all() 1943 and len(id_sub) > 0 1944 ): 1945 idx = id_sub.groupby("name")["entropy_similarity"].idxmax() 1946 id_sub = id_sub.loc[idx] 1947 # Reorder so highest entropy_similarity is first 1948 id_sub = id_sub.sort_values("entropy_similarity", ascending=False) 1949 id_sub["annot_level"] = id_sub["structure_level"] 1950 no_consensus.append(id_sub) 1951 1952 # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level 1953 elif len(id_sub) == 0: 1954 for lipid_summed_name in id_no_con["lipid_summed_name"].unique(): 1955 summed_sub = id_no_con[ 1956 id_no_con["lipid_summed_name"] == lipid_summed_name 1957 ] 1958 # Any consensus to MLF? 1959 if summed_sub["consensus"].any(): 1960 summed_sub = summed_sub[summed_sub["consensus"]] 1961 summed_sub["annot_level"] = summed_sub["structure_level"] 1962 no_consensus.append(summed_sub) 1963 else: 1964 # Grab the highest entropy_similarity, if there are multiple, grab the first one 1965 summed_sub = summed_sub[ 1966 summed_sub["entropy_similarity"] 1967 == summed_sub["entropy_similarity"].max() 1968 ].head(1) 1969 # get first row 1970 summed_sub["annot_level"] = "species" 1971 summed_sub["name"] = "" 1972 no_consensus.append(summed_sub) 1973 else: 1974 raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id) 1975 1976 if len(no_consensus) > 0: 1977 no_consensus = pd.concat(no_consensus) 1978 else: 1979 no_consensus = ms2_annot.head(0) 1980 1981 # Combine all the consensus annotations and reformat the dataframe for output 1982 species_results_all = species_results_all.drop(columns=["name"]) 1983 species_results_all["lipid_molecular_species_id"] = "" 1984 mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"] 1985 no_consensus["lipid_molecular_species_id"] = no_consensus["name"] 1986 consensus_annotations = pd.concat( 1987 [mlf_results_all, species_results_all, no_consensus] 1988 ) 1989 consensus_annotations = consensus_annotations.sort_values( 1990 "mf_id", ascending=True 1991 ) 1992 cols_to_keep = [ 1993 "mf_id", 1994 "ref_ion_type", 1995 "entropy_similarity", 1996 "ref_mz_in_query_fract", 1997 "lipid_molecular_species_id", 1998 "lipid_summed_name", 1999 "lipid_subclass", 2000 "lipid_class", 2001 "lipid_category", 2002 "formula", 2003 "annot_level", 2004 "n_spectra_contributing", 2005 ] 2006 consensus_annotations = consensus_annotations[cols_to_keep] 2007 consensus_annotations = consensus_annotations.set_index("mf_id") 2008 2009 return consensus_annotations 2010 2011 def clean_ms2_report(self, lipid_summary): 2012 """Clean the MS2 report. 2013 2014 Parameters 2015 ---------- 2016 lipid_summary : DataFrame 2017 The full lipid summary DataFrame. 2018 2019 Returns 2020 ------- 2021 DataFrame 2022 The cleaned lipid summary DataFrame. 2023 """ 2024 lipid_summary = lipid_summary.reset_index() 2025 lipid_summary["ion_formula"] = [ 2026 self.get_ion_formula(f, a) 2027 for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"]) 2028 ] 2029 2030 # Reorder columns 2031 lipid_summary = lipid_summary[ 2032 [ 2033 "mf_id", 2034 "ion_formula", 2035 "ref_ion_type", 2036 "formula", 2037 "annot_level", 2038 "lipid_molecular_species_id", 2039 "lipid_summed_name", 2040 "lipid_subclass", 2041 "lipid_class", 2042 "lipid_category", 2043 "entropy_similarity", 2044 "ref_mz_in_query_fract", 2045 "n_spectra_contributing", 2046 ] 2047 ] 2048 2049 # Set the index to mf_id 2050 lipid_summary = lipid_summary.set_index("mf_id") 2051 2052 return lipid_summary 2053 2054 def to_report(self, molecular_metadata=None): 2055 """Create a report of the mass features and their annotations. 2056 2057 Parameters 2058 ---------- 2059 molecular_metadata : dict, optional 2060 The molecular metadata. Default is None. 2061 2062 Returns 2063 ------- 2064 DataFrame 2065 The report of the mass features and their annotations. 2066 2067 Notes 2068 ----- 2069 The report will contain the mass features and their annotations from MS1 and MS2 (if available). 2070 """ 2071 # Get mass feature dataframe 2072 mf_report = self.mass_spectra.mass_features_to_df() 2073 mf_report = mf_report.reset_index(drop=False) 2074 2075 # Get and clean ms1 annotation dataframe 2076 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 2077 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 2078 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 2079 2080 # Get, summarize, and clean ms2 annotation dataframe 2081 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 2082 molecular_metadata=molecular_metadata 2083 ) 2084 if ms2_annot_report is not None and molecular_metadata is not None: 2085 ms2_annot_report = self.summarize_lipid_report(ms2_annot_report) 2086 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 2087 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 2088 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 2089 report = self.combine_reports( 2090 mf_report=mf_report, 2091 ms1_annot_report=ms1_annot_report, 2092 ms2_annot_report=ms2_annot_report 2093 ) 2094 return report 2095 2096
55class LowResGCMSExport: 56 """A class to export low resolution GC-MS data. 57 58 This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame. 59 60 Parameters: 61 ---------- 62 out_file_path : str 63 The output file path. 64 gcms : object 65 The low resolution GCMS object. 66 67 Attributes: 68 ---------- 69 output_file : Path 70 The output file path as a Path object. 71 gcms : object 72 The low resolution GCMS object. 73 74 Methods: 75 ------- 76 * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame. 77 * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string. 78 * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file. 79 * to_excel(write_mode='a', write_metadata=True, id_label="corems:"), 80 Export the data to an Excel file. 81 * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). 82 Export the data to a CSV file. 83 * to_hdf(id_label="corems:"). 84 Export the data to an HDF5 file. 85 * get_data_stats(gcms). 86 Get statistics about the GCMS data. 87 88 """ 89 90 def __init__(self, out_file_path, gcms): 91 self.output_file = Path(out_file_path) 92 93 self.gcms = gcms 94 95 self._init_columns() 96 97 def _init_columns(self): 98 """Initialize the column names for the exported data. 99 100 Returns: 101 ------- 102 list 103 The list of column names. 104 """ 105 106 columns = [ 107 "Sample name", 108 "Peak Index", 109 "Retention Time", 110 "Retention Time Ref", 111 "Peak Height", 112 "Peak Area", 113 "Retention index", 114 "Retention index Ref", 115 "Retention Index Score", 116 "Similarity Score", 117 "Spectral Similarity Score", 118 "Compound Name", 119 "Chebi ID", 120 "Kegg Compound ID", 121 "Inchi", 122 "Inchi Key", 123 "Smiles", 124 "Molecular Formula", 125 "IUPAC Name", 126 "Traditional Name", 127 "Common Name", 128 "Derivatization", 129 ] 130 131 if self.gcms.molecular_search_settings.exploratory_mode: 132 columns.extend( 133 [ 134 "Weighted Cosine Correlation", 135 "Cosine Correlation", 136 "Stein Scott Similarity", 137 "Pearson Correlation", 138 "Spearman Correlation", 139 "Kendall Tau Correlation", 140 "Euclidean Distance", 141 "Manhattan Distance", 142 "Jaccard Distance", 143 "DWT Correlation", 144 "DFT Correlation", 145 ] 146 ) 147 148 columns.extend(list(methods_name.values())) 149 150 return columns 151 152 def get_pandas_df(self, id_label="corems:"): 153 """Get the exported data as a Pandas DataFrame. 154 155 Parameters: 156 ---------- 157 id_label : str, optional 158 The ID label for the data. Default is "corems:". 159 160 Returns: 161 ------- 162 DataFrame 163 The exported data as a Pandas DataFrame. 164 """ 165 166 columns = self._init_columns() 167 168 dict_data_list = self.get_list_dict_data(self.gcms) 169 170 df = DataFrame(dict_data_list, columns=columns) 171 172 df.name = self.gcms.sample_name 173 174 return df 175 176 def get_json(self, nan=False, id_label="corems:"): 177 """Get the exported data as a JSON string. 178 179 Parameters: 180 ---------- 181 nan : bool, optional 182 Whether to include NaN values in the JSON string. Default is False. 183 id_label : str, optional 184 The ID label for the data. Default is "corems:". 185 186 """ 187 188 import json 189 190 dict_data_list = self.get_list_dict_data(self.gcms) 191 192 return json.dumps( 193 dict_data_list, sort_keys=False, indent=4, separators=(",", ": ") 194 ) 195 196 def to_pandas(self, write_metadata=True, id_label="corems:"): 197 """Export the data to a Pandas DataFrame and save it as a pickle file. 198 199 Parameters: 200 ---------- 201 write_metadata : bool, optional 202 Whether to write metadata to the output file. 203 id_label : str, optional 204 The ID label for the data. 205 """ 206 207 columns = self._init_columns() 208 209 dict_data_list = self.get_list_dict_data(self.gcms) 210 211 df = DataFrame(dict_data_list, columns=columns) 212 213 df.to_pickle(self.output_file.with_suffix(".pkl")) 214 215 if write_metadata: 216 self.write_settings( 217 self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:" 218 ) 219 220 def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"): 221 """Export the data to an Excel file. 222 223 Parameters: 224 ---------- 225 write_mode : str, optional 226 The write mode for the Excel file. Default is 'a' (append). 227 write_metadata : bool, optional 228 Whether to write metadata to the output file. Default is True. 229 id_label : str, optional 230 The ID label for the data. Default is "corems:". 231 """ 232 233 out_put_path = self.output_file.with_suffix(".xlsx") 234 235 columns = self._init_columns() 236 237 dict_data_list = self.get_list_dict_data(self.gcms) 238 239 df = DataFrame(dict_data_list, columns=columns) 240 241 if write_mode == "a" and out_put_path.exists(): 242 writer = ExcelWriter(out_put_path, engine="openpyxl") 243 # try to open an existing workbook 244 writer.book = load_workbook(out_put_path) 245 # copy existing sheets 246 writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets) 247 # read existing file 248 reader = read_excel(out_put_path) 249 # write out the new sheet 250 df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1) 251 252 writer.close() 253 else: 254 df.to_excel( 255 self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl" 256 ) 257 258 if write_metadata: 259 self.write_settings(out_put_path, self.gcms, id_label=id_label) 260 261 def to_csv( 262 self, 263 separate_output=False, 264 write_mode="w", 265 write_metadata=True, 266 id_label="corems:", 267 ): 268 """Export the data to a CSV file. 269 270 Parameters: 271 ---------- 272 separate_output : bool, optional 273 Whether to separate the output into multiple files. Default is False. 274 write_mode : str, optional 275 The write mode for the CSV file. Default is 'w' (write). 276 write_metadata : bool, optional 277 Whether to write metadata to the output file. Default is True. 278 id_label : str, optional 279 The ID label for the data. Default is "corems:". 280 """ 281 282 if separate_output: 283 # set write mode to write 284 # this mode will overwrite the file without warning 285 write_mode = "w" 286 else: 287 # set write mode to append 288 write_mode = "a" 289 290 columns = self._init_columns() 291 292 dict_data_list = self.get_list_dict_data(self.gcms) 293 294 out_put_path = self.output_file.with_suffix(".csv") 295 296 write_header = not out_put_path.exists() 297 298 try: 299 with open(out_put_path, write_mode, newline="") as csvfile: 300 writer = csv.DictWriter(csvfile, fieldnames=columns) 301 if write_header: 302 writer.writeheader() 303 for data in dict_data_list: 304 writer.writerow(data) 305 306 if write_metadata: 307 self.write_settings(out_put_path, self.gcms, id_label=id_label) 308 309 except IOError as ioerror: 310 print(ioerror) 311 312 def to_hdf(self, id_label="corems:"): 313 """Export the data to an HDF5 file. 314 315 Parameters: 316 ---------- 317 id_label : str, optional 318 The ID label for the data. Default is "corems:". 319 """ 320 321 # save sample at a time 322 def add_compound(gc_peak, compound_obj): 323 modifier = compound_obj.classify if compound_obj.classify else "" 324 compound_group = compound_obj.name.replace("/", "") + " " + modifier 325 326 if compound_group not in peak_group: 327 compound_group = peak_group.create_group(compound_group) 328 329 # compound_group.attrs["retention_time"] = compound_obj.retention_time 330 compound_group.attrs["retention_index"] = compound_obj.ri 331 compound_group.attrs["retention_index_score"] = compound_obj.ri_score 332 compound_group.attrs["spectral_similarity_score"] = ( 333 compound_obj.spectral_similarity_score 334 ) 335 compound_group.attrs["similarity_score"] = compound_obj.similarity_score 336 337 compond_mz = compound_group.create_dataset( 338 "mz", data=np.array(compound_obj.mz), dtype="f8" 339 ) 340 compond_abundance = compound_group.create_dataset( 341 "abundance", data=np.array(compound_obj.abundance), dtype="f8" 342 ) 343 344 if self.gcms.molecular_search_settings.exploratory_mode: 345 compound_group.attrs["Spectral Similarities"] = json.dumps( 346 compound_obj.spectral_similarity_scores, 347 sort_keys=False, 348 indent=4, 349 separators=(",", ":"), 350 ) 351 else: 352 warnings.warn("Skipping duplicate reference compound.") 353 354 import json 355 from datetime import datetime, timezone 356 357 import h5py 358 import numpy as np 359 360 output_path = self.output_file.with_suffix(".hdf5") 361 362 with h5py.File(output_path, "w") as hdf_handle: 363 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 364 hdf_handle.attrs["time_stamp"] = timenow 365 hdf_handle.attrs["data_structure"] = "gcms" 366 hdf_handle.attrs["analyzer"] = self.gcms.analyzer 367 hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label 368 369 hdf_handle.attrs["sample_id"] = "self.gcms.id" 370 hdf_handle.attrs["sample_name"] = self.gcms.sample_name 371 hdf_handle.attrs["input_data"] = str(self.gcms.file_location) 372 hdf_handle.attrs["output_data"] = str(output_path) 373 hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex 374 hdf_handle.attrs["corems_version"] = __version__ 375 376 hdf_handle.attrs["Stats"] = json.dumps( 377 self.get_data_stats(self.gcms), 378 sort_keys=False, 379 indent=4, 380 separators=(",", ": "), 381 ) 382 hdf_handle.attrs["Calibration"] = json.dumps( 383 self.get_calibration_stats(self.gcms, id_label), 384 sort_keys=False, 385 indent=4, 386 separators=(",", ": "), 387 ) 388 hdf_handle.attrs["Blank"] = json.dumps( 389 self.get_blank_stats(self.gcms), 390 sort_keys=False, 391 indent=4, 392 separators=(",", ": "), 393 ) 394 395 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms) 396 hdf_handle.attrs["CoreMSParameters"] = json.dumps( 397 corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ") 398 ) 399 400 scans_dataset = hdf_handle.create_dataset( 401 "scans", data=np.array(self.gcms.scans_number), dtype="f8" 402 ) 403 rt_dataset = hdf_handle.create_dataset( 404 "rt", data=np.array(self.gcms.retention_time), dtype="f8" 405 ) 406 tic_dataset = hdf_handle.create_dataset( 407 "tic", data=np.array(self.gcms.tic), dtype="f8" 408 ) 409 processed_tic_dataset = hdf_handle.create_dataset( 410 "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8" 411 ) 412 413 output_score_method = ( 414 self.gcms.molecular_search_settings.output_score_method 415 ) 416 417 for gc_peak in self.gcms: 418 # print(gc_peak.retention_time) 419 # print(gc_peak.tic) 420 421 # check if there is a compound candidate 422 peak_group = hdf_handle.create_group(str(gc_peak.retention_time)) 423 peak_group.attrs["deconvolution"] = int( 424 self.gcms.chromatogram_settings.use_deconvolution 425 ) 426 427 peak_group.attrs["start_scan"] = gc_peak.start_scan 428 peak_group.attrs["apex_scan"] = gc_peak.apex_scan 429 peak_group.attrs["final_scan"] = gc_peak.final_scan 430 431 peak_group.attrs["retention_index"] = gc_peak.ri 432 peak_group.attrs["retention_time"] = gc_peak.retention_time 433 peak_group.attrs["area"] = gc_peak.area 434 435 mz = peak_group.create_dataset( 436 "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8" 437 ) 438 abundance = peak_group.create_dataset( 439 "abundance", 440 data=np.array(gc_peak.mass_spectrum.abundance), 441 dtype="f8", 442 ) 443 444 if gc_peak: 445 if output_score_method == "highest_sim_score": 446 compound_obj = gc_peak.highest_score_compound 447 add_compound(gc_peak, compound_obj) 448 449 elif output_score_method == "highest_ss": 450 compound_obj = gc_peak.highest_ss_compound 451 add_compound(gc_peak, compound_obj) 452 453 else: 454 for compound_obj in gc_peak: 455 add_compound(gc_peak, compound_obj) 456 457 def get_data_stats(self, gcms): 458 """Get statistics about the GCMS data. 459 460 Parameters: 461 ---------- 462 gcms : object 463 The low resolution GCMS object. 464 465 Returns: 466 ------- 467 dict 468 A dictionary containing the data statistics. 469 """ 470 471 matched_peaks = gcms.matched_peaks 472 no_matched_peaks = gcms.no_matched_peaks 473 unique_metabolites = gcms.unique_metabolites 474 475 peak_matchs_above_0p85 = 0 476 unique_peak_match_above_0p85 = 0 477 for match_peak in matched_peaks: 478 gc_peak_above_85 = 0 479 matches_above_85 = list( 480 filter(lambda m: m.similarity_score >= 0.85, match_peak) 481 ) 482 if matches_above_85: 483 peak_matchs_above_0p85 += 1 484 if len(matches_above_85) == 1: 485 unique_peak_match_above_0p85 += 1 486 487 data_stats = {} 488 data_stats["average_signal_noise"] = "ni" 489 data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range 490 data_stats["total_number_peaks"] = len(gcms) 491 data_stats["total_peaks_matched"] = len(matched_peaks) 492 data_stats["total_peaks_without_matches"] = len(no_matched_peaks) 493 data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85 494 data_stats["single_matches_above_similarity_score_0.85"] = ( 495 unique_peak_match_above_0p85 496 ) 497 data_stats["unique_metabolites"] = len(unique_metabolites) 498 499 return data_stats 500 501 def get_calibration_stats(self, gcms, id_label): 502 """Get statistics about the GC-MS calibration. 503 504 Parameters: 505 ---------- 506 """ 507 calibration_parameters = {} 508 509 calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref 510 calibration_parameters["data_url"] = str(gcms.cal_file_path) 511 calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path) 512 calibration_parameters["data_name"] = str(gcms.cal_file_path.stem) 513 calibration_parameters["calibration_method"] = "" 514 515 return calibration_parameters 516 517 def get_blank_stats(self, gcms): 518 """Get statistics about the GC-MS blank.""" 519 blank_parameters = {} 520 521 blank_parameters["data_name"] = "ni" 522 blank_parameters["blank_id"] = "ni" 523 blank_parameters["data_url"] = "ni" 524 blank_parameters["has_input"] = "ni" 525 blank_parameters["common_features_to_blank"] = "ni" 526 527 return blank_parameters 528 529 def get_instrument_metadata(self, gcms): 530 """Get metadata about the GC-MS instrument.""" 531 instrument_metadata = {} 532 533 instrument_metadata["analyzer"] = gcms.analyzer 534 instrument_metadata["instrument_label"] = gcms.instrument_label 535 instrument_metadata["instrument_id"] = uuid.uuid4().hex 536 537 return instrument_metadata 538 539 def get_data_metadata(self, gcms, id_label, output_path): 540 """Get metadata about the GC-MS data. 541 542 Parameters: 543 ---------- 544 gcms : object 545 The low resolution GCMS object. 546 id_label : str 547 The ID label for the data. 548 output_path : str 549 The output file path. 550 551 Returns: 552 ------- 553 dict 554 A dictionary containing the data metadata. 555 """ 556 if isinstance(output_path, str): 557 output_path = Path(output_path) 558 559 paramaters_path = output_path.with_suffix(".json") 560 561 if paramaters_path.exists(): 562 with paramaters_path.open() as current_param: 563 metadata = json.load(current_param) 564 data_metadata = metadata.get("Data") 565 else: 566 data_metadata = {} 567 data_metadata["data_name"] = [] 568 data_metadata["input_data_url"] = [] 569 data_metadata["has_input"] = [] 570 571 data_metadata["data_name"].append(gcms.sample_name) 572 data_metadata["input_data_url"].append(str(gcms.file_location)) 573 data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location)) 574 575 data_metadata["output_data_name"] = str(output_path.stem) 576 data_metadata["output_data_url"] = str(output_path) 577 data_metadata["has_output"] = id_label + corems_md5(output_path) 578 579 return data_metadata 580 581 def get_parameters_json(self, gcms, id_label, output_path): 582 """Get the parameters as a JSON string. 583 584 Parameters: 585 ---------- 586 gcms : GCMS object 587 The low resolution GCMS object. 588 id_label : str 589 The ID label for the data. 590 output_path : str 591 The output file path. 592 593 Returns: 594 ------- 595 str 596 The parameters as a JSON string. 597 """ 598 599 output_parameters_dict = {} 600 output_parameters_dict["Data"] = self.get_data_metadata( 601 gcms, id_label, output_path 602 ) 603 output_parameters_dict["Stats"] = self.get_data_stats(gcms) 604 output_parameters_dict["Calibration"] = self.get_calibration_stats( 605 gcms, id_label 606 ) 607 output_parameters_dict["Blank"] = self.get_blank_stats(gcms) 608 output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms) 609 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms) 610 corems_dict_setting["corems_version"] = __version__ 611 output_parameters_dict["CoreMSParameters"] = corems_dict_setting 612 output_parameters_dict["has_metabolite"] = gcms.metabolites_data 613 output = json.dumps( 614 output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ") 615 ) 616 617 return output 618 619 def write_settings(self, output_path, gcms, id_label="emsl:"): 620 """Write the settings to a JSON file. 621 622 Parameters: 623 ---------- 624 output_path : str 625 The output file path. 626 gcms : GCMS object 627 The low resolution GCMS object. 628 id_label : str 629 The ID label for the data. Default is "emsl:". 630 631 """ 632 633 output = self.get_parameters_json(gcms, id_label, output_path) 634 635 with open( 636 output_path.with_suffix(".json"), 637 "w", 638 encoding="utf8", 639 ) as outfile: 640 outfile.write(output) 641 642 def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False): 643 """Get the exported data as a list of dictionaries. 644 645 Parameters: 646 ---------- 647 gcms : object 648 The low resolution GCMS object. 649 include_no_match : bool, optional 650 Whether to include no match data. Default is True. 651 no_match_inline : bool, optional 652 Whether to include no match data inline. Default is False. 653 654 Returns: 655 ------- 656 list 657 The exported data as a list of dictionaries. 658 """ 659 660 output_score_method = gcms.molecular_search_settings.output_score_method 661 662 dict_data_list = [] 663 664 def add_match_dict_data(): 665 derivatization = "{}:{}:{}".format( 666 compound_obj.classify, 667 compound_obj.derivativenum, 668 compound_obj.derivatization, 669 ) 670 out_dict = { 671 "Sample name": gcms.sample_name, 672 "Peak Index": gcpeak_index, 673 "Retention Time": gc_peak.retention_time, 674 "Retention Time Ref": compound_obj.retention_time, 675 "Peak Height": gc_peak.tic, 676 "Peak Area": gc_peak.area, 677 "Retention index": gc_peak.ri, 678 "Retention index Ref": compound_obj.ri, 679 "Retention Index Score": compound_obj.ri_score, 680 "Spectral Similarity Score": compound_obj.spectral_similarity_score, 681 "Similarity Score": compound_obj.similarity_score, 682 "Compound Name": compound_obj.name, 683 "Chebi ID": compound_obj.metadata.chebi, 684 "Kegg Compound ID": compound_obj.metadata.kegg, 685 "Inchi": compound_obj.metadata.inchi, 686 "Inchi Key": compound_obj.metadata.inchikey, 687 "Smiles": compound_obj.metadata.smiles, 688 "Molecular Formula": compound_obj.formula, 689 "IUPAC Name": compound_obj.metadata.iupac_name, 690 "Traditional Name": compound_obj.metadata.traditional_name, 691 "Common Name": compound_obj.metadata.common_name, 692 "Derivatization": derivatization, 693 } 694 695 if self.gcms.molecular_search_settings.exploratory_mode: 696 out_dict.update( 697 { 698 "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get( 699 "weighted_cosine_correlation" 700 ), 701 "Cosine Correlation": compound_obj.spectral_similarity_scores.get( 702 "cosine_correlation" 703 ), 704 "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get( 705 "stein_scott_similarity" 706 ), 707 "Pearson Correlation": compound_obj.spectral_similarity_scores.get( 708 "pearson_correlation" 709 ), 710 "Spearman Correlation": compound_obj.spectral_similarity_scores.get( 711 "spearman_correlation" 712 ), 713 "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get( 714 "kendall_tau_correlation" 715 ), 716 "DFT Correlation": compound_obj.spectral_similarity_scores.get( 717 "dft_correlation" 718 ), 719 "DWT Correlation": compound_obj.spectral_similarity_scores.get( 720 "dwt_correlation" 721 ), 722 "Euclidean Distance": compound_obj.spectral_similarity_scores.get( 723 "euclidean_distance" 724 ), 725 "Manhattan Distance": compound_obj.spectral_similarity_scores.get( 726 "manhattan_distance" 727 ), 728 "Jaccard Distance": compound_obj.spectral_similarity_scores.get( 729 "jaccard_distance" 730 ), 731 } 732 ) 733 for method in methods_name: 734 out_dict[methods_name.get(method)] = ( 735 compound_obj.spectral_similarity_scores.get(method) 736 ) 737 738 dict_data_list.append(out_dict) 739 740 def add_no_match_dict_data(): 741 dict_data_list.append( 742 { 743 "Sample name": gcms.sample_name, 744 "Peak Index": gcpeak_index, 745 "Retention Time": gc_peak.retention_time, 746 "Peak Height": gc_peak.tic, 747 "Peak Area": gc_peak.area, 748 "Retention index": gc_peak.ri, 749 } 750 ) 751 752 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 753 # check if there is a compound candidate 754 if gc_peak: 755 if output_score_method == "highest_sim_score": 756 compound_obj = gc_peak.highest_score_compound 757 add_match_dict_data() 758 759 elif output_score_method == "highest_ss": 760 compound_obj = gc_peak.highest_ss_compound 761 add_match_dict_data() 762 763 else: 764 for compound_obj in gc_peak: 765 add_match_dict_data() # add monoisotopic peak 766 767 else: 768 # include not_match 769 if include_no_match and no_match_inline: 770 add_no_match_dict_data() 771 772 if include_no_match and not no_match_inline: 773 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 774 if not gc_peak: 775 add_no_match_dict_data() 776 777 return dict_data_list
A class to export low resolution GC-MS data.
This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
Parameters:
out_file_path : str The output file path. gcms : object The low resolution GCMS object.
Attributes:
output_file : Path The output file path as a Path object. gcms : object The low resolution GCMS object.
Methods:
- get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
- get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
- to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
- to_excel(write_mode='a', write_metadata=True, id_label="corems:"), Export the data to an Excel file.
- to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). Export the data to a CSV file.
- to_hdf(id_label="corems:"). Export the data to an HDF5 file.
- get_data_stats(gcms). Get statistics about the GCMS data.
152 def get_pandas_df(self, id_label="corems:"): 153 """Get the exported data as a Pandas DataFrame. 154 155 Parameters: 156 ---------- 157 id_label : str, optional 158 The ID label for the data. Default is "corems:". 159 160 Returns: 161 ------- 162 DataFrame 163 The exported data as a Pandas DataFrame. 164 """ 165 166 columns = self._init_columns() 167 168 dict_data_list = self.get_list_dict_data(self.gcms) 169 170 df = DataFrame(dict_data_list, columns=columns) 171 172 df.name = self.gcms.sample_name 173 174 return df
Get the exported data as a Pandas DataFrame.
Parameters:
id_label : str, optional The ID label for the data. Default is "corems:".
Returns:
DataFrame The exported data as a Pandas DataFrame.
176 def get_json(self, nan=False, id_label="corems:"): 177 """Get the exported data as a JSON string. 178 179 Parameters: 180 ---------- 181 nan : bool, optional 182 Whether to include NaN values in the JSON string. Default is False. 183 id_label : str, optional 184 The ID label for the data. Default is "corems:". 185 186 """ 187 188 import json 189 190 dict_data_list = self.get_list_dict_data(self.gcms) 191 192 return json.dumps( 193 dict_data_list, sort_keys=False, indent=4, separators=(",", ": ") 194 )
Get the exported data as a JSON string.
Parameters:
nan : bool, optional Whether to include NaN values in the JSON string. Default is False. id_label : str, optional The ID label for the data. Default is "corems:".
196 def to_pandas(self, write_metadata=True, id_label="corems:"): 197 """Export the data to a Pandas DataFrame and save it as a pickle file. 198 199 Parameters: 200 ---------- 201 write_metadata : bool, optional 202 Whether to write metadata to the output file. 203 id_label : str, optional 204 The ID label for the data. 205 """ 206 207 columns = self._init_columns() 208 209 dict_data_list = self.get_list_dict_data(self.gcms) 210 211 df = DataFrame(dict_data_list, columns=columns) 212 213 df.to_pickle(self.output_file.with_suffix(".pkl")) 214 215 if write_metadata: 216 self.write_settings( 217 self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:" 218 )
Export the data to a Pandas DataFrame and save it as a pickle file.
Parameters:
write_metadata : bool, optional Whether to write metadata to the output file. id_label : str, optional The ID label for the data.
220 def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"): 221 """Export the data to an Excel file. 222 223 Parameters: 224 ---------- 225 write_mode : str, optional 226 The write mode for the Excel file. Default is 'a' (append). 227 write_metadata : bool, optional 228 Whether to write metadata to the output file. Default is True. 229 id_label : str, optional 230 The ID label for the data. Default is "corems:". 231 """ 232 233 out_put_path = self.output_file.with_suffix(".xlsx") 234 235 columns = self._init_columns() 236 237 dict_data_list = self.get_list_dict_data(self.gcms) 238 239 df = DataFrame(dict_data_list, columns=columns) 240 241 if write_mode == "a" and out_put_path.exists(): 242 writer = ExcelWriter(out_put_path, engine="openpyxl") 243 # try to open an existing workbook 244 writer.book = load_workbook(out_put_path) 245 # copy existing sheets 246 writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets) 247 # read existing file 248 reader = read_excel(out_put_path) 249 # write out the new sheet 250 df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1) 251 252 writer.close() 253 else: 254 df.to_excel( 255 self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl" 256 ) 257 258 if write_metadata: 259 self.write_settings(out_put_path, self.gcms, id_label=id_label)
Export the data to an Excel file.
Parameters:
write_mode : str, optional The write mode for the Excel file. Default is 'a' (append). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".
261 def to_csv( 262 self, 263 separate_output=False, 264 write_mode="w", 265 write_metadata=True, 266 id_label="corems:", 267 ): 268 """Export the data to a CSV file. 269 270 Parameters: 271 ---------- 272 separate_output : bool, optional 273 Whether to separate the output into multiple files. Default is False. 274 write_mode : str, optional 275 The write mode for the CSV file. Default is 'w' (write). 276 write_metadata : bool, optional 277 Whether to write metadata to the output file. Default is True. 278 id_label : str, optional 279 The ID label for the data. Default is "corems:". 280 """ 281 282 if separate_output: 283 # set write mode to write 284 # this mode will overwrite the file without warning 285 write_mode = "w" 286 else: 287 # set write mode to append 288 write_mode = "a" 289 290 columns = self._init_columns() 291 292 dict_data_list = self.get_list_dict_data(self.gcms) 293 294 out_put_path = self.output_file.with_suffix(".csv") 295 296 write_header = not out_put_path.exists() 297 298 try: 299 with open(out_put_path, write_mode, newline="") as csvfile: 300 writer = csv.DictWriter(csvfile, fieldnames=columns) 301 if write_header: 302 writer.writeheader() 303 for data in dict_data_list: 304 writer.writerow(data) 305 306 if write_metadata: 307 self.write_settings(out_put_path, self.gcms, id_label=id_label) 308 309 except IOError as ioerror: 310 print(ioerror)
Export the data to a CSV file.
Parameters:
separate_output : bool, optional Whether to separate the output into multiple files. Default is False. write_mode : str, optional The write mode for the CSV file. Default is 'w' (write). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".
312 def to_hdf(self, id_label="corems:"): 313 """Export the data to an HDF5 file. 314 315 Parameters: 316 ---------- 317 id_label : str, optional 318 The ID label for the data. Default is "corems:". 319 """ 320 321 # save sample at a time 322 def add_compound(gc_peak, compound_obj): 323 modifier = compound_obj.classify if compound_obj.classify else "" 324 compound_group = compound_obj.name.replace("/", "") + " " + modifier 325 326 if compound_group not in peak_group: 327 compound_group = peak_group.create_group(compound_group) 328 329 # compound_group.attrs["retention_time"] = compound_obj.retention_time 330 compound_group.attrs["retention_index"] = compound_obj.ri 331 compound_group.attrs["retention_index_score"] = compound_obj.ri_score 332 compound_group.attrs["spectral_similarity_score"] = ( 333 compound_obj.spectral_similarity_score 334 ) 335 compound_group.attrs["similarity_score"] = compound_obj.similarity_score 336 337 compond_mz = compound_group.create_dataset( 338 "mz", data=np.array(compound_obj.mz), dtype="f8" 339 ) 340 compond_abundance = compound_group.create_dataset( 341 "abundance", data=np.array(compound_obj.abundance), dtype="f8" 342 ) 343 344 if self.gcms.molecular_search_settings.exploratory_mode: 345 compound_group.attrs["Spectral Similarities"] = json.dumps( 346 compound_obj.spectral_similarity_scores, 347 sort_keys=False, 348 indent=4, 349 separators=(",", ":"), 350 ) 351 else: 352 warnings.warn("Skipping duplicate reference compound.") 353 354 import json 355 from datetime import datetime, timezone 356 357 import h5py 358 import numpy as np 359 360 output_path = self.output_file.with_suffix(".hdf5") 361 362 with h5py.File(output_path, "w") as hdf_handle: 363 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 364 hdf_handle.attrs["time_stamp"] = timenow 365 hdf_handle.attrs["data_structure"] = "gcms" 366 hdf_handle.attrs["analyzer"] = self.gcms.analyzer 367 hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label 368 369 hdf_handle.attrs["sample_id"] = "self.gcms.id" 370 hdf_handle.attrs["sample_name"] = self.gcms.sample_name 371 hdf_handle.attrs["input_data"] = str(self.gcms.file_location) 372 hdf_handle.attrs["output_data"] = str(output_path) 373 hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex 374 hdf_handle.attrs["corems_version"] = __version__ 375 376 hdf_handle.attrs["Stats"] = json.dumps( 377 self.get_data_stats(self.gcms), 378 sort_keys=False, 379 indent=4, 380 separators=(",", ": "), 381 ) 382 hdf_handle.attrs["Calibration"] = json.dumps( 383 self.get_calibration_stats(self.gcms, id_label), 384 sort_keys=False, 385 indent=4, 386 separators=(",", ": "), 387 ) 388 hdf_handle.attrs["Blank"] = json.dumps( 389 self.get_blank_stats(self.gcms), 390 sort_keys=False, 391 indent=4, 392 separators=(",", ": "), 393 ) 394 395 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms) 396 hdf_handle.attrs["CoreMSParameters"] = json.dumps( 397 corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ") 398 ) 399 400 scans_dataset = hdf_handle.create_dataset( 401 "scans", data=np.array(self.gcms.scans_number), dtype="f8" 402 ) 403 rt_dataset = hdf_handle.create_dataset( 404 "rt", data=np.array(self.gcms.retention_time), dtype="f8" 405 ) 406 tic_dataset = hdf_handle.create_dataset( 407 "tic", data=np.array(self.gcms.tic), dtype="f8" 408 ) 409 processed_tic_dataset = hdf_handle.create_dataset( 410 "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8" 411 ) 412 413 output_score_method = ( 414 self.gcms.molecular_search_settings.output_score_method 415 ) 416 417 for gc_peak in self.gcms: 418 # print(gc_peak.retention_time) 419 # print(gc_peak.tic) 420 421 # check if there is a compound candidate 422 peak_group = hdf_handle.create_group(str(gc_peak.retention_time)) 423 peak_group.attrs["deconvolution"] = int( 424 self.gcms.chromatogram_settings.use_deconvolution 425 ) 426 427 peak_group.attrs["start_scan"] = gc_peak.start_scan 428 peak_group.attrs["apex_scan"] = gc_peak.apex_scan 429 peak_group.attrs["final_scan"] = gc_peak.final_scan 430 431 peak_group.attrs["retention_index"] = gc_peak.ri 432 peak_group.attrs["retention_time"] = gc_peak.retention_time 433 peak_group.attrs["area"] = gc_peak.area 434 435 mz = peak_group.create_dataset( 436 "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8" 437 ) 438 abundance = peak_group.create_dataset( 439 "abundance", 440 data=np.array(gc_peak.mass_spectrum.abundance), 441 dtype="f8", 442 ) 443 444 if gc_peak: 445 if output_score_method == "highest_sim_score": 446 compound_obj = gc_peak.highest_score_compound 447 add_compound(gc_peak, compound_obj) 448 449 elif output_score_method == "highest_ss": 450 compound_obj = gc_peak.highest_ss_compound 451 add_compound(gc_peak, compound_obj) 452 453 else: 454 for compound_obj in gc_peak: 455 add_compound(gc_peak, compound_obj)
Export the data to an HDF5 file.
Parameters:
id_label : str, optional The ID label for the data. Default is "corems:".
457 def get_data_stats(self, gcms): 458 """Get statistics about the GCMS data. 459 460 Parameters: 461 ---------- 462 gcms : object 463 The low resolution GCMS object. 464 465 Returns: 466 ------- 467 dict 468 A dictionary containing the data statistics. 469 """ 470 471 matched_peaks = gcms.matched_peaks 472 no_matched_peaks = gcms.no_matched_peaks 473 unique_metabolites = gcms.unique_metabolites 474 475 peak_matchs_above_0p85 = 0 476 unique_peak_match_above_0p85 = 0 477 for match_peak in matched_peaks: 478 gc_peak_above_85 = 0 479 matches_above_85 = list( 480 filter(lambda m: m.similarity_score >= 0.85, match_peak) 481 ) 482 if matches_above_85: 483 peak_matchs_above_0p85 += 1 484 if len(matches_above_85) == 1: 485 unique_peak_match_above_0p85 += 1 486 487 data_stats = {} 488 data_stats["average_signal_noise"] = "ni" 489 data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range 490 data_stats["total_number_peaks"] = len(gcms) 491 data_stats["total_peaks_matched"] = len(matched_peaks) 492 data_stats["total_peaks_without_matches"] = len(no_matched_peaks) 493 data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85 494 data_stats["single_matches_above_similarity_score_0.85"] = ( 495 unique_peak_match_above_0p85 496 ) 497 data_stats["unique_metabolites"] = len(unique_metabolites) 498 499 return data_stats
Get statistics about the GCMS data.
Parameters:
gcms : object The low resolution GCMS object.
Returns:
dict A dictionary containing the data statistics.
501 def get_calibration_stats(self, gcms, id_label): 502 """Get statistics about the GC-MS calibration. 503 504 Parameters: 505 ---------- 506 """ 507 calibration_parameters = {} 508 509 calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref 510 calibration_parameters["data_url"] = str(gcms.cal_file_path) 511 calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path) 512 calibration_parameters["data_name"] = str(gcms.cal_file_path.stem) 513 calibration_parameters["calibration_method"] = "" 514 515 return calibration_parameters
Get statistics about the GC-MS calibration.
Parameters:
517 def get_blank_stats(self, gcms): 518 """Get statistics about the GC-MS blank.""" 519 blank_parameters = {} 520 521 blank_parameters["data_name"] = "ni" 522 blank_parameters["blank_id"] = "ni" 523 blank_parameters["data_url"] = "ni" 524 blank_parameters["has_input"] = "ni" 525 blank_parameters["common_features_to_blank"] = "ni" 526 527 return blank_parameters
Get statistics about the GC-MS blank.
529 def get_instrument_metadata(self, gcms): 530 """Get metadata about the GC-MS instrument.""" 531 instrument_metadata = {} 532 533 instrument_metadata["analyzer"] = gcms.analyzer 534 instrument_metadata["instrument_label"] = gcms.instrument_label 535 instrument_metadata["instrument_id"] = uuid.uuid4().hex 536 537 return instrument_metadata
Get metadata about the GC-MS instrument.
539 def get_data_metadata(self, gcms, id_label, output_path): 540 """Get metadata about the GC-MS data. 541 542 Parameters: 543 ---------- 544 gcms : object 545 The low resolution GCMS object. 546 id_label : str 547 The ID label for the data. 548 output_path : str 549 The output file path. 550 551 Returns: 552 ------- 553 dict 554 A dictionary containing the data metadata. 555 """ 556 if isinstance(output_path, str): 557 output_path = Path(output_path) 558 559 paramaters_path = output_path.with_suffix(".json") 560 561 if paramaters_path.exists(): 562 with paramaters_path.open() as current_param: 563 metadata = json.load(current_param) 564 data_metadata = metadata.get("Data") 565 else: 566 data_metadata = {} 567 data_metadata["data_name"] = [] 568 data_metadata["input_data_url"] = [] 569 data_metadata["has_input"] = [] 570 571 data_metadata["data_name"].append(gcms.sample_name) 572 data_metadata["input_data_url"].append(str(gcms.file_location)) 573 data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location)) 574 575 data_metadata["output_data_name"] = str(output_path.stem) 576 data_metadata["output_data_url"] = str(output_path) 577 data_metadata["has_output"] = id_label + corems_md5(output_path) 578 579 return data_metadata
Get metadata about the GC-MS data.
Parameters:
gcms : object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.
Returns:
dict A dictionary containing the data metadata.
581 def get_parameters_json(self, gcms, id_label, output_path): 582 """Get the parameters as a JSON string. 583 584 Parameters: 585 ---------- 586 gcms : GCMS object 587 The low resolution GCMS object. 588 id_label : str 589 The ID label for the data. 590 output_path : str 591 The output file path. 592 593 Returns: 594 ------- 595 str 596 The parameters as a JSON string. 597 """ 598 599 output_parameters_dict = {} 600 output_parameters_dict["Data"] = self.get_data_metadata( 601 gcms, id_label, output_path 602 ) 603 output_parameters_dict["Stats"] = self.get_data_stats(gcms) 604 output_parameters_dict["Calibration"] = self.get_calibration_stats( 605 gcms, id_label 606 ) 607 output_parameters_dict["Blank"] = self.get_blank_stats(gcms) 608 output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms) 609 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms) 610 corems_dict_setting["corems_version"] = __version__ 611 output_parameters_dict["CoreMSParameters"] = corems_dict_setting 612 output_parameters_dict["has_metabolite"] = gcms.metabolites_data 613 output = json.dumps( 614 output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ") 615 ) 616 617 return output
Get the parameters as a JSON string.
Parameters:
gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.
Returns:
str The parameters as a JSON string.
619 def write_settings(self, output_path, gcms, id_label="emsl:"): 620 """Write the settings to a JSON file. 621 622 Parameters: 623 ---------- 624 output_path : str 625 The output file path. 626 gcms : GCMS object 627 The low resolution GCMS object. 628 id_label : str 629 The ID label for the data. Default is "emsl:". 630 631 """ 632 633 output = self.get_parameters_json(gcms, id_label, output_path) 634 635 with open( 636 output_path.with_suffix(".json"), 637 "w", 638 encoding="utf8", 639 ) as outfile: 640 outfile.write(output)
Write the settings to a JSON file.
Parameters:
output_path : str The output file path. gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. Default is "emsl:".
642 def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False): 643 """Get the exported data as a list of dictionaries. 644 645 Parameters: 646 ---------- 647 gcms : object 648 The low resolution GCMS object. 649 include_no_match : bool, optional 650 Whether to include no match data. Default is True. 651 no_match_inline : bool, optional 652 Whether to include no match data inline. Default is False. 653 654 Returns: 655 ------- 656 list 657 The exported data as a list of dictionaries. 658 """ 659 660 output_score_method = gcms.molecular_search_settings.output_score_method 661 662 dict_data_list = [] 663 664 def add_match_dict_data(): 665 derivatization = "{}:{}:{}".format( 666 compound_obj.classify, 667 compound_obj.derivativenum, 668 compound_obj.derivatization, 669 ) 670 out_dict = { 671 "Sample name": gcms.sample_name, 672 "Peak Index": gcpeak_index, 673 "Retention Time": gc_peak.retention_time, 674 "Retention Time Ref": compound_obj.retention_time, 675 "Peak Height": gc_peak.tic, 676 "Peak Area": gc_peak.area, 677 "Retention index": gc_peak.ri, 678 "Retention index Ref": compound_obj.ri, 679 "Retention Index Score": compound_obj.ri_score, 680 "Spectral Similarity Score": compound_obj.spectral_similarity_score, 681 "Similarity Score": compound_obj.similarity_score, 682 "Compound Name": compound_obj.name, 683 "Chebi ID": compound_obj.metadata.chebi, 684 "Kegg Compound ID": compound_obj.metadata.kegg, 685 "Inchi": compound_obj.metadata.inchi, 686 "Inchi Key": compound_obj.metadata.inchikey, 687 "Smiles": compound_obj.metadata.smiles, 688 "Molecular Formula": compound_obj.formula, 689 "IUPAC Name": compound_obj.metadata.iupac_name, 690 "Traditional Name": compound_obj.metadata.traditional_name, 691 "Common Name": compound_obj.metadata.common_name, 692 "Derivatization": derivatization, 693 } 694 695 if self.gcms.molecular_search_settings.exploratory_mode: 696 out_dict.update( 697 { 698 "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get( 699 "weighted_cosine_correlation" 700 ), 701 "Cosine Correlation": compound_obj.spectral_similarity_scores.get( 702 "cosine_correlation" 703 ), 704 "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get( 705 "stein_scott_similarity" 706 ), 707 "Pearson Correlation": compound_obj.spectral_similarity_scores.get( 708 "pearson_correlation" 709 ), 710 "Spearman Correlation": compound_obj.spectral_similarity_scores.get( 711 "spearman_correlation" 712 ), 713 "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get( 714 "kendall_tau_correlation" 715 ), 716 "DFT Correlation": compound_obj.spectral_similarity_scores.get( 717 "dft_correlation" 718 ), 719 "DWT Correlation": compound_obj.spectral_similarity_scores.get( 720 "dwt_correlation" 721 ), 722 "Euclidean Distance": compound_obj.spectral_similarity_scores.get( 723 "euclidean_distance" 724 ), 725 "Manhattan Distance": compound_obj.spectral_similarity_scores.get( 726 "manhattan_distance" 727 ), 728 "Jaccard Distance": compound_obj.spectral_similarity_scores.get( 729 "jaccard_distance" 730 ), 731 } 732 ) 733 for method in methods_name: 734 out_dict[methods_name.get(method)] = ( 735 compound_obj.spectral_similarity_scores.get(method) 736 ) 737 738 dict_data_list.append(out_dict) 739 740 def add_no_match_dict_data(): 741 dict_data_list.append( 742 { 743 "Sample name": gcms.sample_name, 744 "Peak Index": gcpeak_index, 745 "Retention Time": gc_peak.retention_time, 746 "Peak Height": gc_peak.tic, 747 "Peak Area": gc_peak.area, 748 "Retention index": gc_peak.ri, 749 } 750 ) 751 752 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 753 # check if there is a compound candidate 754 if gc_peak: 755 if output_score_method == "highest_sim_score": 756 compound_obj = gc_peak.highest_score_compound 757 add_match_dict_data() 758 759 elif output_score_method == "highest_ss": 760 compound_obj = gc_peak.highest_ss_compound 761 add_match_dict_data() 762 763 else: 764 for compound_obj in gc_peak: 765 add_match_dict_data() # add monoisotopic peak 766 767 else: 768 # include not_match 769 if include_no_match and no_match_inline: 770 add_no_match_dict_data() 771 772 if include_no_match and not no_match_inline: 773 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 774 if not gc_peak: 775 add_no_match_dict_data() 776 777 return dict_data_list
Get the exported data as a list of dictionaries.
Parameters:
gcms : object The low resolution GCMS object. include_no_match : bool, optional Whether to include no match data. Default is True. no_match_inline : bool, optional Whether to include no match data inline. Default is False.
Returns:
list The exported data as a list of dictionaries.
780class HighResMassSpectraExport(HighResMassSpecExport): 781 """A class to export high resolution mass spectra data. 782 783 This class provides methods to export high resolution mass spectra data to various formats 784 such as Excel, CSV, HDF5, and Pandas DataFrame. 785 786 Parameters 787 ---------- 788 out_file_path : str | Path 789 The output file path. 790 mass_spectra : object 791 The high resolution mass spectra object. 792 output_type : str, optional 793 The output type. Default is 'excel'. 794 795 Attributes 796 ---------- 797 output_file : Path 798 The output file path without suffix 799 dir_loc : Path 800 The directory location for the output file, 801 by default this will be the output_file + ".corems" and all output files will be 802 written into this location 803 mass_spectra : MassSpectraBase 804 The high resolution mass spectra object. 805 """ 806 807 def __init__(self, out_file_path, mass_spectra, output_type="excel"): 808 super().__init__( 809 out_file_path=out_file_path, mass_spectrum=None, output_type=output_type 810 ) 811 812 self.dir_loc = Path(out_file_path + ".corems") 813 self.dir_loc.mkdir(exist_ok=True) 814 # Place the output file in the directory 815 self.output_file = self.dir_loc / Path(out_file_path).name 816 self._output_type = output_type # 'excel', 'csv', 'pandas' or 'hdf5' 817 self.mass_spectra = mass_spectra 818 self.atoms_order_list = None 819 self._init_columns() 820 821 def get_pandas_df(self): 822 """Get the mass spectra as a list of Pandas DataFrames.""" 823 824 list_df = [] 825 826 for mass_spectrum in self.mass_spectra: 827 columns = self.columns_label + self.get_all_used_atoms_in_order( 828 mass_spectrum 829 ) 830 831 dict_data_list = self.get_list_dict_data(mass_spectrum) 832 833 df = DataFrame(dict_data_list, columns=columns) 834 835 scan_number = mass_spectrum.scan_number 836 837 df.name = str(self.output_file) + "_" + str(scan_number) 838 839 list_df.append(df) 840 841 return list_df 842 843 def to_pandas(self, write_metadata=True): 844 """Export the data to a Pandas DataFrame and save it as a pickle file. 845 846 Parameters: 847 ---------- 848 write_metadata : bool, optional 849 Whether to write metadata to the output file. Default is True. 850 """ 851 852 for mass_spectrum in self.mass_spectra: 853 columns = self.columns_label + self.get_all_used_atoms_in_order( 854 mass_spectrum 855 ) 856 857 dict_data_list = self.get_list_dict_data(mass_spectrum) 858 859 df = DataFrame(dict_data_list, columns=columns) 860 861 scan_number = mass_spectrum.scan_number 862 863 out_filename = Path( 864 "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl") 865 ) 866 867 df.to_pickle(self.dir_loc / out_filename) 868 869 if write_metadata: 870 self.write_settings( 871 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 872 ) 873 874 def to_excel(self, write_metadata=True): 875 """Export the data to an Excel file. 876 877 Parameters: 878 ---------- 879 write_metadata : bool, optional 880 Whether to write metadata to the output file. Default is True. 881 """ 882 for mass_spectrum in self.mass_spectra: 883 columns = self.columns_label + self.get_all_used_atoms_in_order( 884 mass_spectrum 885 ) 886 887 dict_data_list = self.get_list_dict_data(mass_spectrum) 888 889 df = DataFrame(dict_data_list, columns=columns) 890 891 scan_number = mass_spectrum.scan_number 892 893 out_filename = Path( 894 "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx") 895 ) 896 897 df.to_excel(self.dir_loc / out_filename) 898 899 if write_metadata: 900 self.write_settings( 901 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 902 ) 903 904 def to_csv(self, write_metadata=True): 905 """Export the data to a CSV file. 906 907 Parameters: 908 ---------- 909 write_metadata : bool, optional 910 Whether to write metadata to the output file. Default is True. 911 """ 912 import csv 913 914 for mass_spectrum in self.mass_spectra: 915 columns = self.columns_label + self.get_all_used_atoms_in_order( 916 mass_spectrum 917 ) 918 919 scan_number = mass_spectrum.scan_number 920 921 dict_data_list = self.get_list_dict_data(mass_spectrum) 922 923 out_filename = Path( 924 "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv") 925 ) 926 927 with open(self.dir_loc / out_filename, "w", newline="") as csvfile: 928 writer = csv.DictWriter(csvfile, fieldnames=columns) 929 writer.writeheader() 930 for data in dict_data_list: 931 writer.writerow(data) 932 933 if write_metadata: 934 self.write_settings( 935 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 936 ) 937 938 def get_mass_spectra_attrs(self): 939 """Get the mass spectra attributes as a JSON string. 940 941 Parameters: 942 ---------- 943 mass_spectra : object 944 The high resolution mass spectra object. 945 946 Returns: 947 ------- 948 str 949 The mass spectra attributes as a JSON string. 950 """ 951 dict_ms_attrs = {} 952 dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer 953 dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label 954 dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name 955 956 return json.dumps( 957 dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ") 958 ) 959 960 def to_hdf(self, overwrite=False, export_raw=True): 961 """Export the data to an HDF5 file. 962 963 Parameters 964 ---------- 965 overwrite : bool, optional 966 Whether to overwrite the output file. Default is False. 967 export_raw : bool, optional 968 Whether to export the raw mass spectra data. Default is True. 969 """ 970 if overwrite: 971 if self.output_file.with_suffix(".hdf5").exists(): 972 self.output_file.with_suffix(".hdf5").unlink() 973 974 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 975 if not hdf_handle.attrs.get("date_utc"): 976 # Set metadata for all mass spectra 977 timenow = str( 978 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 979 ) 980 hdf_handle.attrs["date_utc"] = timenow 981 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 982 hdf_handle.attrs["data_structure"] = "mass_spectra" 983 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 984 hdf_handle.attrs["instrument_label"] = ( 985 self.mass_spectra.instrument_label 986 ) 987 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 988 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 989 hdf_handle.attrs["parser_type"] = ( 990 self.mass_spectra.spectra_parser_class.__name__ 991 ) 992 hdf_handle.attrs["original_file_location"] = ( 993 self.mass_spectra.file_location._str 994 ) 995 996 if "mass_spectra" not in hdf_handle: 997 mass_spectra_group = hdf_handle.create_group("mass_spectra") 998 else: 999 mass_spectra_group = hdf_handle.get("mass_spectra") 1000 1001 for mass_spectrum in self.mass_spectra: 1002 group_key = str(int(mass_spectrum.scan_number)) 1003 1004 self.add_mass_spectrum_to_hdf5( 1005 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw 1006 )
A class to export high resolution mass spectra data.
This class provides methods to export high resolution mass spectra data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
Parameters
- out_file_path (str | Path): The output file path.
- mass_spectra (object): The high resolution mass spectra object.
- output_type (str, optional): The output type. Default is 'excel'.
Attributes
- output_file (Path): The output file path without suffix
- dir_loc (Path): The directory location for the output file, by default this will be the output_file + ".corems" and all output files will be written into this location
- mass_spectra (MassSpectraBase): The high resolution mass spectra object.
807 def __init__(self, out_file_path, mass_spectra, output_type="excel"): 808 super().__init__( 809 out_file_path=out_file_path, mass_spectrum=None, output_type=output_type 810 ) 811 812 self.dir_loc = Path(out_file_path + ".corems") 813 self.dir_loc.mkdir(exist_ok=True) 814 # Place the output file in the directory 815 self.output_file = self.dir_loc / Path(out_file_path).name 816 self._output_type = output_type # 'excel', 'csv', 'pandas' or 'hdf5' 817 self.mass_spectra = mass_spectra 818 self.atoms_order_list = None 819 self._init_columns()
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
821 def get_pandas_df(self): 822 """Get the mass spectra as a list of Pandas DataFrames.""" 823 824 list_df = [] 825 826 for mass_spectrum in self.mass_spectra: 827 columns = self.columns_label + self.get_all_used_atoms_in_order( 828 mass_spectrum 829 ) 830 831 dict_data_list = self.get_list_dict_data(mass_spectrum) 832 833 df = DataFrame(dict_data_list, columns=columns) 834 835 scan_number = mass_spectrum.scan_number 836 837 df.name = str(self.output_file) + "_" + str(scan_number) 838 839 list_df.append(df) 840 841 return list_df
Get the mass spectra as a list of Pandas DataFrames.
843 def to_pandas(self, write_metadata=True): 844 """Export the data to a Pandas DataFrame and save it as a pickle file. 845 846 Parameters: 847 ---------- 848 write_metadata : bool, optional 849 Whether to write metadata to the output file. Default is True. 850 """ 851 852 for mass_spectrum in self.mass_spectra: 853 columns = self.columns_label + self.get_all_used_atoms_in_order( 854 mass_spectrum 855 ) 856 857 dict_data_list = self.get_list_dict_data(mass_spectrum) 858 859 df = DataFrame(dict_data_list, columns=columns) 860 861 scan_number = mass_spectrum.scan_number 862 863 out_filename = Path( 864 "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl") 865 ) 866 867 df.to_pickle(self.dir_loc / out_filename) 868 869 if write_metadata: 870 self.write_settings( 871 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 872 )
Export the data to a Pandas DataFrame and save it as a pickle file.
Parameters:
write_metadata : bool, optional Whether to write metadata to the output file. Default is True.
874 def to_excel(self, write_metadata=True): 875 """Export the data to an Excel file. 876 877 Parameters: 878 ---------- 879 write_metadata : bool, optional 880 Whether to write metadata to the output file. Default is True. 881 """ 882 for mass_spectrum in self.mass_spectra: 883 columns = self.columns_label + self.get_all_used_atoms_in_order( 884 mass_spectrum 885 ) 886 887 dict_data_list = self.get_list_dict_data(mass_spectrum) 888 889 df = DataFrame(dict_data_list, columns=columns) 890 891 scan_number = mass_spectrum.scan_number 892 893 out_filename = Path( 894 "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx") 895 ) 896 897 df.to_excel(self.dir_loc / out_filename) 898 899 if write_metadata: 900 self.write_settings( 901 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 902 )
Export the data to an Excel file.
Parameters:
write_metadata : bool, optional Whether to write metadata to the output file. Default is True.
904 def to_csv(self, write_metadata=True): 905 """Export the data to a CSV file. 906 907 Parameters: 908 ---------- 909 write_metadata : bool, optional 910 Whether to write metadata to the output file. Default is True. 911 """ 912 import csv 913 914 for mass_spectrum in self.mass_spectra: 915 columns = self.columns_label + self.get_all_used_atoms_in_order( 916 mass_spectrum 917 ) 918 919 scan_number = mass_spectrum.scan_number 920 921 dict_data_list = self.get_list_dict_data(mass_spectrum) 922 923 out_filename = Path( 924 "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv") 925 ) 926 927 with open(self.dir_loc / out_filename, "w", newline="") as csvfile: 928 writer = csv.DictWriter(csvfile, fieldnames=columns) 929 writer.writeheader() 930 for data in dict_data_list: 931 writer.writerow(data) 932 933 if write_metadata: 934 self.write_settings( 935 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 936 )
Export the data to a CSV file.
Parameters:
write_metadata : bool, optional Whether to write metadata to the output file. Default is True.
938 def get_mass_spectra_attrs(self): 939 """Get the mass spectra attributes as a JSON string. 940 941 Parameters: 942 ---------- 943 mass_spectra : object 944 The high resolution mass spectra object. 945 946 Returns: 947 ------- 948 str 949 The mass spectra attributes as a JSON string. 950 """ 951 dict_ms_attrs = {} 952 dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer 953 dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label 954 dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name 955 956 return json.dumps( 957 dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ") 958 )
Get the mass spectra attributes as a JSON string.
Parameters:
mass_spectra : object The high resolution mass spectra object.
Returns:
str The mass spectra attributes as a JSON string.
960 def to_hdf(self, overwrite=False, export_raw=True): 961 """Export the data to an HDF5 file. 962 963 Parameters 964 ---------- 965 overwrite : bool, optional 966 Whether to overwrite the output file. Default is False. 967 export_raw : bool, optional 968 Whether to export the raw mass spectra data. Default is True. 969 """ 970 if overwrite: 971 if self.output_file.with_suffix(".hdf5").exists(): 972 self.output_file.with_suffix(".hdf5").unlink() 973 974 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 975 if not hdf_handle.attrs.get("date_utc"): 976 # Set metadata for all mass spectra 977 timenow = str( 978 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 979 ) 980 hdf_handle.attrs["date_utc"] = timenow 981 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 982 hdf_handle.attrs["data_structure"] = "mass_spectra" 983 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 984 hdf_handle.attrs["instrument_label"] = ( 985 self.mass_spectra.instrument_label 986 ) 987 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 988 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 989 hdf_handle.attrs["parser_type"] = ( 990 self.mass_spectra.spectra_parser_class.__name__ 991 ) 992 hdf_handle.attrs["original_file_location"] = ( 993 self.mass_spectra.file_location._str 994 ) 995 996 if "mass_spectra" not in hdf_handle: 997 mass_spectra_group = hdf_handle.create_group("mass_spectra") 998 else: 999 mass_spectra_group = hdf_handle.get("mass_spectra") 1000 1001 for mass_spectrum in self.mass_spectra: 1002 group_key = str(int(mass_spectrum.scan_number)) 1003 1004 self.add_mass_spectrum_to_hdf5( 1005 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw 1006 )
Export the data to an HDF5 file.
Parameters
- overwrite (bool, optional): Whether to overwrite the output file. Default is False.
- export_raw (bool, optional): Whether to export the raw mass spectra data. Default is True.
Inherited Members
- corems.mass_spectrum.output.export.HighResMassSpecExport
- output_type
- mass_spectrum
- save
- run
- write_settings
- to_json
- add_mass_spectrum_to_hdf5
- parameters_to_toml
- parameters_to_json
- get_mass_spec_attrs
- get_all_used_atoms_in_order
- list_dict_to_list
- get_list_dict_data
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id
1009class LCMSExport(HighResMassSpectraExport): 1010 """A class to export high resolution LC-MS data. 1011 1012 This class provides methods to export high resolution LC-MS data to HDF5. 1013 1014 Parameters 1015 ---------- 1016 out_file_path : str | Path 1017 The output file path, do not include the file extension. 1018 lcms_object : LCMSBase 1019 The high resolution lc-ms object. 1020 """ 1021 1022 def __init__(self, out_file_path, mass_spectra): 1023 super().__init__(out_file_path, mass_spectra, output_type="hdf5") 1024 1025 def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"): 1026 """Export the data to an HDF5. 1027 1028 Parameters 1029 ---------- 1030 overwrite : bool, optional 1031 Whether to overwrite the output file. Default is False. 1032 save_parameters : bool, optional 1033 Whether to save the parameters as a separate json or toml file. Default is True. 1034 parameter_format : str, optional 1035 The format to save the parameters in. Default is 'toml'. 1036 1037 Raises 1038 ------ 1039 ValueError 1040 If parameter_format is not 'json' or 'toml'. 1041 """ 1042 export_profile_spectra = ( 1043 self.mass_spectra.parameters.lc_ms.export_profile_spectra 1044 ) 1045 1046 # Parameterized export: all spectra (default) or only relevant spectra 1047 export_only_relevant = self.mass_spectra.parameters.lc_ms.export_only_relevant_mass_spectra 1048 if export_only_relevant: 1049 relevant_scan_numbers = set() 1050 # Add MS1 spectra associated with mass features (apex scans) and best MS2 spectra 1051 for mass_feature in self.mass_spectra.mass_features.values(): 1052 relevant_scan_numbers.add(mass_feature.apex_scan) 1053 if mass_feature.best_ms2 is not None: 1054 relevant_scan_numbers.add(mass_feature.best_ms2.scan_number) 1055 if overwrite: 1056 if self.output_file.with_suffix(".hdf5").exists(): 1057 self.output_file.with_suffix(".hdf5").unlink() 1058 1059 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1060 if not hdf_handle.attrs.get("date_utc"): 1061 # Set metadata for all mass spectra 1062 timenow = str( 1063 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 1064 ) 1065 hdf_handle.attrs["date_utc"] = timenow 1066 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 1067 hdf_handle.attrs["data_structure"] = "mass_spectra" 1068 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 1069 hdf_handle.attrs["instrument_label"] = ( 1070 self.mass_spectra.instrument_label 1071 ) 1072 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 1073 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 1074 hdf_handle.attrs["parser_type"] = ( 1075 self.mass_spectra.spectra_parser_class.__name__ 1076 ) 1077 hdf_handle.attrs["original_file_location"] = ( 1078 self.mass_spectra.file_location._str 1079 ) 1080 1081 if "mass_spectra" not in hdf_handle: 1082 mass_spectra_group = hdf_handle.create_group("mass_spectra") 1083 else: 1084 mass_spectra_group = hdf_handle.get("mass_spectra") 1085 1086 # Export logic based on parameter 1087 for mass_spectrum in self.mass_spectra: 1088 if export_only_relevant: 1089 if mass_spectrum.scan_number in relevant_scan_numbers: 1090 group_key = str(int(mass_spectrum.scan_number)) 1091 self.add_mass_spectrum_to_hdf5( 1092 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra 1093 ) 1094 else: 1095 group_key = str(int(mass_spectrum.scan_number)) 1096 self.add_mass_spectrum_to_hdf5( 1097 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra 1098 ) 1099 1100 # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file 1101 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1102 # Add scan_info to hdf5 file 1103 if "scan_info" not in hdf_handle: 1104 scan_info_group = hdf_handle.create_group("scan_info") 1105 for k, v in self.mass_spectra._scan_info.items(): 1106 array = np.array(list(v.values())) 1107 if array.dtype.str[0:2] == "<U": 1108 # Convert Unicode strings to UTF-8 encoded bytes first 1109 string_data = [str(item) for item in array] 1110 string_dtype = h5py.string_dtype(encoding='utf-8') 1111 scan_info_group.create_dataset(k, data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1112 else: 1113 # Apply data type optimization for numeric data 1114 if array.dtype == np.float64: 1115 array = array.astype(np.float32) 1116 elif array.dtype == np.int64: 1117 array = array.astype(np.int32) 1118 scan_info_group.create_dataset(k, data=array, compression="gzip", compression_opts=9, chunks=True) 1119 1120 # Add ms_unprocessed to hdf5 file 1121 export_unprocessed_ms1 = ( 1122 self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1 1123 ) 1124 if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1: 1125 if "ms_unprocessed" not in hdf_handle: 1126 ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed") 1127 else: 1128 ms_unprocessed_group = hdf_handle.get("ms_unprocessed") 1129 for k, v in self.mass_spectra._ms_unprocessed.items(): 1130 array = np.array(v) 1131 # Apply data type optimization and compression 1132 if array.dtype == np.int64: 1133 array = array.astype(np.int32) 1134 elif array.dtype == np.float64: 1135 array = array.astype(np.float32) 1136 elif array.dtype.str[0:2] == "<U": 1137 # Convert Unicode strings to UTF-8 encoded strings 1138 string_data = [str(item) for item in array] 1139 string_dtype = h5py.string_dtype(encoding='utf-8') 1140 ms_unprocessed_group.create_dataset(str(k), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1141 continue 1142 ms_unprocessed_group.create_dataset(str(k), data=array, compression="gzip", compression_opts=9, chunks=True) 1143 1144 # Add LCMS mass features to hdf5 file 1145 if len(self.mass_spectra.mass_features) > 0: 1146 if "mass_features" not in hdf_handle: 1147 mass_features_group = hdf_handle.create_group("mass_features") 1148 else: 1149 mass_features_group = hdf_handle.get("mass_features") 1150 1151 # Create group for each mass feature, with key as the mass feature id 1152 for k, v in self.mass_spectra.mass_features.items(): 1153 mass_features_group.create_group(str(k)) 1154 # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array) 1155 for k2, v2 in v.__dict__.items(): 1156 if v2 is not None: 1157 # Check if the attribute is an integer or float and set as an attribute in the mass feature group 1158 if k2 not in [ 1159 "chromatogram_parent", 1160 "ms2_mass_spectra", 1161 "mass_spectrum", 1162 "_eic_data", 1163 "ms2_similarity_results", 1164 ]: 1165 if k2 == "ms2_scan_numbers": 1166 array = np.array(v2) 1167 # Convert int64 to int32 1168 if array.dtype == np.int64: 1169 array = array.astype(np.int32) 1170 mass_features_group[str(k)].create_dataset( 1171 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1172 ) 1173 elif k2 == "_half_height_width": 1174 array = np.array(v2) 1175 # Convert float64 to float32 1176 if array.dtype == np.float64: 1177 array = array.astype(np.float32) 1178 mass_features_group[str(k)].create_dataset( 1179 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1180 ) 1181 elif k2 == "_ms_deconvoluted_idx": 1182 array = np.array(v2) 1183 # Convert int64 to int32 1184 if array.dtype == np.int64: 1185 array = array.astype(np.int32) 1186 mass_features_group[str(k)].create_dataset( 1187 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1188 ) 1189 elif k2 == "associated_mass_features_deconvoluted": 1190 array = np.array(v2) 1191 # Convert int64 to int32 1192 if array.dtype == np.int64: 1193 array = array.astype(np.int32) 1194 mass_features_group[str(k)].create_dataset( 1195 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1196 ) 1197 elif k2 == "_noise_score": 1198 array = np.array(v2) 1199 # Convert float64 to float32 1200 if array.dtype == np.float64: 1201 array = array.astype(np.float32) 1202 mass_features_group[str(k)].create_dataset( 1203 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1204 ) 1205 elif ( 1206 isinstance(v2, int) 1207 or isinstance(v2, float) 1208 or isinstance(v2, str) 1209 or isinstance(v2, np.integer) 1210 or isinstance(v2, np.float32) 1211 or isinstance(v2, np.float64) 1212 or isinstance(v2, np.bool_) 1213 ): 1214 # Convert numpy types to smaller precision for storage 1215 if isinstance(v2, np.int64): 1216 v2 = np.int32(v2) 1217 elif isinstance(v2, np.float64): 1218 v2 = np.float32(v2) 1219 mass_features_group[str(k)].attrs[str(k2)] = v2 1220 else: 1221 raise TypeError( 1222 f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file" 1223 ) 1224 1225 # Add EIC data to hdf5 file 1226 export_eics = self.mass_spectra.parameters.lc_ms.export_eics 1227 if len(self.mass_spectra.eics) > 0 and export_eics: 1228 if "eics" not in hdf_handle: 1229 eic_group = hdf_handle.create_group("eics") 1230 else: 1231 eic_group = hdf_handle.get("eics") 1232 1233 # Create group for each eic 1234 for k, v in self.mass_spectra.eics.items(): 1235 eic_group.create_group(str(k)) 1236 eic_group[str(k)].attrs["mz"] = k 1237 # Loop through each of the attributes and add them as datasets (if array) 1238 for k2, v2 in v.__dict__.items(): 1239 if v2 is not None: 1240 array = np.array(v2) 1241 # Apply data type optimization and compression 1242 if array.dtype == np.int64: 1243 array = array.astype(np.int32) 1244 elif array.dtype == np.float64: 1245 array = array.astype(np.float32) 1246 elif array.dtype.str[0:2] == "<U": 1247 # Convert Unicode strings to UTF-8 encoded strings 1248 string_data = [str(item) for item in array] 1249 string_dtype = h5py.string_dtype(encoding='utf-8') 1250 eic_group[str(k)].create_dataset(str(k2), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1251 continue 1252 eic_group[str(k)].create_dataset(str(k2), data=array, compression="gzip", compression_opts=9, chunks=True) 1253 1254 # Add ms2_search results to hdf5 file (parameterized) 1255 if len(self.mass_spectra.spectral_search_results) > 0: 1256 if "spectral_search_results" not in hdf_handle: 1257 spectral_search_results = hdf_handle.create_group( 1258 "spectral_search_results" 1259 ) 1260 else: 1261 spectral_search_results = hdf_handle.get("spectral_search_results") 1262 # Create group for each search result by ms2_scan / precursor_mz 1263 for k, v in self.mass_spectra.spectral_search_results.items(): 1264 # If parameter is set, only export spectral search results for relevant scans 1265 if export_only_relevant and k not in relevant_scan_numbers: 1266 continue 1267 spectral_search_results.create_group(str(k)) 1268 for k2, v2 in v.items(): 1269 spectral_search_results[str(k)].create_group(str(k2)) 1270 spectral_search_results[str(k)][str(k2)].attrs[ 1271 "precursor_mz" 1272 ] = v2.precursor_mz 1273 spectral_search_results[str(k)][str(k2)].attrs[ 1274 "query_spectrum_id" 1275 ] = v2.query_spectrum_id 1276 # Loop through each of the attributes and add them as datasets (if array) 1277 for k3, v3 in v2.__dict__.items(): 1278 if v3 is not None and k3 not in [ 1279 "query_spectrum", 1280 "precursor_mz", 1281 "query_spectrum_id", 1282 ]: 1283 if k3 == "query_frag_types" or k3 == "ref_frag_types": 1284 v3 = [", ".join(x) for x in v3] 1285 if all(v3 is not None for v3 in v3): 1286 array = np.array(v3) 1287 if array.dtype.str[0:2] == "<U": 1288 array = array.astype("S") 1289 spectral_search_results[str(k)][str(k2)].create_dataset( 1290 str(k3), data=array 1291 ) 1292 if save_parameters: 1293 # Check if parameter_format is valid 1294 if parameter_format not in ["json", "toml"]: 1295 raise ValueError("parameter_format must be 'json' or 'toml'") 1296 1297 if parameter_format == "json": 1298 dump_lcms_settings_json( 1299 filename=self.output_file.with_suffix(".json"), 1300 lcms_obj=self.mass_spectra, 1301 ) 1302 elif parameter_format == "toml": 1303 dump_lcms_settings_toml( 1304 filename=self.output_file.with_suffix(".toml"), 1305 lcms_obj=self.mass_spectra, 1306 )
A class to export high resolution LC-MS data.
This class provides methods to export high resolution LC-MS data to HDF5.
Parameters
- out_file_path (str | Path): The output file path, do not include the file extension.
- lcms_object (LCMSBase): The high resolution lc-ms object.
1022 def __init__(self, out_file_path, mass_spectra): 1023 super().__init__(out_file_path, mass_spectra, output_type="hdf5")
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
1025 def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"): 1026 """Export the data to an HDF5. 1027 1028 Parameters 1029 ---------- 1030 overwrite : bool, optional 1031 Whether to overwrite the output file. Default is False. 1032 save_parameters : bool, optional 1033 Whether to save the parameters as a separate json or toml file. Default is True. 1034 parameter_format : str, optional 1035 The format to save the parameters in. Default is 'toml'. 1036 1037 Raises 1038 ------ 1039 ValueError 1040 If parameter_format is not 'json' or 'toml'. 1041 """ 1042 export_profile_spectra = ( 1043 self.mass_spectra.parameters.lc_ms.export_profile_spectra 1044 ) 1045 1046 # Parameterized export: all spectra (default) or only relevant spectra 1047 export_only_relevant = self.mass_spectra.parameters.lc_ms.export_only_relevant_mass_spectra 1048 if export_only_relevant: 1049 relevant_scan_numbers = set() 1050 # Add MS1 spectra associated with mass features (apex scans) and best MS2 spectra 1051 for mass_feature in self.mass_spectra.mass_features.values(): 1052 relevant_scan_numbers.add(mass_feature.apex_scan) 1053 if mass_feature.best_ms2 is not None: 1054 relevant_scan_numbers.add(mass_feature.best_ms2.scan_number) 1055 if overwrite: 1056 if self.output_file.with_suffix(".hdf5").exists(): 1057 self.output_file.with_suffix(".hdf5").unlink() 1058 1059 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1060 if not hdf_handle.attrs.get("date_utc"): 1061 # Set metadata for all mass spectra 1062 timenow = str( 1063 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 1064 ) 1065 hdf_handle.attrs["date_utc"] = timenow 1066 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 1067 hdf_handle.attrs["data_structure"] = "mass_spectra" 1068 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 1069 hdf_handle.attrs["instrument_label"] = ( 1070 self.mass_spectra.instrument_label 1071 ) 1072 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 1073 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 1074 hdf_handle.attrs["parser_type"] = ( 1075 self.mass_spectra.spectra_parser_class.__name__ 1076 ) 1077 hdf_handle.attrs["original_file_location"] = ( 1078 self.mass_spectra.file_location._str 1079 ) 1080 1081 if "mass_spectra" not in hdf_handle: 1082 mass_spectra_group = hdf_handle.create_group("mass_spectra") 1083 else: 1084 mass_spectra_group = hdf_handle.get("mass_spectra") 1085 1086 # Export logic based on parameter 1087 for mass_spectrum in self.mass_spectra: 1088 if export_only_relevant: 1089 if mass_spectrum.scan_number in relevant_scan_numbers: 1090 group_key = str(int(mass_spectrum.scan_number)) 1091 self.add_mass_spectrum_to_hdf5( 1092 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra 1093 ) 1094 else: 1095 group_key = str(int(mass_spectrum.scan_number)) 1096 self.add_mass_spectrum_to_hdf5( 1097 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra 1098 ) 1099 1100 # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file 1101 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1102 # Add scan_info to hdf5 file 1103 if "scan_info" not in hdf_handle: 1104 scan_info_group = hdf_handle.create_group("scan_info") 1105 for k, v in self.mass_spectra._scan_info.items(): 1106 array = np.array(list(v.values())) 1107 if array.dtype.str[0:2] == "<U": 1108 # Convert Unicode strings to UTF-8 encoded bytes first 1109 string_data = [str(item) for item in array] 1110 string_dtype = h5py.string_dtype(encoding='utf-8') 1111 scan_info_group.create_dataset(k, data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1112 else: 1113 # Apply data type optimization for numeric data 1114 if array.dtype == np.float64: 1115 array = array.astype(np.float32) 1116 elif array.dtype == np.int64: 1117 array = array.astype(np.int32) 1118 scan_info_group.create_dataset(k, data=array, compression="gzip", compression_opts=9, chunks=True) 1119 1120 # Add ms_unprocessed to hdf5 file 1121 export_unprocessed_ms1 = ( 1122 self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1 1123 ) 1124 if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1: 1125 if "ms_unprocessed" not in hdf_handle: 1126 ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed") 1127 else: 1128 ms_unprocessed_group = hdf_handle.get("ms_unprocessed") 1129 for k, v in self.mass_spectra._ms_unprocessed.items(): 1130 array = np.array(v) 1131 # Apply data type optimization and compression 1132 if array.dtype == np.int64: 1133 array = array.astype(np.int32) 1134 elif array.dtype == np.float64: 1135 array = array.astype(np.float32) 1136 elif array.dtype.str[0:2] == "<U": 1137 # Convert Unicode strings to UTF-8 encoded strings 1138 string_data = [str(item) for item in array] 1139 string_dtype = h5py.string_dtype(encoding='utf-8') 1140 ms_unprocessed_group.create_dataset(str(k), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1141 continue 1142 ms_unprocessed_group.create_dataset(str(k), data=array, compression="gzip", compression_opts=9, chunks=True) 1143 1144 # Add LCMS mass features to hdf5 file 1145 if len(self.mass_spectra.mass_features) > 0: 1146 if "mass_features" not in hdf_handle: 1147 mass_features_group = hdf_handle.create_group("mass_features") 1148 else: 1149 mass_features_group = hdf_handle.get("mass_features") 1150 1151 # Create group for each mass feature, with key as the mass feature id 1152 for k, v in self.mass_spectra.mass_features.items(): 1153 mass_features_group.create_group(str(k)) 1154 # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array) 1155 for k2, v2 in v.__dict__.items(): 1156 if v2 is not None: 1157 # Check if the attribute is an integer or float and set as an attribute in the mass feature group 1158 if k2 not in [ 1159 "chromatogram_parent", 1160 "ms2_mass_spectra", 1161 "mass_spectrum", 1162 "_eic_data", 1163 "ms2_similarity_results", 1164 ]: 1165 if k2 == "ms2_scan_numbers": 1166 array = np.array(v2) 1167 # Convert int64 to int32 1168 if array.dtype == np.int64: 1169 array = array.astype(np.int32) 1170 mass_features_group[str(k)].create_dataset( 1171 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1172 ) 1173 elif k2 == "_half_height_width": 1174 array = np.array(v2) 1175 # Convert float64 to float32 1176 if array.dtype == np.float64: 1177 array = array.astype(np.float32) 1178 mass_features_group[str(k)].create_dataset( 1179 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1180 ) 1181 elif k2 == "_ms_deconvoluted_idx": 1182 array = np.array(v2) 1183 # Convert int64 to int32 1184 if array.dtype == np.int64: 1185 array = array.astype(np.int32) 1186 mass_features_group[str(k)].create_dataset( 1187 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1188 ) 1189 elif k2 == "associated_mass_features_deconvoluted": 1190 array = np.array(v2) 1191 # Convert int64 to int32 1192 if array.dtype == np.int64: 1193 array = array.astype(np.int32) 1194 mass_features_group[str(k)].create_dataset( 1195 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1196 ) 1197 elif k2 == "_noise_score": 1198 array = np.array(v2) 1199 # Convert float64 to float32 1200 if array.dtype == np.float64: 1201 array = array.astype(np.float32) 1202 mass_features_group[str(k)].create_dataset( 1203 str(k2), data=array, compression="gzip", compression_opts=9, chunks=True 1204 ) 1205 elif ( 1206 isinstance(v2, int) 1207 or isinstance(v2, float) 1208 or isinstance(v2, str) 1209 or isinstance(v2, np.integer) 1210 or isinstance(v2, np.float32) 1211 or isinstance(v2, np.float64) 1212 or isinstance(v2, np.bool_) 1213 ): 1214 # Convert numpy types to smaller precision for storage 1215 if isinstance(v2, np.int64): 1216 v2 = np.int32(v2) 1217 elif isinstance(v2, np.float64): 1218 v2 = np.float32(v2) 1219 mass_features_group[str(k)].attrs[str(k2)] = v2 1220 else: 1221 raise TypeError( 1222 f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file" 1223 ) 1224 1225 # Add EIC data to hdf5 file 1226 export_eics = self.mass_spectra.parameters.lc_ms.export_eics 1227 if len(self.mass_spectra.eics) > 0 and export_eics: 1228 if "eics" not in hdf_handle: 1229 eic_group = hdf_handle.create_group("eics") 1230 else: 1231 eic_group = hdf_handle.get("eics") 1232 1233 # Create group for each eic 1234 for k, v in self.mass_spectra.eics.items(): 1235 eic_group.create_group(str(k)) 1236 eic_group[str(k)].attrs["mz"] = k 1237 # Loop through each of the attributes and add them as datasets (if array) 1238 for k2, v2 in v.__dict__.items(): 1239 if v2 is not None: 1240 array = np.array(v2) 1241 # Apply data type optimization and compression 1242 if array.dtype == np.int64: 1243 array = array.astype(np.int32) 1244 elif array.dtype == np.float64: 1245 array = array.astype(np.float32) 1246 elif array.dtype.str[0:2] == "<U": 1247 # Convert Unicode strings to UTF-8 encoded strings 1248 string_data = [str(item) for item in array] 1249 string_dtype = h5py.string_dtype(encoding='utf-8') 1250 eic_group[str(k)].create_dataset(str(k2), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True) 1251 continue 1252 eic_group[str(k)].create_dataset(str(k2), data=array, compression="gzip", compression_opts=9, chunks=True) 1253 1254 # Add ms2_search results to hdf5 file (parameterized) 1255 if len(self.mass_spectra.spectral_search_results) > 0: 1256 if "spectral_search_results" not in hdf_handle: 1257 spectral_search_results = hdf_handle.create_group( 1258 "spectral_search_results" 1259 ) 1260 else: 1261 spectral_search_results = hdf_handle.get("spectral_search_results") 1262 # Create group for each search result by ms2_scan / precursor_mz 1263 for k, v in self.mass_spectra.spectral_search_results.items(): 1264 # If parameter is set, only export spectral search results for relevant scans 1265 if export_only_relevant and k not in relevant_scan_numbers: 1266 continue 1267 spectral_search_results.create_group(str(k)) 1268 for k2, v2 in v.items(): 1269 spectral_search_results[str(k)].create_group(str(k2)) 1270 spectral_search_results[str(k)][str(k2)].attrs[ 1271 "precursor_mz" 1272 ] = v2.precursor_mz 1273 spectral_search_results[str(k)][str(k2)].attrs[ 1274 "query_spectrum_id" 1275 ] = v2.query_spectrum_id 1276 # Loop through each of the attributes and add them as datasets (if array) 1277 for k3, v3 in v2.__dict__.items(): 1278 if v3 is not None and k3 not in [ 1279 "query_spectrum", 1280 "precursor_mz", 1281 "query_spectrum_id", 1282 ]: 1283 if k3 == "query_frag_types" or k3 == "ref_frag_types": 1284 v3 = [", ".join(x) for x in v3] 1285 if all(v3 is not None for v3 in v3): 1286 array = np.array(v3) 1287 if array.dtype.str[0:2] == "<U": 1288 array = array.astype("S") 1289 spectral_search_results[str(k)][str(k2)].create_dataset( 1290 str(k3), data=array 1291 ) 1292 if save_parameters: 1293 # Check if parameter_format is valid 1294 if parameter_format not in ["json", "toml"]: 1295 raise ValueError("parameter_format must be 'json' or 'toml'") 1296 1297 if parameter_format == "json": 1298 dump_lcms_settings_json( 1299 filename=self.output_file.with_suffix(".json"), 1300 lcms_obj=self.mass_spectra, 1301 ) 1302 elif parameter_format == "toml": 1303 dump_lcms_settings_toml( 1304 filename=self.output_file.with_suffix(".toml"), 1305 lcms_obj=self.mass_spectra, 1306 )
Export the data to an HDF5.
Parameters
- overwrite (bool, optional): Whether to overwrite the output file. Default is False.
- save_parameters (bool, optional): Whether to save the parameters as a separate json or toml file. Default is True.
- parameter_format (str, optional): The format to save the parameters in. Default is 'toml'.
Raises
- ValueError: If parameter_format is not 'json' or 'toml'.
Inherited Members
- HighResMassSpectraExport
- dir_loc
- output_file
- mass_spectra
- atoms_order_list
- get_pandas_df
- to_pandas
- to_excel
- to_csv
- get_mass_spectra_attrs
- corems.mass_spectrum.output.export.HighResMassSpecExport
- output_type
- mass_spectrum
- save
- run
- write_settings
- to_json
- add_mass_spectrum_to_hdf5
- parameters_to_toml
- parameters_to_json
- get_mass_spec_attrs
- get_all_used_atoms_in_order
- list_dict_to_list
- get_list_dict_data
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id
1308class LCMSMetabolomicsExport(LCMSExport): 1309 """A class to export LCMS metabolite data. 1310 1311 This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report. 1312 1313 Parameters 1314 ---------- 1315 out_file_path : str | Path 1316 The output file path, do not include the file extension. 1317 mass_spectra : object 1318 The high resolution mass spectra object. 1319 """ 1320 1321 def __init__(self, out_file_path, mass_spectra): 1322 super().__init__(out_file_path, mass_spectra) 1323 self.ion_type_dict = ion_type_dict 1324 1325 @staticmethod 1326 def get_ion_formula(neutral_formula, ion_type): 1327 """From a neutral formula and an ion type, return the formula of the ion. 1328 1329 Notes 1330 ----- 1331 This is a static method. 1332 If the neutral_formula is not a string, this method will return None. 1333 1334 Parameters 1335 ---------- 1336 neutral_formula : str 1337 The neutral formula, this should be a string form from the MolecularFormula class 1338 (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). 1339 In the case of a simple string, the atoms are parsed based on the presence of capital letters, 1340 e.g. MgCl2 is parsed as 'Mg Cl2. 1341 ion_type : str 1342 The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. 1343 See the self.ion_type_dict for the available ion types. 1344 1345 Returns 1346 ------- 1347 str 1348 The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string. 1349 """ 1350 # If neutral_formula is not a string, return None 1351 if not isinstance(neutral_formula, str): 1352 return None 1353 1354 # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class) 1355 if re.search(r"\s", neutral_formula): 1356 neutral_formula = MolecularFormula(neutral_formula, ion_charge=0) 1357 else: 1358 form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:] 1359 elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()] 1360 counts = [re.findall(r"\d+", x) for x in form_pre.split()] 1361 neutral_formula = MolecularFormula( 1362 dict( 1363 zip( 1364 [x[0] for x in elements], 1365 [int(x[0]) if x else 1 for x in counts], 1366 ) 1367 ), 1368 ion_charge=0, 1369 ) 1370 neutral_formula_dict = neutral_formula.to_dict().copy() 1371 1372 adduct_add_dict = ion_type_dict[ion_type][0] 1373 for key in adduct_add_dict: 1374 if key in neutral_formula_dict.keys(): 1375 neutral_formula_dict[key] += adduct_add_dict[key] 1376 else: 1377 neutral_formula_dict[key] = adduct_add_dict[key] 1378 1379 adduct_subtract = ion_type_dict[ion_type][1] 1380 for key in adduct_subtract: 1381 neutral_formula_dict[key] -= adduct_subtract[key] 1382 1383 return MolecularFormula(neutral_formula_dict, ion_charge=0).string 1384 1385 @staticmethod 1386 def get_isotope_type(ion_formula): 1387 """From an ion formula, return the 13C isotope type of the ion. 1388 1389 Notes 1390 ----- 1391 This is a static method. 1392 If the ion_formula is not a string, this method will return None. 1393 This is currently only functional for 13C isotopes. 1394 1395 Parameters 1396 ---------- 1397 ion_formula : str 1398 The formula of the ion, expected to be a string like 'C2 H4 O2'. 1399 1400 Returns 1401 ------- 1402 str 1403 The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope. 1404 1405 Raises 1406 ------ 1407 ValueError 1408 If the ion_formula is not a string. 1409 """ 1410 if not isinstance(ion_formula, str): 1411 return None 1412 1413 if re.search(r"\s", ion_formula): 1414 ion_formula = MolecularFormula(ion_formula, ion_charge=0) 1415 else: 1416 raise ValueError('ion_formula should be a string like "C2 H4 O2"') 1417 ion_formula_dict = ion_formula.to_dict().copy() 1418 1419 try: 1420 iso_class = "13C" + str(ion_formula_dict.pop("13C")) 1421 except KeyError: 1422 iso_class = None 1423 1424 return iso_class 1425 1426 def report_to_csv(self, molecular_metadata=None): 1427 """Create a report of the mass features and their annotations and save it as a CSV file. 1428 1429 Parameters 1430 ---------- 1431 molecular_metadata : dict, optional 1432 The molecular metadata. Default is None. 1433 """ 1434 report = self.to_report(molecular_metadata=molecular_metadata) 1435 out_file = self.output_file.with_suffix(".csv") 1436 report.to_csv(out_file, index=False) 1437 1438 def clean_ms1_report(self, ms1_summary_full): 1439 """Clean the MS1 report. 1440 1441 Parameters 1442 ---------- 1443 ms1_summary_full : DataFrame 1444 The full MS1 summary DataFrame. 1445 1446 Returns 1447 ------- 1448 DataFrame 1449 The cleaned MS1 summary DataFrame. 1450 """ 1451 ms1_summary_full = ms1_summary_full.reset_index() 1452 cols_to_keep = [ 1453 "mf_id", 1454 "Molecular Formula", 1455 "Ion Type", 1456 "Calculated m/z", 1457 "m/z Error (ppm)", 1458 "m/z Error Score", 1459 "Is Isotopologue", 1460 "Isotopologue Similarity", 1461 "Confidence Score", 1462 ] 1463 ms1_summary = ms1_summary_full[cols_to_keep].copy() 1464 ms1_summary["ion_formula"] = [ 1465 self.get_ion_formula(f, a) 1466 for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"]) 1467 ] 1468 ms1_summary["isotopologue_type"] = [ 1469 self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist() 1470 ] 1471 1472 # Reorder columns 1473 ms1_summary = ms1_summary[ 1474 [ 1475 "mf_id", 1476 "ion_formula", 1477 "isotopologue_type", 1478 "Calculated m/z", 1479 "m/z Error (ppm)", 1480 "m/z Error Score", 1481 "Isotopologue Similarity", 1482 "Confidence Score", 1483 ] 1484 ] 1485 1486 # Set the index to mf_id 1487 ms1_summary = ms1_summary.set_index("mf_id") 1488 1489 return ms1_summary 1490 1491 def summarize_ms2_report(self, ms2_annot_report): 1492 """ 1493 Summarize the MS2 report. 1494 1495 Parameters 1496 ---------- 1497 ms2_annot_report : DataFrame 1498 The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df. 1499 1500 Returns 1501 ------- 1502 """ 1503 1504 def summarize_metabolomics_report(self, ms2_annot_report): 1505 """Summarize the MS2 hits for a metabolomics report 1506 1507 Parameters 1508 ---------- 1509 ms2_annot : DataFrame 1510 The MS2 annotation DataFrame with all annotations. 1511 1512 Returns 1513 ------- 1514 DataFrame 1515 The summarized metabolomics report. 1516 """ 1517 columns_to_drop = [ 1518 "precursor_mz", 1519 "precursor_mz_error_ppm", 1520 "cas", 1521 "data_id", 1522 "iupac_name", 1523 "traditional_name", 1524 "common_name", 1525 "casno", 1526 ] 1527 ms2_annot = ms2_annot_report.drop( 1528 columns=[col for col in columns_to_drop if col in ms2_annot_report.columns] 1529 ) 1530 1531 # Prepare information about the search results, pulling out the best hit for the single report 1532 # Group by mf_id,ref_mol_id grab row with highest entropy similarity 1533 ms2_annot = ms2_annot.reset_index() 1534 # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id 1535 ms2_annot["n_spectra_contributing"] = ( 1536 ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"] 1537 .transform("nunique") 1538 ) 1539 # Sort by entropy similarity 1540 ms2_annot = ms2_annot.sort_values( 1541 by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False] 1542 ) 1543 best_entropy = ms2_annot.drop_duplicates( 1544 subset=["mf_id", "ref_mol_id"], keep="first" 1545 ) 1546 1547 return best_entropy 1548 1549 def clean_ms2_report(self, metabolite_summary): 1550 """Clean the MS2 report. 1551 1552 Parameters 1553 ---------- 1554 metabolite_summary : DataFrame 1555 The full metabolomics summary DataFrame. 1556 1557 Returns 1558 ------- 1559 DataFrame 1560 The cleaned metabolomics summary DataFrame. 1561 """ 1562 metabolite_summary = metabolite_summary.reset_index() 1563 metabolite_summary["ion_formula"] = [ 1564 self.get_ion_formula(f, a) 1565 for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"]) 1566 ] 1567 1568 col_order = [ 1569 "mf_id", 1570 "ion_formula", 1571 "ref_ion_type", 1572 "formula", 1573 "inchikey", 1574 "name", 1575 "inchi", 1576 "chebi", 1577 "smiles", 1578 "kegg", 1579 "cas", 1580 "database_name", 1581 "ref_ms_id", 1582 "entropy_similarity", 1583 "ref_mz_in_query_fract", 1584 "n_spectra_contributing", 1585 ] 1586 1587 # Reorder columns 1588 metabolite_summary = metabolite_summary[ 1589 [col for col in col_order if col in metabolite_summary.columns] 1590 ] 1591 1592 # Convert chebi (if present) to int: 1593 if "chebi" in metabolite_summary.columns: 1594 metabolite_summary["chebi"] = metabolite_summary["chebi"].astype( 1595 "Int64", errors="ignore" 1596 ) 1597 1598 # Set the index to mf_id 1599 metabolite_summary = metabolite_summary.set_index("mf_id") 1600 1601 return metabolite_summary 1602 1603 def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report): 1604 """Combine the mass feature report with the MS1 and MS2 reports. 1605 1606 Parameters 1607 ---------- 1608 mf_report : DataFrame 1609 The mass feature report DataFrame. 1610 ms1_annot_report : DataFrame 1611 The MS1 annotation report DataFrame. 1612 ms2_annot_report : DataFrame 1613 The MS2 annotation report DataFrame. 1614 """ 1615 # If there is an ms1_annot_report, merge it with the mf_report 1616 if not ms1_annot_report.empty: 1617 # MS1 has been run and has molecular formula information 1618 mf_report = pd.merge( 1619 mf_report, 1620 ms1_annot_report, 1621 how="left", 1622 on=["mf_id", "isotopologue_type"], 1623 ) 1624 if ms2_annot_report is not None: 1625 # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly) 1626 mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()] 1627 mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"]) 1628 mf_no_ion_formula = pd.merge( 1629 mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"] 1630 ) 1631 1632 # pull out the records with ion_formula 1633 mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()] 1634 mf_with_ion_formula = pd.merge( 1635 mf_with_ion_formula, 1636 ms2_annot_report, 1637 how="left", 1638 on=["mf_id", "ion_formula"], 1639 ) 1640 1641 # put back together 1642 mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula]) 1643 1644 # Rename colums 1645 rename_dict = { 1646 "mf_id": "Mass Feature ID", 1647 "scan_time": "Retention Time (min)", 1648 "mz": "m/z", 1649 "apex_scan": "Apex Scan Number", 1650 "intensity": "Intensity", 1651 "persistence": "Persistence", 1652 "area": "Area", 1653 "half_height_width": "Half Height Width (min)", 1654 "tailing_factor": "Tailing Factor", 1655 "dispersity_index": "Dispersity Index", 1656 "ms2_spectrum": "MS2 Spectrum", 1657 "monoisotopic_mf_id": "Monoisotopic Mass Feature ID", 1658 "isotopologue_type": "Isotopologue Type", 1659 "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution", 1660 "associated_mass_features": "Associated Mass Features after Deconvolution", 1661 "ion_formula": "Ion Formula", 1662 "formula": "Molecular Formula", 1663 "ref_ion_type": "Ion Type", 1664 "annot_level": "Lipid Annotation Level", 1665 "lipid_molecular_species_id": "Lipid Molecular Species", 1666 "lipid_summed_name": "Lipid Species", 1667 "lipid_subclass": "Lipid Subclass", 1668 "lipid_class": "Lipid Class", 1669 "lipid_category": "Lipid Category", 1670 "entropy_similarity": "Entropy Similarity", 1671 "ref_mz_in_query_fract": "Library mzs in Query (fraction)", 1672 "n_spectra_contributing": "Spectra with Annotation (n)", 1673 } 1674 mf_report = mf_report.rename(columns=rename_dict) 1675 mf_report["Sample Name"] = self.mass_spectra.sample_name 1676 mf_report["Polarity"] = self.mass_spectra.polarity 1677 mf_report = mf_report[ 1678 ["Mass Feature ID", "Sample Name", "Polarity"] 1679 + [ 1680 col 1681 for col in mf_report.columns 1682 if col not in ["Mass Feature ID", "Sample Name", "Polarity"] 1683 ] 1684 ] 1685 1686 # Reorder rows by "Mass Feature ID", then "Entropy Similarity" (descending), then "Confidence Score" (descending) 1687 if "Entropy Similarity" in mf_report.columns: 1688 mf_report = mf_report.sort_values( 1689 by=["Mass Feature ID", "Entropy Similarity", "Confidence Score"], 1690 ascending=[True, False, False], 1691 ) 1692 elif "Confidence Score" in mf_report.columns: 1693 mf_report = mf_report.sort_values( 1694 by=["Mass Feature ID", "Confidence Score"], 1695 ascending=[True, False], 1696 ) 1697 # If neither "Entropy Similarity" nor "Confidence Score" are in the columns, just sort by "Mass Feature ID" 1698 else: 1699 mf_report = mf_report.sort_values("Mass Feature ID") 1700 1701 # Reset index 1702 mf_report = mf_report.reset_index(drop=True) 1703 1704 return mf_report 1705 1706 def to_report(self, molecular_metadata=None): 1707 """Create a report of the mass features and their annotations. 1708 1709 Parameters 1710 ---------- 1711 molecular_metadata : dict, optional 1712 The molecular metadata. Default is None. 1713 1714 Returns 1715 ------- 1716 DataFrame 1717 The report as a Pandas DataFrame. 1718 """ 1719 # Get mass feature dataframe 1720 mf_report = self.mass_spectra.mass_features_to_df() 1721 mf_report = mf_report.reset_index(drop=False) 1722 1723 # Get and clean ms1 annotation dataframe 1724 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 1725 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 1726 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 1727 1728 # Get, summarize, and clean ms2 annotation dataframe 1729 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 1730 molecular_metadata=molecular_metadata 1731 ) 1732 if ms2_annot_report is not None and molecular_metadata is not None: 1733 ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report) 1734 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 1735 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 1736 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 1737 else: 1738 ms2_annot_report = None 1739 1740 report = self.combine_reports( 1741 mf_report=mf_report, 1742 ms1_annot_report=ms1_annot_report, 1743 ms2_annot_report=ms2_annot_report 1744 ) 1745 1746 return report
A class to export LCMS metabolite data.
This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report.
Parameters
- out_file_path (str | Path): The output file path, do not include the file extension.
- mass_spectra (object): The high resolution mass spectra object.
1321 def __init__(self, out_file_path, mass_spectra): 1322 super().__init__(out_file_path, mass_spectra) 1323 self.ion_type_dict = ion_type_dict
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
1325 @staticmethod 1326 def get_ion_formula(neutral_formula, ion_type): 1327 """From a neutral formula and an ion type, return the formula of the ion. 1328 1329 Notes 1330 ----- 1331 This is a static method. 1332 If the neutral_formula is not a string, this method will return None. 1333 1334 Parameters 1335 ---------- 1336 neutral_formula : str 1337 The neutral formula, this should be a string form from the MolecularFormula class 1338 (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). 1339 In the case of a simple string, the atoms are parsed based on the presence of capital letters, 1340 e.g. MgCl2 is parsed as 'Mg Cl2. 1341 ion_type : str 1342 The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. 1343 See the self.ion_type_dict for the available ion types. 1344 1345 Returns 1346 ------- 1347 str 1348 The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string. 1349 """ 1350 # If neutral_formula is not a string, return None 1351 if not isinstance(neutral_formula, str): 1352 return None 1353 1354 # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class) 1355 if re.search(r"\s", neutral_formula): 1356 neutral_formula = MolecularFormula(neutral_formula, ion_charge=0) 1357 else: 1358 form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:] 1359 elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()] 1360 counts = [re.findall(r"\d+", x) for x in form_pre.split()] 1361 neutral_formula = MolecularFormula( 1362 dict( 1363 zip( 1364 [x[0] for x in elements], 1365 [int(x[0]) if x else 1 for x in counts], 1366 ) 1367 ), 1368 ion_charge=0, 1369 ) 1370 neutral_formula_dict = neutral_formula.to_dict().copy() 1371 1372 adduct_add_dict = ion_type_dict[ion_type][0] 1373 for key in adduct_add_dict: 1374 if key in neutral_formula_dict.keys(): 1375 neutral_formula_dict[key] += adduct_add_dict[key] 1376 else: 1377 neutral_formula_dict[key] = adduct_add_dict[key] 1378 1379 adduct_subtract = ion_type_dict[ion_type][1] 1380 for key in adduct_subtract: 1381 neutral_formula_dict[key] -= adduct_subtract[key] 1382 1383 return MolecularFormula(neutral_formula_dict, ion_charge=0).string
From a neutral formula and an ion type, return the formula of the ion.
Notes
This is a static method. If the neutral_formula is not a string, this method will return None.
Parameters
- neutral_formula (str): The neutral formula, this should be a string form from the MolecularFormula class (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). In the case of a simple string, the atoms are parsed based on the presence of capital letters, e.g. MgCl2 is parsed as 'Mg Cl2.
- ion_type (str): The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. See the self.ion_type_dict for the available ion types.
Returns
- str: The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1385 @staticmethod 1386 def get_isotope_type(ion_formula): 1387 """From an ion formula, return the 13C isotope type of the ion. 1388 1389 Notes 1390 ----- 1391 This is a static method. 1392 If the ion_formula is not a string, this method will return None. 1393 This is currently only functional for 13C isotopes. 1394 1395 Parameters 1396 ---------- 1397 ion_formula : str 1398 The formula of the ion, expected to be a string like 'C2 H4 O2'. 1399 1400 Returns 1401 ------- 1402 str 1403 The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope. 1404 1405 Raises 1406 ------ 1407 ValueError 1408 If the ion_formula is not a string. 1409 """ 1410 if not isinstance(ion_formula, str): 1411 return None 1412 1413 if re.search(r"\s", ion_formula): 1414 ion_formula = MolecularFormula(ion_formula, ion_charge=0) 1415 else: 1416 raise ValueError('ion_formula should be a string like "C2 H4 O2"') 1417 ion_formula_dict = ion_formula.to_dict().copy() 1418 1419 try: 1420 iso_class = "13C" + str(ion_formula_dict.pop("13C")) 1421 except KeyError: 1422 iso_class = None 1423 1424 return iso_class
From an ion formula, return the 13C isotope type of the ion.
Notes
This is a static method. If the ion_formula is not a string, this method will return None. This is currently only functional for 13C isotopes.
Parameters
- ion_formula (str): The formula of the ion, expected to be a string like 'C2 H4 O2'.
Returns
- str: The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
Raises
- ValueError: If the ion_formula is not a string.
1426 def report_to_csv(self, molecular_metadata=None): 1427 """Create a report of the mass features and their annotations and save it as a CSV file. 1428 1429 Parameters 1430 ---------- 1431 molecular_metadata : dict, optional 1432 The molecular metadata. Default is None. 1433 """ 1434 report = self.to_report(molecular_metadata=molecular_metadata) 1435 out_file = self.output_file.with_suffix(".csv") 1436 report.to_csv(out_file, index=False)
Create a report of the mass features and their annotations and save it as a CSV file.
Parameters
- molecular_metadata (dict, optional): The molecular metadata. Default is None.
1438 def clean_ms1_report(self, ms1_summary_full): 1439 """Clean the MS1 report. 1440 1441 Parameters 1442 ---------- 1443 ms1_summary_full : DataFrame 1444 The full MS1 summary DataFrame. 1445 1446 Returns 1447 ------- 1448 DataFrame 1449 The cleaned MS1 summary DataFrame. 1450 """ 1451 ms1_summary_full = ms1_summary_full.reset_index() 1452 cols_to_keep = [ 1453 "mf_id", 1454 "Molecular Formula", 1455 "Ion Type", 1456 "Calculated m/z", 1457 "m/z Error (ppm)", 1458 "m/z Error Score", 1459 "Is Isotopologue", 1460 "Isotopologue Similarity", 1461 "Confidence Score", 1462 ] 1463 ms1_summary = ms1_summary_full[cols_to_keep].copy() 1464 ms1_summary["ion_formula"] = [ 1465 self.get_ion_formula(f, a) 1466 for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"]) 1467 ] 1468 ms1_summary["isotopologue_type"] = [ 1469 self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist() 1470 ] 1471 1472 # Reorder columns 1473 ms1_summary = ms1_summary[ 1474 [ 1475 "mf_id", 1476 "ion_formula", 1477 "isotopologue_type", 1478 "Calculated m/z", 1479 "m/z Error (ppm)", 1480 "m/z Error Score", 1481 "Isotopologue Similarity", 1482 "Confidence Score", 1483 ] 1484 ] 1485 1486 # Set the index to mf_id 1487 ms1_summary = ms1_summary.set_index("mf_id") 1488 1489 return ms1_summary
Clean the MS1 report.
Parameters
- ms1_summary_full (DataFrame): The full MS1 summary DataFrame.
Returns
- DataFrame: The cleaned MS1 summary DataFrame.
1491 def summarize_ms2_report(self, ms2_annot_report): 1492 """ 1493 Summarize the MS2 report. 1494 1495 Parameters 1496 ---------- 1497 ms2_annot_report : DataFrame 1498 The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df. 1499 1500 Returns 1501 ------- 1502 """
Summarize the MS2 report.
Parameters
- ms2_annot_report (DataFrame): The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
- Returns
- -------
1504 def summarize_metabolomics_report(self, ms2_annot_report): 1505 """Summarize the MS2 hits for a metabolomics report 1506 1507 Parameters 1508 ---------- 1509 ms2_annot : DataFrame 1510 The MS2 annotation DataFrame with all annotations. 1511 1512 Returns 1513 ------- 1514 DataFrame 1515 The summarized metabolomics report. 1516 """ 1517 columns_to_drop = [ 1518 "precursor_mz", 1519 "precursor_mz_error_ppm", 1520 "cas", 1521 "data_id", 1522 "iupac_name", 1523 "traditional_name", 1524 "common_name", 1525 "casno", 1526 ] 1527 ms2_annot = ms2_annot_report.drop( 1528 columns=[col for col in columns_to_drop if col in ms2_annot_report.columns] 1529 ) 1530 1531 # Prepare information about the search results, pulling out the best hit for the single report 1532 # Group by mf_id,ref_mol_id grab row with highest entropy similarity 1533 ms2_annot = ms2_annot.reset_index() 1534 # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id 1535 ms2_annot["n_spectra_contributing"] = ( 1536 ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"] 1537 .transform("nunique") 1538 ) 1539 # Sort by entropy similarity 1540 ms2_annot = ms2_annot.sort_values( 1541 by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False] 1542 ) 1543 best_entropy = ms2_annot.drop_duplicates( 1544 subset=["mf_id", "ref_mol_id"], keep="first" 1545 ) 1546 1547 return best_entropy
Summarize the MS2 hits for a metabolomics report
Parameters
- ms2_annot (DataFrame): The MS2 annotation DataFrame with all annotations.
Returns
- DataFrame: The summarized metabolomics report.
1549 def clean_ms2_report(self, metabolite_summary): 1550 """Clean the MS2 report. 1551 1552 Parameters 1553 ---------- 1554 metabolite_summary : DataFrame 1555 The full metabolomics summary DataFrame. 1556 1557 Returns 1558 ------- 1559 DataFrame 1560 The cleaned metabolomics summary DataFrame. 1561 """ 1562 metabolite_summary = metabolite_summary.reset_index() 1563 metabolite_summary["ion_formula"] = [ 1564 self.get_ion_formula(f, a) 1565 for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"]) 1566 ] 1567 1568 col_order = [ 1569 "mf_id", 1570 "ion_formula", 1571 "ref_ion_type", 1572 "formula", 1573 "inchikey", 1574 "name", 1575 "inchi", 1576 "chebi", 1577 "smiles", 1578 "kegg", 1579 "cas", 1580 "database_name", 1581 "ref_ms_id", 1582 "entropy_similarity", 1583 "ref_mz_in_query_fract", 1584 "n_spectra_contributing", 1585 ] 1586 1587 # Reorder columns 1588 metabolite_summary = metabolite_summary[ 1589 [col for col in col_order if col in metabolite_summary.columns] 1590 ] 1591 1592 # Convert chebi (if present) to int: 1593 if "chebi" in metabolite_summary.columns: 1594 metabolite_summary["chebi"] = metabolite_summary["chebi"].astype( 1595 "Int64", errors="ignore" 1596 ) 1597 1598 # Set the index to mf_id 1599 metabolite_summary = metabolite_summary.set_index("mf_id") 1600 1601 return metabolite_summary
Clean the MS2 report.
Parameters
- metabolite_summary (DataFrame): The full metabolomics summary DataFrame.
Returns
- DataFrame: The cleaned metabolomics summary DataFrame.
1603 def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report): 1604 """Combine the mass feature report with the MS1 and MS2 reports. 1605 1606 Parameters 1607 ---------- 1608 mf_report : DataFrame 1609 The mass feature report DataFrame. 1610 ms1_annot_report : DataFrame 1611 The MS1 annotation report DataFrame. 1612 ms2_annot_report : DataFrame 1613 The MS2 annotation report DataFrame. 1614 """ 1615 # If there is an ms1_annot_report, merge it with the mf_report 1616 if not ms1_annot_report.empty: 1617 # MS1 has been run and has molecular formula information 1618 mf_report = pd.merge( 1619 mf_report, 1620 ms1_annot_report, 1621 how="left", 1622 on=["mf_id", "isotopologue_type"], 1623 ) 1624 if ms2_annot_report is not None: 1625 # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly) 1626 mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()] 1627 mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"]) 1628 mf_no_ion_formula = pd.merge( 1629 mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"] 1630 ) 1631 1632 # pull out the records with ion_formula 1633 mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()] 1634 mf_with_ion_formula = pd.merge( 1635 mf_with_ion_formula, 1636 ms2_annot_report, 1637 how="left", 1638 on=["mf_id", "ion_formula"], 1639 ) 1640 1641 # put back together 1642 mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula]) 1643 1644 # Rename colums 1645 rename_dict = { 1646 "mf_id": "Mass Feature ID", 1647 "scan_time": "Retention Time (min)", 1648 "mz": "m/z", 1649 "apex_scan": "Apex Scan Number", 1650 "intensity": "Intensity", 1651 "persistence": "Persistence", 1652 "area": "Area", 1653 "half_height_width": "Half Height Width (min)", 1654 "tailing_factor": "Tailing Factor", 1655 "dispersity_index": "Dispersity Index", 1656 "ms2_spectrum": "MS2 Spectrum", 1657 "monoisotopic_mf_id": "Monoisotopic Mass Feature ID", 1658 "isotopologue_type": "Isotopologue Type", 1659 "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution", 1660 "associated_mass_features": "Associated Mass Features after Deconvolution", 1661 "ion_formula": "Ion Formula", 1662 "formula": "Molecular Formula", 1663 "ref_ion_type": "Ion Type", 1664 "annot_level": "Lipid Annotation Level", 1665 "lipid_molecular_species_id": "Lipid Molecular Species", 1666 "lipid_summed_name": "Lipid Species", 1667 "lipid_subclass": "Lipid Subclass", 1668 "lipid_class": "Lipid Class", 1669 "lipid_category": "Lipid Category", 1670 "entropy_similarity": "Entropy Similarity", 1671 "ref_mz_in_query_fract": "Library mzs in Query (fraction)", 1672 "n_spectra_contributing": "Spectra with Annotation (n)", 1673 } 1674 mf_report = mf_report.rename(columns=rename_dict) 1675 mf_report["Sample Name"] = self.mass_spectra.sample_name 1676 mf_report["Polarity"] = self.mass_spectra.polarity 1677 mf_report = mf_report[ 1678 ["Mass Feature ID", "Sample Name", "Polarity"] 1679 + [ 1680 col 1681 for col in mf_report.columns 1682 if col not in ["Mass Feature ID", "Sample Name", "Polarity"] 1683 ] 1684 ] 1685 1686 # Reorder rows by "Mass Feature ID", then "Entropy Similarity" (descending), then "Confidence Score" (descending) 1687 if "Entropy Similarity" in mf_report.columns: 1688 mf_report = mf_report.sort_values( 1689 by=["Mass Feature ID", "Entropy Similarity", "Confidence Score"], 1690 ascending=[True, False, False], 1691 ) 1692 elif "Confidence Score" in mf_report.columns: 1693 mf_report = mf_report.sort_values( 1694 by=["Mass Feature ID", "Confidence Score"], 1695 ascending=[True, False], 1696 ) 1697 # If neither "Entropy Similarity" nor "Confidence Score" are in the columns, just sort by "Mass Feature ID" 1698 else: 1699 mf_report = mf_report.sort_values("Mass Feature ID") 1700 1701 # Reset index 1702 mf_report = mf_report.reset_index(drop=True) 1703 1704 return mf_report
Combine the mass feature report with the MS1 and MS2 reports.
Parameters
- mf_report (DataFrame): The mass feature report DataFrame.
- ms1_annot_report (DataFrame): The MS1 annotation report DataFrame.
- ms2_annot_report (DataFrame): The MS2 annotation report DataFrame.
1706 def to_report(self, molecular_metadata=None): 1707 """Create a report of the mass features and their annotations. 1708 1709 Parameters 1710 ---------- 1711 molecular_metadata : dict, optional 1712 The molecular metadata. Default is None. 1713 1714 Returns 1715 ------- 1716 DataFrame 1717 The report as a Pandas DataFrame. 1718 """ 1719 # Get mass feature dataframe 1720 mf_report = self.mass_spectra.mass_features_to_df() 1721 mf_report = mf_report.reset_index(drop=False) 1722 1723 # Get and clean ms1 annotation dataframe 1724 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 1725 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 1726 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 1727 1728 # Get, summarize, and clean ms2 annotation dataframe 1729 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 1730 molecular_metadata=molecular_metadata 1731 ) 1732 if ms2_annot_report is not None and molecular_metadata is not None: 1733 ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report) 1734 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 1735 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 1736 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 1737 else: 1738 ms2_annot_report = None 1739 1740 report = self.combine_reports( 1741 mf_report=mf_report, 1742 ms1_annot_report=ms1_annot_report, 1743 ms2_annot_report=ms2_annot_report 1744 ) 1745 1746 return report
Create a report of the mass features and their annotations.
Parameters
- molecular_metadata (dict, optional): The molecular metadata. Default is None.
Returns
- DataFrame: The report as a Pandas DataFrame.
Inherited Members
- HighResMassSpectraExport
- dir_loc
- output_file
- mass_spectra
- atoms_order_list
- get_pandas_df
- to_pandas
- to_excel
- to_csv
- get_mass_spectra_attrs
- corems.mass_spectrum.output.export.HighResMassSpecExport
- output_type
- mass_spectrum
- save
- run
- write_settings
- to_json
- add_mass_spectrum_to_hdf5
- parameters_to_toml
- parameters_to_json
- get_mass_spec_attrs
- get_all_used_atoms_in_order
- list_dict_to_list
- get_list_dict_data
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id
1747class LipidomicsExport(LCMSMetabolomicsExport): 1748 """A class to export lipidomics data. 1749 1750 This class provides methods to export lipidomics data to various formats and summarize the lipid report. 1751 1752 Parameters 1753 ---------- 1754 out_file_path : str | Path 1755 The output file path, do not include the file extension. 1756 mass_spectra : object 1757 The high resolution mass spectra object. 1758 """ 1759 1760 def __init__(self, out_file_path, mass_spectra): 1761 super().__init__(out_file_path, mass_spectra) 1762 1763 def summarize_lipid_report(self, ms2_annot): 1764 """Summarize the lipid report. 1765 1766 Parameters 1767 ---------- 1768 ms2_annot : DataFrame 1769 The MS2 annotation DataFrame with all annotations. 1770 1771 Returns 1772 ------- 1773 DataFrame 1774 The summarized lipid report. 1775 """ 1776 # Drop unnecessary columns for easier viewing 1777 columns_to_drop = [ 1778 "precursor_mz", 1779 "precursor_mz_error_ppm", 1780 "metabref_mol_id", 1781 "metabref_precursor_mz", 1782 "cas", 1783 "inchikey", 1784 "inchi", 1785 "chebi", 1786 "smiles", 1787 "kegg", 1788 "data_id", 1789 "iupac_name", 1790 "traditional_name", 1791 "common_name", 1792 "casno", 1793 ] 1794 ms2_annot = ms2_annot.drop( 1795 columns=[col for col in columns_to_drop if col in ms2_annot.columns] 1796 ) 1797 1798 # If ion_types_excluded is not empty, remove those ion types 1799 ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[ 1800 "ms2" 1801 ].molecular_search.ion_types_excluded 1802 if len(ion_types_excluded) > 0: 1803 ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)] 1804 1805 # If mf_id is not present, check that the index name is mf_id and reset the index 1806 if "mf_id" not in ms2_annot.columns: 1807 if ms2_annot.index.name == "mf_id": 1808 ms2_annot = ms2_annot.reset_index() 1809 else: 1810 raise ValueError("mf_id is not present in the dataframe") 1811 1812 # Attempt to get consensus annotations to the MLF level 1813 mlf_results_all = [] 1814 for mf_id in ms2_annot["mf_id"].unique(): 1815 mlf_results_perid = [] 1816 ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy() 1817 ms2_annot_mf["n_spectra_contributing"] = ms2_annot_mf.query_spectrum_id.nunique() 1818 1819 for query_scan in ms2_annot["query_spectrum_id"].unique(): 1820 ms2_annot_sub = ms2_annot_mf[ 1821 ms2_annot_mf["query_spectrum_id"] == query_scan 1822 ].copy() 1823 1824 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1825 # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation 1826 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1827 ms2_annot_sub["entropy_max"] = ( 1828 ms2_annot_sub["entropy_similarity"] 1829 == ms2_annot_sub["entropy_similarity"].max() 1830 ) 1831 ms2_annot_sub["ref_match_fract_max"] = ( 1832 ms2_annot_sub["ref_mz_in_query_fract"] 1833 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1834 ) 1835 ms2_annot_sub["frag_max"] = ms2_annot_sub[ 1836 "query_frag_types" 1837 ].apply(lambda x: True if "MLF" in x else False) 1838 1839 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1840 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1841 ["entropy_max", "ref_match_fract_max", "frag_max"] 1842 ].all(axis=1) 1843 1844 # If there is a consensus, take the row with the highest entropy_similarity 1845 if ms2_annot_sub["consensus"].any(): 1846 ms2_annot_sub = ms2_annot_sub[ 1847 ms2_annot_sub["entropy_similarity"] 1848 == ms2_annot_sub["entropy_similarity"].max() 1849 ].head(1) 1850 mlf_results_perid.append(ms2_annot_sub) 1851 if len(mlf_results_perid) == 0: 1852 mlf_results_perid = pd.DataFrame() 1853 else: 1854 mlf_results_perid = pd.concat(mlf_results_perid) 1855 if mlf_results_perid["name"].nunique() == 1: 1856 mlf_results_perid = mlf_results_perid[ 1857 mlf_results_perid["entropy_similarity"] 1858 == mlf_results_perid["entropy_similarity"].max() 1859 ].head(1) 1860 else: 1861 mlf_results_perid = pd.DataFrame() 1862 mlf_results_all.append(mlf_results_perid) 1863 1864 # These are the consensus annotations to the MLF level 1865 if len(mlf_results_all) > 0: 1866 mlf_results_all = pd.concat(mlf_results_all) 1867 mlf_results_all["annot_level"] = mlf_results_all["structure_level"] 1868 else: 1869 # Make an empty dataframe 1870 mlf_results_all = ms2_annot.head(0) 1871 1872 # For remaining mf_ids, try to get a consensus annotation to the species level 1873 species_results_all = [] 1874 # Remove mf_ids that have consensus annotations to the MLF level 1875 ms2_annot_spec = ms2_annot[ 1876 ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique()) 1877 ] 1878 for mf_id in ms2_annot_spec["mf_id"].unique(): 1879 # Do all the hits have the same lipid_summed_name? 1880 ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy() 1881 ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub) 1882 1883 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1884 # Grab the highest entropy_similarity result 1885 ms2_annot_sub = ms2_annot_sub[ 1886 ms2_annot_sub["entropy_similarity"] 1887 == ms2_annot_sub["entropy_similarity"].max() 1888 ].head(1) 1889 species_results_all.append(ms2_annot_sub) 1890 1891 # These are the consensus annotations to the species level 1892 if len(species_results_all) > 0: 1893 species_results_all = pd.concat(species_results_all) 1894 species_results_all["annot_level"] = "species" 1895 else: 1896 # Make an empty dataframe 1897 species_results_all = ms2_annot.head(0) 1898 1899 # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level 1900 # Remove mf_ids that have consensus annotations to the species level 1901 ms2_annot_remaining = ms2_annot_spec[ 1902 ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique()) 1903 ] 1904 no_consensus = [] 1905 for mf_id in ms2_annot_remaining["mf_id"].unique(): 1906 id_sub = [] 1907 id_no_con = [] 1908 ms2_annot_sub_mf = ms2_annot_remaining[ 1909 ms2_annot_remaining["mf_id"] == mf_id 1910 ].copy() 1911 for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique(): 1912 ms2_annot_sub = ms2_annot_sub_mf[ 1913 ms2_annot_sub_mf["query_spectrum_id"] == query_scan 1914 ].copy() 1915 1916 # New columns for ranking [HIGHER RANK = BETTER] 1917 ms2_annot_sub["entropy_max"] = ( 1918 ms2_annot_sub["entropy_similarity"] 1919 == ms2_annot_sub["entropy_similarity"].max() 1920 ) 1921 ms2_annot_sub["ref_match_fract_max"] = ( 1922 ms2_annot_sub["ref_mz_in_query_fract"] 1923 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1924 ) 1925 ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply( 1926 lambda x: True if "MLF" in x else False 1927 ) 1928 1929 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1930 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1931 ["entropy_max", "ref_match_fract_max", "frag_max"] 1932 ].all(axis=1) 1933 ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]] 1934 id_sub.append(ms2_annot_sub_con) 1935 id_no_con.append(ms2_annot_sub) 1936 id_sub = pd.concat(id_sub) 1937 id_no_con = pd.concat(id_no_con) 1938 1939 # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level] 1940 if ( 1941 id_sub["query_frag_types"] 1942 .apply(lambda x: True if "MLF" in x else False) 1943 .all() 1944 and len(id_sub) > 0 1945 ): 1946 idx = id_sub.groupby("name")["entropy_similarity"].idxmax() 1947 id_sub = id_sub.loc[idx] 1948 # Reorder so highest entropy_similarity is first 1949 id_sub = id_sub.sort_values("entropy_similarity", ascending=False) 1950 id_sub["annot_level"] = id_sub["structure_level"] 1951 no_consensus.append(id_sub) 1952 1953 # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level 1954 elif len(id_sub) == 0: 1955 for lipid_summed_name in id_no_con["lipid_summed_name"].unique(): 1956 summed_sub = id_no_con[ 1957 id_no_con["lipid_summed_name"] == lipid_summed_name 1958 ] 1959 # Any consensus to MLF? 1960 if summed_sub["consensus"].any(): 1961 summed_sub = summed_sub[summed_sub["consensus"]] 1962 summed_sub["annot_level"] = summed_sub["structure_level"] 1963 no_consensus.append(summed_sub) 1964 else: 1965 # Grab the highest entropy_similarity, if there are multiple, grab the first one 1966 summed_sub = summed_sub[ 1967 summed_sub["entropy_similarity"] 1968 == summed_sub["entropy_similarity"].max() 1969 ].head(1) 1970 # get first row 1971 summed_sub["annot_level"] = "species" 1972 summed_sub["name"] = "" 1973 no_consensus.append(summed_sub) 1974 else: 1975 raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id) 1976 1977 if len(no_consensus) > 0: 1978 no_consensus = pd.concat(no_consensus) 1979 else: 1980 no_consensus = ms2_annot.head(0) 1981 1982 # Combine all the consensus annotations and reformat the dataframe for output 1983 species_results_all = species_results_all.drop(columns=["name"]) 1984 species_results_all["lipid_molecular_species_id"] = "" 1985 mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"] 1986 no_consensus["lipid_molecular_species_id"] = no_consensus["name"] 1987 consensus_annotations = pd.concat( 1988 [mlf_results_all, species_results_all, no_consensus] 1989 ) 1990 consensus_annotations = consensus_annotations.sort_values( 1991 "mf_id", ascending=True 1992 ) 1993 cols_to_keep = [ 1994 "mf_id", 1995 "ref_ion_type", 1996 "entropy_similarity", 1997 "ref_mz_in_query_fract", 1998 "lipid_molecular_species_id", 1999 "lipid_summed_name", 2000 "lipid_subclass", 2001 "lipid_class", 2002 "lipid_category", 2003 "formula", 2004 "annot_level", 2005 "n_spectra_contributing", 2006 ] 2007 consensus_annotations = consensus_annotations[cols_to_keep] 2008 consensus_annotations = consensus_annotations.set_index("mf_id") 2009 2010 return consensus_annotations 2011 2012 def clean_ms2_report(self, lipid_summary): 2013 """Clean the MS2 report. 2014 2015 Parameters 2016 ---------- 2017 lipid_summary : DataFrame 2018 The full lipid summary DataFrame. 2019 2020 Returns 2021 ------- 2022 DataFrame 2023 The cleaned lipid summary DataFrame. 2024 """ 2025 lipid_summary = lipid_summary.reset_index() 2026 lipid_summary["ion_formula"] = [ 2027 self.get_ion_formula(f, a) 2028 for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"]) 2029 ] 2030 2031 # Reorder columns 2032 lipid_summary = lipid_summary[ 2033 [ 2034 "mf_id", 2035 "ion_formula", 2036 "ref_ion_type", 2037 "formula", 2038 "annot_level", 2039 "lipid_molecular_species_id", 2040 "lipid_summed_name", 2041 "lipid_subclass", 2042 "lipid_class", 2043 "lipid_category", 2044 "entropy_similarity", 2045 "ref_mz_in_query_fract", 2046 "n_spectra_contributing", 2047 ] 2048 ] 2049 2050 # Set the index to mf_id 2051 lipid_summary = lipid_summary.set_index("mf_id") 2052 2053 return lipid_summary 2054 2055 def to_report(self, molecular_metadata=None): 2056 """Create a report of the mass features and their annotations. 2057 2058 Parameters 2059 ---------- 2060 molecular_metadata : dict, optional 2061 The molecular metadata. Default is None. 2062 2063 Returns 2064 ------- 2065 DataFrame 2066 The report of the mass features and their annotations. 2067 2068 Notes 2069 ----- 2070 The report will contain the mass features and their annotations from MS1 and MS2 (if available). 2071 """ 2072 # Get mass feature dataframe 2073 mf_report = self.mass_spectra.mass_features_to_df() 2074 mf_report = mf_report.reset_index(drop=False) 2075 2076 # Get and clean ms1 annotation dataframe 2077 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 2078 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 2079 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 2080 2081 # Get, summarize, and clean ms2 annotation dataframe 2082 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 2083 molecular_metadata=molecular_metadata 2084 ) 2085 if ms2_annot_report is not None and molecular_metadata is not None: 2086 ms2_annot_report = self.summarize_lipid_report(ms2_annot_report) 2087 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 2088 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 2089 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 2090 report = self.combine_reports( 2091 mf_report=mf_report, 2092 ms1_annot_report=ms1_annot_report, 2093 ms2_annot_report=ms2_annot_report 2094 ) 2095 return report
A class to export lipidomics data.
This class provides methods to export lipidomics data to various formats and summarize the lipid report.
Parameters
- out_file_path (str | Path): The output file path, do not include the file extension.
- mass_spectra (object): The high resolution mass spectra object.
1760 def __init__(self, out_file_path, mass_spectra): 1761 super().__init__(out_file_path, mass_spectra)
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
1763 def summarize_lipid_report(self, ms2_annot): 1764 """Summarize the lipid report. 1765 1766 Parameters 1767 ---------- 1768 ms2_annot : DataFrame 1769 The MS2 annotation DataFrame with all annotations. 1770 1771 Returns 1772 ------- 1773 DataFrame 1774 The summarized lipid report. 1775 """ 1776 # Drop unnecessary columns for easier viewing 1777 columns_to_drop = [ 1778 "precursor_mz", 1779 "precursor_mz_error_ppm", 1780 "metabref_mol_id", 1781 "metabref_precursor_mz", 1782 "cas", 1783 "inchikey", 1784 "inchi", 1785 "chebi", 1786 "smiles", 1787 "kegg", 1788 "data_id", 1789 "iupac_name", 1790 "traditional_name", 1791 "common_name", 1792 "casno", 1793 ] 1794 ms2_annot = ms2_annot.drop( 1795 columns=[col for col in columns_to_drop if col in ms2_annot.columns] 1796 ) 1797 1798 # If ion_types_excluded is not empty, remove those ion types 1799 ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[ 1800 "ms2" 1801 ].molecular_search.ion_types_excluded 1802 if len(ion_types_excluded) > 0: 1803 ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)] 1804 1805 # If mf_id is not present, check that the index name is mf_id and reset the index 1806 if "mf_id" not in ms2_annot.columns: 1807 if ms2_annot.index.name == "mf_id": 1808 ms2_annot = ms2_annot.reset_index() 1809 else: 1810 raise ValueError("mf_id is not present in the dataframe") 1811 1812 # Attempt to get consensus annotations to the MLF level 1813 mlf_results_all = [] 1814 for mf_id in ms2_annot["mf_id"].unique(): 1815 mlf_results_perid = [] 1816 ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy() 1817 ms2_annot_mf["n_spectra_contributing"] = ms2_annot_mf.query_spectrum_id.nunique() 1818 1819 for query_scan in ms2_annot["query_spectrum_id"].unique(): 1820 ms2_annot_sub = ms2_annot_mf[ 1821 ms2_annot_mf["query_spectrum_id"] == query_scan 1822 ].copy() 1823 1824 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1825 # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation 1826 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1827 ms2_annot_sub["entropy_max"] = ( 1828 ms2_annot_sub["entropy_similarity"] 1829 == ms2_annot_sub["entropy_similarity"].max() 1830 ) 1831 ms2_annot_sub["ref_match_fract_max"] = ( 1832 ms2_annot_sub["ref_mz_in_query_fract"] 1833 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1834 ) 1835 ms2_annot_sub["frag_max"] = ms2_annot_sub[ 1836 "query_frag_types" 1837 ].apply(lambda x: True if "MLF" in x else False) 1838 1839 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1840 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1841 ["entropy_max", "ref_match_fract_max", "frag_max"] 1842 ].all(axis=1) 1843 1844 # If there is a consensus, take the row with the highest entropy_similarity 1845 if ms2_annot_sub["consensus"].any(): 1846 ms2_annot_sub = ms2_annot_sub[ 1847 ms2_annot_sub["entropy_similarity"] 1848 == ms2_annot_sub["entropy_similarity"].max() 1849 ].head(1) 1850 mlf_results_perid.append(ms2_annot_sub) 1851 if len(mlf_results_perid) == 0: 1852 mlf_results_perid = pd.DataFrame() 1853 else: 1854 mlf_results_perid = pd.concat(mlf_results_perid) 1855 if mlf_results_perid["name"].nunique() == 1: 1856 mlf_results_perid = mlf_results_perid[ 1857 mlf_results_perid["entropy_similarity"] 1858 == mlf_results_perid["entropy_similarity"].max() 1859 ].head(1) 1860 else: 1861 mlf_results_perid = pd.DataFrame() 1862 mlf_results_all.append(mlf_results_perid) 1863 1864 # These are the consensus annotations to the MLF level 1865 if len(mlf_results_all) > 0: 1866 mlf_results_all = pd.concat(mlf_results_all) 1867 mlf_results_all["annot_level"] = mlf_results_all["structure_level"] 1868 else: 1869 # Make an empty dataframe 1870 mlf_results_all = ms2_annot.head(0) 1871 1872 # For remaining mf_ids, try to get a consensus annotation to the species level 1873 species_results_all = [] 1874 # Remove mf_ids that have consensus annotations to the MLF level 1875 ms2_annot_spec = ms2_annot[ 1876 ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique()) 1877 ] 1878 for mf_id in ms2_annot_spec["mf_id"].unique(): 1879 # Do all the hits have the same lipid_summed_name? 1880 ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy() 1881 ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub) 1882 1883 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1884 # Grab the highest entropy_similarity result 1885 ms2_annot_sub = ms2_annot_sub[ 1886 ms2_annot_sub["entropy_similarity"] 1887 == ms2_annot_sub["entropy_similarity"].max() 1888 ].head(1) 1889 species_results_all.append(ms2_annot_sub) 1890 1891 # These are the consensus annotations to the species level 1892 if len(species_results_all) > 0: 1893 species_results_all = pd.concat(species_results_all) 1894 species_results_all["annot_level"] = "species" 1895 else: 1896 # Make an empty dataframe 1897 species_results_all = ms2_annot.head(0) 1898 1899 # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level 1900 # Remove mf_ids that have consensus annotations to the species level 1901 ms2_annot_remaining = ms2_annot_spec[ 1902 ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique()) 1903 ] 1904 no_consensus = [] 1905 for mf_id in ms2_annot_remaining["mf_id"].unique(): 1906 id_sub = [] 1907 id_no_con = [] 1908 ms2_annot_sub_mf = ms2_annot_remaining[ 1909 ms2_annot_remaining["mf_id"] == mf_id 1910 ].copy() 1911 for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique(): 1912 ms2_annot_sub = ms2_annot_sub_mf[ 1913 ms2_annot_sub_mf["query_spectrum_id"] == query_scan 1914 ].copy() 1915 1916 # New columns for ranking [HIGHER RANK = BETTER] 1917 ms2_annot_sub["entropy_max"] = ( 1918 ms2_annot_sub["entropy_similarity"] 1919 == ms2_annot_sub["entropy_similarity"].max() 1920 ) 1921 ms2_annot_sub["ref_match_fract_max"] = ( 1922 ms2_annot_sub["ref_mz_in_query_fract"] 1923 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1924 ) 1925 ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply( 1926 lambda x: True if "MLF" in x else False 1927 ) 1928 1929 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1930 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1931 ["entropy_max", "ref_match_fract_max", "frag_max"] 1932 ].all(axis=1) 1933 ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]] 1934 id_sub.append(ms2_annot_sub_con) 1935 id_no_con.append(ms2_annot_sub) 1936 id_sub = pd.concat(id_sub) 1937 id_no_con = pd.concat(id_no_con) 1938 1939 # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level] 1940 if ( 1941 id_sub["query_frag_types"] 1942 .apply(lambda x: True if "MLF" in x else False) 1943 .all() 1944 and len(id_sub) > 0 1945 ): 1946 idx = id_sub.groupby("name")["entropy_similarity"].idxmax() 1947 id_sub = id_sub.loc[idx] 1948 # Reorder so highest entropy_similarity is first 1949 id_sub = id_sub.sort_values("entropy_similarity", ascending=False) 1950 id_sub["annot_level"] = id_sub["structure_level"] 1951 no_consensus.append(id_sub) 1952 1953 # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level 1954 elif len(id_sub) == 0: 1955 for lipid_summed_name in id_no_con["lipid_summed_name"].unique(): 1956 summed_sub = id_no_con[ 1957 id_no_con["lipid_summed_name"] == lipid_summed_name 1958 ] 1959 # Any consensus to MLF? 1960 if summed_sub["consensus"].any(): 1961 summed_sub = summed_sub[summed_sub["consensus"]] 1962 summed_sub["annot_level"] = summed_sub["structure_level"] 1963 no_consensus.append(summed_sub) 1964 else: 1965 # Grab the highest entropy_similarity, if there are multiple, grab the first one 1966 summed_sub = summed_sub[ 1967 summed_sub["entropy_similarity"] 1968 == summed_sub["entropy_similarity"].max() 1969 ].head(1) 1970 # get first row 1971 summed_sub["annot_level"] = "species" 1972 summed_sub["name"] = "" 1973 no_consensus.append(summed_sub) 1974 else: 1975 raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id) 1976 1977 if len(no_consensus) > 0: 1978 no_consensus = pd.concat(no_consensus) 1979 else: 1980 no_consensus = ms2_annot.head(0) 1981 1982 # Combine all the consensus annotations and reformat the dataframe for output 1983 species_results_all = species_results_all.drop(columns=["name"]) 1984 species_results_all["lipid_molecular_species_id"] = "" 1985 mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"] 1986 no_consensus["lipid_molecular_species_id"] = no_consensus["name"] 1987 consensus_annotations = pd.concat( 1988 [mlf_results_all, species_results_all, no_consensus] 1989 ) 1990 consensus_annotations = consensus_annotations.sort_values( 1991 "mf_id", ascending=True 1992 ) 1993 cols_to_keep = [ 1994 "mf_id", 1995 "ref_ion_type", 1996 "entropy_similarity", 1997 "ref_mz_in_query_fract", 1998 "lipid_molecular_species_id", 1999 "lipid_summed_name", 2000 "lipid_subclass", 2001 "lipid_class", 2002 "lipid_category", 2003 "formula", 2004 "annot_level", 2005 "n_spectra_contributing", 2006 ] 2007 consensus_annotations = consensus_annotations[cols_to_keep] 2008 consensus_annotations = consensus_annotations.set_index("mf_id") 2009 2010 return consensus_annotations
Summarize the lipid report.
Parameters
- ms2_annot (DataFrame): The MS2 annotation DataFrame with all annotations.
Returns
- DataFrame: The summarized lipid report.
2012 def clean_ms2_report(self, lipid_summary): 2013 """Clean the MS2 report. 2014 2015 Parameters 2016 ---------- 2017 lipid_summary : DataFrame 2018 The full lipid summary DataFrame. 2019 2020 Returns 2021 ------- 2022 DataFrame 2023 The cleaned lipid summary DataFrame. 2024 """ 2025 lipid_summary = lipid_summary.reset_index() 2026 lipid_summary["ion_formula"] = [ 2027 self.get_ion_formula(f, a) 2028 for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"]) 2029 ] 2030 2031 # Reorder columns 2032 lipid_summary = lipid_summary[ 2033 [ 2034 "mf_id", 2035 "ion_formula", 2036 "ref_ion_type", 2037 "formula", 2038 "annot_level", 2039 "lipid_molecular_species_id", 2040 "lipid_summed_name", 2041 "lipid_subclass", 2042 "lipid_class", 2043 "lipid_category", 2044 "entropy_similarity", 2045 "ref_mz_in_query_fract", 2046 "n_spectra_contributing", 2047 ] 2048 ] 2049 2050 # Set the index to mf_id 2051 lipid_summary = lipid_summary.set_index("mf_id") 2052 2053 return lipid_summary
Clean the MS2 report.
Parameters
- lipid_summary (DataFrame): The full lipid summary DataFrame.
Returns
- DataFrame: The cleaned lipid summary DataFrame.
2055 def to_report(self, molecular_metadata=None): 2056 """Create a report of the mass features and their annotations. 2057 2058 Parameters 2059 ---------- 2060 molecular_metadata : dict, optional 2061 The molecular metadata. Default is None. 2062 2063 Returns 2064 ------- 2065 DataFrame 2066 The report of the mass features and their annotations. 2067 2068 Notes 2069 ----- 2070 The report will contain the mass features and their annotations from MS1 and MS2 (if available). 2071 """ 2072 # Get mass feature dataframe 2073 mf_report = self.mass_spectra.mass_features_to_df() 2074 mf_report = mf_report.reset_index(drop=False) 2075 2076 # Get and clean ms1 annotation dataframe 2077 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 2078 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 2079 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 2080 2081 # Get, summarize, and clean ms2 annotation dataframe 2082 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 2083 molecular_metadata=molecular_metadata 2084 ) 2085 if ms2_annot_report is not None and molecular_metadata is not None: 2086 ms2_annot_report = self.summarize_lipid_report(ms2_annot_report) 2087 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 2088 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 2089 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 2090 report = self.combine_reports( 2091 mf_report=mf_report, 2092 ms1_annot_report=ms1_annot_report, 2093 ms2_annot_report=ms2_annot_report 2094 ) 2095 return report
Create a report of the mass features and their annotations.
Parameters
- molecular_metadata (dict, optional): The molecular metadata. Default is None.
Returns
- DataFrame: The report of the mass features and their annotations.
Notes
The report will contain the mass features and their annotations from MS1 and MS2 (if available).
Inherited Members
- LCMSMetabolomicsExport
- ion_type_dict
- get_ion_formula
- get_isotope_type
- report_to_csv
- clean_ms1_report
- summarize_ms2_report
- summarize_metabolomics_report
- combine_reports
- HighResMassSpectraExport
- dir_loc
- output_file
- mass_spectra
- atoms_order_list
- get_pandas_df
- to_pandas
- to_excel
- to_csv
- get_mass_spectra_attrs
- corems.mass_spectrum.output.export.HighResMassSpecExport
- output_type
- mass_spectrum
- save
- run
- write_settings
- to_json
- add_mass_spectrum_to_hdf5
- parameters_to_toml
- parameters_to_json
- get_mass_spec_attrs
- get_all_used_atoms_in_order
- list_dict_to_list
- get_list_dict_data
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id