corems.mass_spectra.output.export
1__author__ = "Yuri E. Corilo" 2__date__ = "Dec 14, 2010" 3 4 5import csv 6import json 7import re 8import uuid 9import warnings 10from datetime import datetime, timezone 11from pathlib import Path 12 13import h5py 14import numpy as np 15import pandas as pd 16from openpyxl import load_workbook 17from pandas import DataFrame, ExcelWriter, read_excel 18 19from corems import __version__, corems_md5 20from corems.encapsulation.output import parameter_to_dict 21from corems.encapsulation.output.parameter_to_json import ( 22 dump_lcms_settings_json, 23 dump_lcms_settings_toml, 24) 25from corems.mass_spectrum.output.export import HighResMassSpecExport 26from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula 27from corems.molecular_id.calc.SpectralSimilarity import methods_name 28 29ion_type_dict = { 30 # adduct : [atoms to add, atoms to subtract when calculating formula of ion 31 "M+": [{}, {}], 32 "protonated": [{"H": 1}, {}], 33 "[M+H]+": [{"H": 1}, {}], 34 "[M+NH4]+": [{"N": 1, "H": 4}, {}], # ammonium 35 "[M+Na]+": [{"Na": 1}, {}], 36 "[M+K]+": [{"K": 1}, {}], 37 "[M+2Na+Cl]+": [{"Na": 2, "Cl": 1}, {}], 38 "[M+2Na-H]+": [{"Na": 2}, {"H": 1}], 39 "[M+C2H3Na2O2]+": [{"C": 2, "H": 3, "Na": 2, "O": 2}, {}], 40 "[M+C4H10N3]+": [{"C": 4, "H": 10, "N": 3}, {}], 41 "[M+NH4+ACN]+": [{"C": 2, "H": 7, "N": 2}, {}], 42 "[M+H-H2O]+": [{}, {"H": 1, "O": 1}], 43 "de-protonated": [{}, {"H": 1}], 44 "[M-H]-": [{}, {"H": 1}], 45 "[M+Cl]-": [{"Cl": 1}, {}], 46 "[M+HCOO]-": [{"C": 1, "H": 1, "O": 2}, {}], # formate 47 "[M+CH3COO]-": [{"C": 2, "H": 3, "O": 2}, {}], # acetate 48 "[M+2NaAc+Cl]-": [{"Na": 2, "C": 2, "H": 3, "O": 2, "Cl": 1}, {}], 49 "[M+K-2H]-": [{"K": 1}, {"H": 2}], 50 "[M+Na-2H]-": [{"Na": 1}, {"H": 2}], 51} 52 53 54class LowResGCMSExport: 55 """A class to export low resolution GC-MS data. 56 57 This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame. 58 59 Parameters: 60 ---------- 61 out_file_path : str 62 The output file path. 63 gcms : object 64 The low resolution GCMS object. 65 66 Attributes: 67 ---------- 68 output_file : Path 69 The output file path as a Path object. 70 gcms : object 71 The low resolution GCMS object. 72 73 Methods: 74 ------- 75 * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame. 76 * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string. 77 * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file. 78 * to_excel(write_mode='a', write_metadata=True, id_label="corems:"), 79 Export the data to an Excel file. 80 * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). 81 Export the data to a CSV file. 82 * to_hdf(id_label="corems:"). 83 Export the data to an HDF5 file. 84 * get_data_stats(gcms). 85 Get statistics about the GCMS data. 86 87 """ 88 89 def __init__(self, out_file_path, gcms): 90 self.output_file = Path(out_file_path) 91 92 self.gcms = gcms 93 94 self._init_columns() 95 96 def _init_columns(self): 97 """Initialize the column names for the exported data. 98 99 Returns: 100 ------- 101 list 102 The list of column names. 103 """ 104 105 columns = [ 106 "Sample name", 107 "Peak Index", 108 "Retention Time", 109 "Retention Time Ref", 110 "Peak Height", 111 "Peak Area", 112 "Retention index", 113 "Retention index Ref", 114 "Retention Index Score", 115 "Similarity Score", 116 "Spectral Similarity Score", 117 "Compound Name", 118 "Chebi ID", 119 "Kegg Compound ID", 120 "Inchi", 121 "Inchi Key", 122 "Smiles", 123 "Molecular Formula", 124 "IUPAC Name", 125 "Traditional Name", 126 "Common Name", 127 "Derivatization", 128 ] 129 130 if self.gcms.molecular_search_settings.exploratory_mode: 131 columns.extend( 132 [ 133 "Weighted Cosine Correlation", 134 "Cosine Correlation", 135 "Stein Scott Similarity", 136 "Pearson Correlation", 137 "Spearman Correlation", 138 "Kendall Tau Correlation", 139 "Euclidean Distance", 140 "Manhattan Distance", 141 "Jaccard Distance", 142 "DWT Correlation", 143 "DFT Correlation", 144 ] 145 ) 146 147 columns.extend(list(methods_name.values())) 148 149 return columns 150 151 def get_pandas_df(self, id_label="corems:"): 152 """Get the exported data as a Pandas DataFrame. 153 154 Parameters: 155 ---------- 156 id_label : str, optional 157 The ID label for the data. Default is "corems:". 158 159 Returns: 160 ------- 161 DataFrame 162 The exported data as a Pandas DataFrame. 163 """ 164 165 columns = self._init_columns() 166 167 dict_data_list = self.get_list_dict_data(self.gcms) 168 169 df = DataFrame(dict_data_list, columns=columns) 170 171 df.name = self.gcms.sample_name 172 173 return df 174 175 def get_json(self, nan=False, id_label="corems:"): 176 """Get the exported data as a JSON string. 177 178 Parameters: 179 ---------- 180 nan : bool, optional 181 Whether to include NaN values in the JSON string. Default is False. 182 id_label : str, optional 183 The ID label for the data. Default is "corems:". 184 185 """ 186 187 import json 188 189 dict_data_list = self.get_list_dict_data(self.gcms) 190 191 return json.dumps( 192 dict_data_list, sort_keys=False, indent=4, separators=(",", ": ") 193 ) 194 195 def to_pandas(self, write_metadata=True, id_label="corems:"): 196 """Export the data to a Pandas DataFrame and save it as a pickle file. 197 198 Parameters: 199 ---------- 200 write_metadata : bool, optional 201 Whether to write metadata to the output file. 202 id_label : str, optional 203 The ID label for the data. 204 """ 205 206 columns = self._init_columns() 207 208 dict_data_list = self.get_list_dict_data(self.gcms) 209 210 df = DataFrame(dict_data_list, columns=columns) 211 212 df.to_pickle(self.output_file.with_suffix(".pkl")) 213 214 if write_metadata: 215 self.write_settings( 216 self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:" 217 ) 218 219 def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"): 220 """Export the data to an Excel file. 221 222 Parameters: 223 ---------- 224 write_mode : str, optional 225 The write mode for the Excel file. Default is 'a' (append). 226 write_metadata : bool, optional 227 Whether to write metadata to the output file. Default is True. 228 id_label : str, optional 229 The ID label for the data. Default is "corems:". 230 """ 231 232 out_put_path = self.output_file.with_suffix(".xlsx") 233 234 columns = self._init_columns() 235 236 dict_data_list = self.get_list_dict_data(self.gcms) 237 238 df = DataFrame(dict_data_list, columns=columns) 239 240 if write_mode == "a" and out_put_path.exists(): 241 writer = ExcelWriter(out_put_path, engine="openpyxl") 242 # try to open an existing workbook 243 writer.book = load_workbook(out_put_path) 244 # copy existing sheets 245 writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets) 246 # read existing file 247 reader = read_excel(out_put_path) 248 # write out the new sheet 249 df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1) 250 251 writer.close() 252 else: 253 df.to_excel( 254 self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl" 255 ) 256 257 if write_metadata: 258 self.write_settings(out_put_path, self.gcms, id_label=id_label) 259 260 def to_csv( 261 self, 262 separate_output=False, 263 write_mode="w", 264 write_metadata=True, 265 id_label="corems:", 266 ): 267 """Export the data to a CSV file. 268 269 Parameters: 270 ---------- 271 separate_output : bool, optional 272 Whether to separate the output into multiple files. Default is False. 273 write_mode : str, optional 274 The write mode for the CSV file. Default is 'w' (write). 275 write_metadata : bool, optional 276 Whether to write metadata to the output file. Default is True. 277 id_label : str, optional 278 The ID label for the data. Default is "corems:". 279 """ 280 281 if separate_output: 282 # set write mode to write 283 # this mode will overwrite the file without warning 284 write_mode = "w" 285 else: 286 # set write mode to append 287 write_mode = "a" 288 289 columns = self._init_columns() 290 291 dict_data_list = self.get_list_dict_data(self.gcms) 292 293 out_put_path = self.output_file.with_suffix(".csv") 294 295 write_header = not out_put_path.exists() 296 297 try: 298 with open(out_put_path, write_mode, newline="") as csvfile: 299 writer = csv.DictWriter(csvfile, fieldnames=columns) 300 if write_header: 301 writer.writeheader() 302 for data in dict_data_list: 303 writer.writerow(data) 304 305 if write_metadata: 306 self.write_settings(out_put_path, self.gcms, id_label=id_label) 307 308 except IOError as ioerror: 309 print(ioerror) 310 311 def to_hdf(self, id_label="corems:"): 312 """Export the data to an HDF5 file. 313 314 Parameters: 315 ---------- 316 id_label : str, optional 317 The ID label for the data. Default is "corems:". 318 """ 319 320 # save sample at a time 321 def add_compound(gc_peak, compound_obj): 322 modifier = compound_obj.classify if compound_obj.classify else "" 323 compound_group = compound_obj.name.replace("/", "") + " " + modifier 324 325 if compound_group not in peak_group: 326 compound_group = peak_group.create_group(compound_group) 327 328 # compound_group.attrs["retention_time"] = compound_obj.retention_time 329 compound_group.attrs["retention_index"] = compound_obj.ri 330 compound_group.attrs["retention_index_score"] = compound_obj.ri_score 331 compound_group.attrs["spectral_similarity_score"] = ( 332 compound_obj.spectral_similarity_score 333 ) 334 compound_group.attrs["similarity_score"] = compound_obj.similarity_score 335 336 compond_mz = compound_group.create_dataset( 337 "mz", data=np.array(compound_obj.mz), dtype="f8" 338 ) 339 compond_abundance = compound_group.create_dataset( 340 "abundance", data=np.array(compound_obj.abundance), dtype="f8" 341 ) 342 343 if self.gcms.molecular_search_settings.exploratory_mode: 344 compound_group.attrs["Spectral Similarities"] = json.dumps( 345 compound_obj.spectral_similarity_scores, 346 sort_keys=False, 347 indent=4, 348 separators=(",", ":"), 349 ) 350 else: 351 warnings.warn("Skipping duplicate reference compound.") 352 353 import json 354 from datetime import datetime, timezone 355 356 import h5py 357 import numpy as np 358 359 output_path = self.output_file.with_suffix(".hdf5") 360 361 with h5py.File(output_path, "w") as hdf_handle: 362 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 363 hdf_handle.attrs["time_stamp"] = timenow 364 hdf_handle.attrs["data_structure"] = "gcms" 365 hdf_handle.attrs["analyzer"] = self.gcms.analyzer 366 hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label 367 368 hdf_handle.attrs["sample_id"] = "self.gcms.id" 369 hdf_handle.attrs["sample_name"] = self.gcms.sample_name 370 hdf_handle.attrs["input_data"] = str(self.gcms.file_location) 371 hdf_handle.attrs["output_data"] = str(output_path) 372 hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex 373 hdf_handle.attrs["corems_version"] = __version__ 374 375 hdf_handle.attrs["Stats"] = json.dumps( 376 self.get_data_stats(self.gcms), 377 sort_keys=False, 378 indent=4, 379 separators=(",", ": "), 380 ) 381 hdf_handle.attrs["Calibration"] = json.dumps( 382 self.get_calibration_stats(self.gcms, id_label), 383 sort_keys=False, 384 indent=4, 385 separators=(",", ": "), 386 ) 387 hdf_handle.attrs["Blank"] = json.dumps( 388 self.get_blank_stats(self.gcms), 389 sort_keys=False, 390 indent=4, 391 separators=(",", ": "), 392 ) 393 394 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms) 395 hdf_handle.attrs["CoreMSParameters"] = json.dumps( 396 corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ") 397 ) 398 399 scans_dataset = hdf_handle.create_dataset( 400 "scans", data=np.array(self.gcms.scans_number), dtype="f8" 401 ) 402 rt_dataset = hdf_handle.create_dataset( 403 "rt", data=np.array(self.gcms.retention_time), dtype="f8" 404 ) 405 tic_dataset = hdf_handle.create_dataset( 406 "tic", data=np.array(self.gcms.tic), dtype="f8" 407 ) 408 processed_tic_dataset = hdf_handle.create_dataset( 409 "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8" 410 ) 411 412 output_score_method = ( 413 self.gcms.molecular_search_settings.output_score_method 414 ) 415 416 for gc_peak in self.gcms: 417 # print(gc_peak.retention_time) 418 # print(gc_peak.tic) 419 420 # check if there is a compound candidate 421 peak_group = hdf_handle.create_group(str(gc_peak.retention_time)) 422 peak_group.attrs["deconvolution"] = int( 423 self.gcms.chromatogram_settings.use_deconvolution 424 ) 425 426 peak_group.attrs["start_scan"] = gc_peak.start_scan 427 peak_group.attrs["apex_scan"] = gc_peak.apex_scan 428 peak_group.attrs["final_scan"] = gc_peak.final_scan 429 430 peak_group.attrs["retention_index"] = gc_peak.ri 431 peak_group.attrs["retention_time"] = gc_peak.retention_time 432 peak_group.attrs["area"] = gc_peak.area 433 434 mz = peak_group.create_dataset( 435 "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8" 436 ) 437 abundance = peak_group.create_dataset( 438 "abundance", 439 data=np.array(gc_peak.mass_spectrum.abundance), 440 dtype="f8", 441 ) 442 443 if gc_peak: 444 if output_score_method == "highest_sim_score": 445 compound_obj = gc_peak.highest_score_compound 446 add_compound(gc_peak, compound_obj) 447 448 elif output_score_method == "highest_ss": 449 compound_obj = gc_peak.highest_ss_compound 450 add_compound(gc_peak, compound_obj) 451 452 else: 453 for compound_obj in gc_peak: 454 add_compound(gc_peak, compound_obj) 455 456 def get_data_stats(self, gcms): 457 """Get statistics about the GCMS data. 458 459 Parameters: 460 ---------- 461 gcms : object 462 The low resolution GCMS object. 463 464 Returns: 465 ------- 466 dict 467 A dictionary containing the data statistics. 468 """ 469 470 matched_peaks = gcms.matched_peaks 471 no_matched_peaks = gcms.no_matched_peaks 472 unique_metabolites = gcms.unique_metabolites 473 474 peak_matchs_above_0p85 = 0 475 unique_peak_match_above_0p85 = 0 476 for match_peak in matched_peaks: 477 gc_peak_above_85 = 0 478 matches_above_85 = list( 479 filter(lambda m: m.similarity_score >= 0.85, match_peak) 480 ) 481 if matches_above_85: 482 peak_matchs_above_0p85 += 1 483 if len(matches_above_85) == 1: 484 unique_peak_match_above_0p85 += 1 485 486 data_stats = {} 487 data_stats["average_signal_noise"] = "ni" 488 data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range 489 data_stats["total_number_peaks"] = len(gcms) 490 data_stats["total_peaks_matched"] = len(matched_peaks) 491 data_stats["total_peaks_without_matches"] = len(no_matched_peaks) 492 data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85 493 data_stats["single_matches_above_similarity_score_0.85"] = ( 494 unique_peak_match_above_0p85 495 ) 496 data_stats["unique_metabolites"] = len(unique_metabolites) 497 498 return data_stats 499 500 def get_calibration_stats(self, gcms, id_label): 501 """Get statistics about the GC-MS calibration. 502 503 Parameters: 504 ---------- 505 """ 506 calibration_parameters = {} 507 508 calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref 509 calibration_parameters["data_url"] = str(gcms.cal_file_path) 510 calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path) 511 calibration_parameters["data_name"] = str(gcms.cal_file_path.stem) 512 calibration_parameters["calibration_method"] = "" 513 514 return calibration_parameters 515 516 def get_blank_stats(self, gcms): 517 """Get statistics about the GC-MS blank.""" 518 blank_parameters = {} 519 520 blank_parameters["data_name"] = "ni" 521 blank_parameters["blank_id"] = "ni" 522 blank_parameters["data_url"] = "ni" 523 blank_parameters["has_input"] = "ni" 524 blank_parameters["common_features_to_blank"] = "ni" 525 526 return blank_parameters 527 528 def get_instrument_metadata(self, gcms): 529 """Get metadata about the GC-MS instrument.""" 530 instrument_metadata = {} 531 532 instrument_metadata["analyzer"] = gcms.analyzer 533 instrument_metadata["instrument_label"] = gcms.instrument_label 534 instrument_metadata["instrument_id"] = uuid.uuid4().hex 535 536 return instrument_metadata 537 538 def get_data_metadata(self, gcms, id_label, output_path): 539 """Get metadata about the GC-MS data. 540 541 Parameters: 542 ---------- 543 gcms : object 544 The low resolution GCMS object. 545 id_label : str 546 The ID label for the data. 547 output_path : str 548 The output file path. 549 550 Returns: 551 ------- 552 dict 553 A dictionary containing the data metadata. 554 """ 555 if isinstance(output_path, str): 556 output_path = Path(output_path) 557 558 paramaters_path = output_path.with_suffix(".json") 559 560 if paramaters_path.exists(): 561 with paramaters_path.open() as current_param: 562 metadata = json.load(current_param) 563 data_metadata = metadata.get("Data") 564 else: 565 data_metadata = {} 566 data_metadata["data_name"] = [] 567 data_metadata["input_data_url"] = [] 568 data_metadata["has_input"] = [] 569 570 data_metadata["data_name"].append(gcms.sample_name) 571 data_metadata["input_data_url"].append(str(gcms.file_location)) 572 data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location)) 573 574 data_metadata["output_data_name"] = str(output_path.stem) 575 data_metadata["output_data_url"] = str(output_path) 576 data_metadata["has_output"] = id_label + corems_md5(output_path) 577 578 return data_metadata 579 580 def get_parameters_json(self, gcms, id_label, output_path): 581 """Get the parameters as a JSON string. 582 583 Parameters: 584 ---------- 585 gcms : GCMS object 586 The low resolution GCMS object. 587 id_label : str 588 The ID label for the data. 589 output_path : str 590 The output file path. 591 592 Returns: 593 ------- 594 str 595 The parameters as a JSON string. 596 """ 597 598 output_parameters_dict = {} 599 output_parameters_dict["Data"] = self.get_data_metadata( 600 gcms, id_label, output_path 601 ) 602 output_parameters_dict["Stats"] = self.get_data_stats(gcms) 603 output_parameters_dict["Calibration"] = self.get_calibration_stats( 604 gcms, id_label 605 ) 606 output_parameters_dict["Blank"] = self.get_blank_stats(gcms) 607 output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms) 608 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms) 609 corems_dict_setting["corems_version"] = __version__ 610 output_parameters_dict["CoreMSParameters"] = corems_dict_setting 611 output_parameters_dict["has_metabolite"] = gcms.metabolites_data 612 output = json.dumps( 613 output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ") 614 ) 615 616 return output 617 618 def write_settings(self, output_path, gcms, id_label="emsl:"): 619 """Write the settings to a JSON file. 620 621 Parameters: 622 ---------- 623 output_path : str 624 The output file path. 625 gcms : GCMS object 626 The low resolution GCMS object. 627 id_label : str 628 The ID label for the data. Default is "emsl:". 629 630 """ 631 632 output = self.get_parameters_json(gcms, id_label, output_path) 633 634 with open( 635 output_path.with_suffix(".json"), 636 "w", 637 encoding="utf8", 638 ) as outfile: 639 outfile.write(output) 640 641 def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False): 642 """Get the exported data as a list of dictionaries. 643 644 Parameters: 645 ---------- 646 gcms : object 647 The low resolution GCMS object. 648 include_no_match : bool, optional 649 Whether to include no match data. Default is True. 650 no_match_inline : bool, optional 651 Whether to include no match data inline. Default is False. 652 653 Returns: 654 ------- 655 list 656 The exported data as a list of dictionaries. 657 """ 658 659 output_score_method = gcms.molecular_search_settings.output_score_method 660 661 dict_data_list = [] 662 663 def add_match_dict_data(): 664 derivatization = "{}:{}:{}".format( 665 compound_obj.classify, 666 compound_obj.derivativenum, 667 compound_obj.derivatization, 668 ) 669 out_dict = { 670 "Sample name": gcms.sample_name, 671 "Peak Index": gcpeak_index, 672 "Retention Time": gc_peak.retention_time, 673 "Retention Time Ref": compound_obj.retention_time, 674 "Peak Height": gc_peak.tic, 675 "Peak Area": gc_peak.area, 676 "Retention index": gc_peak.ri, 677 "Retention index Ref": compound_obj.ri, 678 "Retention Index Score": compound_obj.ri_score, 679 "Spectral Similarity Score": compound_obj.spectral_similarity_score, 680 "Similarity Score": compound_obj.similarity_score, 681 "Compound Name": compound_obj.name, 682 "Chebi ID": compound_obj.metadata.chebi, 683 "Kegg Compound ID": compound_obj.metadata.kegg, 684 "Inchi": compound_obj.metadata.inchi, 685 "Inchi Key": compound_obj.metadata.inchikey, 686 "Smiles": compound_obj.metadata.smiles, 687 "Molecular Formula": compound_obj.formula, 688 "IUPAC Name": compound_obj.metadata.iupac_name, 689 "Traditional Name": compound_obj.metadata.traditional_name, 690 "Common Name": compound_obj.metadata.common_name, 691 "Derivatization": derivatization, 692 } 693 694 if self.gcms.molecular_search_settings.exploratory_mode: 695 out_dict.update( 696 { 697 "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get( 698 "weighted_cosine_correlation" 699 ), 700 "Cosine Correlation": compound_obj.spectral_similarity_scores.get( 701 "cosine_correlation" 702 ), 703 "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get( 704 "stein_scott_similarity" 705 ), 706 "Pearson Correlation": compound_obj.spectral_similarity_scores.get( 707 "pearson_correlation" 708 ), 709 "Spearman Correlation": compound_obj.spectral_similarity_scores.get( 710 "spearman_correlation" 711 ), 712 "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get( 713 "kendall_tau_correlation" 714 ), 715 "DFT Correlation": compound_obj.spectral_similarity_scores.get( 716 "dft_correlation" 717 ), 718 "DWT Correlation": compound_obj.spectral_similarity_scores.get( 719 "dwt_correlation" 720 ), 721 "Euclidean Distance": compound_obj.spectral_similarity_scores.get( 722 "euclidean_distance" 723 ), 724 "Manhattan Distance": compound_obj.spectral_similarity_scores.get( 725 "manhattan_distance" 726 ), 727 "Jaccard Distance": compound_obj.spectral_similarity_scores.get( 728 "jaccard_distance" 729 ), 730 } 731 ) 732 for method in methods_name: 733 out_dict[methods_name.get(method)] = ( 734 compound_obj.spectral_similarity_scores.get(method) 735 ) 736 737 dict_data_list.append(out_dict) 738 739 def add_no_match_dict_data(): 740 dict_data_list.append( 741 { 742 "Sample name": gcms.sample_name, 743 "Peak Index": gcpeak_index, 744 "Retention Time": gc_peak.retention_time, 745 "Peak Height": gc_peak.tic, 746 "Peak Area": gc_peak.area, 747 "Retention index": gc_peak.ri, 748 } 749 ) 750 751 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 752 # check if there is a compound candidate 753 if gc_peak: 754 if output_score_method == "highest_sim_score": 755 compound_obj = gc_peak.highest_score_compound 756 add_match_dict_data() 757 758 elif output_score_method == "highest_ss": 759 compound_obj = gc_peak.highest_ss_compound 760 add_match_dict_data() 761 762 else: 763 for compound_obj in gc_peak: 764 add_match_dict_data() # add monoisotopic peak 765 766 else: 767 # include not_match 768 if include_no_match and no_match_inline: 769 add_no_match_dict_data() 770 771 if include_no_match and not no_match_inline: 772 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 773 if not gc_peak: 774 add_no_match_dict_data() 775 776 return dict_data_list 777 778 779class HighResMassSpectraExport(HighResMassSpecExport): 780 """A class to export high resolution mass spectra data. 781 782 This class provides methods to export high resolution mass spectra data to various formats 783 such as Excel, CSV, HDF5, and Pandas DataFrame. 784 785 Parameters 786 ---------- 787 out_file_path : str | Path 788 The output file path. 789 mass_spectra : object 790 The high resolution mass spectra object. 791 output_type : str, optional 792 The output type. Default is 'excel'. 793 794 Attributes 795 ---------- 796 output_file : Path 797 The output file path without suffix 798 dir_loc : Path 799 The directory location for the output file, 800 by default this will be the output_file + ".corems" and all output files will be 801 written into this location 802 mass_spectra : MassSpectraBase 803 The high resolution mass spectra object. 804 """ 805 806 def __init__(self, out_file_path, mass_spectra, output_type="excel"): 807 super().__init__( 808 out_file_path=out_file_path, mass_spectrum=None, output_type=output_type 809 ) 810 811 self.dir_loc = Path(out_file_path + ".corems") 812 self.dir_loc.mkdir(exist_ok=True) 813 # Place the output file in the directory 814 self.output_file = self.dir_loc / Path(out_file_path).name 815 self._output_type = output_type # 'excel', 'csv', 'pandas' or 'hdf5' 816 self.mass_spectra = mass_spectra 817 self.atoms_order_list = None 818 self._init_columns() 819 820 def get_pandas_df(self): 821 """Get the mass spectra as a list of Pandas DataFrames.""" 822 823 list_df = [] 824 825 for mass_spectrum in self.mass_spectra: 826 columns = self.columns_label + self.get_all_used_atoms_in_order( 827 mass_spectrum 828 ) 829 830 dict_data_list = self.get_list_dict_data(mass_spectrum) 831 832 df = DataFrame(dict_data_list, columns=columns) 833 834 scan_number = mass_spectrum.scan_number 835 836 df.name = str(self.output_file) + "_" + str(scan_number) 837 838 list_df.append(df) 839 840 return list_df 841 842 def to_pandas(self, write_metadata=True): 843 """Export the data to a Pandas DataFrame and save it as a pickle file. 844 845 Parameters: 846 ---------- 847 write_metadata : bool, optional 848 Whether to write metadata to the output file. Default is True. 849 """ 850 851 for mass_spectrum in self.mass_spectra: 852 columns = self.columns_label + self.get_all_used_atoms_in_order( 853 mass_spectrum 854 ) 855 856 dict_data_list = self.get_list_dict_data(mass_spectrum) 857 858 df = DataFrame(dict_data_list, columns=columns) 859 860 scan_number = mass_spectrum.scan_number 861 862 out_filename = Path( 863 "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl") 864 ) 865 866 df.to_pickle(self.dir_loc / out_filename) 867 868 if write_metadata: 869 self.write_settings( 870 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 871 ) 872 873 def to_excel(self, write_metadata=True): 874 """Export the data to an Excel file. 875 876 Parameters: 877 ---------- 878 write_metadata : bool, optional 879 Whether to write metadata to the output file. Default is True. 880 """ 881 for mass_spectrum in self.mass_spectra: 882 columns = self.columns_label + self.get_all_used_atoms_in_order( 883 mass_spectrum 884 ) 885 886 dict_data_list = self.get_list_dict_data(mass_spectrum) 887 888 df = DataFrame(dict_data_list, columns=columns) 889 890 scan_number = mass_spectrum.scan_number 891 892 out_filename = Path( 893 "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx") 894 ) 895 896 df.to_excel(self.dir_loc / out_filename) 897 898 if write_metadata: 899 self.write_settings( 900 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 901 ) 902 903 def to_csv(self, write_metadata=True): 904 """Export the data to a CSV file. 905 906 Parameters: 907 ---------- 908 write_metadata : bool, optional 909 Whether to write metadata to the output file. Default is True. 910 """ 911 import csv 912 913 for mass_spectrum in self.mass_spectra: 914 columns = self.columns_label + self.get_all_used_atoms_in_order( 915 mass_spectrum 916 ) 917 918 scan_number = mass_spectrum.scan_number 919 920 dict_data_list = self.get_list_dict_data(mass_spectrum) 921 922 out_filename = Path( 923 "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv") 924 ) 925 926 with open(self.dir_loc / out_filename, "w", newline="") as csvfile: 927 writer = csv.DictWriter(csvfile, fieldnames=columns) 928 writer.writeheader() 929 for data in dict_data_list: 930 writer.writerow(data) 931 932 if write_metadata: 933 self.write_settings( 934 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 935 ) 936 937 def get_mass_spectra_attrs(self): 938 """Get the mass spectra attributes as a JSON string. 939 940 Parameters: 941 ---------- 942 mass_spectra : object 943 The high resolution mass spectra object. 944 945 Returns: 946 ------- 947 str 948 The mass spectra attributes as a JSON string. 949 """ 950 dict_ms_attrs = {} 951 dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer 952 dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label 953 dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name 954 955 return json.dumps( 956 dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ") 957 ) 958 959 def to_hdf(self, overwrite=False, export_raw=True): 960 """Export the data to an HDF5 file. 961 962 Parameters 963 ---------- 964 overwrite : bool, optional 965 Whether to overwrite the output file. Default is False. 966 export_raw : bool, optional 967 Whether to export the raw mass spectra data. Default is True. 968 """ 969 if overwrite: 970 if self.output_file.with_suffix(".hdf5").exists(): 971 self.output_file.with_suffix(".hdf5").unlink() 972 973 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 974 if not hdf_handle.attrs.get("date_utc"): 975 # Set metadata for all mass spectra 976 timenow = str( 977 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 978 ) 979 hdf_handle.attrs["date_utc"] = timenow 980 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 981 hdf_handle.attrs["data_structure"] = "mass_spectra" 982 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 983 hdf_handle.attrs["instrument_label"] = ( 984 self.mass_spectra.instrument_label 985 ) 986 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 987 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 988 hdf_handle.attrs["parser_type"] = ( 989 self.mass_spectra.spectra_parser_class.__name__ 990 ) 991 hdf_handle.attrs["original_file_location"] = ( 992 self.mass_spectra.file_location._str 993 ) 994 995 if "mass_spectra" not in hdf_handle: 996 mass_spectra_group = hdf_handle.create_group("mass_spectra") 997 else: 998 mass_spectra_group = hdf_handle.get("mass_spectra") 999 1000 for mass_spectrum in self.mass_spectra: 1001 group_key = str(int(mass_spectrum.scan_number)) 1002 1003 self.add_mass_spectrum_to_hdf5( 1004 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw 1005 ) 1006 1007 1008class LCMSExport(HighResMassSpectraExport): 1009 """A class to export high resolution LC-MS data. 1010 1011 This class provides methods to export high resolution LC-MS data to HDF5. 1012 1013 Parameters 1014 ---------- 1015 out_file_path : str | Path 1016 The output file path, do not include the file extension. 1017 lcms_object : LCMSBase 1018 The high resolution lc-ms object. 1019 """ 1020 1021 def __init__(self, out_file_path, mass_spectra): 1022 super().__init__(out_file_path, mass_spectra, output_type="hdf5") 1023 1024 def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"): 1025 """Export the data to an HDF5. 1026 1027 Parameters 1028 ---------- 1029 overwrite : bool, optional 1030 Whether to overwrite the output file. Default is False. 1031 save_parameters : bool, optional 1032 Whether to save the parameters as a separate json or toml file. Default is True. 1033 parameter_format : str, optional 1034 The format to save the parameters in. Default is 'toml'. 1035 1036 Raises 1037 ------ 1038 ValueError 1039 If parameter_format is not 'json' or 'toml'. 1040 """ 1041 export_profile_spectra = ( 1042 self.mass_spectra.parameters.lc_ms.export_profile_spectra 1043 ) 1044 1045 # Write the mass spectra data to the hdf5 file 1046 super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra) 1047 1048 # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file 1049 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1050 # Add scan_info to hdf5 file 1051 if "scan_info" not in hdf_handle: 1052 scan_info_group = hdf_handle.create_group("scan_info") 1053 for k, v in self.mass_spectra._scan_info.items(): 1054 array = np.array(list(v.values())) 1055 if array.dtype.str[0:2] == "<U": 1056 array = array.astype("S") 1057 scan_info_group.create_dataset(k, data=array) 1058 1059 # Add ms_unprocessed to hdf5 file 1060 export_unprocessed_ms1 = ( 1061 self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1 1062 ) 1063 if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1: 1064 if "ms_unprocessed" not in hdf_handle: 1065 ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed") 1066 else: 1067 ms_unprocessed_group = hdf_handle.get("ms_unprocessed") 1068 for k, v in self.mass_spectra._ms_unprocessed.items(): 1069 array = np.array(v) 1070 ms_unprocessed_group.create_dataset(str(k), data=array) 1071 1072 # Add LCMS mass features to hdf5 file 1073 if len(self.mass_spectra.mass_features) > 0: 1074 if "mass_features" not in hdf_handle: 1075 mass_features_group = hdf_handle.create_group("mass_features") 1076 else: 1077 mass_features_group = hdf_handle.get("mass_features") 1078 1079 # Create group for each mass feature, with key as the mass feature id 1080 for k, v in self.mass_spectra.mass_features.items(): 1081 mass_features_group.create_group(str(k)) 1082 # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array) 1083 for k2, v2 in v.__dict__.items(): 1084 if v2 is not None: 1085 # Check if the attribute is an integer or float and set as an attribute in the mass feature group 1086 if k2 not in [ 1087 "chromatogram_parent", 1088 "ms2_mass_spectra", 1089 "mass_spectrum", 1090 "_eic_data", 1091 "ms2_similarity_results", 1092 ]: 1093 if k2 == "ms2_scan_numbers": 1094 array = np.array(v2) 1095 mass_features_group[str(k)].create_dataset( 1096 str(k2), data=array 1097 ) 1098 elif k2 == "_half_height_width": 1099 array = np.array(v2) 1100 mass_features_group[str(k)].create_dataset( 1101 str(k2), data=array 1102 ) 1103 elif k2 == "_ms_deconvoluted_idx": 1104 array = np.array(v2) 1105 mass_features_group[str(k)].create_dataset( 1106 str(k2), data=array 1107 ) 1108 elif k2 == "associated_mass_features_deconvoluted": 1109 array = np.array(v2) 1110 mass_features_group[str(k)].create_dataset( 1111 str(k2), data=array 1112 ) 1113 elif ( 1114 isinstance(v2, int) 1115 or isinstance(v2, float) 1116 or isinstance(v2, str) 1117 or isinstance(v2, np.integer) 1118 or isinstance(v2, np.bool_) 1119 ): 1120 mass_features_group[str(k)].attrs[str(k2)] = v2 1121 else: 1122 raise TypeError( 1123 f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file" 1124 ) 1125 1126 # Add EIC data to hdf5 file 1127 export_eics = self.mass_spectra.parameters.lc_ms.export_eics 1128 if len(self.mass_spectra.eics) > 0 and export_eics: 1129 if "eics" not in hdf_handle: 1130 eic_group = hdf_handle.create_group("eics") 1131 else: 1132 eic_group = hdf_handle.get("eics") 1133 1134 # Create group for each eic 1135 for k, v in self.mass_spectra.eics.items(): 1136 eic_group.create_group(str(k)) 1137 eic_group[str(k)].attrs["mz"] = k 1138 # Loop through each of the attributes and add them as datasets (if array) 1139 for k2, v2 in v.__dict__.items(): 1140 if v2 is not None: 1141 array = np.array(v2) 1142 eic_group[str(k)].create_dataset(str(k2), data=array) 1143 1144 # Add ms2_search results to hdf5 file 1145 if len(self.mass_spectra.spectral_search_results) > 0: 1146 if "spectral_search_results" not in hdf_handle: 1147 spectral_search_results = hdf_handle.create_group( 1148 "spectral_search_results" 1149 ) 1150 else: 1151 spectral_search_results = hdf_handle.get("spectral_search_results") 1152 # Create group for each search result by ms2_scan / precursor_mz 1153 for k, v in self.mass_spectra.spectral_search_results.items(): 1154 spectral_search_results.create_group(str(k)) 1155 for k2, v2 in v.items(): 1156 spectral_search_results[str(k)].create_group(str(k2)) 1157 spectral_search_results[str(k)][str(k2)].attrs[ 1158 "precursor_mz" 1159 ] = v2.precursor_mz 1160 spectral_search_results[str(k)][str(k2)].attrs[ 1161 "query_spectrum_id" 1162 ] = v2.query_spectrum_id 1163 # Loop through each of the attributes and add them as datasets (if array) 1164 for k3, v3 in v2.__dict__.items(): 1165 if v3 is not None and k3 not in [ 1166 "query_spectrum", 1167 "precursor_mz", 1168 "query_spectrum_id", 1169 ]: 1170 if k3 == "query_frag_types" or k3 == "ref_frag_types": 1171 v3 = [", ".join(x) for x in v3] 1172 array = np.array(v3) 1173 if array.dtype.str[0:2] == "<U": 1174 array = array.astype("S") 1175 spectral_search_results[str(k)][str(k2)].create_dataset( 1176 str(k3), data=array 1177 ) 1178 1179 # Save parameters as separate json 1180 if save_parameters: 1181 # Check if parameter_format is valid 1182 if parameter_format not in ["json", "toml"]: 1183 raise ValueError("parameter_format must be 'json' or 'toml'") 1184 1185 if parameter_format == "json": 1186 dump_lcms_settings_json( 1187 filename=self.output_file.with_suffix(".json"), 1188 lcms_obj=self.mass_spectra, 1189 ) 1190 elif parameter_format == "toml": 1191 dump_lcms_settings_toml( 1192 filename=self.output_file.with_suffix(".toml"), 1193 lcms_obj=self.mass_spectra, 1194 ) 1195 1196 1197class LipidomicsExport(LCMSExport): 1198 """A class to export lipidomics data. 1199 1200 This class provides methods to export lipidomics data to various formats and summarize the lipid report. 1201 1202 Parameters 1203 ---------- 1204 out_file_path : str | Path 1205 The output file path, do not include the file extension. 1206 mass_spectra : object 1207 The high resolution mass spectra object. 1208 """ 1209 1210 def __init__(self, out_file_path, mass_spectra): 1211 super().__init__(out_file_path, mass_spectra) 1212 self.ion_type_dict = ion_type_dict 1213 1214 @staticmethod 1215 def get_ion_formula(neutral_formula, ion_type): 1216 """From a neutral formula and an ion type, return the formula of the ion. 1217 1218 Notes 1219 ----- 1220 This is a static method. 1221 If the neutral_formula is not a string, this method will return None. 1222 1223 Parameters 1224 ---------- 1225 neutral_formula : str 1226 The neutral formula, this should be a string form from the MolecularFormula class 1227 (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). 1228 In the case of a simple string, the atoms are parsed based on the presence of capital letters, 1229 e.g. MgCl2 is parsed as 'Mg Cl2. 1230 ion_type : str 1231 The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. 1232 See the self.ion_type_dict for the available ion types. 1233 1234 Returns 1235 ------- 1236 str 1237 The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string. 1238 """ 1239 # If neutral_formula is not a string, return None 1240 if not isinstance(neutral_formula, str): 1241 return None 1242 1243 # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class) 1244 if re.search(r"\s", neutral_formula): 1245 neutral_formula = MolecularFormula(neutral_formula, ion_charge=0) 1246 else: 1247 form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:] 1248 elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()] 1249 counts = [re.findall(r"\d+", x) for x in form_pre.split()] 1250 neutral_formula = MolecularFormula( 1251 dict( 1252 zip( 1253 [x[0] for x in elements], 1254 [int(x[0]) if x else 1 for x in counts], 1255 ) 1256 ), 1257 ion_charge=0, 1258 ) 1259 neutral_formula_dict = neutral_formula.to_dict().copy() 1260 1261 adduct_add_dict = ion_type_dict[ion_type][0] 1262 for key in adduct_add_dict: 1263 if key in neutral_formula_dict.keys(): 1264 neutral_formula_dict[key] += adduct_add_dict[key] 1265 else: 1266 neutral_formula_dict[key] = adduct_add_dict[key] 1267 1268 adduct_subtract = ion_type_dict[ion_type][1] 1269 for key in adduct_subtract: 1270 neutral_formula_dict[key] -= adduct_subtract[key] 1271 1272 return MolecularFormula(neutral_formula_dict, ion_charge=0).string 1273 1274 @staticmethod 1275 def get_isotope_type(ion_formula): 1276 """From an ion formula, return the 13C isotope type of the ion. 1277 1278 Notes 1279 ----- 1280 This is a static method. 1281 If the ion_formula is not a string, this method will return None. 1282 This is currently only functional for 13C isotopes. 1283 1284 Parameters 1285 ---------- 1286 ion_formula : str 1287 The formula of the ion, expected to be a string like 'C2 H4 O2'. 1288 1289 Returns 1290 ------- 1291 str 1292 The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope. 1293 1294 Raises 1295 ------ 1296 ValueError 1297 If the ion_formula is not a string. 1298 """ 1299 if not isinstance(ion_formula, str): 1300 return None 1301 1302 if re.search(r"\s", ion_formula): 1303 ion_formula = MolecularFormula(ion_formula, ion_charge=0) 1304 else: 1305 raise ValueError('ion_formula should be a string like "C2 H4 O2"') 1306 ion_formula_dict = ion_formula.to_dict().copy() 1307 1308 try: 1309 iso_class = "13C" + str(ion_formula_dict.pop("13C")) 1310 except KeyError: 1311 iso_class = None 1312 1313 return iso_class 1314 1315 def clean_ms1_report(self, ms1_summary_full): 1316 """Clean the MS1 report. 1317 1318 Parameters 1319 ---------- 1320 ms1_summary_full : DataFrame 1321 The full MS1 summary DataFrame. 1322 1323 Returns 1324 ------- 1325 DataFrame 1326 The cleaned MS1 summary DataFrame. 1327 """ 1328 ms1_summary_full = ms1_summary_full.reset_index() 1329 cols_to_keep = [ 1330 "mf_id", 1331 "Molecular Formula", 1332 "Ion Type", 1333 "Calculated m/z", 1334 "m/z Error (ppm)", 1335 "m/z Error Score", 1336 "Is Isotopologue", 1337 "Isotopologue Similarity", 1338 "Confidence Score", 1339 ] 1340 ms1_summary = ms1_summary_full[cols_to_keep].copy() 1341 ms1_summary["ion_formula"] = [ 1342 self.get_ion_formula(f, a) 1343 for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"]) 1344 ] 1345 ms1_summary["isotopologue_type"] = [ 1346 self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist() 1347 ] 1348 1349 # Reorder columns 1350 ms1_summary = ms1_summary[ 1351 [ 1352 "mf_id", 1353 "ion_formula", 1354 "isotopologue_type", 1355 "Calculated m/z", 1356 "m/z Error (ppm)", 1357 "m/z Error Score", 1358 "Isotopologue Similarity", 1359 "Confidence Score", 1360 ] 1361 ] 1362 1363 # Set the index to mf_id 1364 ms1_summary = ms1_summary.set_index("mf_id") 1365 1366 return ms1_summary 1367 1368 def summarize_lipid_report(self, ms2_annot): 1369 """Summarize the lipid report. 1370 1371 Parameters 1372 ---------- 1373 ms2_annot : DataFrame 1374 The MS2 annotation DataFrame with all annotations. 1375 1376 Returns 1377 ------- 1378 DataFrame 1379 The summarized lipid report. 1380 """ 1381 # Drop unnecessary columns for easier viewing 1382 columns_to_drop = [ 1383 "precursor_mz", 1384 "precursor_mz_error_ppm", 1385 "metabref_mol_id", 1386 "metabref_precursor_mz", 1387 "cas", 1388 "inchikey", 1389 "inchi", 1390 "chebi", 1391 "smiles", 1392 "kegg", 1393 "data_id", 1394 "iupac_name", 1395 "traditional_name", 1396 "common_name", 1397 "casno", 1398 ] 1399 ms2_annot = ms2_annot.drop( 1400 columns=[col for col in columns_to_drop if col in ms2_annot.columns] 1401 ) 1402 1403 # If ion_types_excluded is not empty, remove those ion types 1404 ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[ 1405 "ms2" 1406 ].molecular_search.ion_types_excluded 1407 if len(ion_types_excluded) > 0: 1408 ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)] 1409 1410 # If mf_id is not present, check that the index name is mf_id and reset the index 1411 if "mf_id" not in ms2_annot.columns: 1412 if ms2_annot.index.name == "mf_id": 1413 ms2_annot = ms2_annot.reset_index() 1414 else: 1415 raise ValueError("mf_id is not present in the dataframe") 1416 1417 # Attempt to get consensus annotations to the MLF level 1418 mlf_results_all = [] 1419 for mf_id in ms2_annot["mf_id"].unique(): 1420 mlf_results_perid = [] 1421 ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy() 1422 ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf) 1423 1424 for query_scan in ms2_annot["query_spectrum_id"].unique(): 1425 ms2_annot_sub = ms2_annot_mf[ 1426 ms2_annot_mf["query_spectrum_id"] == query_scan 1427 ].copy() 1428 1429 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1430 # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation 1431 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1432 ms2_annot_sub["entropy_max"] = ( 1433 ms2_annot_sub["entropy_similarity"] 1434 == ms2_annot_sub["entropy_similarity"].max() 1435 ) 1436 ms2_annot_sub["ref_match_fract_max"] = ( 1437 ms2_annot_sub["ref_mz_in_query_fract"] 1438 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1439 ) 1440 ms2_annot_sub["frag_max"] = ms2_annot_sub[ 1441 "query_frag_types" 1442 ].apply(lambda x: True if "MLF" in x else False) 1443 1444 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1445 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1446 ["entropy_max", "ref_match_fract_max", "frag_max"] 1447 ].all(axis=1) 1448 1449 # If there is a consensus, take the row with the highest entropy_similarity 1450 if ms2_annot_sub["consensus"].any(): 1451 ms2_annot_sub = ms2_annot_sub[ 1452 ms2_annot_sub["entropy_similarity"] 1453 == ms2_annot_sub["entropy_similarity"].max() 1454 ].head(1) 1455 mlf_results_perid.append(ms2_annot_sub) 1456 if len(mlf_results_perid) == 0: 1457 mlf_results_perid = pd.DataFrame() 1458 else: 1459 mlf_results_perid = pd.concat(mlf_results_perid) 1460 if mlf_results_perid["name"].nunique() == 1: 1461 mlf_results_perid = mlf_results_perid[ 1462 mlf_results_perid["entropy_similarity"] 1463 == mlf_results_perid["entropy_similarity"].max() 1464 ].head(1) 1465 else: 1466 mlf_results_perid = pd.DataFrame() 1467 mlf_results_all.append(mlf_results_perid) 1468 1469 # These are the consensus annotations to the MLF level 1470 if len(mlf_results_all) > 0: 1471 mlf_results_all = pd.concat(mlf_results_all) 1472 mlf_results_all["annot_level"] = mlf_results_all["structure_level"] 1473 else: 1474 # Make an empty dataframe 1475 mlf_results_all = ms2_annot.head(0) 1476 1477 # For remaining mf_ids, try to get a consensus annotation to the species level 1478 species_results_all = [] 1479 # Remove mf_ids that have consensus annotations to the MLF level 1480 ms2_annot_spec = ms2_annot[ 1481 ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique()) 1482 ] 1483 for mf_id in ms2_annot_spec["mf_id"].unique(): 1484 # Do all the hits have the same lipid_summed_name? 1485 ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy() 1486 ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub) 1487 1488 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1489 # Grab the highest entropy_similarity result 1490 ms2_annot_sub = ms2_annot_sub[ 1491 ms2_annot_sub["entropy_similarity"] 1492 == ms2_annot_sub["entropy_similarity"].max() 1493 ].head(1) 1494 species_results_all.append(ms2_annot_sub) 1495 1496 # These are the consensus annotations to the species level 1497 if len(species_results_all) > 0: 1498 species_results_all = pd.concat(species_results_all) 1499 species_results_all["annot_level"] = "species" 1500 else: 1501 # Make an empty dataframe 1502 species_results_all = ms2_annot.head(0) 1503 1504 # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level 1505 # Remove mf_ids that have consensus annotations to the species level 1506 ms2_annot_remaining = ms2_annot_spec[ 1507 ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique()) 1508 ] 1509 no_consensus = [] 1510 for mf_id in ms2_annot_remaining["mf_id"].unique(): 1511 id_sub = [] 1512 id_no_con = [] 1513 ms2_annot_sub_mf = ms2_annot_remaining[ 1514 ms2_annot_remaining["mf_id"] == mf_id 1515 ].copy() 1516 for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique(): 1517 ms2_annot_sub = ms2_annot_sub_mf[ 1518 ms2_annot_sub_mf["query_spectrum_id"] == query_scan 1519 ].copy() 1520 1521 # New columns for ranking [HIGHER RANK = BETTER] 1522 ms2_annot_sub["entropy_max"] = ( 1523 ms2_annot_sub["entropy_similarity"] 1524 == ms2_annot_sub["entropy_similarity"].max() 1525 ) 1526 ms2_annot_sub["ref_match_fract_max"] = ( 1527 ms2_annot_sub["ref_mz_in_query_fract"] 1528 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1529 ) 1530 ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply( 1531 lambda x: True if "MLF" in x else False 1532 ) 1533 1534 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1535 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1536 ["entropy_max", "ref_match_fract_max", "frag_max"] 1537 ].all(axis=1) 1538 ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]] 1539 id_sub.append(ms2_annot_sub_con) 1540 id_no_con.append(ms2_annot_sub) 1541 id_sub = pd.concat(id_sub) 1542 id_no_con = pd.concat(id_no_con) 1543 1544 # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level] 1545 if ( 1546 id_sub["query_frag_types"] 1547 .apply(lambda x: True if "MLF" in x else False) 1548 .all() 1549 and len(id_sub) > 0 1550 ): 1551 idx = id_sub.groupby("name")["entropy_similarity"].idxmax() 1552 id_sub = id_sub.loc[idx] 1553 # Reorder so highest entropy_similarity is first 1554 id_sub = id_sub.sort_values("entropy_similarity", ascending=False) 1555 id_sub["annot_level"] = id_sub["structure_level"] 1556 no_consensus.append(id_sub) 1557 1558 # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level 1559 elif len(id_sub) == 0: 1560 for lipid_summed_name in id_no_con["lipid_summed_name"].unique(): 1561 summed_sub = id_no_con[ 1562 id_no_con["lipid_summed_name"] == lipid_summed_name 1563 ] 1564 # Any consensus to MLF? 1565 if summed_sub["consensus"].any(): 1566 summed_sub = summed_sub[summed_sub["consensus"]] 1567 summed_sub["annot_level"] = summed_sub["structure_level"] 1568 no_consensus.append(summed_sub) 1569 else: 1570 # Grab the highest entropy_similarity, if there are multiple, grab the first one 1571 summed_sub = summed_sub[ 1572 summed_sub["entropy_similarity"] 1573 == summed_sub["entropy_similarity"].max() 1574 ].head(1) 1575 # get first row 1576 summed_sub["annot_level"] = "species" 1577 summed_sub["name"] = "" 1578 no_consensus.append(summed_sub) 1579 else: 1580 raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id) 1581 1582 if len(no_consensus) > 0: 1583 no_consensus = pd.concat(no_consensus) 1584 else: 1585 no_consensus = ms2_annot.head(0) 1586 1587 # Combine all the consensus annotations and reformat the dataframe for output 1588 species_results_all = species_results_all.drop(columns=["name"]) 1589 species_results_all["lipid_molecular_species_id"] = "" 1590 mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"] 1591 no_consensus["lipid_molecular_species_id"] = no_consensus["name"] 1592 consensus_annotations = pd.concat( 1593 [mlf_results_all, species_results_all, no_consensus] 1594 ) 1595 consensus_annotations = consensus_annotations.sort_values( 1596 "mf_id", ascending=True 1597 ) 1598 cols_to_keep = [ 1599 "mf_id", 1600 "ref_ion_type", 1601 "entropy_similarity", 1602 "ref_mz_in_query_fract", 1603 "lipid_molecular_species_id", 1604 "lipid_summed_name", 1605 "lipid_subclass", 1606 "lipid_class", 1607 "lipid_category", 1608 "formula", 1609 "annot_level", 1610 "n_spectra_contributing", 1611 ] 1612 consensus_annotations = consensus_annotations[cols_to_keep] 1613 consensus_annotations = consensus_annotations.set_index("mf_id") 1614 1615 return consensus_annotations 1616 1617 def clean_ms2_report(self, lipid_summary): 1618 """Clean the MS2 report. 1619 1620 Parameters 1621 ---------- 1622 lipid_summary : DataFrame 1623 The full lipid summary DataFrame. 1624 1625 Returns 1626 ------- 1627 DataFrame 1628 The cleaned lipid summary DataFrame. 1629 """ 1630 lipid_summary = lipid_summary.reset_index() 1631 lipid_summary["ion_formula"] = [ 1632 self.get_ion_formula(f, a) 1633 for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"]) 1634 ] 1635 1636 # Reorder columns 1637 lipid_summary = lipid_summary[ 1638 [ 1639 "mf_id", 1640 "ion_formula", 1641 "ref_ion_type", 1642 "formula", 1643 "annot_level", 1644 "lipid_molecular_species_id", 1645 "lipid_summed_name", 1646 "lipid_subclass", 1647 "lipid_class", 1648 "lipid_category", 1649 "entropy_similarity", 1650 "ref_mz_in_query_fract", 1651 "n_spectra_contributing", 1652 ] 1653 ] 1654 1655 # Set the index to mf_id 1656 lipid_summary = lipid_summary.set_index("mf_id") 1657 1658 return lipid_summary 1659 1660 def to_report(self, molecular_metadata=None): 1661 """Create a report of the mass features and their annotations. 1662 1663 Parameters 1664 ---------- 1665 molecular_metadata : dict, optional 1666 The molecular metadata. Default is None. 1667 1668 Returns 1669 ------- 1670 DataFrame 1671 The report of the mass features and their annotations. 1672 1673 Notes 1674 ----- 1675 The report will contain the mass features and their annotations from MS1 and MS2 (if available). 1676 """ 1677 # Get mass feature dataframe 1678 mf_report = self.mass_spectra.mass_features_to_df() 1679 mf_report = mf_report.reset_index(drop=False) 1680 1681 # Get and clean ms1 annotation dataframe 1682 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 1683 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 1684 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 1685 1686 # Get, summarize, and clean ms2 annotation dataframe 1687 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 1688 molecular_metadata=molecular_metadata 1689 ) 1690 if ms2_annot_report is not None: 1691 ms2_annot_report = self.summarize_lipid_report(ms2_annot_report) 1692 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 1693 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 1694 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 1695 1696 # Combine the reports 1697 if not ms1_annot_report.empty: 1698 # MS1 has been run and has molecular formula information 1699 mf_report = pd.merge( 1700 mf_report, 1701 ms1_annot_report, 1702 how="left", 1703 on=["mf_id", "isotopologue_type"], 1704 ) 1705 if ms2_annot_report is not None: 1706 # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly) 1707 mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()] 1708 mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"]) 1709 mf_no_ion_formula = pd.merge( 1710 mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"] 1711 ) 1712 1713 # pull out the records with ion_formula 1714 mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()] 1715 mf_with_ion_formula = pd.merge( 1716 mf_with_ion_formula, 1717 ms2_annot_report, 1718 how="left", 1719 on=["mf_id", "ion_formula"], 1720 ) 1721 1722 # put back together 1723 mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula]) 1724 1725 # Rename colums 1726 rename_dict = { 1727 "mf_id": "Mass Feature ID", 1728 "scan_time": "Retention Time (min)", 1729 "mz": "m/z", 1730 "apex_scan": "Apex Scan Number", 1731 "intensity": "Intensity", 1732 "persistence": "Persistence", 1733 "area": "Area", 1734 "half_height_width": "Half Height Width (min)", 1735 "tailing_factor": "Tailing Factor", 1736 "dispersity_index": "Dispersity Index", 1737 "ms2_spectrum": "MS2 Spectrum", 1738 "monoisotopic_mf_id": "Monoisotopic Mass Feature ID", 1739 "isotopologue_type": "Isotopologue Type", 1740 "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution", 1741 "associated_mass_features": "Associated Mass Features after Deconvolution", 1742 "ion_formula": "Ion Formula", 1743 "formula": "Molecular Formula", 1744 "ref_ion_type": "Ion Type", 1745 "annot_level": "Lipid Annotation Level", 1746 "lipid_molecular_species_id": "Lipid Molecular Species", 1747 "lipid_summed_name": "Lipid Species", 1748 "lipid_subclass": "Lipid Subclass", 1749 "lipid_class": "Lipid Class", 1750 "lipid_category": "Lipid Category", 1751 "entropy_similarity": "Entropy Similarity", 1752 "ref_mz_in_query_fract": "Library mzs in Query (fraction)", 1753 "n_spectra_contributing": "Spectra with Annotation (n)", 1754 } 1755 mf_report = mf_report.rename(columns=rename_dict) 1756 mf_report["Sample Name"] = self.mass_spectra.sample_name 1757 mf_report["Polarity"] = self.mass_spectra.polarity 1758 mf_report = mf_report[ 1759 ["Mass Feature ID", "Sample Name", "Polarity"] 1760 + [ 1761 col 1762 for col in mf_report.columns 1763 if col not in ["Mass Feature ID", "Sample Name", "Polarity"] 1764 ] 1765 ] 1766 1767 # Reorder rows by "Mass Feature ID" 1768 mf_report = mf_report.sort_values("Mass Feature ID") 1769 1770 # Reset index 1771 mf_report = mf_report.reset_index(drop=True) 1772 1773 return mf_report 1774 1775 def report_to_csv(self, molecular_metadata=None): 1776 """Create a report of the mass features and their annotations and save it as a CSV file. 1777 1778 Parameters 1779 ---------- 1780 molecular_metadata : dict, optional 1781 The molecular metadata. Default is None. 1782 """ 1783 report = self.to_report(molecular_metadata=molecular_metadata) 1784 out_file = self.output_file.with_suffix(".csv") 1785 report.to_csv(out_file, index=False)
55class LowResGCMSExport: 56 """A class to export low resolution GC-MS data. 57 58 This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame. 59 60 Parameters: 61 ---------- 62 out_file_path : str 63 The output file path. 64 gcms : object 65 The low resolution GCMS object. 66 67 Attributes: 68 ---------- 69 output_file : Path 70 The output file path as a Path object. 71 gcms : object 72 The low resolution GCMS object. 73 74 Methods: 75 ------- 76 * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame. 77 * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string. 78 * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file. 79 * to_excel(write_mode='a', write_metadata=True, id_label="corems:"), 80 Export the data to an Excel file. 81 * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). 82 Export the data to a CSV file. 83 * to_hdf(id_label="corems:"). 84 Export the data to an HDF5 file. 85 * get_data_stats(gcms). 86 Get statistics about the GCMS data. 87 88 """ 89 90 def __init__(self, out_file_path, gcms): 91 self.output_file = Path(out_file_path) 92 93 self.gcms = gcms 94 95 self._init_columns() 96 97 def _init_columns(self): 98 """Initialize the column names for the exported data. 99 100 Returns: 101 ------- 102 list 103 The list of column names. 104 """ 105 106 columns = [ 107 "Sample name", 108 "Peak Index", 109 "Retention Time", 110 "Retention Time Ref", 111 "Peak Height", 112 "Peak Area", 113 "Retention index", 114 "Retention index Ref", 115 "Retention Index Score", 116 "Similarity Score", 117 "Spectral Similarity Score", 118 "Compound Name", 119 "Chebi ID", 120 "Kegg Compound ID", 121 "Inchi", 122 "Inchi Key", 123 "Smiles", 124 "Molecular Formula", 125 "IUPAC Name", 126 "Traditional Name", 127 "Common Name", 128 "Derivatization", 129 ] 130 131 if self.gcms.molecular_search_settings.exploratory_mode: 132 columns.extend( 133 [ 134 "Weighted Cosine Correlation", 135 "Cosine Correlation", 136 "Stein Scott Similarity", 137 "Pearson Correlation", 138 "Spearman Correlation", 139 "Kendall Tau Correlation", 140 "Euclidean Distance", 141 "Manhattan Distance", 142 "Jaccard Distance", 143 "DWT Correlation", 144 "DFT Correlation", 145 ] 146 ) 147 148 columns.extend(list(methods_name.values())) 149 150 return columns 151 152 def get_pandas_df(self, id_label="corems:"): 153 """Get the exported data as a Pandas DataFrame. 154 155 Parameters: 156 ---------- 157 id_label : str, optional 158 The ID label for the data. Default is "corems:". 159 160 Returns: 161 ------- 162 DataFrame 163 The exported data as a Pandas DataFrame. 164 """ 165 166 columns = self._init_columns() 167 168 dict_data_list = self.get_list_dict_data(self.gcms) 169 170 df = DataFrame(dict_data_list, columns=columns) 171 172 df.name = self.gcms.sample_name 173 174 return df 175 176 def get_json(self, nan=False, id_label="corems:"): 177 """Get the exported data as a JSON string. 178 179 Parameters: 180 ---------- 181 nan : bool, optional 182 Whether to include NaN values in the JSON string. Default is False. 183 id_label : str, optional 184 The ID label for the data. Default is "corems:". 185 186 """ 187 188 import json 189 190 dict_data_list = self.get_list_dict_data(self.gcms) 191 192 return json.dumps( 193 dict_data_list, sort_keys=False, indent=4, separators=(",", ": ") 194 ) 195 196 def to_pandas(self, write_metadata=True, id_label="corems:"): 197 """Export the data to a Pandas DataFrame and save it as a pickle file. 198 199 Parameters: 200 ---------- 201 write_metadata : bool, optional 202 Whether to write metadata to the output file. 203 id_label : str, optional 204 The ID label for the data. 205 """ 206 207 columns = self._init_columns() 208 209 dict_data_list = self.get_list_dict_data(self.gcms) 210 211 df = DataFrame(dict_data_list, columns=columns) 212 213 df.to_pickle(self.output_file.with_suffix(".pkl")) 214 215 if write_metadata: 216 self.write_settings( 217 self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:" 218 ) 219 220 def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"): 221 """Export the data to an Excel file. 222 223 Parameters: 224 ---------- 225 write_mode : str, optional 226 The write mode for the Excel file. Default is 'a' (append). 227 write_metadata : bool, optional 228 Whether to write metadata to the output file. Default is True. 229 id_label : str, optional 230 The ID label for the data. Default is "corems:". 231 """ 232 233 out_put_path = self.output_file.with_suffix(".xlsx") 234 235 columns = self._init_columns() 236 237 dict_data_list = self.get_list_dict_data(self.gcms) 238 239 df = DataFrame(dict_data_list, columns=columns) 240 241 if write_mode == "a" and out_put_path.exists(): 242 writer = ExcelWriter(out_put_path, engine="openpyxl") 243 # try to open an existing workbook 244 writer.book = load_workbook(out_put_path) 245 # copy existing sheets 246 writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets) 247 # read existing file 248 reader = read_excel(out_put_path) 249 # write out the new sheet 250 df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1) 251 252 writer.close() 253 else: 254 df.to_excel( 255 self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl" 256 ) 257 258 if write_metadata: 259 self.write_settings(out_put_path, self.gcms, id_label=id_label) 260 261 def to_csv( 262 self, 263 separate_output=False, 264 write_mode="w", 265 write_metadata=True, 266 id_label="corems:", 267 ): 268 """Export the data to a CSV file. 269 270 Parameters: 271 ---------- 272 separate_output : bool, optional 273 Whether to separate the output into multiple files. Default is False. 274 write_mode : str, optional 275 The write mode for the CSV file. Default is 'w' (write). 276 write_metadata : bool, optional 277 Whether to write metadata to the output file. Default is True. 278 id_label : str, optional 279 The ID label for the data. Default is "corems:". 280 """ 281 282 if separate_output: 283 # set write mode to write 284 # this mode will overwrite the file without warning 285 write_mode = "w" 286 else: 287 # set write mode to append 288 write_mode = "a" 289 290 columns = self._init_columns() 291 292 dict_data_list = self.get_list_dict_data(self.gcms) 293 294 out_put_path = self.output_file.with_suffix(".csv") 295 296 write_header = not out_put_path.exists() 297 298 try: 299 with open(out_put_path, write_mode, newline="") as csvfile: 300 writer = csv.DictWriter(csvfile, fieldnames=columns) 301 if write_header: 302 writer.writeheader() 303 for data in dict_data_list: 304 writer.writerow(data) 305 306 if write_metadata: 307 self.write_settings(out_put_path, self.gcms, id_label=id_label) 308 309 except IOError as ioerror: 310 print(ioerror) 311 312 def to_hdf(self, id_label="corems:"): 313 """Export the data to an HDF5 file. 314 315 Parameters: 316 ---------- 317 id_label : str, optional 318 The ID label for the data. Default is "corems:". 319 """ 320 321 # save sample at a time 322 def add_compound(gc_peak, compound_obj): 323 modifier = compound_obj.classify if compound_obj.classify else "" 324 compound_group = compound_obj.name.replace("/", "") + " " + modifier 325 326 if compound_group not in peak_group: 327 compound_group = peak_group.create_group(compound_group) 328 329 # compound_group.attrs["retention_time"] = compound_obj.retention_time 330 compound_group.attrs["retention_index"] = compound_obj.ri 331 compound_group.attrs["retention_index_score"] = compound_obj.ri_score 332 compound_group.attrs["spectral_similarity_score"] = ( 333 compound_obj.spectral_similarity_score 334 ) 335 compound_group.attrs["similarity_score"] = compound_obj.similarity_score 336 337 compond_mz = compound_group.create_dataset( 338 "mz", data=np.array(compound_obj.mz), dtype="f8" 339 ) 340 compond_abundance = compound_group.create_dataset( 341 "abundance", data=np.array(compound_obj.abundance), dtype="f8" 342 ) 343 344 if self.gcms.molecular_search_settings.exploratory_mode: 345 compound_group.attrs["Spectral Similarities"] = json.dumps( 346 compound_obj.spectral_similarity_scores, 347 sort_keys=False, 348 indent=4, 349 separators=(",", ":"), 350 ) 351 else: 352 warnings.warn("Skipping duplicate reference compound.") 353 354 import json 355 from datetime import datetime, timezone 356 357 import h5py 358 import numpy as np 359 360 output_path = self.output_file.with_suffix(".hdf5") 361 362 with h5py.File(output_path, "w") as hdf_handle: 363 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 364 hdf_handle.attrs["time_stamp"] = timenow 365 hdf_handle.attrs["data_structure"] = "gcms" 366 hdf_handle.attrs["analyzer"] = self.gcms.analyzer 367 hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label 368 369 hdf_handle.attrs["sample_id"] = "self.gcms.id" 370 hdf_handle.attrs["sample_name"] = self.gcms.sample_name 371 hdf_handle.attrs["input_data"] = str(self.gcms.file_location) 372 hdf_handle.attrs["output_data"] = str(output_path) 373 hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex 374 hdf_handle.attrs["corems_version"] = __version__ 375 376 hdf_handle.attrs["Stats"] = json.dumps( 377 self.get_data_stats(self.gcms), 378 sort_keys=False, 379 indent=4, 380 separators=(",", ": "), 381 ) 382 hdf_handle.attrs["Calibration"] = json.dumps( 383 self.get_calibration_stats(self.gcms, id_label), 384 sort_keys=False, 385 indent=4, 386 separators=(",", ": "), 387 ) 388 hdf_handle.attrs["Blank"] = json.dumps( 389 self.get_blank_stats(self.gcms), 390 sort_keys=False, 391 indent=4, 392 separators=(",", ": "), 393 ) 394 395 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms) 396 hdf_handle.attrs["CoreMSParameters"] = json.dumps( 397 corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ") 398 ) 399 400 scans_dataset = hdf_handle.create_dataset( 401 "scans", data=np.array(self.gcms.scans_number), dtype="f8" 402 ) 403 rt_dataset = hdf_handle.create_dataset( 404 "rt", data=np.array(self.gcms.retention_time), dtype="f8" 405 ) 406 tic_dataset = hdf_handle.create_dataset( 407 "tic", data=np.array(self.gcms.tic), dtype="f8" 408 ) 409 processed_tic_dataset = hdf_handle.create_dataset( 410 "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8" 411 ) 412 413 output_score_method = ( 414 self.gcms.molecular_search_settings.output_score_method 415 ) 416 417 for gc_peak in self.gcms: 418 # print(gc_peak.retention_time) 419 # print(gc_peak.tic) 420 421 # check if there is a compound candidate 422 peak_group = hdf_handle.create_group(str(gc_peak.retention_time)) 423 peak_group.attrs["deconvolution"] = int( 424 self.gcms.chromatogram_settings.use_deconvolution 425 ) 426 427 peak_group.attrs["start_scan"] = gc_peak.start_scan 428 peak_group.attrs["apex_scan"] = gc_peak.apex_scan 429 peak_group.attrs["final_scan"] = gc_peak.final_scan 430 431 peak_group.attrs["retention_index"] = gc_peak.ri 432 peak_group.attrs["retention_time"] = gc_peak.retention_time 433 peak_group.attrs["area"] = gc_peak.area 434 435 mz = peak_group.create_dataset( 436 "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8" 437 ) 438 abundance = peak_group.create_dataset( 439 "abundance", 440 data=np.array(gc_peak.mass_spectrum.abundance), 441 dtype="f8", 442 ) 443 444 if gc_peak: 445 if output_score_method == "highest_sim_score": 446 compound_obj = gc_peak.highest_score_compound 447 add_compound(gc_peak, compound_obj) 448 449 elif output_score_method == "highest_ss": 450 compound_obj = gc_peak.highest_ss_compound 451 add_compound(gc_peak, compound_obj) 452 453 else: 454 for compound_obj in gc_peak: 455 add_compound(gc_peak, compound_obj) 456 457 def get_data_stats(self, gcms): 458 """Get statistics about the GCMS data. 459 460 Parameters: 461 ---------- 462 gcms : object 463 The low resolution GCMS object. 464 465 Returns: 466 ------- 467 dict 468 A dictionary containing the data statistics. 469 """ 470 471 matched_peaks = gcms.matched_peaks 472 no_matched_peaks = gcms.no_matched_peaks 473 unique_metabolites = gcms.unique_metabolites 474 475 peak_matchs_above_0p85 = 0 476 unique_peak_match_above_0p85 = 0 477 for match_peak in matched_peaks: 478 gc_peak_above_85 = 0 479 matches_above_85 = list( 480 filter(lambda m: m.similarity_score >= 0.85, match_peak) 481 ) 482 if matches_above_85: 483 peak_matchs_above_0p85 += 1 484 if len(matches_above_85) == 1: 485 unique_peak_match_above_0p85 += 1 486 487 data_stats = {} 488 data_stats["average_signal_noise"] = "ni" 489 data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range 490 data_stats["total_number_peaks"] = len(gcms) 491 data_stats["total_peaks_matched"] = len(matched_peaks) 492 data_stats["total_peaks_without_matches"] = len(no_matched_peaks) 493 data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85 494 data_stats["single_matches_above_similarity_score_0.85"] = ( 495 unique_peak_match_above_0p85 496 ) 497 data_stats["unique_metabolites"] = len(unique_metabolites) 498 499 return data_stats 500 501 def get_calibration_stats(self, gcms, id_label): 502 """Get statistics about the GC-MS calibration. 503 504 Parameters: 505 ---------- 506 """ 507 calibration_parameters = {} 508 509 calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref 510 calibration_parameters["data_url"] = str(gcms.cal_file_path) 511 calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path) 512 calibration_parameters["data_name"] = str(gcms.cal_file_path.stem) 513 calibration_parameters["calibration_method"] = "" 514 515 return calibration_parameters 516 517 def get_blank_stats(self, gcms): 518 """Get statistics about the GC-MS blank.""" 519 blank_parameters = {} 520 521 blank_parameters["data_name"] = "ni" 522 blank_parameters["blank_id"] = "ni" 523 blank_parameters["data_url"] = "ni" 524 blank_parameters["has_input"] = "ni" 525 blank_parameters["common_features_to_blank"] = "ni" 526 527 return blank_parameters 528 529 def get_instrument_metadata(self, gcms): 530 """Get metadata about the GC-MS instrument.""" 531 instrument_metadata = {} 532 533 instrument_metadata["analyzer"] = gcms.analyzer 534 instrument_metadata["instrument_label"] = gcms.instrument_label 535 instrument_metadata["instrument_id"] = uuid.uuid4().hex 536 537 return instrument_metadata 538 539 def get_data_metadata(self, gcms, id_label, output_path): 540 """Get metadata about the GC-MS data. 541 542 Parameters: 543 ---------- 544 gcms : object 545 The low resolution GCMS object. 546 id_label : str 547 The ID label for the data. 548 output_path : str 549 The output file path. 550 551 Returns: 552 ------- 553 dict 554 A dictionary containing the data metadata. 555 """ 556 if isinstance(output_path, str): 557 output_path = Path(output_path) 558 559 paramaters_path = output_path.with_suffix(".json") 560 561 if paramaters_path.exists(): 562 with paramaters_path.open() as current_param: 563 metadata = json.load(current_param) 564 data_metadata = metadata.get("Data") 565 else: 566 data_metadata = {} 567 data_metadata["data_name"] = [] 568 data_metadata["input_data_url"] = [] 569 data_metadata["has_input"] = [] 570 571 data_metadata["data_name"].append(gcms.sample_name) 572 data_metadata["input_data_url"].append(str(gcms.file_location)) 573 data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location)) 574 575 data_metadata["output_data_name"] = str(output_path.stem) 576 data_metadata["output_data_url"] = str(output_path) 577 data_metadata["has_output"] = id_label + corems_md5(output_path) 578 579 return data_metadata 580 581 def get_parameters_json(self, gcms, id_label, output_path): 582 """Get the parameters as a JSON string. 583 584 Parameters: 585 ---------- 586 gcms : GCMS object 587 The low resolution GCMS object. 588 id_label : str 589 The ID label for the data. 590 output_path : str 591 The output file path. 592 593 Returns: 594 ------- 595 str 596 The parameters as a JSON string. 597 """ 598 599 output_parameters_dict = {} 600 output_parameters_dict["Data"] = self.get_data_metadata( 601 gcms, id_label, output_path 602 ) 603 output_parameters_dict["Stats"] = self.get_data_stats(gcms) 604 output_parameters_dict["Calibration"] = self.get_calibration_stats( 605 gcms, id_label 606 ) 607 output_parameters_dict["Blank"] = self.get_blank_stats(gcms) 608 output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms) 609 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms) 610 corems_dict_setting["corems_version"] = __version__ 611 output_parameters_dict["CoreMSParameters"] = corems_dict_setting 612 output_parameters_dict["has_metabolite"] = gcms.metabolites_data 613 output = json.dumps( 614 output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ") 615 ) 616 617 return output 618 619 def write_settings(self, output_path, gcms, id_label="emsl:"): 620 """Write the settings to a JSON file. 621 622 Parameters: 623 ---------- 624 output_path : str 625 The output file path. 626 gcms : GCMS object 627 The low resolution GCMS object. 628 id_label : str 629 The ID label for the data. Default is "emsl:". 630 631 """ 632 633 output = self.get_parameters_json(gcms, id_label, output_path) 634 635 with open( 636 output_path.with_suffix(".json"), 637 "w", 638 encoding="utf8", 639 ) as outfile: 640 outfile.write(output) 641 642 def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False): 643 """Get the exported data as a list of dictionaries. 644 645 Parameters: 646 ---------- 647 gcms : object 648 The low resolution GCMS object. 649 include_no_match : bool, optional 650 Whether to include no match data. Default is True. 651 no_match_inline : bool, optional 652 Whether to include no match data inline. Default is False. 653 654 Returns: 655 ------- 656 list 657 The exported data as a list of dictionaries. 658 """ 659 660 output_score_method = gcms.molecular_search_settings.output_score_method 661 662 dict_data_list = [] 663 664 def add_match_dict_data(): 665 derivatization = "{}:{}:{}".format( 666 compound_obj.classify, 667 compound_obj.derivativenum, 668 compound_obj.derivatization, 669 ) 670 out_dict = { 671 "Sample name": gcms.sample_name, 672 "Peak Index": gcpeak_index, 673 "Retention Time": gc_peak.retention_time, 674 "Retention Time Ref": compound_obj.retention_time, 675 "Peak Height": gc_peak.tic, 676 "Peak Area": gc_peak.area, 677 "Retention index": gc_peak.ri, 678 "Retention index Ref": compound_obj.ri, 679 "Retention Index Score": compound_obj.ri_score, 680 "Spectral Similarity Score": compound_obj.spectral_similarity_score, 681 "Similarity Score": compound_obj.similarity_score, 682 "Compound Name": compound_obj.name, 683 "Chebi ID": compound_obj.metadata.chebi, 684 "Kegg Compound ID": compound_obj.metadata.kegg, 685 "Inchi": compound_obj.metadata.inchi, 686 "Inchi Key": compound_obj.metadata.inchikey, 687 "Smiles": compound_obj.metadata.smiles, 688 "Molecular Formula": compound_obj.formula, 689 "IUPAC Name": compound_obj.metadata.iupac_name, 690 "Traditional Name": compound_obj.metadata.traditional_name, 691 "Common Name": compound_obj.metadata.common_name, 692 "Derivatization": derivatization, 693 } 694 695 if self.gcms.molecular_search_settings.exploratory_mode: 696 out_dict.update( 697 { 698 "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get( 699 "weighted_cosine_correlation" 700 ), 701 "Cosine Correlation": compound_obj.spectral_similarity_scores.get( 702 "cosine_correlation" 703 ), 704 "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get( 705 "stein_scott_similarity" 706 ), 707 "Pearson Correlation": compound_obj.spectral_similarity_scores.get( 708 "pearson_correlation" 709 ), 710 "Spearman Correlation": compound_obj.spectral_similarity_scores.get( 711 "spearman_correlation" 712 ), 713 "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get( 714 "kendall_tau_correlation" 715 ), 716 "DFT Correlation": compound_obj.spectral_similarity_scores.get( 717 "dft_correlation" 718 ), 719 "DWT Correlation": compound_obj.spectral_similarity_scores.get( 720 "dwt_correlation" 721 ), 722 "Euclidean Distance": compound_obj.spectral_similarity_scores.get( 723 "euclidean_distance" 724 ), 725 "Manhattan Distance": compound_obj.spectral_similarity_scores.get( 726 "manhattan_distance" 727 ), 728 "Jaccard Distance": compound_obj.spectral_similarity_scores.get( 729 "jaccard_distance" 730 ), 731 } 732 ) 733 for method in methods_name: 734 out_dict[methods_name.get(method)] = ( 735 compound_obj.spectral_similarity_scores.get(method) 736 ) 737 738 dict_data_list.append(out_dict) 739 740 def add_no_match_dict_data(): 741 dict_data_list.append( 742 { 743 "Sample name": gcms.sample_name, 744 "Peak Index": gcpeak_index, 745 "Retention Time": gc_peak.retention_time, 746 "Peak Height": gc_peak.tic, 747 "Peak Area": gc_peak.area, 748 "Retention index": gc_peak.ri, 749 } 750 ) 751 752 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 753 # check if there is a compound candidate 754 if gc_peak: 755 if output_score_method == "highest_sim_score": 756 compound_obj = gc_peak.highest_score_compound 757 add_match_dict_data() 758 759 elif output_score_method == "highest_ss": 760 compound_obj = gc_peak.highest_ss_compound 761 add_match_dict_data() 762 763 else: 764 for compound_obj in gc_peak: 765 add_match_dict_data() # add monoisotopic peak 766 767 else: 768 # include not_match 769 if include_no_match and no_match_inline: 770 add_no_match_dict_data() 771 772 if include_no_match and not no_match_inline: 773 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 774 if not gc_peak: 775 add_no_match_dict_data() 776 777 return dict_data_list
A class to export low resolution GC-MS data.
This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
Parameters:
out_file_path : str The output file path. gcms : object The low resolution GCMS object.
Attributes:
output_file : Path The output file path as a Path object. gcms : object The low resolution GCMS object.
Methods:
- get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
- get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
- to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
- to_excel(write_mode='a', write_metadata=True, id_label="corems:"), Export the data to an Excel file.
- to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). Export the data to a CSV file.
- to_hdf(id_label="corems:"). Export the data to an HDF5 file.
- get_data_stats(gcms). Get statistics about the GCMS data.
152 def get_pandas_df(self, id_label="corems:"): 153 """Get the exported data as a Pandas DataFrame. 154 155 Parameters: 156 ---------- 157 id_label : str, optional 158 The ID label for the data. Default is "corems:". 159 160 Returns: 161 ------- 162 DataFrame 163 The exported data as a Pandas DataFrame. 164 """ 165 166 columns = self._init_columns() 167 168 dict_data_list = self.get_list_dict_data(self.gcms) 169 170 df = DataFrame(dict_data_list, columns=columns) 171 172 df.name = self.gcms.sample_name 173 174 return df
Get the exported data as a Pandas DataFrame.
Parameters:
id_label : str, optional The ID label for the data. Default is "corems:".
Returns:
DataFrame The exported data as a Pandas DataFrame.
176 def get_json(self, nan=False, id_label="corems:"): 177 """Get the exported data as a JSON string. 178 179 Parameters: 180 ---------- 181 nan : bool, optional 182 Whether to include NaN values in the JSON string. Default is False. 183 id_label : str, optional 184 The ID label for the data. Default is "corems:". 185 186 """ 187 188 import json 189 190 dict_data_list = self.get_list_dict_data(self.gcms) 191 192 return json.dumps( 193 dict_data_list, sort_keys=False, indent=4, separators=(",", ": ") 194 )
Get the exported data as a JSON string.
Parameters:
nan : bool, optional Whether to include NaN values in the JSON string. Default is False. id_label : str, optional The ID label for the data. Default is "corems:".
196 def to_pandas(self, write_metadata=True, id_label="corems:"): 197 """Export the data to a Pandas DataFrame and save it as a pickle file. 198 199 Parameters: 200 ---------- 201 write_metadata : bool, optional 202 Whether to write metadata to the output file. 203 id_label : str, optional 204 The ID label for the data. 205 """ 206 207 columns = self._init_columns() 208 209 dict_data_list = self.get_list_dict_data(self.gcms) 210 211 df = DataFrame(dict_data_list, columns=columns) 212 213 df.to_pickle(self.output_file.with_suffix(".pkl")) 214 215 if write_metadata: 216 self.write_settings( 217 self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:" 218 )
Export the data to a Pandas DataFrame and save it as a pickle file.
Parameters:
write_metadata : bool, optional Whether to write metadata to the output file. id_label : str, optional The ID label for the data.
220 def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"): 221 """Export the data to an Excel file. 222 223 Parameters: 224 ---------- 225 write_mode : str, optional 226 The write mode for the Excel file. Default is 'a' (append). 227 write_metadata : bool, optional 228 Whether to write metadata to the output file. Default is True. 229 id_label : str, optional 230 The ID label for the data. Default is "corems:". 231 """ 232 233 out_put_path = self.output_file.with_suffix(".xlsx") 234 235 columns = self._init_columns() 236 237 dict_data_list = self.get_list_dict_data(self.gcms) 238 239 df = DataFrame(dict_data_list, columns=columns) 240 241 if write_mode == "a" and out_put_path.exists(): 242 writer = ExcelWriter(out_put_path, engine="openpyxl") 243 # try to open an existing workbook 244 writer.book = load_workbook(out_put_path) 245 # copy existing sheets 246 writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets) 247 # read existing file 248 reader = read_excel(out_put_path) 249 # write out the new sheet 250 df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1) 251 252 writer.close() 253 else: 254 df.to_excel( 255 self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl" 256 ) 257 258 if write_metadata: 259 self.write_settings(out_put_path, self.gcms, id_label=id_label)
Export the data to an Excel file.
Parameters:
write_mode : str, optional The write mode for the Excel file. Default is 'a' (append). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".
261 def to_csv( 262 self, 263 separate_output=False, 264 write_mode="w", 265 write_metadata=True, 266 id_label="corems:", 267 ): 268 """Export the data to a CSV file. 269 270 Parameters: 271 ---------- 272 separate_output : bool, optional 273 Whether to separate the output into multiple files. Default is False. 274 write_mode : str, optional 275 The write mode for the CSV file. Default is 'w' (write). 276 write_metadata : bool, optional 277 Whether to write metadata to the output file. Default is True. 278 id_label : str, optional 279 The ID label for the data. Default is "corems:". 280 """ 281 282 if separate_output: 283 # set write mode to write 284 # this mode will overwrite the file without warning 285 write_mode = "w" 286 else: 287 # set write mode to append 288 write_mode = "a" 289 290 columns = self._init_columns() 291 292 dict_data_list = self.get_list_dict_data(self.gcms) 293 294 out_put_path = self.output_file.with_suffix(".csv") 295 296 write_header = not out_put_path.exists() 297 298 try: 299 with open(out_put_path, write_mode, newline="") as csvfile: 300 writer = csv.DictWriter(csvfile, fieldnames=columns) 301 if write_header: 302 writer.writeheader() 303 for data in dict_data_list: 304 writer.writerow(data) 305 306 if write_metadata: 307 self.write_settings(out_put_path, self.gcms, id_label=id_label) 308 309 except IOError as ioerror: 310 print(ioerror)
Export the data to a CSV file.
Parameters:
separate_output : bool, optional Whether to separate the output into multiple files. Default is False. write_mode : str, optional The write mode for the CSV file. Default is 'w' (write). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".
312 def to_hdf(self, id_label="corems:"): 313 """Export the data to an HDF5 file. 314 315 Parameters: 316 ---------- 317 id_label : str, optional 318 The ID label for the data. Default is "corems:". 319 """ 320 321 # save sample at a time 322 def add_compound(gc_peak, compound_obj): 323 modifier = compound_obj.classify if compound_obj.classify else "" 324 compound_group = compound_obj.name.replace("/", "") + " " + modifier 325 326 if compound_group not in peak_group: 327 compound_group = peak_group.create_group(compound_group) 328 329 # compound_group.attrs["retention_time"] = compound_obj.retention_time 330 compound_group.attrs["retention_index"] = compound_obj.ri 331 compound_group.attrs["retention_index_score"] = compound_obj.ri_score 332 compound_group.attrs["spectral_similarity_score"] = ( 333 compound_obj.spectral_similarity_score 334 ) 335 compound_group.attrs["similarity_score"] = compound_obj.similarity_score 336 337 compond_mz = compound_group.create_dataset( 338 "mz", data=np.array(compound_obj.mz), dtype="f8" 339 ) 340 compond_abundance = compound_group.create_dataset( 341 "abundance", data=np.array(compound_obj.abundance), dtype="f8" 342 ) 343 344 if self.gcms.molecular_search_settings.exploratory_mode: 345 compound_group.attrs["Spectral Similarities"] = json.dumps( 346 compound_obj.spectral_similarity_scores, 347 sort_keys=False, 348 indent=4, 349 separators=(",", ":"), 350 ) 351 else: 352 warnings.warn("Skipping duplicate reference compound.") 353 354 import json 355 from datetime import datetime, timezone 356 357 import h5py 358 import numpy as np 359 360 output_path = self.output_file.with_suffix(".hdf5") 361 362 with h5py.File(output_path, "w") as hdf_handle: 363 timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")) 364 hdf_handle.attrs["time_stamp"] = timenow 365 hdf_handle.attrs["data_structure"] = "gcms" 366 hdf_handle.attrs["analyzer"] = self.gcms.analyzer 367 hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label 368 369 hdf_handle.attrs["sample_id"] = "self.gcms.id" 370 hdf_handle.attrs["sample_name"] = self.gcms.sample_name 371 hdf_handle.attrs["input_data"] = str(self.gcms.file_location) 372 hdf_handle.attrs["output_data"] = str(output_path) 373 hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex 374 hdf_handle.attrs["corems_version"] = __version__ 375 376 hdf_handle.attrs["Stats"] = json.dumps( 377 self.get_data_stats(self.gcms), 378 sort_keys=False, 379 indent=4, 380 separators=(",", ": "), 381 ) 382 hdf_handle.attrs["Calibration"] = json.dumps( 383 self.get_calibration_stats(self.gcms, id_label), 384 sort_keys=False, 385 indent=4, 386 separators=(",", ": "), 387 ) 388 hdf_handle.attrs["Blank"] = json.dumps( 389 self.get_blank_stats(self.gcms), 390 sort_keys=False, 391 indent=4, 392 separators=(",", ": "), 393 ) 394 395 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms) 396 hdf_handle.attrs["CoreMSParameters"] = json.dumps( 397 corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ") 398 ) 399 400 scans_dataset = hdf_handle.create_dataset( 401 "scans", data=np.array(self.gcms.scans_number), dtype="f8" 402 ) 403 rt_dataset = hdf_handle.create_dataset( 404 "rt", data=np.array(self.gcms.retention_time), dtype="f8" 405 ) 406 tic_dataset = hdf_handle.create_dataset( 407 "tic", data=np.array(self.gcms.tic), dtype="f8" 408 ) 409 processed_tic_dataset = hdf_handle.create_dataset( 410 "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8" 411 ) 412 413 output_score_method = ( 414 self.gcms.molecular_search_settings.output_score_method 415 ) 416 417 for gc_peak in self.gcms: 418 # print(gc_peak.retention_time) 419 # print(gc_peak.tic) 420 421 # check if there is a compound candidate 422 peak_group = hdf_handle.create_group(str(gc_peak.retention_time)) 423 peak_group.attrs["deconvolution"] = int( 424 self.gcms.chromatogram_settings.use_deconvolution 425 ) 426 427 peak_group.attrs["start_scan"] = gc_peak.start_scan 428 peak_group.attrs["apex_scan"] = gc_peak.apex_scan 429 peak_group.attrs["final_scan"] = gc_peak.final_scan 430 431 peak_group.attrs["retention_index"] = gc_peak.ri 432 peak_group.attrs["retention_time"] = gc_peak.retention_time 433 peak_group.attrs["area"] = gc_peak.area 434 435 mz = peak_group.create_dataset( 436 "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8" 437 ) 438 abundance = peak_group.create_dataset( 439 "abundance", 440 data=np.array(gc_peak.mass_spectrum.abundance), 441 dtype="f8", 442 ) 443 444 if gc_peak: 445 if output_score_method == "highest_sim_score": 446 compound_obj = gc_peak.highest_score_compound 447 add_compound(gc_peak, compound_obj) 448 449 elif output_score_method == "highest_ss": 450 compound_obj = gc_peak.highest_ss_compound 451 add_compound(gc_peak, compound_obj) 452 453 else: 454 for compound_obj in gc_peak: 455 add_compound(gc_peak, compound_obj)
Export the data to an HDF5 file.
Parameters:
id_label : str, optional The ID label for the data. Default is "corems:".
457 def get_data_stats(self, gcms): 458 """Get statistics about the GCMS data. 459 460 Parameters: 461 ---------- 462 gcms : object 463 The low resolution GCMS object. 464 465 Returns: 466 ------- 467 dict 468 A dictionary containing the data statistics. 469 """ 470 471 matched_peaks = gcms.matched_peaks 472 no_matched_peaks = gcms.no_matched_peaks 473 unique_metabolites = gcms.unique_metabolites 474 475 peak_matchs_above_0p85 = 0 476 unique_peak_match_above_0p85 = 0 477 for match_peak in matched_peaks: 478 gc_peak_above_85 = 0 479 matches_above_85 = list( 480 filter(lambda m: m.similarity_score >= 0.85, match_peak) 481 ) 482 if matches_above_85: 483 peak_matchs_above_0p85 += 1 484 if len(matches_above_85) == 1: 485 unique_peak_match_above_0p85 += 1 486 487 data_stats = {} 488 data_stats["average_signal_noise"] = "ni" 489 data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range 490 data_stats["total_number_peaks"] = len(gcms) 491 data_stats["total_peaks_matched"] = len(matched_peaks) 492 data_stats["total_peaks_without_matches"] = len(no_matched_peaks) 493 data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85 494 data_stats["single_matches_above_similarity_score_0.85"] = ( 495 unique_peak_match_above_0p85 496 ) 497 data_stats["unique_metabolites"] = len(unique_metabolites) 498 499 return data_stats
Get statistics about the GCMS data.
Parameters:
gcms : object The low resolution GCMS object.
Returns:
dict A dictionary containing the data statistics.
501 def get_calibration_stats(self, gcms, id_label): 502 """Get statistics about the GC-MS calibration. 503 504 Parameters: 505 ---------- 506 """ 507 calibration_parameters = {} 508 509 calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref 510 calibration_parameters["data_url"] = str(gcms.cal_file_path) 511 calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path) 512 calibration_parameters["data_name"] = str(gcms.cal_file_path.stem) 513 calibration_parameters["calibration_method"] = "" 514 515 return calibration_parameters
Get statistics about the GC-MS calibration.
Parameters:
517 def get_blank_stats(self, gcms): 518 """Get statistics about the GC-MS blank.""" 519 blank_parameters = {} 520 521 blank_parameters["data_name"] = "ni" 522 blank_parameters["blank_id"] = "ni" 523 blank_parameters["data_url"] = "ni" 524 blank_parameters["has_input"] = "ni" 525 blank_parameters["common_features_to_blank"] = "ni" 526 527 return blank_parameters
Get statistics about the GC-MS blank.
529 def get_instrument_metadata(self, gcms): 530 """Get metadata about the GC-MS instrument.""" 531 instrument_metadata = {} 532 533 instrument_metadata["analyzer"] = gcms.analyzer 534 instrument_metadata["instrument_label"] = gcms.instrument_label 535 instrument_metadata["instrument_id"] = uuid.uuid4().hex 536 537 return instrument_metadata
Get metadata about the GC-MS instrument.
539 def get_data_metadata(self, gcms, id_label, output_path): 540 """Get metadata about the GC-MS data. 541 542 Parameters: 543 ---------- 544 gcms : object 545 The low resolution GCMS object. 546 id_label : str 547 The ID label for the data. 548 output_path : str 549 The output file path. 550 551 Returns: 552 ------- 553 dict 554 A dictionary containing the data metadata. 555 """ 556 if isinstance(output_path, str): 557 output_path = Path(output_path) 558 559 paramaters_path = output_path.with_suffix(".json") 560 561 if paramaters_path.exists(): 562 with paramaters_path.open() as current_param: 563 metadata = json.load(current_param) 564 data_metadata = metadata.get("Data") 565 else: 566 data_metadata = {} 567 data_metadata["data_name"] = [] 568 data_metadata["input_data_url"] = [] 569 data_metadata["has_input"] = [] 570 571 data_metadata["data_name"].append(gcms.sample_name) 572 data_metadata["input_data_url"].append(str(gcms.file_location)) 573 data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location)) 574 575 data_metadata["output_data_name"] = str(output_path.stem) 576 data_metadata["output_data_url"] = str(output_path) 577 data_metadata["has_output"] = id_label + corems_md5(output_path) 578 579 return data_metadata
Get metadata about the GC-MS data.
Parameters:
gcms : object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.
Returns:
dict A dictionary containing the data metadata.
581 def get_parameters_json(self, gcms, id_label, output_path): 582 """Get the parameters as a JSON string. 583 584 Parameters: 585 ---------- 586 gcms : GCMS object 587 The low resolution GCMS object. 588 id_label : str 589 The ID label for the data. 590 output_path : str 591 The output file path. 592 593 Returns: 594 ------- 595 str 596 The parameters as a JSON string. 597 """ 598 599 output_parameters_dict = {} 600 output_parameters_dict["Data"] = self.get_data_metadata( 601 gcms, id_label, output_path 602 ) 603 output_parameters_dict["Stats"] = self.get_data_stats(gcms) 604 output_parameters_dict["Calibration"] = self.get_calibration_stats( 605 gcms, id_label 606 ) 607 output_parameters_dict["Blank"] = self.get_blank_stats(gcms) 608 output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms) 609 corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms) 610 corems_dict_setting["corems_version"] = __version__ 611 output_parameters_dict["CoreMSParameters"] = corems_dict_setting 612 output_parameters_dict["has_metabolite"] = gcms.metabolites_data 613 output = json.dumps( 614 output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ") 615 ) 616 617 return output
Get the parameters as a JSON string.
Parameters:
gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.
Returns:
str The parameters as a JSON string.
619 def write_settings(self, output_path, gcms, id_label="emsl:"): 620 """Write the settings to a JSON file. 621 622 Parameters: 623 ---------- 624 output_path : str 625 The output file path. 626 gcms : GCMS object 627 The low resolution GCMS object. 628 id_label : str 629 The ID label for the data. Default is "emsl:". 630 631 """ 632 633 output = self.get_parameters_json(gcms, id_label, output_path) 634 635 with open( 636 output_path.with_suffix(".json"), 637 "w", 638 encoding="utf8", 639 ) as outfile: 640 outfile.write(output)
Write the settings to a JSON file.
Parameters:
output_path : str The output file path. gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. Default is "emsl:".
642 def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False): 643 """Get the exported data as a list of dictionaries. 644 645 Parameters: 646 ---------- 647 gcms : object 648 The low resolution GCMS object. 649 include_no_match : bool, optional 650 Whether to include no match data. Default is True. 651 no_match_inline : bool, optional 652 Whether to include no match data inline. Default is False. 653 654 Returns: 655 ------- 656 list 657 The exported data as a list of dictionaries. 658 """ 659 660 output_score_method = gcms.molecular_search_settings.output_score_method 661 662 dict_data_list = [] 663 664 def add_match_dict_data(): 665 derivatization = "{}:{}:{}".format( 666 compound_obj.classify, 667 compound_obj.derivativenum, 668 compound_obj.derivatization, 669 ) 670 out_dict = { 671 "Sample name": gcms.sample_name, 672 "Peak Index": gcpeak_index, 673 "Retention Time": gc_peak.retention_time, 674 "Retention Time Ref": compound_obj.retention_time, 675 "Peak Height": gc_peak.tic, 676 "Peak Area": gc_peak.area, 677 "Retention index": gc_peak.ri, 678 "Retention index Ref": compound_obj.ri, 679 "Retention Index Score": compound_obj.ri_score, 680 "Spectral Similarity Score": compound_obj.spectral_similarity_score, 681 "Similarity Score": compound_obj.similarity_score, 682 "Compound Name": compound_obj.name, 683 "Chebi ID": compound_obj.metadata.chebi, 684 "Kegg Compound ID": compound_obj.metadata.kegg, 685 "Inchi": compound_obj.metadata.inchi, 686 "Inchi Key": compound_obj.metadata.inchikey, 687 "Smiles": compound_obj.metadata.smiles, 688 "Molecular Formula": compound_obj.formula, 689 "IUPAC Name": compound_obj.metadata.iupac_name, 690 "Traditional Name": compound_obj.metadata.traditional_name, 691 "Common Name": compound_obj.metadata.common_name, 692 "Derivatization": derivatization, 693 } 694 695 if self.gcms.molecular_search_settings.exploratory_mode: 696 out_dict.update( 697 { 698 "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get( 699 "weighted_cosine_correlation" 700 ), 701 "Cosine Correlation": compound_obj.spectral_similarity_scores.get( 702 "cosine_correlation" 703 ), 704 "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get( 705 "stein_scott_similarity" 706 ), 707 "Pearson Correlation": compound_obj.spectral_similarity_scores.get( 708 "pearson_correlation" 709 ), 710 "Spearman Correlation": compound_obj.spectral_similarity_scores.get( 711 "spearman_correlation" 712 ), 713 "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get( 714 "kendall_tau_correlation" 715 ), 716 "DFT Correlation": compound_obj.spectral_similarity_scores.get( 717 "dft_correlation" 718 ), 719 "DWT Correlation": compound_obj.spectral_similarity_scores.get( 720 "dwt_correlation" 721 ), 722 "Euclidean Distance": compound_obj.spectral_similarity_scores.get( 723 "euclidean_distance" 724 ), 725 "Manhattan Distance": compound_obj.spectral_similarity_scores.get( 726 "manhattan_distance" 727 ), 728 "Jaccard Distance": compound_obj.spectral_similarity_scores.get( 729 "jaccard_distance" 730 ), 731 } 732 ) 733 for method in methods_name: 734 out_dict[methods_name.get(method)] = ( 735 compound_obj.spectral_similarity_scores.get(method) 736 ) 737 738 dict_data_list.append(out_dict) 739 740 def add_no_match_dict_data(): 741 dict_data_list.append( 742 { 743 "Sample name": gcms.sample_name, 744 "Peak Index": gcpeak_index, 745 "Retention Time": gc_peak.retention_time, 746 "Peak Height": gc_peak.tic, 747 "Peak Area": gc_peak.area, 748 "Retention index": gc_peak.ri, 749 } 750 ) 751 752 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 753 # check if there is a compound candidate 754 if gc_peak: 755 if output_score_method == "highest_sim_score": 756 compound_obj = gc_peak.highest_score_compound 757 add_match_dict_data() 758 759 elif output_score_method == "highest_ss": 760 compound_obj = gc_peak.highest_ss_compound 761 add_match_dict_data() 762 763 else: 764 for compound_obj in gc_peak: 765 add_match_dict_data() # add monoisotopic peak 766 767 else: 768 # include not_match 769 if include_no_match and no_match_inline: 770 add_no_match_dict_data() 771 772 if include_no_match and not no_match_inline: 773 for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks): 774 if not gc_peak: 775 add_no_match_dict_data() 776 777 return dict_data_list
Get the exported data as a list of dictionaries.
Parameters:
gcms : object The low resolution GCMS object. include_no_match : bool, optional Whether to include no match data. Default is True. no_match_inline : bool, optional Whether to include no match data inline. Default is False.
Returns:
list The exported data as a list of dictionaries.
780class HighResMassSpectraExport(HighResMassSpecExport): 781 """A class to export high resolution mass spectra data. 782 783 This class provides methods to export high resolution mass spectra data to various formats 784 such as Excel, CSV, HDF5, and Pandas DataFrame. 785 786 Parameters 787 ---------- 788 out_file_path : str | Path 789 The output file path. 790 mass_spectra : object 791 The high resolution mass spectra object. 792 output_type : str, optional 793 The output type. Default is 'excel'. 794 795 Attributes 796 ---------- 797 output_file : Path 798 The output file path without suffix 799 dir_loc : Path 800 The directory location for the output file, 801 by default this will be the output_file + ".corems" and all output files will be 802 written into this location 803 mass_spectra : MassSpectraBase 804 The high resolution mass spectra object. 805 """ 806 807 def __init__(self, out_file_path, mass_spectra, output_type="excel"): 808 super().__init__( 809 out_file_path=out_file_path, mass_spectrum=None, output_type=output_type 810 ) 811 812 self.dir_loc = Path(out_file_path + ".corems") 813 self.dir_loc.mkdir(exist_ok=True) 814 # Place the output file in the directory 815 self.output_file = self.dir_loc / Path(out_file_path).name 816 self._output_type = output_type # 'excel', 'csv', 'pandas' or 'hdf5' 817 self.mass_spectra = mass_spectra 818 self.atoms_order_list = None 819 self._init_columns() 820 821 def get_pandas_df(self): 822 """Get the mass spectra as a list of Pandas DataFrames.""" 823 824 list_df = [] 825 826 for mass_spectrum in self.mass_spectra: 827 columns = self.columns_label + self.get_all_used_atoms_in_order( 828 mass_spectrum 829 ) 830 831 dict_data_list = self.get_list_dict_data(mass_spectrum) 832 833 df = DataFrame(dict_data_list, columns=columns) 834 835 scan_number = mass_spectrum.scan_number 836 837 df.name = str(self.output_file) + "_" + str(scan_number) 838 839 list_df.append(df) 840 841 return list_df 842 843 def to_pandas(self, write_metadata=True): 844 """Export the data to a Pandas DataFrame and save it as a pickle file. 845 846 Parameters: 847 ---------- 848 write_metadata : bool, optional 849 Whether to write metadata to the output file. Default is True. 850 """ 851 852 for mass_spectrum in self.mass_spectra: 853 columns = self.columns_label + self.get_all_used_atoms_in_order( 854 mass_spectrum 855 ) 856 857 dict_data_list = self.get_list_dict_data(mass_spectrum) 858 859 df = DataFrame(dict_data_list, columns=columns) 860 861 scan_number = mass_spectrum.scan_number 862 863 out_filename = Path( 864 "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl") 865 ) 866 867 df.to_pickle(self.dir_loc / out_filename) 868 869 if write_metadata: 870 self.write_settings( 871 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 872 ) 873 874 def to_excel(self, write_metadata=True): 875 """Export the data to an Excel file. 876 877 Parameters: 878 ---------- 879 write_metadata : bool, optional 880 Whether to write metadata to the output file. Default is True. 881 """ 882 for mass_spectrum in self.mass_spectra: 883 columns = self.columns_label + self.get_all_used_atoms_in_order( 884 mass_spectrum 885 ) 886 887 dict_data_list = self.get_list_dict_data(mass_spectrum) 888 889 df = DataFrame(dict_data_list, columns=columns) 890 891 scan_number = mass_spectrum.scan_number 892 893 out_filename = Path( 894 "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx") 895 ) 896 897 df.to_excel(self.dir_loc / out_filename) 898 899 if write_metadata: 900 self.write_settings( 901 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 902 ) 903 904 def to_csv(self, write_metadata=True): 905 """Export the data to a CSV file. 906 907 Parameters: 908 ---------- 909 write_metadata : bool, optional 910 Whether to write metadata to the output file. Default is True. 911 """ 912 import csv 913 914 for mass_spectrum in self.mass_spectra: 915 columns = self.columns_label + self.get_all_used_atoms_in_order( 916 mass_spectrum 917 ) 918 919 scan_number = mass_spectrum.scan_number 920 921 dict_data_list = self.get_list_dict_data(mass_spectrum) 922 923 out_filename = Path( 924 "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv") 925 ) 926 927 with open(self.dir_loc / out_filename, "w", newline="") as csvfile: 928 writer = csv.DictWriter(csvfile, fieldnames=columns) 929 writer.writeheader() 930 for data in dict_data_list: 931 writer.writerow(data) 932 933 if write_metadata: 934 self.write_settings( 935 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 936 ) 937 938 def get_mass_spectra_attrs(self): 939 """Get the mass spectra attributes as a JSON string. 940 941 Parameters: 942 ---------- 943 mass_spectra : object 944 The high resolution mass spectra object. 945 946 Returns: 947 ------- 948 str 949 The mass spectra attributes as a JSON string. 950 """ 951 dict_ms_attrs = {} 952 dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer 953 dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label 954 dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name 955 956 return json.dumps( 957 dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ") 958 ) 959 960 def to_hdf(self, overwrite=False, export_raw=True): 961 """Export the data to an HDF5 file. 962 963 Parameters 964 ---------- 965 overwrite : bool, optional 966 Whether to overwrite the output file. Default is False. 967 export_raw : bool, optional 968 Whether to export the raw mass spectra data. Default is True. 969 """ 970 if overwrite: 971 if self.output_file.with_suffix(".hdf5").exists(): 972 self.output_file.with_suffix(".hdf5").unlink() 973 974 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 975 if not hdf_handle.attrs.get("date_utc"): 976 # Set metadata for all mass spectra 977 timenow = str( 978 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 979 ) 980 hdf_handle.attrs["date_utc"] = timenow 981 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 982 hdf_handle.attrs["data_structure"] = "mass_spectra" 983 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 984 hdf_handle.attrs["instrument_label"] = ( 985 self.mass_spectra.instrument_label 986 ) 987 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 988 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 989 hdf_handle.attrs["parser_type"] = ( 990 self.mass_spectra.spectra_parser_class.__name__ 991 ) 992 hdf_handle.attrs["original_file_location"] = ( 993 self.mass_spectra.file_location._str 994 ) 995 996 if "mass_spectra" not in hdf_handle: 997 mass_spectra_group = hdf_handle.create_group("mass_spectra") 998 else: 999 mass_spectra_group = hdf_handle.get("mass_spectra") 1000 1001 for mass_spectrum in self.mass_spectra: 1002 group_key = str(int(mass_spectrum.scan_number)) 1003 1004 self.add_mass_spectrum_to_hdf5( 1005 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw 1006 )
A class to export high resolution mass spectra data.
This class provides methods to export high resolution mass spectra data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
Parameters
- out_file_path (str | Path): The output file path.
- mass_spectra (object): The high resolution mass spectra object.
- output_type (str, optional): The output type. Default is 'excel'.
Attributes
- output_file (Path): The output file path without suffix
- dir_loc (Path): The directory location for the output file, by default this will be the output_file + ".corems" and all output files will be written into this location
- mass_spectra (MassSpectraBase): The high resolution mass spectra object.
807 def __init__(self, out_file_path, mass_spectra, output_type="excel"): 808 super().__init__( 809 out_file_path=out_file_path, mass_spectrum=None, output_type=output_type 810 ) 811 812 self.dir_loc = Path(out_file_path + ".corems") 813 self.dir_loc.mkdir(exist_ok=True) 814 # Place the output file in the directory 815 self.output_file = self.dir_loc / Path(out_file_path).name 816 self._output_type = output_type # 'excel', 'csv', 'pandas' or 'hdf5' 817 self.mass_spectra = mass_spectra 818 self.atoms_order_list = None 819 self._init_columns()
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
821 def get_pandas_df(self): 822 """Get the mass spectra as a list of Pandas DataFrames.""" 823 824 list_df = [] 825 826 for mass_spectrum in self.mass_spectra: 827 columns = self.columns_label + self.get_all_used_atoms_in_order( 828 mass_spectrum 829 ) 830 831 dict_data_list = self.get_list_dict_data(mass_spectrum) 832 833 df = DataFrame(dict_data_list, columns=columns) 834 835 scan_number = mass_spectrum.scan_number 836 837 df.name = str(self.output_file) + "_" + str(scan_number) 838 839 list_df.append(df) 840 841 return list_df
Get the mass spectra as a list of Pandas DataFrames.
843 def to_pandas(self, write_metadata=True): 844 """Export the data to a Pandas DataFrame and save it as a pickle file. 845 846 Parameters: 847 ---------- 848 write_metadata : bool, optional 849 Whether to write metadata to the output file. Default is True. 850 """ 851 852 for mass_spectrum in self.mass_spectra: 853 columns = self.columns_label + self.get_all_used_atoms_in_order( 854 mass_spectrum 855 ) 856 857 dict_data_list = self.get_list_dict_data(mass_spectrum) 858 859 df = DataFrame(dict_data_list, columns=columns) 860 861 scan_number = mass_spectrum.scan_number 862 863 out_filename = Path( 864 "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl") 865 ) 866 867 df.to_pickle(self.dir_loc / out_filename) 868 869 if write_metadata: 870 self.write_settings( 871 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 872 )
Export the data to a Pandas DataFrame and save it as a pickle file.
Parameters:
write_metadata : bool, optional Whether to write metadata to the output file. Default is True.
874 def to_excel(self, write_metadata=True): 875 """Export the data to an Excel file. 876 877 Parameters: 878 ---------- 879 write_metadata : bool, optional 880 Whether to write metadata to the output file. Default is True. 881 """ 882 for mass_spectrum in self.mass_spectra: 883 columns = self.columns_label + self.get_all_used_atoms_in_order( 884 mass_spectrum 885 ) 886 887 dict_data_list = self.get_list_dict_data(mass_spectrum) 888 889 df = DataFrame(dict_data_list, columns=columns) 890 891 scan_number = mass_spectrum.scan_number 892 893 out_filename = Path( 894 "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx") 895 ) 896 897 df.to_excel(self.dir_loc / out_filename) 898 899 if write_metadata: 900 self.write_settings( 901 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 902 )
Export the data to an Excel file.
Parameters:
write_metadata : bool, optional Whether to write metadata to the output file. Default is True.
904 def to_csv(self, write_metadata=True): 905 """Export the data to a CSV file. 906 907 Parameters: 908 ---------- 909 write_metadata : bool, optional 910 Whether to write metadata to the output file. Default is True. 911 """ 912 import csv 913 914 for mass_spectrum in self.mass_spectra: 915 columns = self.columns_label + self.get_all_used_atoms_in_order( 916 mass_spectrum 917 ) 918 919 scan_number = mass_spectrum.scan_number 920 921 dict_data_list = self.get_list_dict_data(mass_spectrum) 922 923 out_filename = Path( 924 "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv") 925 ) 926 927 with open(self.dir_loc / out_filename, "w", newline="") as csvfile: 928 writer = csv.DictWriter(csvfile, fieldnames=columns) 929 writer.writeheader() 930 for data in dict_data_list: 931 writer.writerow(data) 932 933 if write_metadata: 934 self.write_settings( 935 self.dir_loc / out_filename.with_suffix(""), mass_spectrum 936 )
Export the data to a CSV file.
Parameters:
write_metadata : bool, optional Whether to write metadata to the output file. Default is True.
938 def get_mass_spectra_attrs(self): 939 """Get the mass spectra attributes as a JSON string. 940 941 Parameters: 942 ---------- 943 mass_spectra : object 944 The high resolution mass spectra object. 945 946 Returns: 947 ------- 948 str 949 The mass spectra attributes as a JSON string. 950 """ 951 dict_ms_attrs = {} 952 dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer 953 dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label 954 dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name 955 956 return json.dumps( 957 dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ") 958 )
Get the mass spectra attributes as a JSON string.
Parameters:
mass_spectra : object The high resolution mass spectra object.
Returns:
str The mass spectra attributes as a JSON string.
960 def to_hdf(self, overwrite=False, export_raw=True): 961 """Export the data to an HDF5 file. 962 963 Parameters 964 ---------- 965 overwrite : bool, optional 966 Whether to overwrite the output file. Default is False. 967 export_raw : bool, optional 968 Whether to export the raw mass spectra data. Default is True. 969 """ 970 if overwrite: 971 if self.output_file.with_suffix(".hdf5").exists(): 972 self.output_file.with_suffix(".hdf5").unlink() 973 974 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 975 if not hdf_handle.attrs.get("date_utc"): 976 # Set metadata for all mass spectra 977 timenow = str( 978 datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z") 979 ) 980 hdf_handle.attrs["date_utc"] = timenow 981 hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name 982 hdf_handle.attrs["data_structure"] = "mass_spectra" 983 hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer 984 hdf_handle.attrs["instrument_label"] = ( 985 self.mass_spectra.instrument_label 986 ) 987 hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name 988 hdf_handle.attrs["polarity"] = self.mass_spectra.polarity 989 hdf_handle.attrs["parser_type"] = ( 990 self.mass_spectra.spectra_parser_class.__name__ 991 ) 992 hdf_handle.attrs["original_file_location"] = ( 993 self.mass_spectra.file_location._str 994 ) 995 996 if "mass_spectra" not in hdf_handle: 997 mass_spectra_group = hdf_handle.create_group("mass_spectra") 998 else: 999 mass_spectra_group = hdf_handle.get("mass_spectra") 1000 1001 for mass_spectrum in self.mass_spectra: 1002 group_key = str(int(mass_spectrum.scan_number)) 1003 1004 self.add_mass_spectrum_to_hdf5( 1005 hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw 1006 )
Export the data to an HDF5 file.
Parameters
- overwrite (bool, optional): Whether to overwrite the output file. Default is False.
- export_raw (bool, optional): Whether to export the raw mass spectra data. Default is True.
Inherited Members
- corems.mass_spectrum.output.export.HighResMassSpecExport
- output_type
- mass_spectrum
- save
- run
- write_settings
- to_json
- add_mass_spectrum_to_hdf5
- parameters_to_toml
- parameters_to_json
- get_mass_spec_attrs
- get_all_used_atoms_in_order
- list_dict_to_list
- get_list_dict_data
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id
1009class LCMSExport(HighResMassSpectraExport): 1010 """A class to export high resolution LC-MS data. 1011 1012 This class provides methods to export high resolution LC-MS data to HDF5. 1013 1014 Parameters 1015 ---------- 1016 out_file_path : str | Path 1017 The output file path, do not include the file extension. 1018 lcms_object : LCMSBase 1019 The high resolution lc-ms object. 1020 """ 1021 1022 def __init__(self, out_file_path, mass_spectra): 1023 super().__init__(out_file_path, mass_spectra, output_type="hdf5") 1024 1025 def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"): 1026 """Export the data to an HDF5. 1027 1028 Parameters 1029 ---------- 1030 overwrite : bool, optional 1031 Whether to overwrite the output file. Default is False. 1032 save_parameters : bool, optional 1033 Whether to save the parameters as a separate json or toml file. Default is True. 1034 parameter_format : str, optional 1035 The format to save the parameters in. Default is 'toml'. 1036 1037 Raises 1038 ------ 1039 ValueError 1040 If parameter_format is not 'json' or 'toml'. 1041 """ 1042 export_profile_spectra = ( 1043 self.mass_spectra.parameters.lc_ms.export_profile_spectra 1044 ) 1045 1046 # Write the mass spectra data to the hdf5 file 1047 super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra) 1048 1049 # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file 1050 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1051 # Add scan_info to hdf5 file 1052 if "scan_info" not in hdf_handle: 1053 scan_info_group = hdf_handle.create_group("scan_info") 1054 for k, v in self.mass_spectra._scan_info.items(): 1055 array = np.array(list(v.values())) 1056 if array.dtype.str[0:2] == "<U": 1057 array = array.astype("S") 1058 scan_info_group.create_dataset(k, data=array) 1059 1060 # Add ms_unprocessed to hdf5 file 1061 export_unprocessed_ms1 = ( 1062 self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1 1063 ) 1064 if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1: 1065 if "ms_unprocessed" not in hdf_handle: 1066 ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed") 1067 else: 1068 ms_unprocessed_group = hdf_handle.get("ms_unprocessed") 1069 for k, v in self.mass_spectra._ms_unprocessed.items(): 1070 array = np.array(v) 1071 ms_unprocessed_group.create_dataset(str(k), data=array) 1072 1073 # Add LCMS mass features to hdf5 file 1074 if len(self.mass_spectra.mass_features) > 0: 1075 if "mass_features" not in hdf_handle: 1076 mass_features_group = hdf_handle.create_group("mass_features") 1077 else: 1078 mass_features_group = hdf_handle.get("mass_features") 1079 1080 # Create group for each mass feature, with key as the mass feature id 1081 for k, v in self.mass_spectra.mass_features.items(): 1082 mass_features_group.create_group(str(k)) 1083 # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array) 1084 for k2, v2 in v.__dict__.items(): 1085 if v2 is not None: 1086 # Check if the attribute is an integer or float and set as an attribute in the mass feature group 1087 if k2 not in [ 1088 "chromatogram_parent", 1089 "ms2_mass_spectra", 1090 "mass_spectrum", 1091 "_eic_data", 1092 "ms2_similarity_results", 1093 ]: 1094 if k2 == "ms2_scan_numbers": 1095 array = np.array(v2) 1096 mass_features_group[str(k)].create_dataset( 1097 str(k2), data=array 1098 ) 1099 elif k2 == "_half_height_width": 1100 array = np.array(v2) 1101 mass_features_group[str(k)].create_dataset( 1102 str(k2), data=array 1103 ) 1104 elif k2 == "_ms_deconvoluted_idx": 1105 array = np.array(v2) 1106 mass_features_group[str(k)].create_dataset( 1107 str(k2), data=array 1108 ) 1109 elif k2 == "associated_mass_features_deconvoluted": 1110 array = np.array(v2) 1111 mass_features_group[str(k)].create_dataset( 1112 str(k2), data=array 1113 ) 1114 elif ( 1115 isinstance(v2, int) 1116 or isinstance(v2, float) 1117 or isinstance(v2, str) 1118 or isinstance(v2, np.integer) 1119 or isinstance(v2, np.bool_) 1120 ): 1121 mass_features_group[str(k)].attrs[str(k2)] = v2 1122 else: 1123 raise TypeError( 1124 f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file" 1125 ) 1126 1127 # Add EIC data to hdf5 file 1128 export_eics = self.mass_spectra.parameters.lc_ms.export_eics 1129 if len(self.mass_spectra.eics) > 0 and export_eics: 1130 if "eics" not in hdf_handle: 1131 eic_group = hdf_handle.create_group("eics") 1132 else: 1133 eic_group = hdf_handle.get("eics") 1134 1135 # Create group for each eic 1136 for k, v in self.mass_spectra.eics.items(): 1137 eic_group.create_group(str(k)) 1138 eic_group[str(k)].attrs["mz"] = k 1139 # Loop through each of the attributes and add them as datasets (if array) 1140 for k2, v2 in v.__dict__.items(): 1141 if v2 is not None: 1142 array = np.array(v2) 1143 eic_group[str(k)].create_dataset(str(k2), data=array) 1144 1145 # Add ms2_search results to hdf5 file 1146 if len(self.mass_spectra.spectral_search_results) > 0: 1147 if "spectral_search_results" not in hdf_handle: 1148 spectral_search_results = hdf_handle.create_group( 1149 "spectral_search_results" 1150 ) 1151 else: 1152 spectral_search_results = hdf_handle.get("spectral_search_results") 1153 # Create group for each search result by ms2_scan / precursor_mz 1154 for k, v in self.mass_spectra.spectral_search_results.items(): 1155 spectral_search_results.create_group(str(k)) 1156 for k2, v2 in v.items(): 1157 spectral_search_results[str(k)].create_group(str(k2)) 1158 spectral_search_results[str(k)][str(k2)].attrs[ 1159 "precursor_mz" 1160 ] = v2.precursor_mz 1161 spectral_search_results[str(k)][str(k2)].attrs[ 1162 "query_spectrum_id" 1163 ] = v2.query_spectrum_id 1164 # Loop through each of the attributes and add them as datasets (if array) 1165 for k3, v3 in v2.__dict__.items(): 1166 if v3 is not None and k3 not in [ 1167 "query_spectrum", 1168 "precursor_mz", 1169 "query_spectrum_id", 1170 ]: 1171 if k3 == "query_frag_types" or k3 == "ref_frag_types": 1172 v3 = [", ".join(x) for x in v3] 1173 array = np.array(v3) 1174 if array.dtype.str[0:2] == "<U": 1175 array = array.astype("S") 1176 spectral_search_results[str(k)][str(k2)].create_dataset( 1177 str(k3), data=array 1178 ) 1179 1180 # Save parameters as separate json 1181 if save_parameters: 1182 # Check if parameter_format is valid 1183 if parameter_format not in ["json", "toml"]: 1184 raise ValueError("parameter_format must be 'json' or 'toml'") 1185 1186 if parameter_format == "json": 1187 dump_lcms_settings_json( 1188 filename=self.output_file.with_suffix(".json"), 1189 lcms_obj=self.mass_spectra, 1190 ) 1191 elif parameter_format == "toml": 1192 dump_lcms_settings_toml( 1193 filename=self.output_file.with_suffix(".toml"), 1194 lcms_obj=self.mass_spectra, 1195 )
A class to export high resolution LC-MS data.
This class provides methods to export high resolution LC-MS data to HDF5.
Parameters
- out_file_path (str | Path): The output file path, do not include the file extension.
- lcms_object (LCMSBase): The high resolution lc-ms object.
1022 def __init__(self, out_file_path, mass_spectra): 1023 super().__init__(out_file_path, mass_spectra, output_type="hdf5")
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
1025 def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"): 1026 """Export the data to an HDF5. 1027 1028 Parameters 1029 ---------- 1030 overwrite : bool, optional 1031 Whether to overwrite the output file. Default is False. 1032 save_parameters : bool, optional 1033 Whether to save the parameters as a separate json or toml file. Default is True. 1034 parameter_format : str, optional 1035 The format to save the parameters in. Default is 'toml'. 1036 1037 Raises 1038 ------ 1039 ValueError 1040 If parameter_format is not 'json' or 'toml'. 1041 """ 1042 export_profile_spectra = ( 1043 self.mass_spectra.parameters.lc_ms.export_profile_spectra 1044 ) 1045 1046 # Write the mass spectra data to the hdf5 file 1047 super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra) 1048 1049 # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file 1050 with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle: 1051 # Add scan_info to hdf5 file 1052 if "scan_info" not in hdf_handle: 1053 scan_info_group = hdf_handle.create_group("scan_info") 1054 for k, v in self.mass_spectra._scan_info.items(): 1055 array = np.array(list(v.values())) 1056 if array.dtype.str[0:2] == "<U": 1057 array = array.astype("S") 1058 scan_info_group.create_dataset(k, data=array) 1059 1060 # Add ms_unprocessed to hdf5 file 1061 export_unprocessed_ms1 = ( 1062 self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1 1063 ) 1064 if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1: 1065 if "ms_unprocessed" not in hdf_handle: 1066 ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed") 1067 else: 1068 ms_unprocessed_group = hdf_handle.get("ms_unprocessed") 1069 for k, v in self.mass_spectra._ms_unprocessed.items(): 1070 array = np.array(v) 1071 ms_unprocessed_group.create_dataset(str(k), data=array) 1072 1073 # Add LCMS mass features to hdf5 file 1074 if len(self.mass_spectra.mass_features) > 0: 1075 if "mass_features" not in hdf_handle: 1076 mass_features_group = hdf_handle.create_group("mass_features") 1077 else: 1078 mass_features_group = hdf_handle.get("mass_features") 1079 1080 # Create group for each mass feature, with key as the mass feature id 1081 for k, v in self.mass_spectra.mass_features.items(): 1082 mass_features_group.create_group(str(k)) 1083 # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array) 1084 for k2, v2 in v.__dict__.items(): 1085 if v2 is not None: 1086 # Check if the attribute is an integer or float and set as an attribute in the mass feature group 1087 if k2 not in [ 1088 "chromatogram_parent", 1089 "ms2_mass_spectra", 1090 "mass_spectrum", 1091 "_eic_data", 1092 "ms2_similarity_results", 1093 ]: 1094 if k2 == "ms2_scan_numbers": 1095 array = np.array(v2) 1096 mass_features_group[str(k)].create_dataset( 1097 str(k2), data=array 1098 ) 1099 elif k2 == "_half_height_width": 1100 array = np.array(v2) 1101 mass_features_group[str(k)].create_dataset( 1102 str(k2), data=array 1103 ) 1104 elif k2 == "_ms_deconvoluted_idx": 1105 array = np.array(v2) 1106 mass_features_group[str(k)].create_dataset( 1107 str(k2), data=array 1108 ) 1109 elif k2 == "associated_mass_features_deconvoluted": 1110 array = np.array(v2) 1111 mass_features_group[str(k)].create_dataset( 1112 str(k2), data=array 1113 ) 1114 elif ( 1115 isinstance(v2, int) 1116 or isinstance(v2, float) 1117 or isinstance(v2, str) 1118 or isinstance(v2, np.integer) 1119 or isinstance(v2, np.bool_) 1120 ): 1121 mass_features_group[str(k)].attrs[str(k2)] = v2 1122 else: 1123 raise TypeError( 1124 f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file" 1125 ) 1126 1127 # Add EIC data to hdf5 file 1128 export_eics = self.mass_spectra.parameters.lc_ms.export_eics 1129 if len(self.mass_spectra.eics) > 0 and export_eics: 1130 if "eics" not in hdf_handle: 1131 eic_group = hdf_handle.create_group("eics") 1132 else: 1133 eic_group = hdf_handle.get("eics") 1134 1135 # Create group for each eic 1136 for k, v in self.mass_spectra.eics.items(): 1137 eic_group.create_group(str(k)) 1138 eic_group[str(k)].attrs["mz"] = k 1139 # Loop through each of the attributes and add them as datasets (if array) 1140 for k2, v2 in v.__dict__.items(): 1141 if v2 is not None: 1142 array = np.array(v2) 1143 eic_group[str(k)].create_dataset(str(k2), data=array) 1144 1145 # Add ms2_search results to hdf5 file 1146 if len(self.mass_spectra.spectral_search_results) > 0: 1147 if "spectral_search_results" not in hdf_handle: 1148 spectral_search_results = hdf_handle.create_group( 1149 "spectral_search_results" 1150 ) 1151 else: 1152 spectral_search_results = hdf_handle.get("spectral_search_results") 1153 # Create group for each search result by ms2_scan / precursor_mz 1154 for k, v in self.mass_spectra.spectral_search_results.items(): 1155 spectral_search_results.create_group(str(k)) 1156 for k2, v2 in v.items(): 1157 spectral_search_results[str(k)].create_group(str(k2)) 1158 spectral_search_results[str(k)][str(k2)].attrs[ 1159 "precursor_mz" 1160 ] = v2.precursor_mz 1161 spectral_search_results[str(k)][str(k2)].attrs[ 1162 "query_spectrum_id" 1163 ] = v2.query_spectrum_id 1164 # Loop through each of the attributes and add them as datasets (if array) 1165 for k3, v3 in v2.__dict__.items(): 1166 if v3 is not None and k3 not in [ 1167 "query_spectrum", 1168 "precursor_mz", 1169 "query_spectrum_id", 1170 ]: 1171 if k3 == "query_frag_types" or k3 == "ref_frag_types": 1172 v3 = [", ".join(x) for x in v3] 1173 array = np.array(v3) 1174 if array.dtype.str[0:2] == "<U": 1175 array = array.astype("S") 1176 spectral_search_results[str(k)][str(k2)].create_dataset( 1177 str(k3), data=array 1178 ) 1179 1180 # Save parameters as separate json 1181 if save_parameters: 1182 # Check if parameter_format is valid 1183 if parameter_format not in ["json", "toml"]: 1184 raise ValueError("parameter_format must be 'json' or 'toml'") 1185 1186 if parameter_format == "json": 1187 dump_lcms_settings_json( 1188 filename=self.output_file.with_suffix(".json"), 1189 lcms_obj=self.mass_spectra, 1190 ) 1191 elif parameter_format == "toml": 1192 dump_lcms_settings_toml( 1193 filename=self.output_file.with_suffix(".toml"), 1194 lcms_obj=self.mass_spectra, 1195 )
Export the data to an HDF5.
Parameters
- overwrite (bool, optional): Whether to overwrite the output file. Default is False.
- save_parameters (bool, optional): Whether to save the parameters as a separate json or toml file. Default is True.
- parameter_format (str, optional): The format to save the parameters in. Default is 'toml'.
Raises
- ValueError: If parameter_format is not 'json' or 'toml'.
Inherited Members
- HighResMassSpectraExport
- dir_loc
- output_file
- mass_spectra
- atoms_order_list
- get_pandas_df
- to_pandas
- to_excel
- to_csv
- get_mass_spectra_attrs
- corems.mass_spectrum.output.export.HighResMassSpecExport
- output_type
- mass_spectrum
- save
- run
- write_settings
- to_json
- add_mass_spectrum_to_hdf5
- parameters_to_toml
- parameters_to_json
- get_mass_spec_attrs
- get_all_used_atoms_in_order
- list_dict_to_list
- get_list_dict_data
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id
1198class LipidomicsExport(LCMSExport): 1199 """A class to export lipidomics data. 1200 1201 This class provides methods to export lipidomics data to various formats and summarize the lipid report. 1202 1203 Parameters 1204 ---------- 1205 out_file_path : str | Path 1206 The output file path, do not include the file extension. 1207 mass_spectra : object 1208 The high resolution mass spectra object. 1209 """ 1210 1211 def __init__(self, out_file_path, mass_spectra): 1212 super().__init__(out_file_path, mass_spectra) 1213 self.ion_type_dict = ion_type_dict 1214 1215 @staticmethod 1216 def get_ion_formula(neutral_formula, ion_type): 1217 """From a neutral formula and an ion type, return the formula of the ion. 1218 1219 Notes 1220 ----- 1221 This is a static method. 1222 If the neutral_formula is not a string, this method will return None. 1223 1224 Parameters 1225 ---------- 1226 neutral_formula : str 1227 The neutral formula, this should be a string form from the MolecularFormula class 1228 (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). 1229 In the case of a simple string, the atoms are parsed based on the presence of capital letters, 1230 e.g. MgCl2 is parsed as 'Mg Cl2. 1231 ion_type : str 1232 The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. 1233 See the self.ion_type_dict for the available ion types. 1234 1235 Returns 1236 ------- 1237 str 1238 The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string. 1239 """ 1240 # If neutral_formula is not a string, return None 1241 if not isinstance(neutral_formula, str): 1242 return None 1243 1244 # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class) 1245 if re.search(r"\s", neutral_formula): 1246 neutral_formula = MolecularFormula(neutral_formula, ion_charge=0) 1247 else: 1248 form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:] 1249 elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()] 1250 counts = [re.findall(r"\d+", x) for x in form_pre.split()] 1251 neutral_formula = MolecularFormula( 1252 dict( 1253 zip( 1254 [x[0] for x in elements], 1255 [int(x[0]) if x else 1 for x in counts], 1256 ) 1257 ), 1258 ion_charge=0, 1259 ) 1260 neutral_formula_dict = neutral_formula.to_dict().copy() 1261 1262 adduct_add_dict = ion_type_dict[ion_type][0] 1263 for key in adduct_add_dict: 1264 if key in neutral_formula_dict.keys(): 1265 neutral_formula_dict[key] += adduct_add_dict[key] 1266 else: 1267 neutral_formula_dict[key] = adduct_add_dict[key] 1268 1269 adduct_subtract = ion_type_dict[ion_type][1] 1270 for key in adduct_subtract: 1271 neutral_formula_dict[key] -= adduct_subtract[key] 1272 1273 return MolecularFormula(neutral_formula_dict, ion_charge=0).string 1274 1275 @staticmethod 1276 def get_isotope_type(ion_formula): 1277 """From an ion formula, return the 13C isotope type of the ion. 1278 1279 Notes 1280 ----- 1281 This is a static method. 1282 If the ion_formula is not a string, this method will return None. 1283 This is currently only functional for 13C isotopes. 1284 1285 Parameters 1286 ---------- 1287 ion_formula : str 1288 The formula of the ion, expected to be a string like 'C2 H4 O2'. 1289 1290 Returns 1291 ------- 1292 str 1293 The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope. 1294 1295 Raises 1296 ------ 1297 ValueError 1298 If the ion_formula is not a string. 1299 """ 1300 if not isinstance(ion_formula, str): 1301 return None 1302 1303 if re.search(r"\s", ion_formula): 1304 ion_formula = MolecularFormula(ion_formula, ion_charge=0) 1305 else: 1306 raise ValueError('ion_formula should be a string like "C2 H4 O2"') 1307 ion_formula_dict = ion_formula.to_dict().copy() 1308 1309 try: 1310 iso_class = "13C" + str(ion_formula_dict.pop("13C")) 1311 except KeyError: 1312 iso_class = None 1313 1314 return iso_class 1315 1316 def clean_ms1_report(self, ms1_summary_full): 1317 """Clean the MS1 report. 1318 1319 Parameters 1320 ---------- 1321 ms1_summary_full : DataFrame 1322 The full MS1 summary DataFrame. 1323 1324 Returns 1325 ------- 1326 DataFrame 1327 The cleaned MS1 summary DataFrame. 1328 """ 1329 ms1_summary_full = ms1_summary_full.reset_index() 1330 cols_to_keep = [ 1331 "mf_id", 1332 "Molecular Formula", 1333 "Ion Type", 1334 "Calculated m/z", 1335 "m/z Error (ppm)", 1336 "m/z Error Score", 1337 "Is Isotopologue", 1338 "Isotopologue Similarity", 1339 "Confidence Score", 1340 ] 1341 ms1_summary = ms1_summary_full[cols_to_keep].copy() 1342 ms1_summary["ion_formula"] = [ 1343 self.get_ion_formula(f, a) 1344 for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"]) 1345 ] 1346 ms1_summary["isotopologue_type"] = [ 1347 self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist() 1348 ] 1349 1350 # Reorder columns 1351 ms1_summary = ms1_summary[ 1352 [ 1353 "mf_id", 1354 "ion_formula", 1355 "isotopologue_type", 1356 "Calculated m/z", 1357 "m/z Error (ppm)", 1358 "m/z Error Score", 1359 "Isotopologue Similarity", 1360 "Confidence Score", 1361 ] 1362 ] 1363 1364 # Set the index to mf_id 1365 ms1_summary = ms1_summary.set_index("mf_id") 1366 1367 return ms1_summary 1368 1369 def summarize_lipid_report(self, ms2_annot): 1370 """Summarize the lipid report. 1371 1372 Parameters 1373 ---------- 1374 ms2_annot : DataFrame 1375 The MS2 annotation DataFrame with all annotations. 1376 1377 Returns 1378 ------- 1379 DataFrame 1380 The summarized lipid report. 1381 """ 1382 # Drop unnecessary columns for easier viewing 1383 columns_to_drop = [ 1384 "precursor_mz", 1385 "precursor_mz_error_ppm", 1386 "metabref_mol_id", 1387 "metabref_precursor_mz", 1388 "cas", 1389 "inchikey", 1390 "inchi", 1391 "chebi", 1392 "smiles", 1393 "kegg", 1394 "data_id", 1395 "iupac_name", 1396 "traditional_name", 1397 "common_name", 1398 "casno", 1399 ] 1400 ms2_annot = ms2_annot.drop( 1401 columns=[col for col in columns_to_drop if col in ms2_annot.columns] 1402 ) 1403 1404 # If ion_types_excluded is not empty, remove those ion types 1405 ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[ 1406 "ms2" 1407 ].molecular_search.ion_types_excluded 1408 if len(ion_types_excluded) > 0: 1409 ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)] 1410 1411 # If mf_id is not present, check that the index name is mf_id and reset the index 1412 if "mf_id" not in ms2_annot.columns: 1413 if ms2_annot.index.name == "mf_id": 1414 ms2_annot = ms2_annot.reset_index() 1415 else: 1416 raise ValueError("mf_id is not present in the dataframe") 1417 1418 # Attempt to get consensus annotations to the MLF level 1419 mlf_results_all = [] 1420 for mf_id in ms2_annot["mf_id"].unique(): 1421 mlf_results_perid = [] 1422 ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy() 1423 ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf) 1424 1425 for query_scan in ms2_annot["query_spectrum_id"].unique(): 1426 ms2_annot_sub = ms2_annot_mf[ 1427 ms2_annot_mf["query_spectrum_id"] == query_scan 1428 ].copy() 1429 1430 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1431 # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation 1432 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1433 ms2_annot_sub["entropy_max"] = ( 1434 ms2_annot_sub["entropy_similarity"] 1435 == ms2_annot_sub["entropy_similarity"].max() 1436 ) 1437 ms2_annot_sub["ref_match_fract_max"] = ( 1438 ms2_annot_sub["ref_mz_in_query_fract"] 1439 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1440 ) 1441 ms2_annot_sub["frag_max"] = ms2_annot_sub[ 1442 "query_frag_types" 1443 ].apply(lambda x: True if "MLF" in x else False) 1444 1445 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1446 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1447 ["entropy_max", "ref_match_fract_max", "frag_max"] 1448 ].all(axis=1) 1449 1450 # If there is a consensus, take the row with the highest entropy_similarity 1451 if ms2_annot_sub["consensus"].any(): 1452 ms2_annot_sub = ms2_annot_sub[ 1453 ms2_annot_sub["entropy_similarity"] 1454 == ms2_annot_sub["entropy_similarity"].max() 1455 ].head(1) 1456 mlf_results_perid.append(ms2_annot_sub) 1457 if len(mlf_results_perid) == 0: 1458 mlf_results_perid = pd.DataFrame() 1459 else: 1460 mlf_results_perid = pd.concat(mlf_results_perid) 1461 if mlf_results_perid["name"].nunique() == 1: 1462 mlf_results_perid = mlf_results_perid[ 1463 mlf_results_perid["entropy_similarity"] 1464 == mlf_results_perid["entropy_similarity"].max() 1465 ].head(1) 1466 else: 1467 mlf_results_perid = pd.DataFrame() 1468 mlf_results_all.append(mlf_results_perid) 1469 1470 # These are the consensus annotations to the MLF level 1471 if len(mlf_results_all) > 0: 1472 mlf_results_all = pd.concat(mlf_results_all) 1473 mlf_results_all["annot_level"] = mlf_results_all["structure_level"] 1474 else: 1475 # Make an empty dataframe 1476 mlf_results_all = ms2_annot.head(0) 1477 1478 # For remaining mf_ids, try to get a consensus annotation to the species level 1479 species_results_all = [] 1480 # Remove mf_ids that have consensus annotations to the MLF level 1481 ms2_annot_spec = ms2_annot[ 1482 ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique()) 1483 ] 1484 for mf_id in ms2_annot_spec["mf_id"].unique(): 1485 # Do all the hits have the same lipid_summed_name? 1486 ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy() 1487 ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub) 1488 1489 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1490 # Grab the highest entropy_similarity result 1491 ms2_annot_sub = ms2_annot_sub[ 1492 ms2_annot_sub["entropy_similarity"] 1493 == ms2_annot_sub["entropy_similarity"].max() 1494 ].head(1) 1495 species_results_all.append(ms2_annot_sub) 1496 1497 # These are the consensus annotations to the species level 1498 if len(species_results_all) > 0: 1499 species_results_all = pd.concat(species_results_all) 1500 species_results_all["annot_level"] = "species" 1501 else: 1502 # Make an empty dataframe 1503 species_results_all = ms2_annot.head(0) 1504 1505 # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level 1506 # Remove mf_ids that have consensus annotations to the species level 1507 ms2_annot_remaining = ms2_annot_spec[ 1508 ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique()) 1509 ] 1510 no_consensus = [] 1511 for mf_id in ms2_annot_remaining["mf_id"].unique(): 1512 id_sub = [] 1513 id_no_con = [] 1514 ms2_annot_sub_mf = ms2_annot_remaining[ 1515 ms2_annot_remaining["mf_id"] == mf_id 1516 ].copy() 1517 for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique(): 1518 ms2_annot_sub = ms2_annot_sub_mf[ 1519 ms2_annot_sub_mf["query_spectrum_id"] == query_scan 1520 ].copy() 1521 1522 # New columns for ranking [HIGHER RANK = BETTER] 1523 ms2_annot_sub["entropy_max"] = ( 1524 ms2_annot_sub["entropy_similarity"] 1525 == ms2_annot_sub["entropy_similarity"].max() 1526 ) 1527 ms2_annot_sub["ref_match_fract_max"] = ( 1528 ms2_annot_sub["ref_mz_in_query_fract"] 1529 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1530 ) 1531 ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply( 1532 lambda x: True if "MLF" in x else False 1533 ) 1534 1535 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1536 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1537 ["entropy_max", "ref_match_fract_max", "frag_max"] 1538 ].all(axis=1) 1539 ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]] 1540 id_sub.append(ms2_annot_sub_con) 1541 id_no_con.append(ms2_annot_sub) 1542 id_sub = pd.concat(id_sub) 1543 id_no_con = pd.concat(id_no_con) 1544 1545 # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level] 1546 if ( 1547 id_sub["query_frag_types"] 1548 .apply(lambda x: True if "MLF" in x else False) 1549 .all() 1550 and len(id_sub) > 0 1551 ): 1552 idx = id_sub.groupby("name")["entropy_similarity"].idxmax() 1553 id_sub = id_sub.loc[idx] 1554 # Reorder so highest entropy_similarity is first 1555 id_sub = id_sub.sort_values("entropy_similarity", ascending=False) 1556 id_sub["annot_level"] = id_sub["structure_level"] 1557 no_consensus.append(id_sub) 1558 1559 # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level 1560 elif len(id_sub) == 0: 1561 for lipid_summed_name in id_no_con["lipid_summed_name"].unique(): 1562 summed_sub = id_no_con[ 1563 id_no_con["lipid_summed_name"] == lipid_summed_name 1564 ] 1565 # Any consensus to MLF? 1566 if summed_sub["consensus"].any(): 1567 summed_sub = summed_sub[summed_sub["consensus"]] 1568 summed_sub["annot_level"] = summed_sub["structure_level"] 1569 no_consensus.append(summed_sub) 1570 else: 1571 # Grab the highest entropy_similarity, if there are multiple, grab the first one 1572 summed_sub = summed_sub[ 1573 summed_sub["entropy_similarity"] 1574 == summed_sub["entropy_similarity"].max() 1575 ].head(1) 1576 # get first row 1577 summed_sub["annot_level"] = "species" 1578 summed_sub["name"] = "" 1579 no_consensus.append(summed_sub) 1580 else: 1581 raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id) 1582 1583 if len(no_consensus) > 0: 1584 no_consensus = pd.concat(no_consensus) 1585 else: 1586 no_consensus = ms2_annot.head(0) 1587 1588 # Combine all the consensus annotations and reformat the dataframe for output 1589 species_results_all = species_results_all.drop(columns=["name"]) 1590 species_results_all["lipid_molecular_species_id"] = "" 1591 mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"] 1592 no_consensus["lipid_molecular_species_id"] = no_consensus["name"] 1593 consensus_annotations = pd.concat( 1594 [mlf_results_all, species_results_all, no_consensus] 1595 ) 1596 consensus_annotations = consensus_annotations.sort_values( 1597 "mf_id", ascending=True 1598 ) 1599 cols_to_keep = [ 1600 "mf_id", 1601 "ref_ion_type", 1602 "entropy_similarity", 1603 "ref_mz_in_query_fract", 1604 "lipid_molecular_species_id", 1605 "lipid_summed_name", 1606 "lipid_subclass", 1607 "lipid_class", 1608 "lipid_category", 1609 "formula", 1610 "annot_level", 1611 "n_spectra_contributing", 1612 ] 1613 consensus_annotations = consensus_annotations[cols_to_keep] 1614 consensus_annotations = consensus_annotations.set_index("mf_id") 1615 1616 return consensus_annotations 1617 1618 def clean_ms2_report(self, lipid_summary): 1619 """Clean the MS2 report. 1620 1621 Parameters 1622 ---------- 1623 lipid_summary : DataFrame 1624 The full lipid summary DataFrame. 1625 1626 Returns 1627 ------- 1628 DataFrame 1629 The cleaned lipid summary DataFrame. 1630 """ 1631 lipid_summary = lipid_summary.reset_index() 1632 lipid_summary["ion_formula"] = [ 1633 self.get_ion_formula(f, a) 1634 for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"]) 1635 ] 1636 1637 # Reorder columns 1638 lipid_summary = lipid_summary[ 1639 [ 1640 "mf_id", 1641 "ion_formula", 1642 "ref_ion_type", 1643 "formula", 1644 "annot_level", 1645 "lipid_molecular_species_id", 1646 "lipid_summed_name", 1647 "lipid_subclass", 1648 "lipid_class", 1649 "lipid_category", 1650 "entropy_similarity", 1651 "ref_mz_in_query_fract", 1652 "n_spectra_contributing", 1653 ] 1654 ] 1655 1656 # Set the index to mf_id 1657 lipid_summary = lipid_summary.set_index("mf_id") 1658 1659 return lipid_summary 1660 1661 def to_report(self, molecular_metadata=None): 1662 """Create a report of the mass features and their annotations. 1663 1664 Parameters 1665 ---------- 1666 molecular_metadata : dict, optional 1667 The molecular metadata. Default is None. 1668 1669 Returns 1670 ------- 1671 DataFrame 1672 The report of the mass features and their annotations. 1673 1674 Notes 1675 ----- 1676 The report will contain the mass features and their annotations from MS1 and MS2 (if available). 1677 """ 1678 # Get mass feature dataframe 1679 mf_report = self.mass_spectra.mass_features_to_df() 1680 mf_report = mf_report.reset_index(drop=False) 1681 1682 # Get and clean ms1 annotation dataframe 1683 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 1684 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 1685 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 1686 1687 # Get, summarize, and clean ms2 annotation dataframe 1688 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 1689 molecular_metadata=molecular_metadata 1690 ) 1691 if ms2_annot_report is not None: 1692 ms2_annot_report = self.summarize_lipid_report(ms2_annot_report) 1693 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 1694 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 1695 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 1696 1697 # Combine the reports 1698 if not ms1_annot_report.empty: 1699 # MS1 has been run and has molecular formula information 1700 mf_report = pd.merge( 1701 mf_report, 1702 ms1_annot_report, 1703 how="left", 1704 on=["mf_id", "isotopologue_type"], 1705 ) 1706 if ms2_annot_report is not None: 1707 # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly) 1708 mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()] 1709 mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"]) 1710 mf_no_ion_formula = pd.merge( 1711 mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"] 1712 ) 1713 1714 # pull out the records with ion_formula 1715 mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()] 1716 mf_with_ion_formula = pd.merge( 1717 mf_with_ion_formula, 1718 ms2_annot_report, 1719 how="left", 1720 on=["mf_id", "ion_formula"], 1721 ) 1722 1723 # put back together 1724 mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula]) 1725 1726 # Rename colums 1727 rename_dict = { 1728 "mf_id": "Mass Feature ID", 1729 "scan_time": "Retention Time (min)", 1730 "mz": "m/z", 1731 "apex_scan": "Apex Scan Number", 1732 "intensity": "Intensity", 1733 "persistence": "Persistence", 1734 "area": "Area", 1735 "half_height_width": "Half Height Width (min)", 1736 "tailing_factor": "Tailing Factor", 1737 "dispersity_index": "Dispersity Index", 1738 "ms2_spectrum": "MS2 Spectrum", 1739 "monoisotopic_mf_id": "Monoisotopic Mass Feature ID", 1740 "isotopologue_type": "Isotopologue Type", 1741 "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution", 1742 "associated_mass_features": "Associated Mass Features after Deconvolution", 1743 "ion_formula": "Ion Formula", 1744 "formula": "Molecular Formula", 1745 "ref_ion_type": "Ion Type", 1746 "annot_level": "Lipid Annotation Level", 1747 "lipid_molecular_species_id": "Lipid Molecular Species", 1748 "lipid_summed_name": "Lipid Species", 1749 "lipid_subclass": "Lipid Subclass", 1750 "lipid_class": "Lipid Class", 1751 "lipid_category": "Lipid Category", 1752 "entropy_similarity": "Entropy Similarity", 1753 "ref_mz_in_query_fract": "Library mzs in Query (fraction)", 1754 "n_spectra_contributing": "Spectra with Annotation (n)", 1755 } 1756 mf_report = mf_report.rename(columns=rename_dict) 1757 mf_report["Sample Name"] = self.mass_spectra.sample_name 1758 mf_report["Polarity"] = self.mass_spectra.polarity 1759 mf_report = mf_report[ 1760 ["Mass Feature ID", "Sample Name", "Polarity"] 1761 + [ 1762 col 1763 for col in mf_report.columns 1764 if col not in ["Mass Feature ID", "Sample Name", "Polarity"] 1765 ] 1766 ] 1767 1768 # Reorder rows by "Mass Feature ID" 1769 mf_report = mf_report.sort_values("Mass Feature ID") 1770 1771 # Reset index 1772 mf_report = mf_report.reset_index(drop=True) 1773 1774 return mf_report 1775 1776 def report_to_csv(self, molecular_metadata=None): 1777 """Create a report of the mass features and their annotations and save it as a CSV file. 1778 1779 Parameters 1780 ---------- 1781 molecular_metadata : dict, optional 1782 The molecular metadata. Default is None. 1783 """ 1784 report = self.to_report(molecular_metadata=molecular_metadata) 1785 out_file = self.output_file.with_suffix(".csv") 1786 report.to_csv(out_file, index=False)
A class to export lipidomics data.
This class provides methods to export lipidomics data to various formats and summarize the lipid report.
Parameters
- out_file_path (str | Path): The output file path, do not include the file extension.
- mass_spectra (object): The high resolution mass spectra object.
1211 def __init__(self, out_file_path, mass_spectra): 1212 super().__init__(out_file_path, mass_spectra) 1213 self.ion_type_dict = ion_type_dict
This constructor should always be called with keyword arguments. Arguments are:
group should be None; reserved for future extension when a ThreadGroup class is implemented.
target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.
name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.
args is the argument tuple for the target invocation. Defaults to ().
kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.
If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.
1215 @staticmethod 1216 def get_ion_formula(neutral_formula, ion_type): 1217 """From a neutral formula and an ion type, return the formula of the ion. 1218 1219 Notes 1220 ----- 1221 This is a static method. 1222 If the neutral_formula is not a string, this method will return None. 1223 1224 Parameters 1225 ---------- 1226 neutral_formula : str 1227 The neutral formula, this should be a string form from the MolecularFormula class 1228 (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). 1229 In the case of a simple string, the atoms are parsed based on the presence of capital letters, 1230 e.g. MgCl2 is parsed as 'Mg Cl2. 1231 ion_type : str 1232 The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. 1233 See the self.ion_type_dict for the available ion types. 1234 1235 Returns 1236 ------- 1237 str 1238 The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string. 1239 """ 1240 # If neutral_formula is not a string, return None 1241 if not isinstance(neutral_formula, str): 1242 return None 1243 1244 # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class) 1245 if re.search(r"\s", neutral_formula): 1246 neutral_formula = MolecularFormula(neutral_formula, ion_charge=0) 1247 else: 1248 form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:] 1249 elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()] 1250 counts = [re.findall(r"\d+", x) for x in form_pre.split()] 1251 neutral_formula = MolecularFormula( 1252 dict( 1253 zip( 1254 [x[0] for x in elements], 1255 [int(x[0]) if x else 1 for x in counts], 1256 ) 1257 ), 1258 ion_charge=0, 1259 ) 1260 neutral_formula_dict = neutral_formula.to_dict().copy() 1261 1262 adduct_add_dict = ion_type_dict[ion_type][0] 1263 for key in adduct_add_dict: 1264 if key in neutral_formula_dict.keys(): 1265 neutral_formula_dict[key] += adduct_add_dict[key] 1266 else: 1267 neutral_formula_dict[key] = adduct_add_dict[key] 1268 1269 adduct_subtract = ion_type_dict[ion_type][1] 1270 for key in adduct_subtract: 1271 neutral_formula_dict[key] -= adduct_subtract[key] 1272 1273 return MolecularFormula(neutral_formula_dict, ion_charge=0).string
From a neutral formula and an ion type, return the formula of the ion.
Notes
This is a static method. If the neutral_formula is not a string, this method will return None.
Parameters
- neutral_formula (str): The neutral formula, this should be a string form from the MolecularFormula class (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). In the case of a simple string, the atoms are parsed based on the presence of capital letters, e.g. MgCl2 is parsed as 'Mg Cl2.
- ion_type (str): The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. See the self.ion_type_dict for the available ion types.
Returns
- str: The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1275 @staticmethod 1276 def get_isotope_type(ion_formula): 1277 """From an ion formula, return the 13C isotope type of the ion. 1278 1279 Notes 1280 ----- 1281 This is a static method. 1282 If the ion_formula is not a string, this method will return None. 1283 This is currently only functional for 13C isotopes. 1284 1285 Parameters 1286 ---------- 1287 ion_formula : str 1288 The formula of the ion, expected to be a string like 'C2 H4 O2'. 1289 1290 Returns 1291 ------- 1292 str 1293 The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope. 1294 1295 Raises 1296 ------ 1297 ValueError 1298 If the ion_formula is not a string. 1299 """ 1300 if not isinstance(ion_formula, str): 1301 return None 1302 1303 if re.search(r"\s", ion_formula): 1304 ion_formula = MolecularFormula(ion_formula, ion_charge=0) 1305 else: 1306 raise ValueError('ion_formula should be a string like "C2 H4 O2"') 1307 ion_formula_dict = ion_formula.to_dict().copy() 1308 1309 try: 1310 iso_class = "13C" + str(ion_formula_dict.pop("13C")) 1311 except KeyError: 1312 iso_class = None 1313 1314 return iso_class
From an ion formula, return the 13C isotope type of the ion.
Notes
This is a static method. If the ion_formula is not a string, this method will return None. This is currently only functional for 13C isotopes.
Parameters
- ion_formula (str): The formula of the ion, expected to be a string like 'C2 H4 O2'.
Returns
- str: The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
Raises
- ValueError: If the ion_formula is not a string.
1316 def clean_ms1_report(self, ms1_summary_full): 1317 """Clean the MS1 report. 1318 1319 Parameters 1320 ---------- 1321 ms1_summary_full : DataFrame 1322 The full MS1 summary DataFrame. 1323 1324 Returns 1325 ------- 1326 DataFrame 1327 The cleaned MS1 summary DataFrame. 1328 """ 1329 ms1_summary_full = ms1_summary_full.reset_index() 1330 cols_to_keep = [ 1331 "mf_id", 1332 "Molecular Formula", 1333 "Ion Type", 1334 "Calculated m/z", 1335 "m/z Error (ppm)", 1336 "m/z Error Score", 1337 "Is Isotopologue", 1338 "Isotopologue Similarity", 1339 "Confidence Score", 1340 ] 1341 ms1_summary = ms1_summary_full[cols_to_keep].copy() 1342 ms1_summary["ion_formula"] = [ 1343 self.get_ion_formula(f, a) 1344 for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"]) 1345 ] 1346 ms1_summary["isotopologue_type"] = [ 1347 self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist() 1348 ] 1349 1350 # Reorder columns 1351 ms1_summary = ms1_summary[ 1352 [ 1353 "mf_id", 1354 "ion_formula", 1355 "isotopologue_type", 1356 "Calculated m/z", 1357 "m/z Error (ppm)", 1358 "m/z Error Score", 1359 "Isotopologue Similarity", 1360 "Confidence Score", 1361 ] 1362 ] 1363 1364 # Set the index to mf_id 1365 ms1_summary = ms1_summary.set_index("mf_id") 1366 1367 return ms1_summary
Clean the MS1 report.
Parameters
- ms1_summary_full (DataFrame): The full MS1 summary DataFrame.
Returns
- DataFrame: The cleaned MS1 summary DataFrame.
1369 def summarize_lipid_report(self, ms2_annot): 1370 """Summarize the lipid report. 1371 1372 Parameters 1373 ---------- 1374 ms2_annot : DataFrame 1375 The MS2 annotation DataFrame with all annotations. 1376 1377 Returns 1378 ------- 1379 DataFrame 1380 The summarized lipid report. 1381 """ 1382 # Drop unnecessary columns for easier viewing 1383 columns_to_drop = [ 1384 "precursor_mz", 1385 "precursor_mz_error_ppm", 1386 "metabref_mol_id", 1387 "metabref_precursor_mz", 1388 "cas", 1389 "inchikey", 1390 "inchi", 1391 "chebi", 1392 "smiles", 1393 "kegg", 1394 "data_id", 1395 "iupac_name", 1396 "traditional_name", 1397 "common_name", 1398 "casno", 1399 ] 1400 ms2_annot = ms2_annot.drop( 1401 columns=[col for col in columns_to_drop if col in ms2_annot.columns] 1402 ) 1403 1404 # If ion_types_excluded is not empty, remove those ion types 1405 ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[ 1406 "ms2" 1407 ].molecular_search.ion_types_excluded 1408 if len(ion_types_excluded) > 0: 1409 ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)] 1410 1411 # If mf_id is not present, check that the index name is mf_id and reset the index 1412 if "mf_id" not in ms2_annot.columns: 1413 if ms2_annot.index.name == "mf_id": 1414 ms2_annot = ms2_annot.reset_index() 1415 else: 1416 raise ValueError("mf_id is not present in the dataframe") 1417 1418 # Attempt to get consensus annotations to the MLF level 1419 mlf_results_all = [] 1420 for mf_id in ms2_annot["mf_id"].unique(): 1421 mlf_results_perid = [] 1422 ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy() 1423 ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf) 1424 1425 for query_scan in ms2_annot["query_spectrum_id"].unique(): 1426 ms2_annot_sub = ms2_annot_mf[ 1427 ms2_annot_mf["query_spectrum_id"] == query_scan 1428 ].copy() 1429 1430 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1431 # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation 1432 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1433 ms2_annot_sub["entropy_max"] = ( 1434 ms2_annot_sub["entropy_similarity"] 1435 == ms2_annot_sub["entropy_similarity"].max() 1436 ) 1437 ms2_annot_sub["ref_match_fract_max"] = ( 1438 ms2_annot_sub["ref_mz_in_query_fract"] 1439 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1440 ) 1441 ms2_annot_sub["frag_max"] = ms2_annot_sub[ 1442 "query_frag_types" 1443 ].apply(lambda x: True if "MLF" in x else False) 1444 1445 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1446 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1447 ["entropy_max", "ref_match_fract_max", "frag_max"] 1448 ].all(axis=1) 1449 1450 # If there is a consensus, take the row with the highest entropy_similarity 1451 if ms2_annot_sub["consensus"].any(): 1452 ms2_annot_sub = ms2_annot_sub[ 1453 ms2_annot_sub["entropy_similarity"] 1454 == ms2_annot_sub["entropy_similarity"].max() 1455 ].head(1) 1456 mlf_results_perid.append(ms2_annot_sub) 1457 if len(mlf_results_perid) == 0: 1458 mlf_results_perid = pd.DataFrame() 1459 else: 1460 mlf_results_perid = pd.concat(mlf_results_perid) 1461 if mlf_results_perid["name"].nunique() == 1: 1462 mlf_results_perid = mlf_results_perid[ 1463 mlf_results_perid["entropy_similarity"] 1464 == mlf_results_perid["entropy_similarity"].max() 1465 ].head(1) 1466 else: 1467 mlf_results_perid = pd.DataFrame() 1468 mlf_results_all.append(mlf_results_perid) 1469 1470 # These are the consensus annotations to the MLF level 1471 if len(mlf_results_all) > 0: 1472 mlf_results_all = pd.concat(mlf_results_all) 1473 mlf_results_all["annot_level"] = mlf_results_all["structure_level"] 1474 else: 1475 # Make an empty dataframe 1476 mlf_results_all = ms2_annot.head(0) 1477 1478 # For remaining mf_ids, try to get a consensus annotation to the species level 1479 species_results_all = [] 1480 # Remove mf_ids that have consensus annotations to the MLF level 1481 ms2_annot_spec = ms2_annot[ 1482 ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique()) 1483 ] 1484 for mf_id in ms2_annot_spec["mf_id"].unique(): 1485 # Do all the hits have the same lipid_summed_name? 1486 ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy() 1487 ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub) 1488 1489 if ms2_annot_sub["lipid_summed_name"].nunique() == 1: 1490 # Grab the highest entropy_similarity result 1491 ms2_annot_sub = ms2_annot_sub[ 1492 ms2_annot_sub["entropy_similarity"] 1493 == ms2_annot_sub["entropy_similarity"].max() 1494 ].head(1) 1495 species_results_all.append(ms2_annot_sub) 1496 1497 # These are the consensus annotations to the species level 1498 if len(species_results_all) > 0: 1499 species_results_all = pd.concat(species_results_all) 1500 species_results_all["annot_level"] = "species" 1501 else: 1502 # Make an empty dataframe 1503 species_results_all = ms2_annot.head(0) 1504 1505 # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level 1506 # Remove mf_ids that have consensus annotations to the species level 1507 ms2_annot_remaining = ms2_annot_spec[ 1508 ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique()) 1509 ] 1510 no_consensus = [] 1511 for mf_id in ms2_annot_remaining["mf_id"].unique(): 1512 id_sub = [] 1513 id_no_con = [] 1514 ms2_annot_sub_mf = ms2_annot_remaining[ 1515 ms2_annot_remaining["mf_id"] == mf_id 1516 ].copy() 1517 for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique(): 1518 ms2_annot_sub = ms2_annot_sub_mf[ 1519 ms2_annot_sub_mf["query_spectrum_id"] == query_scan 1520 ].copy() 1521 1522 # New columns for ranking [HIGHER RANK = BETTER] 1523 ms2_annot_sub["entropy_max"] = ( 1524 ms2_annot_sub["entropy_similarity"] 1525 == ms2_annot_sub["entropy_similarity"].max() 1526 ) 1527 ms2_annot_sub["ref_match_fract_max"] = ( 1528 ms2_annot_sub["ref_mz_in_query_fract"] 1529 == ms2_annot_sub["ref_mz_in_query_fract"].max() 1530 ) 1531 ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply( 1532 lambda x: True if "MLF" in x else False 1533 ) 1534 1535 # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks) 1536 ms2_annot_sub["consensus"] = ms2_annot_sub[ 1537 ["entropy_max", "ref_match_fract_max", "frag_max"] 1538 ].all(axis=1) 1539 ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]] 1540 id_sub.append(ms2_annot_sub_con) 1541 id_no_con.append(ms2_annot_sub) 1542 id_sub = pd.concat(id_sub) 1543 id_no_con = pd.concat(id_no_con) 1544 1545 # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level] 1546 if ( 1547 id_sub["query_frag_types"] 1548 .apply(lambda x: True if "MLF" in x else False) 1549 .all() 1550 and len(id_sub) > 0 1551 ): 1552 idx = id_sub.groupby("name")["entropy_similarity"].idxmax() 1553 id_sub = id_sub.loc[idx] 1554 # Reorder so highest entropy_similarity is first 1555 id_sub = id_sub.sort_values("entropy_similarity", ascending=False) 1556 id_sub["annot_level"] = id_sub["structure_level"] 1557 no_consensus.append(id_sub) 1558 1559 # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level 1560 elif len(id_sub) == 0: 1561 for lipid_summed_name in id_no_con["lipid_summed_name"].unique(): 1562 summed_sub = id_no_con[ 1563 id_no_con["lipid_summed_name"] == lipid_summed_name 1564 ] 1565 # Any consensus to MLF? 1566 if summed_sub["consensus"].any(): 1567 summed_sub = summed_sub[summed_sub["consensus"]] 1568 summed_sub["annot_level"] = summed_sub["structure_level"] 1569 no_consensus.append(summed_sub) 1570 else: 1571 # Grab the highest entropy_similarity, if there are multiple, grab the first one 1572 summed_sub = summed_sub[ 1573 summed_sub["entropy_similarity"] 1574 == summed_sub["entropy_similarity"].max() 1575 ].head(1) 1576 # get first row 1577 summed_sub["annot_level"] = "species" 1578 summed_sub["name"] = "" 1579 no_consensus.append(summed_sub) 1580 else: 1581 raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id) 1582 1583 if len(no_consensus) > 0: 1584 no_consensus = pd.concat(no_consensus) 1585 else: 1586 no_consensus = ms2_annot.head(0) 1587 1588 # Combine all the consensus annotations and reformat the dataframe for output 1589 species_results_all = species_results_all.drop(columns=["name"]) 1590 species_results_all["lipid_molecular_species_id"] = "" 1591 mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"] 1592 no_consensus["lipid_molecular_species_id"] = no_consensus["name"] 1593 consensus_annotations = pd.concat( 1594 [mlf_results_all, species_results_all, no_consensus] 1595 ) 1596 consensus_annotations = consensus_annotations.sort_values( 1597 "mf_id", ascending=True 1598 ) 1599 cols_to_keep = [ 1600 "mf_id", 1601 "ref_ion_type", 1602 "entropy_similarity", 1603 "ref_mz_in_query_fract", 1604 "lipid_molecular_species_id", 1605 "lipid_summed_name", 1606 "lipid_subclass", 1607 "lipid_class", 1608 "lipid_category", 1609 "formula", 1610 "annot_level", 1611 "n_spectra_contributing", 1612 ] 1613 consensus_annotations = consensus_annotations[cols_to_keep] 1614 consensus_annotations = consensus_annotations.set_index("mf_id") 1615 1616 return consensus_annotations
Summarize the lipid report.
Parameters
- ms2_annot (DataFrame): The MS2 annotation DataFrame with all annotations.
Returns
- DataFrame: The summarized lipid report.
1618 def clean_ms2_report(self, lipid_summary): 1619 """Clean the MS2 report. 1620 1621 Parameters 1622 ---------- 1623 lipid_summary : DataFrame 1624 The full lipid summary DataFrame. 1625 1626 Returns 1627 ------- 1628 DataFrame 1629 The cleaned lipid summary DataFrame. 1630 """ 1631 lipid_summary = lipid_summary.reset_index() 1632 lipid_summary["ion_formula"] = [ 1633 self.get_ion_formula(f, a) 1634 for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"]) 1635 ] 1636 1637 # Reorder columns 1638 lipid_summary = lipid_summary[ 1639 [ 1640 "mf_id", 1641 "ion_formula", 1642 "ref_ion_type", 1643 "formula", 1644 "annot_level", 1645 "lipid_molecular_species_id", 1646 "lipid_summed_name", 1647 "lipid_subclass", 1648 "lipid_class", 1649 "lipid_category", 1650 "entropy_similarity", 1651 "ref_mz_in_query_fract", 1652 "n_spectra_contributing", 1653 ] 1654 ] 1655 1656 # Set the index to mf_id 1657 lipid_summary = lipid_summary.set_index("mf_id") 1658 1659 return lipid_summary
Clean the MS2 report.
Parameters
- lipid_summary (DataFrame): The full lipid summary DataFrame.
Returns
- DataFrame: The cleaned lipid summary DataFrame.
1661 def to_report(self, molecular_metadata=None): 1662 """Create a report of the mass features and their annotations. 1663 1664 Parameters 1665 ---------- 1666 molecular_metadata : dict, optional 1667 The molecular metadata. Default is None. 1668 1669 Returns 1670 ------- 1671 DataFrame 1672 The report of the mass features and their annotations. 1673 1674 Notes 1675 ----- 1676 The report will contain the mass features and their annotations from MS1 and MS2 (if available). 1677 """ 1678 # Get mass feature dataframe 1679 mf_report = self.mass_spectra.mass_features_to_df() 1680 mf_report = mf_report.reset_index(drop=False) 1681 1682 # Get and clean ms1 annotation dataframe 1683 ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy() 1684 ms1_annot_report = self.clean_ms1_report(ms1_annot_report) 1685 ms1_annot_report = ms1_annot_report.reset_index(drop=False) 1686 1687 # Get, summarize, and clean ms2 annotation dataframe 1688 ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df( 1689 molecular_metadata=molecular_metadata 1690 ) 1691 if ms2_annot_report is not None: 1692 ms2_annot_report = self.summarize_lipid_report(ms2_annot_report) 1693 ms2_annot_report = self.clean_ms2_report(ms2_annot_report) 1694 ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all") 1695 ms2_annot_report = ms2_annot_report.reset_index(drop=False) 1696 1697 # Combine the reports 1698 if not ms1_annot_report.empty: 1699 # MS1 has been run and has molecular formula information 1700 mf_report = pd.merge( 1701 mf_report, 1702 ms1_annot_report, 1703 how="left", 1704 on=["mf_id", "isotopologue_type"], 1705 ) 1706 if ms2_annot_report is not None: 1707 # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly) 1708 mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()] 1709 mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"]) 1710 mf_no_ion_formula = pd.merge( 1711 mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"] 1712 ) 1713 1714 # pull out the records with ion_formula 1715 mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()] 1716 mf_with_ion_formula = pd.merge( 1717 mf_with_ion_formula, 1718 ms2_annot_report, 1719 how="left", 1720 on=["mf_id", "ion_formula"], 1721 ) 1722 1723 # put back together 1724 mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula]) 1725 1726 # Rename colums 1727 rename_dict = { 1728 "mf_id": "Mass Feature ID", 1729 "scan_time": "Retention Time (min)", 1730 "mz": "m/z", 1731 "apex_scan": "Apex Scan Number", 1732 "intensity": "Intensity", 1733 "persistence": "Persistence", 1734 "area": "Area", 1735 "half_height_width": "Half Height Width (min)", 1736 "tailing_factor": "Tailing Factor", 1737 "dispersity_index": "Dispersity Index", 1738 "ms2_spectrum": "MS2 Spectrum", 1739 "monoisotopic_mf_id": "Monoisotopic Mass Feature ID", 1740 "isotopologue_type": "Isotopologue Type", 1741 "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution", 1742 "associated_mass_features": "Associated Mass Features after Deconvolution", 1743 "ion_formula": "Ion Formula", 1744 "formula": "Molecular Formula", 1745 "ref_ion_type": "Ion Type", 1746 "annot_level": "Lipid Annotation Level", 1747 "lipid_molecular_species_id": "Lipid Molecular Species", 1748 "lipid_summed_name": "Lipid Species", 1749 "lipid_subclass": "Lipid Subclass", 1750 "lipid_class": "Lipid Class", 1751 "lipid_category": "Lipid Category", 1752 "entropy_similarity": "Entropy Similarity", 1753 "ref_mz_in_query_fract": "Library mzs in Query (fraction)", 1754 "n_spectra_contributing": "Spectra with Annotation (n)", 1755 } 1756 mf_report = mf_report.rename(columns=rename_dict) 1757 mf_report["Sample Name"] = self.mass_spectra.sample_name 1758 mf_report["Polarity"] = self.mass_spectra.polarity 1759 mf_report = mf_report[ 1760 ["Mass Feature ID", "Sample Name", "Polarity"] 1761 + [ 1762 col 1763 for col in mf_report.columns 1764 if col not in ["Mass Feature ID", "Sample Name", "Polarity"] 1765 ] 1766 ] 1767 1768 # Reorder rows by "Mass Feature ID" 1769 mf_report = mf_report.sort_values("Mass Feature ID") 1770 1771 # Reset index 1772 mf_report = mf_report.reset_index(drop=True) 1773 1774 return mf_report
Create a report of the mass features and their annotations.
Parameters
- molecular_metadata (dict, optional): The molecular metadata. Default is None.
Returns
- DataFrame: The report of the mass features and their annotations.
Notes
The report will contain the mass features and their annotations from MS1 and MS2 (if available).
1776 def report_to_csv(self, molecular_metadata=None): 1777 """Create a report of the mass features and their annotations and save it as a CSV file. 1778 1779 Parameters 1780 ---------- 1781 molecular_metadata : dict, optional 1782 The molecular metadata. Default is None. 1783 """ 1784 report = self.to_report(molecular_metadata=molecular_metadata) 1785 out_file = self.output_file.with_suffix(".csv") 1786 report.to_csv(out_file, index=False)
Create a report of the mass features and their annotations and save it as a CSV file.
Parameters
- molecular_metadata (dict, optional): The molecular metadata. Default is None.
Inherited Members
- HighResMassSpectraExport
- dir_loc
- output_file
- mass_spectra
- atoms_order_list
- get_pandas_df
- to_pandas
- to_excel
- to_csv
- get_mass_spectra_attrs
- corems.mass_spectrum.output.export.HighResMassSpecExport
- output_type
- mass_spectrum
- save
- run
- write_settings
- to_json
- add_mass_spectrum_to_hdf5
- parameters_to_toml
- parameters_to_json
- get_mass_spec_attrs
- get_all_used_atoms_in_order
- list_dict_to_list
- get_list_dict_data
- threading.Thread
- start
- join
- name
- ident
- is_alive
- daemon
- isDaemon
- setDaemon
- getName
- setName
- native_id