corems.mass_spectra.output.export

   1__author__ = "Yuri E. Corilo"
   2__date__ = "Dec 14, 2010"
   3
   4
   5import csv
   6import json
   7import re
   8import uuid
   9import warnings
  10from datetime import datetime, timezone
  11from pathlib import Path
  12
  13import h5py
  14import numpy as np
  15import pandas as pd
  16from openpyxl import load_workbook
  17from pandas import DataFrame, ExcelWriter, read_excel
  18
  19from corems import __version__, corems_md5
  20from corems.encapsulation.output import parameter_to_dict
  21from corems.encapsulation.output.parameter_to_json import (
  22    dump_lcms_settings_json,
  23    dump_lcms_settings_toml,
  24)
  25from corems.mass_spectrum.output.export import HighResMassSpecExport
  26from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula
  27from corems.molecular_id.calc.SpectralSimilarity import methods_name
  28
  29ion_type_dict = {
  30    # adduct : [atoms to add, atoms to subtract when calculating formula of ion
  31    "M+": [{}, {}],
  32    "protonated": [{"H": 1}, {}],
  33    "[M+H]+": [{"H": 1}, {}],
  34    "[M+NH4]+": [{"N": 1, "H": 4}, {}],  # ammonium
  35    "[M+Na]+": [{"Na": 1}, {}],
  36    "[M+K]+": [{"K": 1}, {}],
  37    "[M+2Na+Cl]+": [{"Na": 2, "Cl": 1}, {}],
  38    "[M+2Na-H]+": [{"Na": 2}, {"H": 1}],
  39    "[M+C2H3Na2O2]+": [{"C": 2, "H": 3, "Na": 2, "O": 2}, {}],
  40    "[M+C4H10N3]+": [{"C": 4, "H": 10, "N": 3}, {}],
  41    "[M+NH4+ACN]+": [{"C": 2, "H": 7, "N": 2}, {}],
  42    "[M+H-H2O]+": [{}, {"H": 1, "O": 1}],
  43    "de-protonated": [{}, {"H": 1}],
  44    "[M-H]-": [{}, {"H": 1}],
  45    "[M+Cl]-": [{"Cl": 1}, {}],
  46    "[M+HCOO]-": [{"C": 1, "H": 1, "O": 2}, {}],  # formate
  47    "[M+CH3COO]-": [{"C": 2, "H": 3, "O": 2}, {}],  # acetate
  48    "[M+2NaAc+Cl]-": [{"Na": 2, "C": 2, "H": 3, "O": 2, "Cl": 1}, {}],
  49    "[M+K-2H]-": [{"K": 1}, {"H": 2}],
  50    "[M+Na-2H]-": [{"Na": 1}, {"H": 2}],
  51}
  52
  53
  54class LowResGCMSExport:
  55    """A class to export low resolution GC-MS data.
  56
  57    This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
  58
  59    Parameters:
  60    ----------
  61    out_file_path : str
  62        The output file path.
  63    gcms : object
  64        The low resolution GCMS object.
  65
  66    Attributes:
  67    ----------
  68    output_file : Path
  69        The output file path as a Path object.
  70    gcms : object
  71        The low resolution GCMS object.
  72
  73    Methods:
  74    -------
  75    * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
  76    * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
  77    * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
  78    * to_excel(write_mode='a', write_metadata=True, id_label="corems:"),
  79        Export the data to an Excel file.
  80    * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:").
  81        Export the data to a CSV file.
  82    * to_hdf(id_label="corems:").
  83        Export the data to an HDF5 file.
  84    * get_data_stats(gcms).
  85        Get statistics about the GCMS data.
  86
  87    """
  88
  89    def __init__(self, out_file_path, gcms):
  90        self.output_file = Path(out_file_path)
  91
  92        self.gcms = gcms
  93
  94        self._init_columns()
  95
  96    def _init_columns(self):
  97        """Initialize the column names for the exported data.
  98
  99        Returns:
 100        -------
 101        list
 102            The list of column names.
 103        """
 104
 105        columns = [
 106            "Sample name",
 107            "Peak Index",
 108            "Retention Time",
 109            "Retention Time Ref",
 110            "Peak Height",
 111            "Peak Area",
 112            "Retention index",
 113            "Retention index Ref",
 114            "Retention Index Score",
 115            "Similarity Score",
 116            "Spectral Similarity Score",
 117            "Compound Name",
 118            "Chebi ID",
 119            "Kegg Compound ID",
 120            "Inchi",
 121            "Inchi Key",
 122            "Smiles",
 123            "Molecular Formula",
 124            "IUPAC Name",
 125            "Traditional Name",
 126            "Common Name",
 127            "Derivatization",
 128        ]
 129
 130        if self.gcms.molecular_search_settings.exploratory_mode:
 131            columns.extend(
 132                [
 133                    "Weighted Cosine Correlation",
 134                    "Cosine Correlation",
 135                    "Stein Scott Similarity",
 136                    "Pearson Correlation",
 137                    "Spearman Correlation",
 138                    "Kendall Tau Correlation",
 139                    "Euclidean Distance",
 140                    "Manhattan Distance",
 141                    "Jaccard Distance",
 142                    "DWT Correlation",
 143                    "DFT Correlation",
 144                ]
 145            )
 146
 147            columns.extend(list(methods_name.values()))
 148
 149        return columns
 150
 151    def get_pandas_df(self, id_label="corems:"):
 152        """Get the exported data as a Pandas DataFrame.
 153
 154        Parameters:
 155        ----------
 156        id_label : str, optional
 157            The ID label for the data. Default is "corems:".
 158
 159        Returns:
 160        -------
 161        DataFrame
 162            The exported data as a Pandas DataFrame.
 163        """
 164
 165        columns = self._init_columns()
 166
 167        dict_data_list = self.get_list_dict_data(self.gcms)
 168
 169        df = DataFrame(dict_data_list, columns=columns)
 170
 171        df.name = self.gcms.sample_name
 172
 173        return df
 174
 175    def get_json(self, nan=False, id_label="corems:"):
 176        """Get the exported data as a JSON string.
 177
 178        Parameters:
 179        ----------
 180        nan : bool, optional
 181            Whether to include NaN values in the JSON string. Default is False.
 182        id_label : str, optional
 183            The ID label for the data. Default is "corems:".
 184
 185        """
 186
 187        import json
 188
 189        dict_data_list = self.get_list_dict_data(self.gcms)
 190
 191        return json.dumps(
 192            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
 193        )
 194
 195    def to_pandas(self, write_metadata=True, id_label="corems:"):
 196        """Export the data to a Pandas DataFrame and save it as a pickle file.
 197
 198        Parameters:
 199        ----------
 200        write_metadata : bool, optional
 201            Whether to write metadata to the output file.
 202        id_label : str, optional
 203            The ID label for the data.
 204        """
 205
 206        columns = self._init_columns()
 207
 208        dict_data_list = self.get_list_dict_data(self.gcms)
 209
 210        df = DataFrame(dict_data_list, columns=columns)
 211
 212        df.to_pickle(self.output_file.with_suffix(".pkl"))
 213
 214        if write_metadata:
 215            self.write_settings(
 216                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
 217            )
 218
 219    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
 220        """Export the data to an Excel file.
 221
 222        Parameters:
 223        ----------
 224        write_mode : str, optional
 225            The write mode for the Excel file. Default is 'a' (append).
 226        write_metadata : bool, optional
 227            Whether to write metadata to the output file. Default is True.
 228        id_label : str, optional
 229            The ID label for the data. Default is "corems:".
 230        """
 231
 232        out_put_path = self.output_file.with_suffix(".xlsx")
 233
 234        columns = self._init_columns()
 235
 236        dict_data_list = self.get_list_dict_data(self.gcms)
 237
 238        df = DataFrame(dict_data_list, columns=columns)
 239
 240        if write_mode == "a" and out_put_path.exists():
 241            writer = ExcelWriter(out_put_path, engine="openpyxl")
 242            # try to open an existing workbook
 243            writer.book = load_workbook(out_put_path)
 244            # copy existing sheets
 245            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
 246            # read existing file
 247            reader = read_excel(out_put_path)
 248            # write out the new sheet
 249            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
 250
 251            writer.close()
 252        else:
 253            df.to_excel(
 254                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
 255            )
 256
 257        if write_metadata:
 258            self.write_settings(out_put_path, self.gcms, id_label=id_label)
 259
 260    def to_csv(
 261        self,
 262        separate_output=False,
 263        write_mode="w",
 264        write_metadata=True,
 265        id_label="corems:",
 266    ):
 267        """Export the data to a CSV file.
 268
 269        Parameters:
 270        ----------
 271        separate_output : bool, optional
 272            Whether to separate the output into multiple files. Default is False.
 273        write_mode : str, optional
 274            The write mode for the CSV file. Default is 'w' (write).
 275        write_metadata : bool, optional
 276            Whether to write metadata to the output file. Default is True.
 277        id_label : str, optional
 278            The ID label for the data. Default is "corems:".
 279        """
 280
 281        if separate_output:
 282            # set write mode to write
 283            # this mode will overwrite the file without warning
 284            write_mode = "w"
 285        else:
 286            # set write mode to append
 287            write_mode = "a"
 288
 289        columns = self._init_columns()
 290
 291        dict_data_list = self.get_list_dict_data(self.gcms)
 292
 293        out_put_path = self.output_file.with_suffix(".csv")
 294
 295        write_header = not out_put_path.exists()
 296
 297        try:
 298            with open(out_put_path, write_mode, newline="") as csvfile:
 299                writer = csv.DictWriter(csvfile, fieldnames=columns)
 300                if write_header:
 301                    writer.writeheader()
 302                for data in dict_data_list:
 303                    writer.writerow(data)
 304
 305            if write_metadata:
 306                self.write_settings(out_put_path, self.gcms, id_label=id_label)
 307
 308        except IOError as ioerror:
 309            print(ioerror)
 310
 311    def to_hdf(self, id_label="corems:"):
 312        """Export the data to an HDF5 file.
 313
 314        Parameters:
 315        ----------
 316        id_label : str, optional
 317            The ID label for the data. Default is "corems:".
 318        """
 319
 320        # save sample at a time
 321        def add_compound(gc_peak, compound_obj):
 322            modifier = compound_obj.classify if compound_obj.classify else ""
 323            compound_group = compound_obj.name.replace("/", "") + " " + modifier
 324
 325            if compound_group not in peak_group:
 326                compound_group = peak_group.create_group(compound_group)
 327
 328                # compound_group.attrs["retention_time"] = compound_obj.retention_time
 329                compound_group.attrs["retention_index"] = compound_obj.ri
 330                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
 331                compound_group.attrs["spectral_similarity_score"] = (
 332                    compound_obj.spectral_similarity_score
 333                )
 334                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
 335
 336                compond_mz = compound_group.create_dataset(
 337                    "mz", data=np.array(compound_obj.mz), dtype="f8"
 338                )
 339                compond_abundance = compound_group.create_dataset(
 340                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
 341                )
 342
 343                if self.gcms.molecular_search_settings.exploratory_mode:
 344                    compound_group.attrs["Spectral Similarities"] = json.dumps(
 345                        compound_obj.spectral_similarity_scores,
 346                        sort_keys=False,
 347                        indent=4,
 348                        separators=(",", ":"),
 349                    )
 350            else:
 351                warnings.warn("Skipping duplicate reference compound.")
 352
 353        import json
 354        from datetime import datetime, timezone
 355
 356        import h5py
 357        import numpy as np
 358
 359        output_path = self.output_file.with_suffix(".hdf5")
 360
 361        with h5py.File(output_path, "w") as hdf_handle:
 362            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
 363            hdf_handle.attrs["time_stamp"] = timenow
 364            hdf_handle.attrs["data_structure"] = "gcms"
 365            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
 366            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
 367
 368            hdf_handle.attrs["sample_id"] = "self.gcms.id"
 369            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
 370            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
 371            hdf_handle.attrs["output_data"] = str(output_path)
 372            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
 373            hdf_handle.attrs["corems_version"] = __version__
 374
 375            hdf_handle.attrs["Stats"] = json.dumps(
 376                self.get_data_stats(self.gcms),
 377                sort_keys=False,
 378                indent=4,
 379                separators=(",", ": "),
 380            )
 381            hdf_handle.attrs["Calibration"] = json.dumps(
 382                self.get_calibration_stats(self.gcms, id_label),
 383                sort_keys=False,
 384                indent=4,
 385                separators=(",", ": "),
 386            )
 387            hdf_handle.attrs["Blank"] = json.dumps(
 388                self.get_blank_stats(self.gcms),
 389                sort_keys=False,
 390                indent=4,
 391                separators=(",", ": "),
 392            )
 393
 394            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
 395            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
 396                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
 397            )
 398
 399            scans_dataset = hdf_handle.create_dataset(
 400                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
 401            )
 402            rt_dataset = hdf_handle.create_dataset(
 403                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
 404            )
 405            tic_dataset = hdf_handle.create_dataset(
 406                "tic", data=np.array(self.gcms.tic), dtype="f8"
 407            )
 408            processed_tic_dataset = hdf_handle.create_dataset(
 409                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
 410            )
 411
 412            output_score_method = (
 413                self.gcms.molecular_search_settings.output_score_method
 414            )
 415
 416            for gc_peak in self.gcms:
 417                # print(gc_peak.retention_time)
 418                # print(gc_peak.tic)
 419
 420                # check if there is a compound candidate
 421                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
 422                peak_group.attrs["deconvolution"] = int(
 423                    self.gcms.chromatogram_settings.use_deconvolution
 424                )
 425
 426                peak_group.attrs["start_scan"] = gc_peak.start_scan
 427                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
 428                peak_group.attrs["final_scan"] = gc_peak.final_scan
 429
 430                peak_group.attrs["retention_index"] = gc_peak.ri
 431                peak_group.attrs["retention_time"] = gc_peak.retention_time
 432                peak_group.attrs["area"] = gc_peak.area
 433
 434                mz = peak_group.create_dataset(
 435                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
 436                )
 437                abundance = peak_group.create_dataset(
 438                    "abundance",
 439                    data=np.array(gc_peak.mass_spectrum.abundance),
 440                    dtype="f8",
 441                )
 442
 443                if gc_peak:
 444                    if output_score_method == "highest_sim_score":
 445                        compound_obj = gc_peak.highest_score_compound
 446                        add_compound(gc_peak, compound_obj)
 447
 448                    elif output_score_method == "highest_ss":
 449                        compound_obj = gc_peak.highest_ss_compound
 450                        add_compound(gc_peak, compound_obj)
 451
 452                    else:
 453                        for compound_obj in gc_peak:
 454                            add_compound(gc_peak, compound_obj)
 455
 456    def get_data_stats(self, gcms):
 457        """Get statistics about the GCMS data.
 458
 459        Parameters:
 460        ----------
 461        gcms : object
 462            The low resolution GCMS object.
 463
 464        Returns:
 465        -------
 466        dict
 467            A dictionary containing the data statistics.
 468        """
 469
 470        matched_peaks = gcms.matched_peaks
 471        no_matched_peaks = gcms.no_matched_peaks
 472        unique_metabolites = gcms.unique_metabolites
 473
 474        peak_matchs_above_0p85 = 0
 475        unique_peak_match_above_0p85 = 0
 476        for match_peak in matched_peaks:
 477            gc_peak_above_85 = 0
 478            matches_above_85 = list(
 479                filter(lambda m: m.similarity_score >= 0.85, match_peak)
 480            )
 481            if matches_above_85:
 482                peak_matchs_above_0p85 += 1
 483            if len(matches_above_85) == 1:
 484                unique_peak_match_above_0p85 += 1
 485
 486        data_stats = {}
 487        data_stats["average_signal_noise"] = "ni"
 488        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
 489        data_stats["total_number_peaks"] = len(gcms)
 490        data_stats["total_peaks_matched"] = len(matched_peaks)
 491        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
 492        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
 493        data_stats["single_matches_above_similarity_score_0.85"] = (
 494            unique_peak_match_above_0p85
 495        )
 496        data_stats["unique_metabolites"] = len(unique_metabolites)
 497
 498        return data_stats
 499
 500    def get_calibration_stats(self, gcms, id_label):
 501        """Get statistics about the GC-MS calibration.
 502
 503        Parameters:
 504        ----------
 505        """
 506        calibration_parameters = {}
 507
 508        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
 509        calibration_parameters["data_url"] = str(gcms.cal_file_path)
 510        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
 511        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
 512        calibration_parameters["calibration_method"] = ""
 513
 514        return calibration_parameters
 515
 516    def get_blank_stats(self, gcms):
 517        """Get statistics about the GC-MS blank."""
 518        blank_parameters = {}
 519
 520        blank_parameters["data_name"] = "ni"
 521        blank_parameters["blank_id"] = "ni"
 522        blank_parameters["data_url"] = "ni"
 523        blank_parameters["has_input"] = "ni"
 524        blank_parameters["common_features_to_blank"] = "ni"
 525
 526        return blank_parameters
 527
 528    def get_instrument_metadata(self, gcms):
 529        """Get metadata about the GC-MS instrument."""
 530        instrument_metadata = {}
 531
 532        instrument_metadata["analyzer"] = gcms.analyzer
 533        instrument_metadata["instrument_label"] = gcms.instrument_label
 534        instrument_metadata["instrument_id"] = uuid.uuid4().hex
 535
 536        return instrument_metadata
 537
 538    def get_data_metadata(self, gcms, id_label, output_path):
 539        """Get metadata about the GC-MS data.
 540
 541        Parameters:
 542        ----------
 543        gcms : object
 544            The low resolution GCMS object.
 545        id_label : str
 546            The ID label for the data.
 547        output_path : str
 548            The output file path.
 549
 550        Returns:
 551        -------
 552        dict
 553            A dictionary containing the data metadata.
 554        """
 555        if isinstance(output_path, str):
 556            output_path = Path(output_path)
 557
 558        paramaters_path = output_path.with_suffix(".json")
 559
 560        if paramaters_path.exists():
 561            with paramaters_path.open() as current_param:
 562                metadata = json.load(current_param)
 563                data_metadata = metadata.get("Data")
 564        else:
 565            data_metadata = {}
 566            data_metadata["data_name"] = []
 567            data_metadata["input_data_url"] = []
 568            data_metadata["has_input"] = []
 569
 570        data_metadata["data_name"].append(gcms.sample_name)
 571        data_metadata["input_data_url"].append(str(gcms.file_location))
 572        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
 573
 574        data_metadata["output_data_name"] = str(output_path.stem)
 575        data_metadata["output_data_url"] = str(output_path)
 576        data_metadata["has_output"] = id_label + corems_md5(output_path)
 577
 578        return data_metadata
 579
 580    def get_parameters_json(self, gcms, id_label, output_path):
 581        """Get the parameters as a JSON string.
 582
 583        Parameters:
 584        ----------
 585        gcms : GCMS object
 586            The low resolution GCMS object.
 587        id_label : str
 588            The ID label for the data.
 589        output_path : str
 590            The output file path.
 591
 592        Returns:
 593        -------
 594        str
 595            The parameters as a JSON string.
 596        """
 597
 598        output_parameters_dict = {}
 599        output_parameters_dict["Data"] = self.get_data_metadata(
 600            gcms, id_label, output_path
 601        )
 602        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
 603        output_parameters_dict["Calibration"] = self.get_calibration_stats(
 604            gcms, id_label
 605        )
 606        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
 607        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
 608        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
 609        corems_dict_setting["corems_version"] = __version__
 610        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
 611        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
 612        output = json.dumps(
 613            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
 614        )
 615
 616        return output
 617
 618    def write_settings(self, output_path, gcms, id_label="emsl:"):
 619        """Write the settings to a JSON file.
 620
 621        Parameters:
 622        ----------
 623        output_path : str
 624            The output file path.
 625        gcms : GCMS object
 626            The low resolution GCMS object.
 627        id_label : str
 628            The ID label for the data. Default is "emsl:".
 629
 630        """
 631
 632        output = self.get_parameters_json(gcms, id_label, output_path)
 633
 634        with open(
 635            output_path.with_suffix(".json"),
 636            "w",
 637            encoding="utf8",
 638        ) as outfile:
 639            outfile.write(output)
 640
 641    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
 642        """Get the exported data as a list of dictionaries.
 643
 644        Parameters:
 645        ----------
 646        gcms : object
 647            The low resolution GCMS object.
 648        include_no_match : bool, optional
 649            Whether to include no match data. Default is True.
 650        no_match_inline : bool, optional
 651            Whether to include no match data inline. Default is False.
 652
 653        Returns:
 654        -------
 655        list
 656            The exported data as a list of dictionaries.
 657        """
 658
 659        output_score_method = gcms.molecular_search_settings.output_score_method
 660
 661        dict_data_list = []
 662
 663        def add_match_dict_data():
 664            derivatization = "{}:{}:{}".format(
 665                compound_obj.classify,
 666                compound_obj.derivativenum,
 667                compound_obj.derivatization,
 668            )
 669            out_dict = {
 670                "Sample name": gcms.sample_name,
 671                "Peak Index": gcpeak_index,
 672                "Retention Time": gc_peak.retention_time,
 673                "Retention Time Ref": compound_obj.retention_time,
 674                "Peak Height": gc_peak.tic,
 675                "Peak Area": gc_peak.area,
 676                "Retention index": gc_peak.ri,
 677                "Retention index Ref": compound_obj.ri,
 678                "Retention Index Score": compound_obj.ri_score,
 679                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
 680                "Similarity Score": compound_obj.similarity_score,
 681                "Compound Name": compound_obj.name,
 682                "Chebi ID": compound_obj.metadata.chebi,
 683                "Kegg Compound ID": compound_obj.metadata.kegg,
 684                "Inchi": compound_obj.metadata.inchi,
 685                "Inchi Key": compound_obj.metadata.inchikey,
 686                "Smiles": compound_obj.metadata.smiles,
 687                "Molecular Formula": compound_obj.formula,
 688                "IUPAC Name": compound_obj.metadata.iupac_name,
 689                "Traditional Name": compound_obj.metadata.traditional_name,
 690                "Common Name": compound_obj.metadata.common_name,
 691                "Derivatization": derivatization,
 692            }
 693
 694            if self.gcms.molecular_search_settings.exploratory_mode:
 695                out_dict.update(
 696                    {
 697                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
 698                            "weighted_cosine_correlation"
 699                        ),
 700                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
 701                            "cosine_correlation"
 702                        ),
 703                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
 704                            "stein_scott_similarity"
 705                        ),
 706                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
 707                            "pearson_correlation"
 708                        ),
 709                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
 710                            "spearman_correlation"
 711                        ),
 712                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
 713                            "kendall_tau_correlation"
 714                        ),
 715                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
 716                            "dft_correlation"
 717                        ),
 718                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
 719                            "dwt_correlation"
 720                        ),
 721                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
 722                            "euclidean_distance"
 723                        ),
 724                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
 725                            "manhattan_distance"
 726                        ),
 727                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
 728                            "jaccard_distance"
 729                        ),
 730                    }
 731                )
 732                for method in methods_name:
 733                    out_dict[methods_name.get(method)] = (
 734                        compound_obj.spectral_similarity_scores.get(method)
 735                    )
 736
 737            dict_data_list.append(out_dict)
 738
 739        def add_no_match_dict_data():
 740            dict_data_list.append(
 741                {
 742                    "Sample name": gcms.sample_name,
 743                    "Peak Index": gcpeak_index,
 744                    "Retention Time": gc_peak.retention_time,
 745                    "Peak Height": gc_peak.tic,
 746                    "Peak Area": gc_peak.area,
 747                    "Retention index": gc_peak.ri,
 748                }
 749            )
 750
 751        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
 752            # check if there is a compound candidate
 753            if gc_peak:
 754                if output_score_method == "highest_sim_score":
 755                    compound_obj = gc_peak.highest_score_compound
 756                    add_match_dict_data()
 757
 758                elif output_score_method == "highest_ss":
 759                    compound_obj = gc_peak.highest_ss_compound
 760                    add_match_dict_data()
 761
 762                else:
 763                    for compound_obj in gc_peak:
 764                        add_match_dict_data()  # add monoisotopic peak
 765
 766            else:
 767                # include not_match
 768                if include_no_match and no_match_inline:
 769                    add_no_match_dict_data()
 770
 771        if include_no_match and not no_match_inline:
 772            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
 773                if not gc_peak:
 774                    add_no_match_dict_data()
 775
 776        return dict_data_list
 777
 778
 779class HighResMassSpectraExport(HighResMassSpecExport):
 780    """A class to export high resolution mass spectra data.
 781
 782    This class provides methods to export high resolution mass spectra data to various formats
 783    such as Excel, CSV, HDF5, and Pandas DataFrame.
 784
 785    Parameters
 786    ----------
 787    out_file_path : str | Path
 788        The output file path.
 789    mass_spectra : object
 790        The high resolution mass spectra object.
 791    output_type : str, optional
 792        The output type. Default is 'excel'.
 793
 794    Attributes
 795    ----------
 796    output_file : Path
 797        The output file path without suffix
 798    dir_loc : Path
 799        The directory location for the output file,
 800        by default this will be the output_file + ".corems" and all output files will be
 801        written into this location
 802    mass_spectra : MassSpectraBase
 803        The high resolution mass spectra object.
 804    """
 805
 806    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
 807        super().__init__(
 808            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
 809        )
 810
 811        self.dir_loc = Path(out_file_path + ".corems")
 812        self.dir_loc.mkdir(exist_ok=True)
 813        # Place the output file in the directory
 814        self.output_file = self.dir_loc / Path(out_file_path).name
 815        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
 816        self.mass_spectra = mass_spectra
 817        self.atoms_order_list = None
 818        self._init_columns()
 819
 820    def get_pandas_df(self):
 821        """Get the mass spectra as a list of Pandas DataFrames."""
 822
 823        list_df = []
 824
 825        for mass_spectrum in self.mass_spectra:
 826            columns = self.columns_label + self.get_all_used_atoms_in_order(
 827                mass_spectrum
 828            )
 829
 830            dict_data_list = self.get_list_dict_data(mass_spectrum)
 831
 832            df = DataFrame(dict_data_list, columns=columns)
 833
 834            scan_number = mass_spectrum.scan_number
 835
 836            df.name = str(self.output_file) + "_" + str(scan_number)
 837
 838            list_df.append(df)
 839
 840        return list_df
 841
 842    def to_pandas(self, write_metadata=True):
 843        """Export the data to a Pandas DataFrame and save it as a pickle file.
 844
 845        Parameters:
 846        ----------
 847        write_metadata : bool, optional
 848            Whether to write metadata to the output file. Default is True.
 849        """
 850
 851        for mass_spectrum in self.mass_spectra:
 852            columns = self.columns_label + self.get_all_used_atoms_in_order(
 853                mass_spectrum
 854            )
 855
 856            dict_data_list = self.get_list_dict_data(mass_spectrum)
 857
 858            df = DataFrame(dict_data_list, columns=columns)
 859
 860            scan_number = mass_spectrum.scan_number
 861
 862            out_filename = Path(
 863                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
 864            )
 865
 866            df.to_pickle(self.dir_loc / out_filename)
 867
 868            if write_metadata:
 869                self.write_settings(
 870                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 871                )
 872
 873    def to_excel(self, write_metadata=True):
 874        """Export the data to an Excel file.
 875
 876        Parameters:
 877        ----------
 878        write_metadata : bool, optional
 879            Whether to write metadata to the output file. Default is True.
 880        """
 881        for mass_spectrum in self.mass_spectra:
 882            columns = self.columns_label + self.get_all_used_atoms_in_order(
 883                mass_spectrum
 884            )
 885
 886            dict_data_list = self.get_list_dict_data(mass_spectrum)
 887
 888            df = DataFrame(dict_data_list, columns=columns)
 889
 890            scan_number = mass_spectrum.scan_number
 891
 892            out_filename = Path(
 893                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
 894            )
 895
 896            df.to_excel(self.dir_loc / out_filename)
 897
 898            if write_metadata:
 899                self.write_settings(
 900                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 901                )
 902
 903    def to_csv(self, write_metadata=True):
 904        """Export the data to a CSV file.
 905
 906        Parameters:
 907        ----------
 908        write_metadata : bool, optional
 909            Whether to write metadata to the output file. Default is True.
 910        """
 911        import csv
 912
 913        for mass_spectrum in self.mass_spectra:
 914            columns = self.columns_label + self.get_all_used_atoms_in_order(
 915                mass_spectrum
 916            )
 917
 918            scan_number = mass_spectrum.scan_number
 919
 920            dict_data_list = self.get_list_dict_data(mass_spectrum)
 921
 922            out_filename = Path(
 923                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
 924            )
 925
 926            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
 927                writer = csv.DictWriter(csvfile, fieldnames=columns)
 928                writer.writeheader()
 929                for data in dict_data_list:
 930                    writer.writerow(data)
 931
 932            if write_metadata:
 933                self.write_settings(
 934                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 935                )
 936
 937    def get_mass_spectra_attrs(self):
 938        """Get the mass spectra attributes as a JSON string.
 939
 940        Parameters:
 941        ----------
 942        mass_spectra : object
 943            The high resolution mass spectra object.
 944
 945        Returns:
 946        -------
 947        str
 948            The mass spectra attributes as a JSON string.
 949        """
 950        dict_ms_attrs = {}
 951        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
 952        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
 953        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
 954
 955        return json.dumps(
 956            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
 957        )
 958
 959    def to_hdf(self, overwrite=False, export_raw=True):
 960        """Export the data to an HDF5 file.
 961
 962        Parameters
 963        ----------
 964        overwrite : bool, optional
 965            Whether to overwrite the output file. Default is False.
 966        export_raw : bool, optional
 967            Whether to export the raw mass spectra data. Default is True.
 968        """
 969        if overwrite:
 970            if self.output_file.with_suffix(".hdf5").exists():
 971                self.output_file.with_suffix(".hdf5").unlink()
 972
 973        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 974            if not hdf_handle.attrs.get("date_utc"):
 975                # Set metadata for all mass spectra
 976                timenow = str(
 977                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 978                )
 979                hdf_handle.attrs["date_utc"] = timenow
 980                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 981                hdf_handle.attrs["data_structure"] = "mass_spectra"
 982                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 983                hdf_handle.attrs["instrument_label"] = (
 984                    self.mass_spectra.instrument_label
 985                )
 986                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 987                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 988                hdf_handle.attrs["parser_type"] = (
 989                    self.mass_spectra.spectra_parser_class.__name__
 990                )
 991                hdf_handle.attrs["original_file_location"] = (
 992                    self.mass_spectra.file_location._str
 993                )
 994
 995            if "mass_spectra" not in hdf_handle:
 996                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 997            else:
 998                mass_spectra_group = hdf_handle.get("mass_spectra")
 999
1000            for mass_spectrum in self.mass_spectra:
1001                group_key = str(int(mass_spectrum.scan_number))
1002
1003                self.add_mass_spectrum_to_hdf5(
1004                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1005                )
1006
1007
1008class LCMSExport(HighResMassSpectraExport):
1009    """A class to export high resolution LC-MS data.
1010
1011    This class provides methods to export high resolution LC-MS data to HDF5.
1012
1013    Parameters
1014    ----------
1015    out_file_path : str | Path
1016        The output file path, do not include the file extension.
1017    lcms_object : LCMSBase
1018        The high resolution lc-ms object.
1019    """
1020
1021    def __init__(self, out_file_path, mass_spectra):
1022        super().__init__(out_file_path, mass_spectra, output_type="hdf5")
1023
1024    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1025        """Export the data to an HDF5.
1026
1027        Parameters
1028        ----------
1029        overwrite : bool, optional
1030            Whether to overwrite the output file. Default is False.
1031        save_parameters : bool, optional
1032            Whether to save the parameters as a separate json or toml file. Default is True.
1033        parameter_format : str, optional
1034            The format to save the parameters in. Default is 'toml'.
1035
1036        Raises
1037        ------
1038        ValueError
1039            If parameter_format is not 'json' or 'toml'.
1040        """
1041        export_profile_spectra = (
1042            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1043        )
1044
1045        # Write the mass spectra data to the hdf5 file
1046        super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra)
1047
1048        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1049        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1050            # Add scan_info to hdf5 file
1051            if "scan_info" not in hdf_handle:
1052                scan_info_group = hdf_handle.create_group("scan_info")
1053                for k, v in self.mass_spectra._scan_info.items():
1054                    array = np.array(list(v.values()))
1055                    if array.dtype.str[0:2] == "<U":
1056                        array = array.astype("S")
1057                    scan_info_group.create_dataset(k, data=array)
1058
1059            # Add ms_unprocessed to hdf5 file
1060            export_unprocessed_ms1 = (
1061                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1062            )
1063            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1064                if "ms_unprocessed" not in hdf_handle:
1065                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1066                else:
1067                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1068                for k, v in self.mass_spectra._ms_unprocessed.items():
1069                    array = np.array(v)
1070                    ms_unprocessed_group.create_dataset(str(k), data=array)
1071
1072            # Add LCMS mass features to hdf5 file
1073            if len(self.mass_spectra.mass_features) > 0:
1074                if "mass_features" not in hdf_handle:
1075                    mass_features_group = hdf_handle.create_group("mass_features")
1076                else:
1077                    mass_features_group = hdf_handle.get("mass_features")
1078
1079                # Create group for each mass feature, with key as the mass feature id
1080                for k, v in self.mass_spectra.mass_features.items():
1081                    mass_features_group.create_group(str(k))
1082                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1083                    for k2, v2 in v.__dict__.items():
1084                        if v2 is not None:
1085                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1086                            if k2 not in [
1087                                "chromatogram_parent",
1088                                "ms2_mass_spectra",
1089                                "mass_spectrum",
1090                                "_eic_data",
1091                                "ms2_similarity_results",
1092                            ]:
1093                                if k2 == "ms2_scan_numbers":
1094                                    array = np.array(v2)
1095                                    mass_features_group[str(k)].create_dataset(
1096                                        str(k2), data=array
1097                                    )
1098                                elif k2 == "_half_height_width":
1099                                    array = np.array(v2)
1100                                    mass_features_group[str(k)].create_dataset(
1101                                        str(k2), data=array
1102                                    )
1103                                elif k2 == "_ms_deconvoluted_idx":
1104                                    array = np.array(v2)
1105                                    mass_features_group[str(k)].create_dataset(
1106                                        str(k2), data=array
1107                                    )
1108                                elif k2 == "associated_mass_features_deconvoluted":
1109                                    array = np.array(v2)
1110                                    mass_features_group[str(k)].create_dataset(
1111                                        str(k2), data=array
1112                                    )
1113                                elif (
1114                                    isinstance(v2, int)
1115                                    or isinstance(v2, float)
1116                                    or isinstance(v2, str)
1117                                    or isinstance(v2, np.integer)
1118                                    or isinstance(v2, np.bool_)
1119                                ):
1120                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1121                                else:
1122                                    raise TypeError(
1123                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1124                                    )
1125
1126            # Add EIC data to hdf5 file
1127            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1128            if len(self.mass_spectra.eics) > 0 and export_eics:
1129                if "eics" not in hdf_handle:
1130                    eic_group = hdf_handle.create_group("eics")
1131                else:
1132                    eic_group = hdf_handle.get("eics")
1133
1134                # Create group for each eic
1135                for k, v in self.mass_spectra.eics.items():
1136                    eic_group.create_group(str(k))
1137                    eic_group[str(k)].attrs["mz"] = k
1138                    # Loop through each of the attributes and add them as datasets (if array)
1139                    for k2, v2 in v.__dict__.items():
1140                        if v2 is not None:
1141                            array = np.array(v2)
1142                            eic_group[str(k)].create_dataset(str(k2), data=array)
1143
1144            # Add ms2_search results to hdf5 file
1145            if len(self.mass_spectra.spectral_search_results) > 0:
1146                if "spectral_search_results" not in hdf_handle:
1147                    spectral_search_results = hdf_handle.create_group(
1148                        "spectral_search_results"
1149                    )
1150                else:
1151                    spectral_search_results = hdf_handle.get("spectral_search_results")
1152                # Create group for each search result by ms2_scan / precursor_mz
1153                for k, v in self.mass_spectra.spectral_search_results.items():
1154                    spectral_search_results.create_group(str(k))
1155                    for k2, v2 in v.items():
1156                        spectral_search_results[str(k)].create_group(str(k2))
1157                        spectral_search_results[str(k)][str(k2)].attrs[
1158                            "precursor_mz"
1159                        ] = v2.precursor_mz
1160                        spectral_search_results[str(k)][str(k2)].attrs[
1161                            "query_spectrum_id"
1162                        ] = v2.query_spectrum_id
1163                        # Loop through each of the attributes and add them as datasets (if array)
1164                        for k3, v3 in v2.__dict__.items():
1165                            if v3 is not None and k3 not in [
1166                                "query_spectrum",
1167                                "precursor_mz",
1168                                "query_spectrum_id",
1169                            ]:
1170                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1171                                    v3 = [", ".join(x) for x in v3]
1172                                array = np.array(v3)
1173                                if array.dtype.str[0:2] == "<U":
1174                                    array = array.astype("S")
1175                                spectral_search_results[str(k)][str(k2)].create_dataset(
1176                                    str(k3), data=array
1177                                )
1178
1179        # Save parameters as separate json
1180        if save_parameters:
1181            # Check if parameter_format is valid
1182            if parameter_format not in ["json", "toml"]:
1183                raise ValueError("parameter_format must be 'json' or 'toml'")
1184
1185            if parameter_format == "json":
1186                dump_lcms_settings_json(
1187                    filename=self.output_file.with_suffix(".json"),
1188                    lcms_obj=self.mass_spectra,
1189                )
1190            elif parameter_format == "toml":
1191                dump_lcms_settings_toml(
1192                    filename=self.output_file.with_suffix(".toml"),
1193                    lcms_obj=self.mass_spectra,
1194                )
1195
1196
1197class LipidomicsExport(LCMSExport):
1198    """A class to export lipidomics data.
1199
1200    This class provides methods to export lipidomics data to various formats and summarize the lipid report.
1201
1202    Parameters
1203    ----------
1204    out_file_path : str | Path
1205        The output file path, do not include the file extension.
1206    mass_spectra : object
1207        The high resolution mass spectra object.
1208    """
1209
1210    def __init__(self, out_file_path, mass_spectra):
1211        super().__init__(out_file_path, mass_spectra)
1212        self.ion_type_dict = ion_type_dict
1213
1214    @staticmethod
1215    def get_ion_formula(neutral_formula, ion_type):
1216        """From a neutral formula and an ion type, return the formula of the ion.
1217
1218        Notes
1219        -----
1220        This is a static method.
1221        If the neutral_formula is not a string, this method will return None.
1222
1223        Parameters
1224        ----------
1225        neutral_formula : str
1226            The neutral formula, this should be a string form from the MolecularFormula class
1227            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1228            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1229            e.g. MgCl2 is parsed as 'Mg Cl2.
1230        ion_type : str
1231            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1232            See the self.ion_type_dict for the available ion types.
1233
1234        Returns
1235        -------
1236        str
1237            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1238        """
1239        # If neutral_formula is not a string, return None
1240        if not isinstance(neutral_formula, str):
1241            return None
1242
1243        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1244        if re.search(r"\s", neutral_formula):
1245            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1246        else:
1247            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1248            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1249            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1250            neutral_formula = MolecularFormula(
1251                dict(
1252                    zip(
1253                        [x[0] for x in elements],
1254                        [int(x[0]) if x else 1 for x in counts],
1255                    )
1256                ),
1257                ion_charge=0,
1258            )
1259        neutral_formula_dict = neutral_formula.to_dict().copy()
1260
1261        adduct_add_dict = ion_type_dict[ion_type][0]
1262        for key in adduct_add_dict:
1263            if key in neutral_formula_dict.keys():
1264                neutral_formula_dict[key] += adduct_add_dict[key]
1265            else:
1266                neutral_formula_dict[key] = adduct_add_dict[key]
1267
1268        adduct_subtract = ion_type_dict[ion_type][1]
1269        for key in adduct_subtract:
1270            neutral_formula_dict[key] -= adduct_subtract[key]
1271
1272        return MolecularFormula(neutral_formula_dict, ion_charge=0).string
1273
1274    @staticmethod
1275    def get_isotope_type(ion_formula):
1276        """From an ion formula, return the 13C isotope type of the ion.
1277
1278        Notes
1279        -----
1280        This is a static method.
1281        If the ion_formula is not a string, this method will return None.
1282        This is currently only functional for 13C isotopes.
1283
1284        Parameters
1285        ----------
1286        ion_formula : str
1287            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1288
1289        Returns
1290        -------
1291        str
1292            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1293
1294        Raises
1295        ------
1296        ValueError
1297            If the ion_formula is not a string.
1298        """
1299        if not isinstance(ion_formula, str):
1300            return None
1301
1302        if re.search(r"\s", ion_formula):
1303            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1304        else:
1305            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1306        ion_formula_dict = ion_formula.to_dict().copy()
1307
1308        try:
1309            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1310        except KeyError:
1311            iso_class = None
1312
1313        return iso_class
1314
1315    def clean_ms1_report(self, ms1_summary_full):
1316        """Clean the MS1 report.
1317
1318        Parameters
1319        ----------
1320        ms1_summary_full : DataFrame
1321            The full MS1 summary DataFrame.
1322
1323        Returns
1324        -------
1325        DataFrame
1326            The cleaned MS1 summary DataFrame.
1327        """
1328        ms1_summary_full = ms1_summary_full.reset_index()
1329        cols_to_keep = [
1330            "mf_id",
1331            "Molecular Formula",
1332            "Ion Type",
1333            "Calculated m/z",
1334            "m/z Error (ppm)",
1335            "m/z Error Score",
1336            "Is Isotopologue",
1337            "Isotopologue Similarity",
1338            "Confidence Score",
1339        ]
1340        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1341        ms1_summary["ion_formula"] = [
1342            self.get_ion_formula(f, a)
1343            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1344        ]
1345        ms1_summary["isotopologue_type"] = [
1346            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1347        ]
1348
1349        # Reorder columns
1350        ms1_summary = ms1_summary[
1351            [
1352                "mf_id",
1353                "ion_formula",
1354                "isotopologue_type",
1355                "Calculated m/z",
1356                "m/z Error (ppm)",
1357                "m/z Error Score",
1358                "Isotopologue Similarity",
1359                "Confidence Score",
1360            ]
1361        ]
1362
1363        # Set the index to mf_id
1364        ms1_summary = ms1_summary.set_index("mf_id")
1365
1366        return ms1_summary
1367
1368    def summarize_lipid_report(self, ms2_annot):
1369        """Summarize the lipid report.
1370
1371        Parameters
1372        ----------
1373        ms2_annot : DataFrame
1374            The MS2 annotation DataFrame with all annotations.
1375
1376        Returns
1377        -------
1378        DataFrame
1379            The summarized lipid report.
1380        """
1381        # Drop unnecessary columns for easier viewing
1382        columns_to_drop = [
1383            "precursor_mz",
1384            "precursor_mz_error_ppm",
1385            "metabref_mol_id",
1386            "metabref_precursor_mz",
1387            "cas",
1388            "inchikey",
1389            "inchi",
1390            "chebi",
1391            "smiles",
1392            "kegg",
1393            "data_id",
1394            "iupac_name",
1395            "traditional_name",
1396            "common_name",
1397            "casno",
1398        ]
1399        ms2_annot = ms2_annot.drop(
1400            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1401        )
1402
1403        # If ion_types_excluded is not empty, remove those ion types
1404        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1405            "ms2"
1406        ].molecular_search.ion_types_excluded
1407        if len(ion_types_excluded) > 0:
1408            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1409
1410        # If mf_id is not present, check that the index name is mf_id and reset the index
1411        if "mf_id" not in ms2_annot.columns:
1412            if ms2_annot.index.name == "mf_id":
1413                ms2_annot = ms2_annot.reset_index()
1414            else:
1415                raise ValueError("mf_id is not present in the dataframe")
1416
1417        # Attempt to get consensus annotations to the MLF level
1418        mlf_results_all = []
1419        for mf_id in ms2_annot["mf_id"].unique():
1420            mlf_results_perid = []
1421            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1422            ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf)
1423
1424            for query_scan in ms2_annot["query_spectrum_id"].unique():
1425                ms2_annot_sub = ms2_annot_mf[
1426                    ms2_annot_mf["query_spectrum_id"] == query_scan
1427                ].copy()
1428
1429                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1430                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1431                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1432                        ms2_annot_sub["entropy_max"] = (
1433                            ms2_annot_sub["entropy_similarity"]
1434                            == ms2_annot_sub["entropy_similarity"].max()
1435                        )
1436                        ms2_annot_sub["ref_match_fract_max"] = (
1437                            ms2_annot_sub["ref_mz_in_query_fract"]
1438                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1439                        )
1440                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1441                            "query_frag_types"
1442                        ].apply(lambda x: True if "MLF" in x else False)
1443
1444                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1445                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1446                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1447                        ].all(axis=1)
1448
1449                        # If there is a consensus, take the row with the highest entropy_similarity
1450                        if ms2_annot_sub["consensus"].any():
1451                            ms2_annot_sub = ms2_annot_sub[
1452                                ms2_annot_sub["entropy_similarity"]
1453                                == ms2_annot_sub["entropy_similarity"].max()
1454                            ].head(1)
1455                            mlf_results_perid.append(ms2_annot_sub)
1456            if len(mlf_results_perid) == 0:
1457                mlf_results_perid = pd.DataFrame()
1458            else:
1459                mlf_results_perid = pd.concat(mlf_results_perid)
1460                if mlf_results_perid["name"].nunique() == 1:
1461                    mlf_results_perid = mlf_results_perid[
1462                        mlf_results_perid["entropy_similarity"]
1463                        == mlf_results_perid["entropy_similarity"].max()
1464                    ].head(1)
1465                else:
1466                    mlf_results_perid = pd.DataFrame()
1467                mlf_results_all.append(mlf_results_perid)
1468
1469        # These are the consensus annotations to the MLF level
1470        if len(mlf_results_all) > 0:
1471            mlf_results_all = pd.concat(mlf_results_all)
1472            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1473        else:
1474            # Make an empty dataframe
1475            mlf_results_all = ms2_annot.head(0)
1476
1477        # For remaining mf_ids, try to get a consensus annotation to the species level
1478        species_results_all = []
1479        # Remove mf_ids that have consensus annotations to the MLF level
1480        ms2_annot_spec = ms2_annot[
1481            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1482        ]
1483        for mf_id in ms2_annot_spec["mf_id"].unique():
1484            # Do all the hits have the same lipid_summed_name?
1485            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1486            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1487
1488            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1489                # Grab the highest entropy_similarity result
1490                ms2_annot_sub = ms2_annot_sub[
1491                    ms2_annot_sub["entropy_similarity"]
1492                    == ms2_annot_sub["entropy_similarity"].max()
1493                ].head(1)
1494                species_results_all.append(ms2_annot_sub)
1495
1496        # These are the consensus annotations to the species level
1497        if len(species_results_all) > 0:
1498            species_results_all = pd.concat(species_results_all)
1499            species_results_all["annot_level"] = "species"
1500        else:
1501            # Make an empty dataframe
1502            species_results_all = ms2_annot.head(0)
1503
1504        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1505        # Remove mf_ids that have consensus annotations to the species level
1506        ms2_annot_remaining = ms2_annot_spec[
1507            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1508        ]
1509        no_consensus = []
1510        for mf_id in ms2_annot_remaining["mf_id"].unique():
1511            id_sub = []
1512            id_no_con = []
1513            ms2_annot_sub_mf = ms2_annot_remaining[
1514                ms2_annot_remaining["mf_id"] == mf_id
1515            ].copy()
1516            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1517                ms2_annot_sub = ms2_annot_sub_mf[
1518                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1519                ].copy()
1520
1521                # New columns for ranking [HIGHER RANK = BETTER]
1522                ms2_annot_sub["entropy_max"] = (
1523                    ms2_annot_sub["entropy_similarity"]
1524                    == ms2_annot_sub["entropy_similarity"].max()
1525                )
1526                ms2_annot_sub["ref_match_fract_max"] = (
1527                    ms2_annot_sub["ref_mz_in_query_fract"]
1528                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1529                )
1530                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1531                    lambda x: True if "MLF" in x else False
1532                )
1533
1534                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1535                ms2_annot_sub["consensus"] = ms2_annot_sub[
1536                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1537                ].all(axis=1)
1538                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1539                id_sub.append(ms2_annot_sub_con)
1540                id_no_con.append(ms2_annot_sub)
1541            id_sub = pd.concat(id_sub)
1542            id_no_con = pd.concat(id_no_con)
1543
1544            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1545            if (
1546                id_sub["query_frag_types"]
1547                .apply(lambda x: True if "MLF" in x else False)
1548                .all()
1549                and len(id_sub) > 0
1550            ):
1551                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1552                id_sub = id_sub.loc[idx]
1553                # Reorder so highest entropy_similarity is first
1554                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1555                id_sub["annot_level"] = id_sub["structure_level"]
1556                no_consensus.append(id_sub)
1557
1558            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1559            elif len(id_sub) == 0:
1560                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1561                    summed_sub = id_no_con[
1562                        id_no_con["lipid_summed_name"] == lipid_summed_name
1563                    ]
1564                    # Any consensus to MLF?
1565                    if summed_sub["consensus"].any():
1566                        summed_sub = summed_sub[summed_sub["consensus"]]
1567                        summed_sub["annot_level"] = summed_sub["structure_level"]
1568                        no_consensus.append(summed_sub)
1569                    else:
1570                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1571                        summed_sub = summed_sub[
1572                            summed_sub["entropy_similarity"]
1573                            == summed_sub["entropy_similarity"].max()
1574                        ].head(1)
1575                        # get first row
1576                        summed_sub["annot_level"] = "species"
1577                        summed_sub["name"] = ""
1578                        no_consensus.append(summed_sub)
1579            else:
1580                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1581
1582        if len(no_consensus) > 0:
1583            no_consensus = pd.concat(no_consensus)
1584        else:
1585            no_consensus = ms2_annot.head(0)
1586
1587        # Combine all the consensus annotations and reformat the dataframe for output
1588        species_results_all = species_results_all.drop(columns=["name"])
1589        species_results_all["lipid_molecular_species_id"] = ""
1590        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1591        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1592        consensus_annotations = pd.concat(
1593            [mlf_results_all, species_results_all, no_consensus]
1594        )
1595        consensus_annotations = consensus_annotations.sort_values(
1596            "mf_id", ascending=True
1597        )
1598        cols_to_keep = [
1599            "mf_id",
1600            "ref_ion_type",
1601            "entropy_similarity",
1602            "ref_mz_in_query_fract",
1603            "lipid_molecular_species_id",
1604            "lipid_summed_name",
1605            "lipid_subclass",
1606            "lipid_class",
1607            "lipid_category",
1608            "formula",
1609            "annot_level",
1610            "n_spectra_contributing",
1611        ]
1612        consensus_annotations = consensus_annotations[cols_to_keep]
1613        consensus_annotations = consensus_annotations.set_index("mf_id")
1614
1615        return consensus_annotations
1616
1617    def clean_ms2_report(self, lipid_summary):
1618        """Clean the MS2 report.
1619
1620        Parameters
1621        ----------
1622        lipid_summary : DataFrame
1623            The full lipid summary DataFrame.
1624
1625        Returns
1626        -------
1627        DataFrame
1628            The cleaned lipid summary DataFrame.
1629        """
1630        lipid_summary = lipid_summary.reset_index()
1631        lipid_summary["ion_formula"] = [
1632            self.get_ion_formula(f, a)
1633            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
1634        ]
1635
1636        # Reorder columns
1637        lipid_summary = lipid_summary[
1638            [
1639                "mf_id",
1640                "ion_formula",
1641                "ref_ion_type",
1642                "formula",
1643                "annot_level",
1644                "lipid_molecular_species_id",
1645                "lipid_summed_name",
1646                "lipid_subclass",
1647                "lipid_class",
1648                "lipid_category",
1649                "entropy_similarity",
1650                "ref_mz_in_query_fract",
1651                "n_spectra_contributing",
1652            ]
1653        ]
1654
1655        # Set the index to mf_id
1656        lipid_summary = lipid_summary.set_index("mf_id")
1657
1658        return lipid_summary
1659
1660    def to_report(self, molecular_metadata=None):
1661        """Create a report of the mass features and their annotations.
1662
1663        Parameters
1664        ----------
1665        molecular_metadata : dict, optional
1666            The molecular metadata. Default is None.
1667
1668        Returns
1669        -------
1670        DataFrame
1671            The report of the mass features and their annotations.
1672
1673        Notes
1674        -----
1675        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
1676        """
1677        # Get mass feature dataframe
1678        mf_report = self.mass_spectra.mass_features_to_df()
1679        mf_report = mf_report.reset_index(drop=False)
1680
1681        # Get and clean ms1 annotation dataframe
1682        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1683        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1684        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1685
1686        # Get, summarize, and clean ms2 annotation dataframe
1687        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1688            molecular_metadata=molecular_metadata
1689        )
1690        if ms2_annot_report is not None:
1691            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
1692            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1693            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1694            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1695
1696        # Combine the reports
1697        if not ms1_annot_report.empty:
1698            # MS1 has been run and has molecular formula information
1699            mf_report = pd.merge(
1700                mf_report,
1701                ms1_annot_report,
1702                how="left",
1703                on=["mf_id", "isotopologue_type"],
1704            )
1705        if ms2_annot_report is not None:
1706            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1707            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1708            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1709            mf_no_ion_formula = pd.merge(
1710                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1711            )
1712
1713            # pull out the records with ion_formula
1714            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1715            mf_with_ion_formula = pd.merge(
1716                mf_with_ion_formula,
1717                ms2_annot_report,
1718                how="left",
1719                on=["mf_id", "ion_formula"],
1720            )
1721
1722            # put back together
1723            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1724
1725        # Rename colums
1726        rename_dict = {
1727            "mf_id": "Mass Feature ID",
1728            "scan_time": "Retention Time (min)",
1729            "mz": "m/z",
1730            "apex_scan": "Apex Scan Number",
1731            "intensity": "Intensity",
1732            "persistence": "Persistence",
1733            "area": "Area",
1734            "half_height_width": "Half Height Width (min)",
1735            "tailing_factor": "Tailing Factor",
1736            "dispersity_index": "Dispersity Index",
1737            "ms2_spectrum": "MS2 Spectrum",
1738            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1739            "isotopologue_type": "Isotopologue Type",
1740            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1741            "associated_mass_features": "Associated Mass Features after Deconvolution",
1742            "ion_formula": "Ion Formula",
1743            "formula": "Molecular Formula",
1744            "ref_ion_type": "Ion Type",
1745            "annot_level": "Lipid Annotation Level",
1746            "lipid_molecular_species_id": "Lipid Molecular Species",
1747            "lipid_summed_name": "Lipid Species",
1748            "lipid_subclass": "Lipid Subclass",
1749            "lipid_class": "Lipid Class",
1750            "lipid_category": "Lipid Category",
1751            "entropy_similarity": "Entropy Similarity",
1752            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1753            "n_spectra_contributing": "Spectra with Annotation (n)",
1754        }
1755        mf_report = mf_report.rename(columns=rename_dict)
1756        mf_report["Sample Name"] = self.mass_spectra.sample_name
1757        mf_report["Polarity"] = self.mass_spectra.polarity
1758        mf_report = mf_report[
1759            ["Mass Feature ID", "Sample Name", "Polarity"]
1760            + [
1761                col
1762                for col in mf_report.columns
1763                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1764            ]
1765        ]
1766
1767        # Reorder rows by "Mass Feature ID"
1768        mf_report = mf_report.sort_values("Mass Feature ID")
1769
1770        # Reset index
1771        mf_report = mf_report.reset_index(drop=True)
1772
1773        return mf_report
1774
1775    def report_to_csv(self, molecular_metadata=None):
1776        """Create a report of the mass features and their annotations and save it as a CSV file.
1777
1778        Parameters
1779        ----------
1780        molecular_metadata : dict, optional
1781            The molecular metadata. Default is None.
1782        """
1783        report = self.to_report(molecular_metadata=molecular_metadata)
1784        out_file = self.output_file.with_suffix(".csv")
1785        report.to_csv(out_file, index=False)
ion_type_dict = {'M+': [{}, {}], 'protonated': [{'H': 1}, {}], '[M+H]+': [{'H': 1}, {}], '[M+NH4]+': [{'N': 1, 'H': 4}, {}], '[M+Na]+': [{'Na': 1}, {}], '[M+K]+': [{'K': 1}, {}], '[M+2Na+Cl]+': [{'Na': 2, 'Cl': 1}, {}], '[M+2Na-H]+': [{'Na': 2}, {'H': 1}], '[M+C2H3Na2O2]+': [{'C': 2, 'H': 3, 'Na': 2, 'O': 2}, {}], '[M+C4H10N3]+': [{'C': 4, 'H': 10, 'N': 3}, {}], '[M+NH4+ACN]+': [{'C': 2, 'H': 7, 'N': 2}, {}], '[M+H-H2O]+': [{}, {'H': 1, 'O': 1}], 'de-protonated': [{}, {'H': 1}], '[M-H]-': [{}, {'H': 1}], '[M+Cl]-': [{'Cl': 1}, {}], '[M+HCOO]-': [{'C': 1, 'H': 1, 'O': 2}, {}], '[M+CH3COO]-': [{'C': 2, 'H': 3, 'O': 2}, {}], '[M+2NaAc+Cl]-': [{'Na': 2, 'C': 2, 'H': 3, 'O': 2, 'Cl': 1}, {}], '[M+K-2H]-': [{'K': 1}, {'H': 2}], '[M+Na-2H]-': [{'Na': 1}, {'H': 2}]}
class LowResGCMSExport:
 55class LowResGCMSExport:
 56    """A class to export low resolution GC-MS data.
 57
 58    This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
 59
 60    Parameters:
 61    ----------
 62    out_file_path : str
 63        The output file path.
 64    gcms : object
 65        The low resolution GCMS object.
 66
 67    Attributes:
 68    ----------
 69    output_file : Path
 70        The output file path as a Path object.
 71    gcms : object
 72        The low resolution GCMS object.
 73
 74    Methods:
 75    -------
 76    * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
 77    * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
 78    * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
 79    * to_excel(write_mode='a', write_metadata=True, id_label="corems:"),
 80        Export the data to an Excel file.
 81    * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:").
 82        Export the data to a CSV file.
 83    * to_hdf(id_label="corems:").
 84        Export the data to an HDF5 file.
 85    * get_data_stats(gcms).
 86        Get statistics about the GCMS data.
 87
 88    """
 89
 90    def __init__(self, out_file_path, gcms):
 91        self.output_file = Path(out_file_path)
 92
 93        self.gcms = gcms
 94
 95        self._init_columns()
 96
 97    def _init_columns(self):
 98        """Initialize the column names for the exported data.
 99
100        Returns:
101        -------
102        list
103            The list of column names.
104        """
105
106        columns = [
107            "Sample name",
108            "Peak Index",
109            "Retention Time",
110            "Retention Time Ref",
111            "Peak Height",
112            "Peak Area",
113            "Retention index",
114            "Retention index Ref",
115            "Retention Index Score",
116            "Similarity Score",
117            "Spectral Similarity Score",
118            "Compound Name",
119            "Chebi ID",
120            "Kegg Compound ID",
121            "Inchi",
122            "Inchi Key",
123            "Smiles",
124            "Molecular Formula",
125            "IUPAC Name",
126            "Traditional Name",
127            "Common Name",
128            "Derivatization",
129        ]
130
131        if self.gcms.molecular_search_settings.exploratory_mode:
132            columns.extend(
133                [
134                    "Weighted Cosine Correlation",
135                    "Cosine Correlation",
136                    "Stein Scott Similarity",
137                    "Pearson Correlation",
138                    "Spearman Correlation",
139                    "Kendall Tau Correlation",
140                    "Euclidean Distance",
141                    "Manhattan Distance",
142                    "Jaccard Distance",
143                    "DWT Correlation",
144                    "DFT Correlation",
145                ]
146            )
147
148            columns.extend(list(methods_name.values()))
149
150        return columns
151
152    def get_pandas_df(self, id_label="corems:"):
153        """Get the exported data as a Pandas DataFrame.
154
155        Parameters:
156        ----------
157        id_label : str, optional
158            The ID label for the data. Default is "corems:".
159
160        Returns:
161        -------
162        DataFrame
163            The exported data as a Pandas DataFrame.
164        """
165
166        columns = self._init_columns()
167
168        dict_data_list = self.get_list_dict_data(self.gcms)
169
170        df = DataFrame(dict_data_list, columns=columns)
171
172        df.name = self.gcms.sample_name
173
174        return df
175
176    def get_json(self, nan=False, id_label="corems:"):
177        """Get the exported data as a JSON string.
178
179        Parameters:
180        ----------
181        nan : bool, optional
182            Whether to include NaN values in the JSON string. Default is False.
183        id_label : str, optional
184            The ID label for the data. Default is "corems:".
185
186        """
187
188        import json
189
190        dict_data_list = self.get_list_dict_data(self.gcms)
191
192        return json.dumps(
193            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
194        )
195
196    def to_pandas(self, write_metadata=True, id_label="corems:"):
197        """Export the data to a Pandas DataFrame and save it as a pickle file.
198
199        Parameters:
200        ----------
201        write_metadata : bool, optional
202            Whether to write metadata to the output file.
203        id_label : str, optional
204            The ID label for the data.
205        """
206
207        columns = self._init_columns()
208
209        dict_data_list = self.get_list_dict_data(self.gcms)
210
211        df = DataFrame(dict_data_list, columns=columns)
212
213        df.to_pickle(self.output_file.with_suffix(".pkl"))
214
215        if write_metadata:
216            self.write_settings(
217                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
218            )
219
220    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
221        """Export the data to an Excel file.
222
223        Parameters:
224        ----------
225        write_mode : str, optional
226            The write mode for the Excel file. Default is 'a' (append).
227        write_metadata : bool, optional
228            Whether to write metadata to the output file. Default is True.
229        id_label : str, optional
230            The ID label for the data. Default is "corems:".
231        """
232
233        out_put_path = self.output_file.with_suffix(".xlsx")
234
235        columns = self._init_columns()
236
237        dict_data_list = self.get_list_dict_data(self.gcms)
238
239        df = DataFrame(dict_data_list, columns=columns)
240
241        if write_mode == "a" and out_put_path.exists():
242            writer = ExcelWriter(out_put_path, engine="openpyxl")
243            # try to open an existing workbook
244            writer.book = load_workbook(out_put_path)
245            # copy existing sheets
246            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
247            # read existing file
248            reader = read_excel(out_put_path)
249            # write out the new sheet
250            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
251
252            writer.close()
253        else:
254            df.to_excel(
255                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
256            )
257
258        if write_metadata:
259            self.write_settings(out_put_path, self.gcms, id_label=id_label)
260
261    def to_csv(
262        self,
263        separate_output=False,
264        write_mode="w",
265        write_metadata=True,
266        id_label="corems:",
267    ):
268        """Export the data to a CSV file.
269
270        Parameters:
271        ----------
272        separate_output : bool, optional
273            Whether to separate the output into multiple files. Default is False.
274        write_mode : str, optional
275            The write mode for the CSV file. Default is 'w' (write).
276        write_metadata : bool, optional
277            Whether to write metadata to the output file. Default is True.
278        id_label : str, optional
279            The ID label for the data. Default is "corems:".
280        """
281
282        if separate_output:
283            # set write mode to write
284            # this mode will overwrite the file without warning
285            write_mode = "w"
286        else:
287            # set write mode to append
288            write_mode = "a"
289
290        columns = self._init_columns()
291
292        dict_data_list = self.get_list_dict_data(self.gcms)
293
294        out_put_path = self.output_file.with_suffix(".csv")
295
296        write_header = not out_put_path.exists()
297
298        try:
299            with open(out_put_path, write_mode, newline="") as csvfile:
300                writer = csv.DictWriter(csvfile, fieldnames=columns)
301                if write_header:
302                    writer.writeheader()
303                for data in dict_data_list:
304                    writer.writerow(data)
305
306            if write_metadata:
307                self.write_settings(out_put_path, self.gcms, id_label=id_label)
308
309        except IOError as ioerror:
310            print(ioerror)
311
312    def to_hdf(self, id_label="corems:"):
313        """Export the data to an HDF5 file.
314
315        Parameters:
316        ----------
317        id_label : str, optional
318            The ID label for the data. Default is "corems:".
319        """
320
321        # save sample at a time
322        def add_compound(gc_peak, compound_obj):
323            modifier = compound_obj.classify if compound_obj.classify else ""
324            compound_group = compound_obj.name.replace("/", "") + " " + modifier
325
326            if compound_group not in peak_group:
327                compound_group = peak_group.create_group(compound_group)
328
329                # compound_group.attrs["retention_time"] = compound_obj.retention_time
330                compound_group.attrs["retention_index"] = compound_obj.ri
331                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
332                compound_group.attrs["spectral_similarity_score"] = (
333                    compound_obj.spectral_similarity_score
334                )
335                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
336
337                compond_mz = compound_group.create_dataset(
338                    "mz", data=np.array(compound_obj.mz), dtype="f8"
339                )
340                compond_abundance = compound_group.create_dataset(
341                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
342                )
343
344                if self.gcms.molecular_search_settings.exploratory_mode:
345                    compound_group.attrs["Spectral Similarities"] = json.dumps(
346                        compound_obj.spectral_similarity_scores,
347                        sort_keys=False,
348                        indent=4,
349                        separators=(",", ":"),
350                    )
351            else:
352                warnings.warn("Skipping duplicate reference compound.")
353
354        import json
355        from datetime import datetime, timezone
356
357        import h5py
358        import numpy as np
359
360        output_path = self.output_file.with_suffix(".hdf5")
361
362        with h5py.File(output_path, "w") as hdf_handle:
363            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
364            hdf_handle.attrs["time_stamp"] = timenow
365            hdf_handle.attrs["data_structure"] = "gcms"
366            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
367            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
368
369            hdf_handle.attrs["sample_id"] = "self.gcms.id"
370            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
371            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
372            hdf_handle.attrs["output_data"] = str(output_path)
373            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
374            hdf_handle.attrs["corems_version"] = __version__
375
376            hdf_handle.attrs["Stats"] = json.dumps(
377                self.get_data_stats(self.gcms),
378                sort_keys=False,
379                indent=4,
380                separators=(",", ": "),
381            )
382            hdf_handle.attrs["Calibration"] = json.dumps(
383                self.get_calibration_stats(self.gcms, id_label),
384                sort_keys=False,
385                indent=4,
386                separators=(",", ": "),
387            )
388            hdf_handle.attrs["Blank"] = json.dumps(
389                self.get_blank_stats(self.gcms),
390                sort_keys=False,
391                indent=4,
392                separators=(",", ": "),
393            )
394
395            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
396            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
397                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
398            )
399
400            scans_dataset = hdf_handle.create_dataset(
401                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
402            )
403            rt_dataset = hdf_handle.create_dataset(
404                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
405            )
406            tic_dataset = hdf_handle.create_dataset(
407                "tic", data=np.array(self.gcms.tic), dtype="f8"
408            )
409            processed_tic_dataset = hdf_handle.create_dataset(
410                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
411            )
412
413            output_score_method = (
414                self.gcms.molecular_search_settings.output_score_method
415            )
416
417            for gc_peak in self.gcms:
418                # print(gc_peak.retention_time)
419                # print(gc_peak.tic)
420
421                # check if there is a compound candidate
422                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
423                peak_group.attrs["deconvolution"] = int(
424                    self.gcms.chromatogram_settings.use_deconvolution
425                )
426
427                peak_group.attrs["start_scan"] = gc_peak.start_scan
428                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
429                peak_group.attrs["final_scan"] = gc_peak.final_scan
430
431                peak_group.attrs["retention_index"] = gc_peak.ri
432                peak_group.attrs["retention_time"] = gc_peak.retention_time
433                peak_group.attrs["area"] = gc_peak.area
434
435                mz = peak_group.create_dataset(
436                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
437                )
438                abundance = peak_group.create_dataset(
439                    "abundance",
440                    data=np.array(gc_peak.mass_spectrum.abundance),
441                    dtype="f8",
442                )
443
444                if gc_peak:
445                    if output_score_method == "highest_sim_score":
446                        compound_obj = gc_peak.highest_score_compound
447                        add_compound(gc_peak, compound_obj)
448
449                    elif output_score_method == "highest_ss":
450                        compound_obj = gc_peak.highest_ss_compound
451                        add_compound(gc_peak, compound_obj)
452
453                    else:
454                        for compound_obj in gc_peak:
455                            add_compound(gc_peak, compound_obj)
456
457    def get_data_stats(self, gcms):
458        """Get statistics about the GCMS data.
459
460        Parameters:
461        ----------
462        gcms : object
463            The low resolution GCMS object.
464
465        Returns:
466        -------
467        dict
468            A dictionary containing the data statistics.
469        """
470
471        matched_peaks = gcms.matched_peaks
472        no_matched_peaks = gcms.no_matched_peaks
473        unique_metabolites = gcms.unique_metabolites
474
475        peak_matchs_above_0p85 = 0
476        unique_peak_match_above_0p85 = 0
477        for match_peak in matched_peaks:
478            gc_peak_above_85 = 0
479            matches_above_85 = list(
480                filter(lambda m: m.similarity_score >= 0.85, match_peak)
481            )
482            if matches_above_85:
483                peak_matchs_above_0p85 += 1
484            if len(matches_above_85) == 1:
485                unique_peak_match_above_0p85 += 1
486
487        data_stats = {}
488        data_stats["average_signal_noise"] = "ni"
489        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
490        data_stats["total_number_peaks"] = len(gcms)
491        data_stats["total_peaks_matched"] = len(matched_peaks)
492        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
493        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
494        data_stats["single_matches_above_similarity_score_0.85"] = (
495            unique_peak_match_above_0p85
496        )
497        data_stats["unique_metabolites"] = len(unique_metabolites)
498
499        return data_stats
500
501    def get_calibration_stats(self, gcms, id_label):
502        """Get statistics about the GC-MS calibration.
503
504        Parameters:
505        ----------
506        """
507        calibration_parameters = {}
508
509        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
510        calibration_parameters["data_url"] = str(gcms.cal_file_path)
511        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
512        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
513        calibration_parameters["calibration_method"] = ""
514
515        return calibration_parameters
516
517    def get_blank_stats(self, gcms):
518        """Get statistics about the GC-MS blank."""
519        blank_parameters = {}
520
521        blank_parameters["data_name"] = "ni"
522        blank_parameters["blank_id"] = "ni"
523        blank_parameters["data_url"] = "ni"
524        blank_parameters["has_input"] = "ni"
525        blank_parameters["common_features_to_blank"] = "ni"
526
527        return blank_parameters
528
529    def get_instrument_metadata(self, gcms):
530        """Get metadata about the GC-MS instrument."""
531        instrument_metadata = {}
532
533        instrument_metadata["analyzer"] = gcms.analyzer
534        instrument_metadata["instrument_label"] = gcms.instrument_label
535        instrument_metadata["instrument_id"] = uuid.uuid4().hex
536
537        return instrument_metadata
538
539    def get_data_metadata(self, gcms, id_label, output_path):
540        """Get metadata about the GC-MS data.
541
542        Parameters:
543        ----------
544        gcms : object
545            The low resolution GCMS object.
546        id_label : str
547            The ID label for the data.
548        output_path : str
549            The output file path.
550
551        Returns:
552        -------
553        dict
554            A dictionary containing the data metadata.
555        """
556        if isinstance(output_path, str):
557            output_path = Path(output_path)
558
559        paramaters_path = output_path.with_suffix(".json")
560
561        if paramaters_path.exists():
562            with paramaters_path.open() as current_param:
563                metadata = json.load(current_param)
564                data_metadata = metadata.get("Data")
565        else:
566            data_metadata = {}
567            data_metadata["data_name"] = []
568            data_metadata["input_data_url"] = []
569            data_metadata["has_input"] = []
570
571        data_metadata["data_name"].append(gcms.sample_name)
572        data_metadata["input_data_url"].append(str(gcms.file_location))
573        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
574
575        data_metadata["output_data_name"] = str(output_path.stem)
576        data_metadata["output_data_url"] = str(output_path)
577        data_metadata["has_output"] = id_label + corems_md5(output_path)
578
579        return data_metadata
580
581    def get_parameters_json(self, gcms, id_label, output_path):
582        """Get the parameters as a JSON string.
583
584        Parameters:
585        ----------
586        gcms : GCMS object
587            The low resolution GCMS object.
588        id_label : str
589            The ID label for the data.
590        output_path : str
591            The output file path.
592
593        Returns:
594        -------
595        str
596            The parameters as a JSON string.
597        """
598
599        output_parameters_dict = {}
600        output_parameters_dict["Data"] = self.get_data_metadata(
601            gcms, id_label, output_path
602        )
603        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
604        output_parameters_dict["Calibration"] = self.get_calibration_stats(
605            gcms, id_label
606        )
607        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
608        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
609        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
610        corems_dict_setting["corems_version"] = __version__
611        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
612        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
613        output = json.dumps(
614            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
615        )
616
617        return output
618
619    def write_settings(self, output_path, gcms, id_label="emsl:"):
620        """Write the settings to a JSON file.
621
622        Parameters:
623        ----------
624        output_path : str
625            The output file path.
626        gcms : GCMS object
627            The low resolution GCMS object.
628        id_label : str
629            The ID label for the data. Default is "emsl:".
630
631        """
632
633        output = self.get_parameters_json(gcms, id_label, output_path)
634
635        with open(
636            output_path.with_suffix(".json"),
637            "w",
638            encoding="utf8",
639        ) as outfile:
640            outfile.write(output)
641
642    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
643        """Get the exported data as a list of dictionaries.
644
645        Parameters:
646        ----------
647        gcms : object
648            The low resolution GCMS object.
649        include_no_match : bool, optional
650            Whether to include no match data. Default is True.
651        no_match_inline : bool, optional
652            Whether to include no match data inline. Default is False.
653
654        Returns:
655        -------
656        list
657            The exported data as a list of dictionaries.
658        """
659
660        output_score_method = gcms.molecular_search_settings.output_score_method
661
662        dict_data_list = []
663
664        def add_match_dict_data():
665            derivatization = "{}:{}:{}".format(
666                compound_obj.classify,
667                compound_obj.derivativenum,
668                compound_obj.derivatization,
669            )
670            out_dict = {
671                "Sample name": gcms.sample_name,
672                "Peak Index": gcpeak_index,
673                "Retention Time": gc_peak.retention_time,
674                "Retention Time Ref": compound_obj.retention_time,
675                "Peak Height": gc_peak.tic,
676                "Peak Area": gc_peak.area,
677                "Retention index": gc_peak.ri,
678                "Retention index Ref": compound_obj.ri,
679                "Retention Index Score": compound_obj.ri_score,
680                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
681                "Similarity Score": compound_obj.similarity_score,
682                "Compound Name": compound_obj.name,
683                "Chebi ID": compound_obj.metadata.chebi,
684                "Kegg Compound ID": compound_obj.metadata.kegg,
685                "Inchi": compound_obj.metadata.inchi,
686                "Inchi Key": compound_obj.metadata.inchikey,
687                "Smiles": compound_obj.metadata.smiles,
688                "Molecular Formula": compound_obj.formula,
689                "IUPAC Name": compound_obj.metadata.iupac_name,
690                "Traditional Name": compound_obj.metadata.traditional_name,
691                "Common Name": compound_obj.metadata.common_name,
692                "Derivatization": derivatization,
693            }
694
695            if self.gcms.molecular_search_settings.exploratory_mode:
696                out_dict.update(
697                    {
698                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
699                            "weighted_cosine_correlation"
700                        ),
701                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
702                            "cosine_correlation"
703                        ),
704                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
705                            "stein_scott_similarity"
706                        ),
707                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
708                            "pearson_correlation"
709                        ),
710                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
711                            "spearman_correlation"
712                        ),
713                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
714                            "kendall_tau_correlation"
715                        ),
716                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
717                            "dft_correlation"
718                        ),
719                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
720                            "dwt_correlation"
721                        ),
722                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
723                            "euclidean_distance"
724                        ),
725                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
726                            "manhattan_distance"
727                        ),
728                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
729                            "jaccard_distance"
730                        ),
731                    }
732                )
733                for method in methods_name:
734                    out_dict[methods_name.get(method)] = (
735                        compound_obj.spectral_similarity_scores.get(method)
736                    )
737
738            dict_data_list.append(out_dict)
739
740        def add_no_match_dict_data():
741            dict_data_list.append(
742                {
743                    "Sample name": gcms.sample_name,
744                    "Peak Index": gcpeak_index,
745                    "Retention Time": gc_peak.retention_time,
746                    "Peak Height": gc_peak.tic,
747                    "Peak Area": gc_peak.area,
748                    "Retention index": gc_peak.ri,
749                }
750            )
751
752        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
753            # check if there is a compound candidate
754            if gc_peak:
755                if output_score_method == "highest_sim_score":
756                    compound_obj = gc_peak.highest_score_compound
757                    add_match_dict_data()
758
759                elif output_score_method == "highest_ss":
760                    compound_obj = gc_peak.highest_ss_compound
761                    add_match_dict_data()
762
763                else:
764                    for compound_obj in gc_peak:
765                        add_match_dict_data()  # add monoisotopic peak
766
767            else:
768                # include not_match
769                if include_no_match and no_match_inline:
770                    add_no_match_dict_data()
771
772        if include_no_match and not no_match_inline:
773            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
774                if not gc_peak:
775                    add_no_match_dict_data()
776
777        return dict_data_list

A class to export low resolution GC-MS data.

This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.

Parameters:

out_file_path : str The output file path. gcms : object The low resolution GCMS object.

Attributes:

output_file : Path The output file path as a Path object. gcms : object The low resolution GCMS object.

Methods:

  • get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
  • get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
  • to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
  • to_excel(write_mode='a', write_metadata=True, id_label="corems:"), Export the data to an Excel file.
  • to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). Export the data to a CSV file.
  • to_hdf(id_label="corems:"). Export the data to an HDF5 file.
  • get_data_stats(gcms). Get statistics about the GCMS data.
LowResGCMSExport(out_file_path, gcms)
90    def __init__(self, out_file_path, gcms):
91        self.output_file = Path(out_file_path)
92
93        self.gcms = gcms
94
95        self._init_columns()
output_file
gcms
def get_pandas_df(self, id_label='corems:'):
152    def get_pandas_df(self, id_label="corems:"):
153        """Get the exported data as a Pandas DataFrame.
154
155        Parameters:
156        ----------
157        id_label : str, optional
158            The ID label for the data. Default is "corems:".
159
160        Returns:
161        -------
162        DataFrame
163            The exported data as a Pandas DataFrame.
164        """
165
166        columns = self._init_columns()
167
168        dict_data_list = self.get_list_dict_data(self.gcms)
169
170        df = DataFrame(dict_data_list, columns=columns)
171
172        df.name = self.gcms.sample_name
173
174        return df

Get the exported data as a Pandas DataFrame.

Parameters:

id_label : str, optional The ID label for the data. Default is "corems:".

Returns:

DataFrame The exported data as a Pandas DataFrame.

def get_json(self, nan=False, id_label='corems:'):
176    def get_json(self, nan=False, id_label="corems:"):
177        """Get the exported data as a JSON string.
178
179        Parameters:
180        ----------
181        nan : bool, optional
182            Whether to include NaN values in the JSON string. Default is False.
183        id_label : str, optional
184            The ID label for the data. Default is "corems:".
185
186        """
187
188        import json
189
190        dict_data_list = self.get_list_dict_data(self.gcms)
191
192        return json.dumps(
193            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
194        )

Get the exported data as a JSON string.

Parameters:

nan : bool, optional Whether to include NaN values in the JSON string. Default is False. id_label : str, optional The ID label for the data. Default is "corems:".

def to_pandas(self, write_metadata=True, id_label='corems:'):
196    def to_pandas(self, write_metadata=True, id_label="corems:"):
197        """Export the data to a Pandas DataFrame and save it as a pickle file.
198
199        Parameters:
200        ----------
201        write_metadata : bool, optional
202            Whether to write metadata to the output file.
203        id_label : str, optional
204            The ID label for the data.
205        """
206
207        columns = self._init_columns()
208
209        dict_data_list = self.get_list_dict_data(self.gcms)
210
211        df = DataFrame(dict_data_list, columns=columns)
212
213        df.to_pickle(self.output_file.with_suffix(".pkl"))
214
215        if write_metadata:
216            self.write_settings(
217                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
218            )

Export the data to a Pandas DataFrame and save it as a pickle file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. id_label : str, optional The ID label for the data.

def to_excel(self, write_mode='a', write_metadata=True, id_label='corems:'):
220    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
221        """Export the data to an Excel file.
222
223        Parameters:
224        ----------
225        write_mode : str, optional
226            The write mode for the Excel file. Default is 'a' (append).
227        write_metadata : bool, optional
228            Whether to write metadata to the output file. Default is True.
229        id_label : str, optional
230            The ID label for the data. Default is "corems:".
231        """
232
233        out_put_path = self.output_file.with_suffix(".xlsx")
234
235        columns = self._init_columns()
236
237        dict_data_list = self.get_list_dict_data(self.gcms)
238
239        df = DataFrame(dict_data_list, columns=columns)
240
241        if write_mode == "a" and out_put_path.exists():
242            writer = ExcelWriter(out_put_path, engine="openpyxl")
243            # try to open an existing workbook
244            writer.book = load_workbook(out_put_path)
245            # copy existing sheets
246            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
247            # read existing file
248            reader = read_excel(out_put_path)
249            # write out the new sheet
250            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
251
252            writer.close()
253        else:
254            df.to_excel(
255                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
256            )
257
258        if write_metadata:
259            self.write_settings(out_put_path, self.gcms, id_label=id_label)

Export the data to an Excel file.

Parameters:

write_mode : str, optional The write mode for the Excel file. Default is 'a' (append). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".

def to_csv( self, separate_output=False, write_mode='w', write_metadata=True, id_label='corems:'):
261    def to_csv(
262        self,
263        separate_output=False,
264        write_mode="w",
265        write_metadata=True,
266        id_label="corems:",
267    ):
268        """Export the data to a CSV file.
269
270        Parameters:
271        ----------
272        separate_output : bool, optional
273            Whether to separate the output into multiple files. Default is False.
274        write_mode : str, optional
275            The write mode for the CSV file. Default is 'w' (write).
276        write_metadata : bool, optional
277            Whether to write metadata to the output file. Default is True.
278        id_label : str, optional
279            The ID label for the data. Default is "corems:".
280        """
281
282        if separate_output:
283            # set write mode to write
284            # this mode will overwrite the file without warning
285            write_mode = "w"
286        else:
287            # set write mode to append
288            write_mode = "a"
289
290        columns = self._init_columns()
291
292        dict_data_list = self.get_list_dict_data(self.gcms)
293
294        out_put_path = self.output_file.with_suffix(".csv")
295
296        write_header = not out_put_path.exists()
297
298        try:
299            with open(out_put_path, write_mode, newline="") as csvfile:
300                writer = csv.DictWriter(csvfile, fieldnames=columns)
301                if write_header:
302                    writer.writeheader()
303                for data in dict_data_list:
304                    writer.writerow(data)
305
306            if write_metadata:
307                self.write_settings(out_put_path, self.gcms, id_label=id_label)
308
309        except IOError as ioerror:
310            print(ioerror)

Export the data to a CSV file.

Parameters:

separate_output : bool, optional Whether to separate the output into multiple files. Default is False. write_mode : str, optional The write mode for the CSV file. Default is 'w' (write). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".

def to_hdf(self, id_label='corems:'):
312    def to_hdf(self, id_label="corems:"):
313        """Export the data to an HDF5 file.
314
315        Parameters:
316        ----------
317        id_label : str, optional
318            The ID label for the data. Default is "corems:".
319        """
320
321        # save sample at a time
322        def add_compound(gc_peak, compound_obj):
323            modifier = compound_obj.classify if compound_obj.classify else ""
324            compound_group = compound_obj.name.replace("/", "") + " " + modifier
325
326            if compound_group not in peak_group:
327                compound_group = peak_group.create_group(compound_group)
328
329                # compound_group.attrs["retention_time"] = compound_obj.retention_time
330                compound_group.attrs["retention_index"] = compound_obj.ri
331                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
332                compound_group.attrs["spectral_similarity_score"] = (
333                    compound_obj.spectral_similarity_score
334                )
335                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
336
337                compond_mz = compound_group.create_dataset(
338                    "mz", data=np.array(compound_obj.mz), dtype="f8"
339                )
340                compond_abundance = compound_group.create_dataset(
341                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
342                )
343
344                if self.gcms.molecular_search_settings.exploratory_mode:
345                    compound_group.attrs["Spectral Similarities"] = json.dumps(
346                        compound_obj.spectral_similarity_scores,
347                        sort_keys=False,
348                        indent=4,
349                        separators=(",", ":"),
350                    )
351            else:
352                warnings.warn("Skipping duplicate reference compound.")
353
354        import json
355        from datetime import datetime, timezone
356
357        import h5py
358        import numpy as np
359
360        output_path = self.output_file.with_suffix(".hdf5")
361
362        with h5py.File(output_path, "w") as hdf_handle:
363            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
364            hdf_handle.attrs["time_stamp"] = timenow
365            hdf_handle.attrs["data_structure"] = "gcms"
366            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
367            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
368
369            hdf_handle.attrs["sample_id"] = "self.gcms.id"
370            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
371            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
372            hdf_handle.attrs["output_data"] = str(output_path)
373            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
374            hdf_handle.attrs["corems_version"] = __version__
375
376            hdf_handle.attrs["Stats"] = json.dumps(
377                self.get_data_stats(self.gcms),
378                sort_keys=False,
379                indent=4,
380                separators=(",", ": "),
381            )
382            hdf_handle.attrs["Calibration"] = json.dumps(
383                self.get_calibration_stats(self.gcms, id_label),
384                sort_keys=False,
385                indent=4,
386                separators=(",", ": "),
387            )
388            hdf_handle.attrs["Blank"] = json.dumps(
389                self.get_blank_stats(self.gcms),
390                sort_keys=False,
391                indent=4,
392                separators=(",", ": "),
393            )
394
395            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
396            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
397                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
398            )
399
400            scans_dataset = hdf_handle.create_dataset(
401                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
402            )
403            rt_dataset = hdf_handle.create_dataset(
404                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
405            )
406            tic_dataset = hdf_handle.create_dataset(
407                "tic", data=np.array(self.gcms.tic), dtype="f8"
408            )
409            processed_tic_dataset = hdf_handle.create_dataset(
410                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
411            )
412
413            output_score_method = (
414                self.gcms.molecular_search_settings.output_score_method
415            )
416
417            for gc_peak in self.gcms:
418                # print(gc_peak.retention_time)
419                # print(gc_peak.tic)
420
421                # check if there is a compound candidate
422                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
423                peak_group.attrs["deconvolution"] = int(
424                    self.gcms.chromatogram_settings.use_deconvolution
425                )
426
427                peak_group.attrs["start_scan"] = gc_peak.start_scan
428                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
429                peak_group.attrs["final_scan"] = gc_peak.final_scan
430
431                peak_group.attrs["retention_index"] = gc_peak.ri
432                peak_group.attrs["retention_time"] = gc_peak.retention_time
433                peak_group.attrs["area"] = gc_peak.area
434
435                mz = peak_group.create_dataset(
436                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
437                )
438                abundance = peak_group.create_dataset(
439                    "abundance",
440                    data=np.array(gc_peak.mass_spectrum.abundance),
441                    dtype="f8",
442                )
443
444                if gc_peak:
445                    if output_score_method == "highest_sim_score":
446                        compound_obj = gc_peak.highest_score_compound
447                        add_compound(gc_peak, compound_obj)
448
449                    elif output_score_method == "highest_ss":
450                        compound_obj = gc_peak.highest_ss_compound
451                        add_compound(gc_peak, compound_obj)
452
453                    else:
454                        for compound_obj in gc_peak:
455                            add_compound(gc_peak, compound_obj)

Export the data to an HDF5 file.

Parameters:

id_label : str, optional The ID label for the data. Default is "corems:".

def get_data_stats(self, gcms):
457    def get_data_stats(self, gcms):
458        """Get statistics about the GCMS data.
459
460        Parameters:
461        ----------
462        gcms : object
463            The low resolution GCMS object.
464
465        Returns:
466        -------
467        dict
468            A dictionary containing the data statistics.
469        """
470
471        matched_peaks = gcms.matched_peaks
472        no_matched_peaks = gcms.no_matched_peaks
473        unique_metabolites = gcms.unique_metabolites
474
475        peak_matchs_above_0p85 = 0
476        unique_peak_match_above_0p85 = 0
477        for match_peak in matched_peaks:
478            gc_peak_above_85 = 0
479            matches_above_85 = list(
480                filter(lambda m: m.similarity_score >= 0.85, match_peak)
481            )
482            if matches_above_85:
483                peak_matchs_above_0p85 += 1
484            if len(matches_above_85) == 1:
485                unique_peak_match_above_0p85 += 1
486
487        data_stats = {}
488        data_stats["average_signal_noise"] = "ni"
489        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
490        data_stats["total_number_peaks"] = len(gcms)
491        data_stats["total_peaks_matched"] = len(matched_peaks)
492        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
493        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
494        data_stats["single_matches_above_similarity_score_0.85"] = (
495            unique_peak_match_above_0p85
496        )
497        data_stats["unique_metabolites"] = len(unique_metabolites)
498
499        return data_stats

Get statistics about the GCMS data.

Parameters:

gcms : object The low resolution GCMS object.

Returns:

dict A dictionary containing the data statistics.

def get_calibration_stats(self, gcms, id_label):
501    def get_calibration_stats(self, gcms, id_label):
502        """Get statistics about the GC-MS calibration.
503
504        Parameters:
505        ----------
506        """
507        calibration_parameters = {}
508
509        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
510        calibration_parameters["data_url"] = str(gcms.cal_file_path)
511        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
512        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
513        calibration_parameters["calibration_method"] = ""
514
515        return calibration_parameters

Get statistics about the GC-MS calibration.

Parameters:

def get_blank_stats(self, gcms):
517    def get_blank_stats(self, gcms):
518        """Get statistics about the GC-MS blank."""
519        blank_parameters = {}
520
521        blank_parameters["data_name"] = "ni"
522        blank_parameters["blank_id"] = "ni"
523        blank_parameters["data_url"] = "ni"
524        blank_parameters["has_input"] = "ni"
525        blank_parameters["common_features_to_blank"] = "ni"
526
527        return blank_parameters

Get statistics about the GC-MS blank.

def get_instrument_metadata(self, gcms):
529    def get_instrument_metadata(self, gcms):
530        """Get metadata about the GC-MS instrument."""
531        instrument_metadata = {}
532
533        instrument_metadata["analyzer"] = gcms.analyzer
534        instrument_metadata["instrument_label"] = gcms.instrument_label
535        instrument_metadata["instrument_id"] = uuid.uuid4().hex
536
537        return instrument_metadata

Get metadata about the GC-MS instrument.

def get_data_metadata(self, gcms, id_label, output_path):
539    def get_data_metadata(self, gcms, id_label, output_path):
540        """Get metadata about the GC-MS data.
541
542        Parameters:
543        ----------
544        gcms : object
545            The low resolution GCMS object.
546        id_label : str
547            The ID label for the data.
548        output_path : str
549            The output file path.
550
551        Returns:
552        -------
553        dict
554            A dictionary containing the data metadata.
555        """
556        if isinstance(output_path, str):
557            output_path = Path(output_path)
558
559        paramaters_path = output_path.with_suffix(".json")
560
561        if paramaters_path.exists():
562            with paramaters_path.open() as current_param:
563                metadata = json.load(current_param)
564                data_metadata = metadata.get("Data")
565        else:
566            data_metadata = {}
567            data_metadata["data_name"] = []
568            data_metadata["input_data_url"] = []
569            data_metadata["has_input"] = []
570
571        data_metadata["data_name"].append(gcms.sample_name)
572        data_metadata["input_data_url"].append(str(gcms.file_location))
573        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
574
575        data_metadata["output_data_name"] = str(output_path.stem)
576        data_metadata["output_data_url"] = str(output_path)
577        data_metadata["has_output"] = id_label + corems_md5(output_path)
578
579        return data_metadata

Get metadata about the GC-MS data.

Parameters:

gcms : object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.

Returns:

dict A dictionary containing the data metadata.

def get_parameters_json(self, gcms, id_label, output_path):
581    def get_parameters_json(self, gcms, id_label, output_path):
582        """Get the parameters as a JSON string.
583
584        Parameters:
585        ----------
586        gcms : GCMS object
587            The low resolution GCMS object.
588        id_label : str
589            The ID label for the data.
590        output_path : str
591            The output file path.
592
593        Returns:
594        -------
595        str
596            The parameters as a JSON string.
597        """
598
599        output_parameters_dict = {}
600        output_parameters_dict["Data"] = self.get_data_metadata(
601            gcms, id_label, output_path
602        )
603        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
604        output_parameters_dict["Calibration"] = self.get_calibration_stats(
605            gcms, id_label
606        )
607        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
608        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
609        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
610        corems_dict_setting["corems_version"] = __version__
611        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
612        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
613        output = json.dumps(
614            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
615        )
616
617        return output

Get the parameters as a JSON string.

Parameters:

gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.

Returns:

str The parameters as a JSON string.

def write_settings(self, output_path, gcms, id_label='emsl:'):
619    def write_settings(self, output_path, gcms, id_label="emsl:"):
620        """Write the settings to a JSON file.
621
622        Parameters:
623        ----------
624        output_path : str
625            The output file path.
626        gcms : GCMS object
627            The low resolution GCMS object.
628        id_label : str
629            The ID label for the data. Default is "emsl:".
630
631        """
632
633        output = self.get_parameters_json(gcms, id_label, output_path)
634
635        with open(
636            output_path.with_suffix(".json"),
637            "w",
638            encoding="utf8",
639        ) as outfile:
640            outfile.write(output)

Write the settings to a JSON file.

Parameters:

output_path : str The output file path. gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. Default is "emsl:".

def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
642    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
643        """Get the exported data as a list of dictionaries.
644
645        Parameters:
646        ----------
647        gcms : object
648            The low resolution GCMS object.
649        include_no_match : bool, optional
650            Whether to include no match data. Default is True.
651        no_match_inline : bool, optional
652            Whether to include no match data inline. Default is False.
653
654        Returns:
655        -------
656        list
657            The exported data as a list of dictionaries.
658        """
659
660        output_score_method = gcms.molecular_search_settings.output_score_method
661
662        dict_data_list = []
663
664        def add_match_dict_data():
665            derivatization = "{}:{}:{}".format(
666                compound_obj.classify,
667                compound_obj.derivativenum,
668                compound_obj.derivatization,
669            )
670            out_dict = {
671                "Sample name": gcms.sample_name,
672                "Peak Index": gcpeak_index,
673                "Retention Time": gc_peak.retention_time,
674                "Retention Time Ref": compound_obj.retention_time,
675                "Peak Height": gc_peak.tic,
676                "Peak Area": gc_peak.area,
677                "Retention index": gc_peak.ri,
678                "Retention index Ref": compound_obj.ri,
679                "Retention Index Score": compound_obj.ri_score,
680                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
681                "Similarity Score": compound_obj.similarity_score,
682                "Compound Name": compound_obj.name,
683                "Chebi ID": compound_obj.metadata.chebi,
684                "Kegg Compound ID": compound_obj.metadata.kegg,
685                "Inchi": compound_obj.metadata.inchi,
686                "Inchi Key": compound_obj.metadata.inchikey,
687                "Smiles": compound_obj.metadata.smiles,
688                "Molecular Formula": compound_obj.formula,
689                "IUPAC Name": compound_obj.metadata.iupac_name,
690                "Traditional Name": compound_obj.metadata.traditional_name,
691                "Common Name": compound_obj.metadata.common_name,
692                "Derivatization": derivatization,
693            }
694
695            if self.gcms.molecular_search_settings.exploratory_mode:
696                out_dict.update(
697                    {
698                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
699                            "weighted_cosine_correlation"
700                        ),
701                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
702                            "cosine_correlation"
703                        ),
704                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
705                            "stein_scott_similarity"
706                        ),
707                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
708                            "pearson_correlation"
709                        ),
710                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
711                            "spearman_correlation"
712                        ),
713                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
714                            "kendall_tau_correlation"
715                        ),
716                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
717                            "dft_correlation"
718                        ),
719                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
720                            "dwt_correlation"
721                        ),
722                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
723                            "euclidean_distance"
724                        ),
725                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
726                            "manhattan_distance"
727                        ),
728                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
729                            "jaccard_distance"
730                        ),
731                    }
732                )
733                for method in methods_name:
734                    out_dict[methods_name.get(method)] = (
735                        compound_obj.spectral_similarity_scores.get(method)
736                    )
737
738            dict_data_list.append(out_dict)
739
740        def add_no_match_dict_data():
741            dict_data_list.append(
742                {
743                    "Sample name": gcms.sample_name,
744                    "Peak Index": gcpeak_index,
745                    "Retention Time": gc_peak.retention_time,
746                    "Peak Height": gc_peak.tic,
747                    "Peak Area": gc_peak.area,
748                    "Retention index": gc_peak.ri,
749                }
750            )
751
752        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
753            # check if there is a compound candidate
754            if gc_peak:
755                if output_score_method == "highest_sim_score":
756                    compound_obj = gc_peak.highest_score_compound
757                    add_match_dict_data()
758
759                elif output_score_method == "highest_ss":
760                    compound_obj = gc_peak.highest_ss_compound
761                    add_match_dict_data()
762
763                else:
764                    for compound_obj in gc_peak:
765                        add_match_dict_data()  # add monoisotopic peak
766
767            else:
768                # include not_match
769                if include_no_match and no_match_inline:
770                    add_no_match_dict_data()
771
772        if include_no_match and not no_match_inline:
773            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
774                if not gc_peak:
775                    add_no_match_dict_data()
776
777        return dict_data_list

Get the exported data as a list of dictionaries.

Parameters:

gcms : object The low resolution GCMS object. include_no_match : bool, optional Whether to include no match data. Default is True. no_match_inline : bool, optional Whether to include no match data inline. Default is False.

Returns:

list The exported data as a list of dictionaries.

class HighResMassSpectraExport(corems.mass_spectrum.output.export.HighResMassSpecExport):
 780class HighResMassSpectraExport(HighResMassSpecExport):
 781    """A class to export high resolution mass spectra data.
 782
 783    This class provides methods to export high resolution mass spectra data to various formats
 784    such as Excel, CSV, HDF5, and Pandas DataFrame.
 785
 786    Parameters
 787    ----------
 788    out_file_path : str | Path
 789        The output file path.
 790    mass_spectra : object
 791        The high resolution mass spectra object.
 792    output_type : str, optional
 793        The output type. Default is 'excel'.
 794
 795    Attributes
 796    ----------
 797    output_file : Path
 798        The output file path without suffix
 799    dir_loc : Path
 800        The directory location for the output file,
 801        by default this will be the output_file + ".corems" and all output files will be
 802        written into this location
 803    mass_spectra : MassSpectraBase
 804        The high resolution mass spectra object.
 805    """
 806
 807    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
 808        super().__init__(
 809            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
 810        )
 811
 812        self.dir_loc = Path(out_file_path + ".corems")
 813        self.dir_loc.mkdir(exist_ok=True)
 814        # Place the output file in the directory
 815        self.output_file = self.dir_loc / Path(out_file_path).name
 816        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
 817        self.mass_spectra = mass_spectra
 818        self.atoms_order_list = None
 819        self._init_columns()
 820
 821    def get_pandas_df(self):
 822        """Get the mass spectra as a list of Pandas DataFrames."""
 823
 824        list_df = []
 825
 826        for mass_spectrum in self.mass_spectra:
 827            columns = self.columns_label + self.get_all_used_atoms_in_order(
 828                mass_spectrum
 829            )
 830
 831            dict_data_list = self.get_list_dict_data(mass_spectrum)
 832
 833            df = DataFrame(dict_data_list, columns=columns)
 834
 835            scan_number = mass_spectrum.scan_number
 836
 837            df.name = str(self.output_file) + "_" + str(scan_number)
 838
 839            list_df.append(df)
 840
 841        return list_df
 842
 843    def to_pandas(self, write_metadata=True):
 844        """Export the data to a Pandas DataFrame and save it as a pickle file.
 845
 846        Parameters:
 847        ----------
 848        write_metadata : bool, optional
 849            Whether to write metadata to the output file. Default is True.
 850        """
 851
 852        for mass_spectrum in self.mass_spectra:
 853            columns = self.columns_label + self.get_all_used_atoms_in_order(
 854                mass_spectrum
 855            )
 856
 857            dict_data_list = self.get_list_dict_data(mass_spectrum)
 858
 859            df = DataFrame(dict_data_list, columns=columns)
 860
 861            scan_number = mass_spectrum.scan_number
 862
 863            out_filename = Path(
 864                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
 865            )
 866
 867            df.to_pickle(self.dir_loc / out_filename)
 868
 869            if write_metadata:
 870                self.write_settings(
 871                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 872                )
 873
 874    def to_excel(self, write_metadata=True):
 875        """Export the data to an Excel file.
 876
 877        Parameters:
 878        ----------
 879        write_metadata : bool, optional
 880            Whether to write metadata to the output file. Default is True.
 881        """
 882        for mass_spectrum in self.mass_spectra:
 883            columns = self.columns_label + self.get_all_used_atoms_in_order(
 884                mass_spectrum
 885            )
 886
 887            dict_data_list = self.get_list_dict_data(mass_spectrum)
 888
 889            df = DataFrame(dict_data_list, columns=columns)
 890
 891            scan_number = mass_spectrum.scan_number
 892
 893            out_filename = Path(
 894                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
 895            )
 896
 897            df.to_excel(self.dir_loc / out_filename)
 898
 899            if write_metadata:
 900                self.write_settings(
 901                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 902                )
 903
 904    def to_csv(self, write_metadata=True):
 905        """Export the data to a CSV file.
 906
 907        Parameters:
 908        ----------
 909        write_metadata : bool, optional
 910            Whether to write metadata to the output file. Default is True.
 911        """
 912        import csv
 913
 914        for mass_spectrum in self.mass_spectra:
 915            columns = self.columns_label + self.get_all_used_atoms_in_order(
 916                mass_spectrum
 917            )
 918
 919            scan_number = mass_spectrum.scan_number
 920
 921            dict_data_list = self.get_list_dict_data(mass_spectrum)
 922
 923            out_filename = Path(
 924                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
 925            )
 926
 927            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
 928                writer = csv.DictWriter(csvfile, fieldnames=columns)
 929                writer.writeheader()
 930                for data in dict_data_list:
 931                    writer.writerow(data)
 932
 933            if write_metadata:
 934                self.write_settings(
 935                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 936                )
 937
 938    def get_mass_spectra_attrs(self):
 939        """Get the mass spectra attributes as a JSON string.
 940
 941        Parameters:
 942        ----------
 943        mass_spectra : object
 944            The high resolution mass spectra object.
 945
 946        Returns:
 947        -------
 948        str
 949            The mass spectra attributes as a JSON string.
 950        """
 951        dict_ms_attrs = {}
 952        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
 953        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
 954        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
 955
 956        return json.dumps(
 957            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
 958        )
 959
 960    def to_hdf(self, overwrite=False, export_raw=True):
 961        """Export the data to an HDF5 file.
 962
 963        Parameters
 964        ----------
 965        overwrite : bool, optional
 966            Whether to overwrite the output file. Default is False.
 967        export_raw : bool, optional
 968            Whether to export the raw mass spectra data. Default is True.
 969        """
 970        if overwrite:
 971            if self.output_file.with_suffix(".hdf5").exists():
 972                self.output_file.with_suffix(".hdf5").unlink()
 973
 974        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 975            if not hdf_handle.attrs.get("date_utc"):
 976                # Set metadata for all mass spectra
 977                timenow = str(
 978                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 979                )
 980                hdf_handle.attrs["date_utc"] = timenow
 981                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 982                hdf_handle.attrs["data_structure"] = "mass_spectra"
 983                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 984                hdf_handle.attrs["instrument_label"] = (
 985                    self.mass_spectra.instrument_label
 986                )
 987                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 988                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 989                hdf_handle.attrs["parser_type"] = (
 990                    self.mass_spectra.spectra_parser_class.__name__
 991                )
 992                hdf_handle.attrs["original_file_location"] = (
 993                    self.mass_spectra.file_location._str
 994                )
 995
 996            if "mass_spectra" not in hdf_handle:
 997                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 998            else:
 999                mass_spectra_group = hdf_handle.get("mass_spectra")
1000
1001            for mass_spectrum in self.mass_spectra:
1002                group_key = str(int(mass_spectrum.scan_number))
1003
1004                self.add_mass_spectrum_to_hdf5(
1005                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1006                )

A class to export high resolution mass spectra data.

This class provides methods to export high resolution mass spectra data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.

Parameters
  • out_file_path (str | Path): The output file path.
  • mass_spectra (object): The high resolution mass spectra object.
  • output_type (str, optional): The output type. Default is 'excel'.
Attributes
  • output_file (Path): The output file path without suffix
  • dir_loc (Path): The directory location for the output file, by default this will be the output_file + ".corems" and all output files will be written into this location
  • mass_spectra (MassSpectraBase): The high resolution mass spectra object.
HighResMassSpectraExport(out_file_path, mass_spectra, output_type='excel')
807    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
808        super().__init__(
809            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
810        )
811
812        self.dir_loc = Path(out_file_path + ".corems")
813        self.dir_loc.mkdir(exist_ok=True)
814        # Place the output file in the directory
815        self.output_file = self.dir_loc / Path(out_file_path).name
816        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
817        self.mass_spectra = mass_spectra
818        self.atoms_order_list = None
819        self._init_columns()

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

dir_loc
output_file
mass_spectra
atoms_order_list
def get_pandas_df(self):
821    def get_pandas_df(self):
822        """Get the mass spectra as a list of Pandas DataFrames."""
823
824        list_df = []
825
826        for mass_spectrum in self.mass_spectra:
827            columns = self.columns_label + self.get_all_used_atoms_in_order(
828                mass_spectrum
829            )
830
831            dict_data_list = self.get_list_dict_data(mass_spectrum)
832
833            df = DataFrame(dict_data_list, columns=columns)
834
835            scan_number = mass_spectrum.scan_number
836
837            df.name = str(self.output_file) + "_" + str(scan_number)
838
839            list_df.append(df)
840
841        return list_df

Get the mass spectra as a list of Pandas DataFrames.

def to_pandas(self, write_metadata=True):
843    def to_pandas(self, write_metadata=True):
844        """Export the data to a Pandas DataFrame and save it as a pickle file.
845
846        Parameters:
847        ----------
848        write_metadata : bool, optional
849            Whether to write metadata to the output file. Default is True.
850        """
851
852        for mass_spectrum in self.mass_spectra:
853            columns = self.columns_label + self.get_all_used_atoms_in_order(
854                mass_spectrum
855            )
856
857            dict_data_list = self.get_list_dict_data(mass_spectrum)
858
859            df = DataFrame(dict_data_list, columns=columns)
860
861            scan_number = mass_spectrum.scan_number
862
863            out_filename = Path(
864                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
865            )
866
867            df.to_pickle(self.dir_loc / out_filename)
868
869            if write_metadata:
870                self.write_settings(
871                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
872                )

Export the data to a Pandas DataFrame and save it as a pickle file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def to_excel(self, write_metadata=True):
874    def to_excel(self, write_metadata=True):
875        """Export the data to an Excel file.
876
877        Parameters:
878        ----------
879        write_metadata : bool, optional
880            Whether to write metadata to the output file. Default is True.
881        """
882        for mass_spectrum in self.mass_spectra:
883            columns = self.columns_label + self.get_all_used_atoms_in_order(
884                mass_spectrum
885            )
886
887            dict_data_list = self.get_list_dict_data(mass_spectrum)
888
889            df = DataFrame(dict_data_list, columns=columns)
890
891            scan_number = mass_spectrum.scan_number
892
893            out_filename = Path(
894                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
895            )
896
897            df.to_excel(self.dir_loc / out_filename)
898
899            if write_metadata:
900                self.write_settings(
901                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
902                )

Export the data to an Excel file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def to_csv(self, write_metadata=True):
904    def to_csv(self, write_metadata=True):
905        """Export the data to a CSV file.
906
907        Parameters:
908        ----------
909        write_metadata : bool, optional
910            Whether to write metadata to the output file. Default is True.
911        """
912        import csv
913
914        for mass_spectrum in self.mass_spectra:
915            columns = self.columns_label + self.get_all_used_atoms_in_order(
916                mass_spectrum
917            )
918
919            scan_number = mass_spectrum.scan_number
920
921            dict_data_list = self.get_list_dict_data(mass_spectrum)
922
923            out_filename = Path(
924                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
925            )
926
927            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
928                writer = csv.DictWriter(csvfile, fieldnames=columns)
929                writer.writeheader()
930                for data in dict_data_list:
931                    writer.writerow(data)
932
933            if write_metadata:
934                self.write_settings(
935                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
936                )

Export the data to a CSV file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def get_mass_spectra_attrs(self):
938    def get_mass_spectra_attrs(self):
939        """Get the mass spectra attributes as a JSON string.
940
941        Parameters:
942        ----------
943        mass_spectra : object
944            The high resolution mass spectra object.
945
946        Returns:
947        -------
948        str
949            The mass spectra attributes as a JSON string.
950        """
951        dict_ms_attrs = {}
952        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
953        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
954        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
955
956        return json.dumps(
957            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
958        )

Get the mass spectra attributes as a JSON string.

Parameters:

mass_spectra : object The high resolution mass spectra object.

Returns:

str The mass spectra attributes as a JSON string.

def to_hdf(self, overwrite=False, export_raw=True):
 960    def to_hdf(self, overwrite=False, export_raw=True):
 961        """Export the data to an HDF5 file.
 962
 963        Parameters
 964        ----------
 965        overwrite : bool, optional
 966            Whether to overwrite the output file. Default is False.
 967        export_raw : bool, optional
 968            Whether to export the raw mass spectra data. Default is True.
 969        """
 970        if overwrite:
 971            if self.output_file.with_suffix(".hdf5").exists():
 972                self.output_file.with_suffix(".hdf5").unlink()
 973
 974        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 975            if not hdf_handle.attrs.get("date_utc"):
 976                # Set metadata for all mass spectra
 977                timenow = str(
 978                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 979                )
 980                hdf_handle.attrs["date_utc"] = timenow
 981                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 982                hdf_handle.attrs["data_structure"] = "mass_spectra"
 983                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 984                hdf_handle.attrs["instrument_label"] = (
 985                    self.mass_spectra.instrument_label
 986                )
 987                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 988                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 989                hdf_handle.attrs["parser_type"] = (
 990                    self.mass_spectra.spectra_parser_class.__name__
 991                )
 992                hdf_handle.attrs["original_file_location"] = (
 993                    self.mass_spectra.file_location._str
 994                )
 995
 996            if "mass_spectra" not in hdf_handle:
 997                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 998            else:
 999                mass_spectra_group = hdf_handle.get("mass_spectra")
1000
1001            for mass_spectrum in self.mass_spectra:
1002                group_key = str(int(mass_spectrum.scan_number))
1003
1004                self.add_mass_spectrum_to_hdf5(
1005                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1006                )

Export the data to an HDF5 file.

Parameters
  • overwrite (bool, optional): Whether to overwrite the output file. Default is False.
  • export_raw (bool, optional): Whether to export the raw mass spectra data. Default is True.
class LCMSExport(HighResMassSpectraExport):
1009class LCMSExport(HighResMassSpectraExport):
1010    """A class to export high resolution LC-MS data.
1011
1012    This class provides methods to export high resolution LC-MS data to HDF5.
1013
1014    Parameters
1015    ----------
1016    out_file_path : str | Path
1017        The output file path, do not include the file extension.
1018    lcms_object : LCMSBase
1019        The high resolution lc-ms object.
1020    """
1021
1022    def __init__(self, out_file_path, mass_spectra):
1023        super().__init__(out_file_path, mass_spectra, output_type="hdf5")
1024
1025    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1026        """Export the data to an HDF5.
1027
1028        Parameters
1029        ----------
1030        overwrite : bool, optional
1031            Whether to overwrite the output file. Default is False.
1032        save_parameters : bool, optional
1033            Whether to save the parameters as a separate json or toml file. Default is True.
1034        parameter_format : str, optional
1035            The format to save the parameters in. Default is 'toml'.
1036
1037        Raises
1038        ------
1039        ValueError
1040            If parameter_format is not 'json' or 'toml'.
1041        """
1042        export_profile_spectra = (
1043            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1044        )
1045
1046        # Write the mass spectra data to the hdf5 file
1047        super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra)
1048
1049        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1050        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1051            # Add scan_info to hdf5 file
1052            if "scan_info" not in hdf_handle:
1053                scan_info_group = hdf_handle.create_group("scan_info")
1054                for k, v in self.mass_spectra._scan_info.items():
1055                    array = np.array(list(v.values()))
1056                    if array.dtype.str[0:2] == "<U":
1057                        array = array.astype("S")
1058                    scan_info_group.create_dataset(k, data=array)
1059
1060            # Add ms_unprocessed to hdf5 file
1061            export_unprocessed_ms1 = (
1062                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1063            )
1064            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1065                if "ms_unprocessed" not in hdf_handle:
1066                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1067                else:
1068                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1069                for k, v in self.mass_spectra._ms_unprocessed.items():
1070                    array = np.array(v)
1071                    ms_unprocessed_group.create_dataset(str(k), data=array)
1072
1073            # Add LCMS mass features to hdf5 file
1074            if len(self.mass_spectra.mass_features) > 0:
1075                if "mass_features" not in hdf_handle:
1076                    mass_features_group = hdf_handle.create_group("mass_features")
1077                else:
1078                    mass_features_group = hdf_handle.get("mass_features")
1079
1080                # Create group for each mass feature, with key as the mass feature id
1081                for k, v in self.mass_spectra.mass_features.items():
1082                    mass_features_group.create_group(str(k))
1083                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1084                    for k2, v2 in v.__dict__.items():
1085                        if v2 is not None:
1086                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1087                            if k2 not in [
1088                                "chromatogram_parent",
1089                                "ms2_mass_spectra",
1090                                "mass_spectrum",
1091                                "_eic_data",
1092                                "ms2_similarity_results",
1093                            ]:
1094                                if k2 == "ms2_scan_numbers":
1095                                    array = np.array(v2)
1096                                    mass_features_group[str(k)].create_dataset(
1097                                        str(k2), data=array
1098                                    )
1099                                elif k2 == "_half_height_width":
1100                                    array = np.array(v2)
1101                                    mass_features_group[str(k)].create_dataset(
1102                                        str(k2), data=array
1103                                    )
1104                                elif k2 == "_ms_deconvoluted_idx":
1105                                    array = np.array(v2)
1106                                    mass_features_group[str(k)].create_dataset(
1107                                        str(k2), data=array
1108                                    )
1109                                elif k2 == "associated_mass_features_deconvoluted":
1110                                    array = np.array(v2)
1111                                    mass_features_group[str(k)].create_dataset(
1112                                        str(k2), data=array
1113                                    )
1114                                elif (
1115                                    isinstance(v2, int)
1116                                    or isinstance(v2, float)
1117                                    or isinstance(v2, str)
1118                                    or isinstance(v2, np.integer)
1119                                    or isinstance(v2, np.bool_)
1120                                ):
1121                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1122                                else:
1123                                    raise TypeError(
1124                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1125                                    )
1126
1127            # Add EIC data to hdf5 file
1128            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1129            if len(self.mass_spectra.eics) > 0 and export_eics:
1130                if "eics" not in hdf_handle:
1131                    eic_group = hdf_handle.create_group("eics")
1132                else:
1133                    eic_group = hdf_handle.get("eics")
1134
1135                # Create group for each eic
1136                for k, v in self.mass_spectra.eics.items():
1137                    eic_group.create_group(str(k))
1138                    eic_group[str(k)].attrs["mz"] = k
1139                    # Loop through each of the attributes and add them as datasets (if array)
1140                    for k2, v2 in v.__dict__.items():
1141                        if v2 is not None:
1142                            array = np.array(v2)
1143                            eic_group[str(k)].create_dataset(str(k2), data=array)
1144
1145            # Add ms2_search results to hdf5 file
1146            if len(self.mass_spectra.spectral_search_results) > 0:
1147                if "spectral_search_results" not in hdf_handle:
1148                    spectral_search_results = hdf_handle.create_group(
1149                        "spectral_search_results"
1150                    )
1151                else:
1152                    spectral_search_results = hdf_handle.get("spectral_search_results")
1153                # Create group for each search result by ms2_scan / precursor_mz
1154                for k, v in self.mass_spectra.spectral_search_results.items():
1155                    spectral_search_results.create_group(str(k))
1156                    for k2, v2 in v.items():
1157                        spectral_search_results[str(k)].create_group(str(k2))
1158                        spectral_search_results[str(k)][str(k2)].attrs[
1159                            "precursor_mz"
1160                        ] = v2.precursor_mz
1161                        spectral_search_results[str(k)][str(k2)].attrs[
1162                            "query_spectrum_id"
1163                        ] = v2.query_spectrum_id
1164                        # Loop through each of the attributes and add them as datasets (if array)
1165                        for k3, v3 in v2.__dict__.items():
1166                            if v3 is not None and k3 not in [
1167                                "query_spectrum",
1168                                "precursor_mz",
1169                                "query_spectrum_id",
1170                            ]:
1171                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1172                                    v3 = [", ".join(x) for x in v3]
1173                                array = np.array(v3)
1174                                if array.dtype.str[0:2] == "<U":
1175                                    array = array.astype("S")
1176                                spectral_search_results[str(k)][str(k2)].create_dataset(
1177                                    str(k3), data=array
1178                                )
1179
1180        # Save parameters as separate json
1181        if save_parameters:
1182            # Check if parameter_format is valid
1183            if parameter_format not in ["json", "toml"]:
1184                raise ValueError("parameter_format must be 'json' or 'toml'")
1185
1186            if parameter_format == "json":
1187                dump_lcms_settings_json(
1188                    filename=self.output_file.with_suffix(".json"),
1189                    lcms_obj=self.mass_spectra,
1190                )
1191            elif parameter_format == "toml":
1192                dump_lcms_settings_toml(
1193                    filename=self.output_file.with_suffix(".toml"),
1194                    lcms_obj=self.mass_spectra,
1195                )

A class to export high resolution LC-MS data.

This class provides methods to export high resolution LC-MS data to HDF5.

Parameters
  • out_file_path (str | Path): The output file path, do not include the file extension.
  • lcms_object (LCMSBase): The high resolution lc-ms object.
LCMSExport(out_file_path, mass_spectra)
1022    def __init__(self, out_file_path, mass_spectra):
1023        super().__init__(out_file_path, mass_spectra, output_type="hdf5")

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

def to_hdf(self, overwrite=False, save_parameters=True, parameter_format='toml'):
1025    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1026        """Export the data to an HDF5.
1027
1028        Parameters
1029        ----------
1030        overwrite : bool, optional
1031            Whether to overwrite the output file. Default is False.
1032        save_parameters : bool, optional
1033            Whether to save the parameters as a separate json or toml file. Default is True.
1034        parameter_format : str, optional
1035            The format to save the parameters in. Default is 'toml'.
1036
1037        Raises
1038        ------
1039        ValueError
1040            If parameter_format is not 'json' or 'toml'.
1041        """
1042        export_profile_spectra = (
1043            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1044        )
1045
1046        # Write the mass spectra data to the hdf5 file
1047        super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra)
1048
1049        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1050        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1051            # Add scan_info to hdf5 file
1052            if "scan_info" not in hdf_handle:
1053                scan_info_group = hdf_handle.create_group("scan_info")
1054                for k, v in self.mass_spectra._scan_info.items():
1055                    array = np.array(list(v.values()))
1056                    if array.dtype.str[0:2] == "<U":
1057                        array = array.astype("S")
1058                    scan_info_group.create_dataset(k, data=array)
1059
1060            # Add ms_unprocessed to hdf5 file
1061            export_unprocessed_ms1 = (
1062                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1063            )
1064            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1065                if "ms_unprocessed" not in hdf_handle:
1066                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1067                else:
1068                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1069                for k, v in self.mass_spectra._ms_unprocessed.items():
1070                    array = np.array(v)
1071                    ms_unprocessed_group.create_dataset(str(k), data=array)
1072
1073            # Add LCMS mass features to hdf5 file
1074            if len(self.mass_spectra.mass_features) > 0:
1075                if "mass_features" not in hdf_handle:
1076                    mass_features_group = hdf_handle.create_group("mass_features")
1077                else:
1078                    mass_features_group = hdf_handle.get("mass_features")
1079
1080                # Create group for each mass feature, with key as the mass feature id
1081                for k, v in self.mass_spectra.mass_features.items():
1082                    mass_features_group.create_group(str(k))
1083                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1084                    for k2, v2 in v.__dict__.items():
1085                        if v2 is not None:
1086                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1087                            if k2 not in [
1088                                "chromatogram_parent",
1089                                "ms2_mass_spectra",
1090                                "mass_spectrum",
1091                                "_eic_data",
1092                                "ms2_similarity_results",
1093                            ]:
1094                                if k2 == "ms2_scan_numbers":
1095                                    array = np.array(v2)
1096                                    mass_features_group[str(k)].create_dataset(
1097                                        str(k2), data=array
1098                                    )
1099                                elif k2 == "_half_height_width":
1100                                    array = np.array(v2)
1101                                    mass_features_group[str(k)].create_dataset(
1102                                        str(k2), data=array
1103                                    )
1104                                elif k2 == "_ms_deconvoluted_idx":
1105                                    array = np.array(v2)
1106                                    mass_features_group[str(k)].create_dataset(
1107                                        str(k2), data=array
1108                                    )
1109                                elif k2 == "associated_mass_features_deconvoluted":
1110                                    array = np.array(v2)
1111                                    mass_features_group[str(k)].create_dataset(
1112                                        str(k2), data=array
1113                                    )
1114                                elif (
1115                                    isinstance(v2, int)
1116                                    or isinstance(v2, float)
1117                                    or isinstance(v2, str)
1118                                    or isinstance(v2, np.integer)
1119                                    or isinstance(v2, np.bool_)
1120                                ):
1121                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1122                                else:
1123                                    raise TypeError(
1124                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1125                                    )
1126
1127            # Add EIC data to hdf5 file
1128            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1129            if len(self.mass_spectra.eics) > 0 and export_eics:
1130                if "eics" not in hdf_handle:
1131                    eic_group = hdf_handle.create_group("eics")
1132                else:
1133                    eic_group = hdf_handle.get("eics")
1134
1135                # Create group for each eic
1136                for k, v in self.mass_spectra.eics.items():
1137                    eic_group.create_group(str(k))
1138                    eic_group[str(k)].attrs["mz"] = k
1139                    # Loop through each of the attributes and add them as datasets (if array)
1140                    for k2, v2 in v.__dict__.items():
1141                        if v2 is not None:
1142                            array = np.array(v2)
1143                            eic_group[str(k)].create_dataset(str(k2), data=array)
1144
1145            # Add ms2_search results to hdf5 file
1146            if len(self.mass_spectra.spectral_search_results) > 0:
1147                if "spectral_search_results" not in hdf_handle:
1148                    spectral_search_results = hdf_handle.create_group(
1149                        "spectral_search_results"
1150                    )
1151                else:
1152                    spectral_search_results = hdf_handle.get("spectral_search_results")
1153                # Create group for each search result by ms2_scan / precursor_mz
1154                for k, v in self.mass_spectra.spectral_search_results.items():
1155                    spectral_search_results.create_group(str(k))
1156                    for k2, v2 in v.items():
1157                        spectral_search_results[str(k)].create_group(str(k2))
1158                        spectral_search_results[str(k)][str(k2)].attrs[
1159                            "precursor_mz"
1160                        ] = v2.precursor_mz
1161                        spectral_search_results[str(k)][str(k2)].attrs[
1162                            "query_spectrum_id"
1163                        ] = v2.query_spectrum_id
1164                        # Loop through each of the attributes and add them as datasets (if array)
1165                        for k3, v3 in v2.__dict__.items():
1166                            if v3 is not None and k3 not in [
1167                                "query_spectrum",
1168                                "precursor_mz",
1169                                "query_spectrum_id",
1170                            ]:
1171                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1172                                    v3 = [", ".join(x) for x in v3]
1173                                array = np.array(v3)
1174                                if array.dtype.str[0:2] == "<U":
1175                                    array = array.astype("S")
1176                                spectral_search_results[str(k)][str(k2)].create_dataset(
1177                                    str(k3), data=array
1178                                )
1179
1180        # Save parameters as separate json
1181        if save_parameters:
1182            # Check if parameter_format is valid
1183            if parameter_format not in ["json", "toml"]:
1184                raise ValueError("parameter_format must be 'json' or 'toml'")
1185
1186            if parameter_format == "json":
1187                dump_lcms_settings_json(
1188                    filename=self.output_file.with_suffix(".json"),
1189                    lcms_obj=self.mass_spectra,
1190                )
1191            elif parameter_format == "toml":
1192                dump_lcms_settings_toml(
1193                    filename=self.output_file.with_suffix(".toml"),
1194                    lcms_obj=self.mass_spectra,
1195                )

Export the data to an HDF5.

Parameters
  • overwrite (bool, optional): Whether to overwrite the output file. Default is False.
  • save_parameters (bool, optional): Whether to save the parameters as a separate json or toml file. Default is True.
  • parameter_format (str, optional): The format to save the parameters in. Default is 'toml'.
Raises
  • ValueError: If parameter_format is not 'json' or 'toml'.
class LipidomicsExport(LCMSExport):
1198class LipidomicsExport(LCMSExport):
1199    """A class to export lipidomics data.
1200
1201    This class provides methods to export lipidomics data to various formats and summarize the lipid report.
1202
1203    Parameters
1204    ----------
1205    out_file_path : str | Path
1206        The output file path, do not include the file extension.
1207    mass_spectra : object
1208        The high resolution mass spectra object.
1209    """
1210
1211    def __init__(self, out_file_path, mass_spectra):
1212        super().__init__(out_file_path, mass_spectra)
1213        self.ion_type_dict = ion_type_dict
1214
1215    @staticmethod
1216    def get_ion_formula(neutral_formula, ion_type):
1217        """From a neutral formula and an ion type, return the formula of the ion.
1218
1219        Notes
1220        -----
1221        This is a static method.
1222        If the neutral_formula is not a string, this method will return None.
1223
1224        Parameters
1225        ----------
1226        neutral_formula : str
1227            The neutral formula, this should be a string form from the MolecularFormula class
1228            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1229            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1230            e.g. MgCl2 is parsed as 'Mg Cl2.
1231        ion_type : str
1232            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1233            See the self.ion_type_dict for the available ion types.
1234
1235        Returns
1236        -------
1237        str
1238            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1239        """
1240        # If neutral_formula is not a string, return None
1241        if not isinstance(neutral_formula, str):
1242            return None
1243
1244        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1245        if re.search(r"\s", neutral_formula):
1246            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1247        else:
1248            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1249            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1250            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1251            neutral_formula = MolecularFormula(
1252                dict(
1253                    zip(
1254                        [x[0] for x in elements],
1255                        [int(x[0]) if x else 1 for x in counts],
1256                    )
1257                ),
1258                ion_charge=0,
1259            )
1260        neutral_formula_dict = neutral_formula.to_dict().copy()
1261
1262        adduct_add_dict = ion_type_dict[ion_type][0]
1263        for key in adduct_add_dict:
1264            if key in neutral_formula_dict.keys():
1265                neutral_formula_dict[key] += adduct_add_dict[key]
1266            else:
1267                neutral_formula_dict[key] = adduct_add_dict[key]
1268
1269        adduct_subtract = ion_type_dict[ion_type][1]
1270        for key in adduct_subtract:
1271            neutral_formula_dict[key] -= adduct_subtract[key]
1272
1273        return MolecularFormula(neutral_formula_dict, ion_charge=0).string
1274
1275    @staticmethod
1276    def get_isotope_type(ion_formula):
1277        """From an ion formula, return the 13C isotope type of the ion.
1278
1279        Notes
1280        -----
1281        This is a static method.
1282        If the ion_formula is not a string, this method will return None.
1283        This is currently only functional for 13C isotopes.
1284
1285        Parameters
1286        ----------
1287        ion_formula : str
1288            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1289
1290        Returns
1291        -------
1292        str
1293            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1294
1295        Raises
1296        ------
1297        ValueError
1298            If the ion_formula is not a string.
1299        """
1300        if not isinstance(ion_formula, str):
1301            return None
1302
1303        if re.search(r"\s", ion_formula):
1304            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1305        else:
1306            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1307        ion_formula_dict = ion_formula.to_dict().copy()
1308
1309        try:
1310            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1311        except KeyError:
1312            iso_class = None
1313
1314        return iso_class
1315
1316    def clean_ms1_report(self, ms1_summary_full):
1317        """Clean the MS1 report.
1318
1319        Parameters
1320        ----------
1321        ms1_summary_full : DataFrame
1322            The full MS1 summary DataFrame.
1323
1324        Returns
1325        -------
1326        DataFrame
1327            The cleaned MS1 summary DataFrame.
1328        """
1329        ms1_summary_full = ms1_summary_full.reset_index()
1330        cols_to_keep = [
1331            "mf_id",
1332            "Molecular Formula",
1333            "Ion Type",
1334            "Calculated m/z",
1335            "m/z Error (ppm)",
1336            "m/z Error Score",
1337            "Is Isotopologue",
1338            "Isotopologue Similarity",
1339            "Confidence Score",
1340        ]
1341        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1342        ms1_summary["ion_formula"] = [
1343            self.get_ion_formula(f, a)
1344            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1345        ]
1346        ms1_summary["isotopologue_type"] = [
1347            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1348        ]
1349
1350        # Reorder columns
1351        ms1_summary = ms1_summary[
1352            [
1353                "mf_id",
1354                "ion_formula",
1355                "isotopologue_type",
1356                "Calculated m/z",
1357                "m/z Error (ppm)",
1358                "m/z Error Score",
1359                "Isotopologue Similarity",
1360                "Confidence Score",
1361            ]
1362        ]
1363
1364        # Set the index to mf_id
1365        ms1_summary = ms1_summary.set_index("mf_id")
1366
1367        return ms1_summary
1368
1369    def summarize_lipid_report(self, ms2_annot):
1370        """Summarize the lipid report.
1371
1372        Parameters
1373        ----------
1374        ms2_annot : DataFrame
1375            The MS2 annotation DataFrame with all annotations.
1376
1377        Returns
1378        -------
1379        DataFrame
1380            The summarized lipid report.
1381        """
1382        # Drop unnecessary columns for easier viewing
1383        columns_to_drop = [
1384            "precursor_mz",
1385            "precursor_mz_error_ppm",
1386            "metabref_mol_id",
1387            "metabref_precursor_mz",
1388            "cas",
1389            "inchikey",
1390            "inchi",
1391            "chebi",
1392            "smiles",
1393            "kegg",
1394            "data_id",
1395            "iupac_name",
1396            "traditional_name",
1397            "common_name",
1398            "casno",
1399        ]
1400        ms2_annot = ms2_annot.drop(
1401            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1402        )
1403
1404        # If ion_types_excluded is not empty, remove those ion types
1405        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1406            "ms2"
1407        ].molecular_search.ion_types_excluded
1408        if len(ion_types_excluded) > 0:
1409            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1410
1411        # If mf_id is not present, check that the index name is mf_id and reset the index
1412        if "mf_id" not in ms2_annot.columns:
1413            if ms2_annot.index.name == "mf_id":
1414                ms2_annot = ms2_annot.reset_index()
1415            else:
1416                raise ValueError("mf_id is not present in the dataframe")
1417
1418        # Attempt to get consensus annotations to the MLF level
1419        mlf_results_all = []
1420        for mf_id in ms2_annot["mf_id"].unique():
1421            mlf_results_perid = []
1422            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1423            ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf)
1424
1425            for query_scan in ms2_annot["query_spectrum_id"].unique():
1426                ms2_annot_sub = ms2_annot_mf[
1427                    ms2_annot_mf["query_spectrum_id"] == query_scan
1428                ].copy()
1429
1430                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1431                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1432                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1433                        ms2_annot_sub["entropy_max"] = (
1434                            ms2_annot_sub["entropy_similarity"]
1435                            == ms2_annot_sub["entropy_similarity"].max()
1436                        )
1437                        ms2_annot_sub["ref_match_fract_max"] = (
1438                            ms2_annot_sub["ref_mz_in_query_fract"]
1439                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1440                        )
1441                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1442                            "query_frag_types"
1443                        ].apply(lambda x: True if "MLF" in x else False)
1444
1445                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1446                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1447                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1448                        ].all(axis=1)
1449
1450                        # If there is a consensus, take the row with the highest entropy_similarity
1451                        if ms2_annot_sub["consensus"].any():
1452                            ms2_annot_sub = ms2_annot_sub[
1453                                ms2_annot_sub["entropy_similarity"]
1454                                == ms2_annot_sub["entropy_similarity"].max()
1455                            ].head(1)
1456                            mlf_results_perid.append(ms2_annot_sub)
1457            if len(mlf_results_perid) == 0:
1458                mlf_results_perid = pd.DataFrame()
1459            else:
1460                mlf_results_perid = pd.concat(mlf_results_perid)
1461                if mlf_results_perid["name"].nunique() == 1:
1462                    mlf_results_perid = mlf_results_perid[
1463                        mlf_results_perid["entropy_similarity"]
1464                        == mlf_results_perid["entropy_similarity"].max()
1465                    ].head(1)
1466                else:
1467                    mlf_results_perid = pd.DataFrame()
1468                mlf_results_all.append(mlf_results_perid)
1469
1470        # These are the consensus annotations to the MLF level
1471        if len(mlf_results_all) > 0:
1472            mlf_results_all = pd.concat(mlf_results_all)
1473            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1474        else:
1475            # Make an empty dataframe
1476            mlf_results_all = ms2_annot.head(0)
1477
1478        # For remaining mf_ids, try to get a consensus annotation to the species level
1479        species_results_all = []
1480        # Remove mf_ids that have consensus annotations to the MLF level
1481        ms2_annot_spec = ms2_annot[
1482            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1483        ]
1484        for mf_id in ms2_annot_spec["mf_id"].unique():
1485            # Do all the hits have the same lipid_summed_name?
1486            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1487            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1488
1489            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1490                # Grab the highest entropy_similarity result
1491                ms2_annot_sub = ms2_annot_sub[
1492                    ms2_annot_sub["entropy_similarity"]
1493                    == ms2_annot_sub["entropy_similarity"].max()
1494                ].head(1)
1495                species_results_all.append(ms2_annot_sub)
1496
1497        # These are the consensus annotations to the species level
1498        if len(species_results_all) > 0:
1499            species_results_all = pd.concat(species_results_all)
1500            species_results_all["annot_level"] = "species"
1501        else:
1502            # Make an empty dataframe
1503            species_results_all = ms2_annot.head(0)
1504
1505        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1506        # Remove mf_ids that have consensus annotations to the species level
1507        ms2_annot_remaining = ms2_annot_spec[
1508            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1509        ]
1510        no_consensus = []
1511        for mf_id in ms2_annot_remaining["mf_id"].unique():
1512            id_sub = []
1513            id_no_con = []
1514            ms2_annot_sub_mf = ms2_annot_remaining[
1515                ms2_annot_remaining["mf_id"] == mf_id
1516            ].copy()
1517            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1518                ms2_annot_sub = ms2_annot_sub_mf[
1519                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1520                ].copy()
1521
1522                # New columns for ranking [HIGHER RANK = BETTER]
1523                ms2_annot_sub["entropy_max"] = (
1524                    ms2_annot_sub["entropy_similarity"]
1525                    == ms2_annot_sub["entropy_similarity"].max()
1526                )
1527                ms2_annot_sub["ref_match_fract_max"] = (
1528                    ms2_annot_sub["ref_mz_in_query_fract"]
1529                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1530                )
1531                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1532                    lambda x: True if "MLF" in x else False
1533                )
1534
1535                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1536                ms2_annot_sub["consensus"] = ms2_annot_sub[
1537                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1538                ].all(axis=1)
1539                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1540                id_sub.append(ms2_annot_sub_con)
1541                id_no_con.append(ms2_annot_sub)
1542            id_sub = pd.concat(id_sub)
1543            id_no_con = pd.concat(id_no_con)
1544
1545            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1546            if (
1547                id_sub["query_frag_types"]
1548                .apply(lambda x: True if "MLF" in x else False)
1549                .all()
1550                and len(id_sub) > 0
1551            ):
1552                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1553                id_sub = id_sub.loc[idx]
1554                # Reorder so highest entropy_similarity is first
1555                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1556                id_sub["annot_level"] = id_sub["structure_level"]
1557                no_consensus.append(id_sub)
1558
1559            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1560            elif len(id_sub) == 0:
1561                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1562                    summed_sub = id_no_con[
1563                        id_no_con["lipid_summed_name"] == lipid_summed_name
1564                    ]
1565                    # Any consensus to MLF?
1566                    if summed_sub["consensus"].any():
1567                        summed_sub = summed_sub[summed_sub["consensus"]]
1568                        summed_sub["annot_level"] = summed_sub["structure_level"]
1569                        no_consensus.append(summed_sub)
1570                    else:
1571                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1572                        summed_sub = summed_sub[
1573                            summed_sub["entropy_similarity"]
1574                            == summed_sub["entropy_similarity"].max()
1575                        ].head(1)
1576                        # get first row
1577                        summed_sub["annot_level"] = "species"
1578                        summed_sub["name"] = ""
1579                        no_consensus.append(summed_sub)
1580            else:
1581                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1582
1583        if len(no_consensus) > 0:
1584            no_consensus = pd.concat(no_consensus)
1585        else:
1586            no_consensus = ms2_annot.head(0)
1587
1588        # Combine all the consensus annotations and reformat the dataframe for output
1589        species_results_all = species_results_all.drop(columns=["name"])
1590        species_results_all["lipid_molecular_species_id"] = ""
1591        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1592        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1593        consensus_annotations = pd.concat(
1594            [mlf_results_all, species_results_all, no_consensus]
1595        )
1596        consensus_annotations = consensus_annotations.sort_values(
1597            "mf_id", ascending=True
1598        )
1599        cols_to_keep = [
1600            "mf_id",
1601            "ref_ion_type",
1602            "entropy_similarity",
1603            "ref_mz_in_query_fract",
1604            "lipid_molecular_species_id",
1605            "lipid_summed_name",
1606            "lipid_subclass",
1607            "lipid_class",
1608            "lipid_category",
1609            "formula",
1610            "annot_level",
1611            "n_spectra_contributing",
1612        ]
1613        consensus_annotations = consensus_annotations[cols_to_keep]
1614        consensus_annotations = consensus_annotations.set_index("mf_id")
1615
1616        return consensus_annotations
1617
1618    def clean_ms2_report(self, lipid_summary):
1619        """Clean the MS2 report.
1620
1621        Parameters
1622        ----------
1623        lipid_summary : DataFrame
1624            The full lipid summary DataFrame.
1625
1626        Returns
1627        -------
1628        DataFrame
1629            The cleaned lipid summary DataFrame.
1630        """
1631        lipid_summary = lipid_summary.reset_index()
1632        lipid_summary["ion_formula"] = [
1633            self.get_ion_formula(f, a)
1634            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
1635        ]
1636
1637        # Reorder columns
1638        lipid_summary = lipid_summary[
1639            [
1640                "mf_id",
1641                "ion_formula",
1642                "ref_ion_type",
1643                "formula",
1644                "annot_level",
1645                "lipid_molecular_species_id",
1646                "lipid_summed_name",
1647                "lipid_subclass",
1648                "lipid_class",
1649                "lipid_category",
1650                "entropy_similarity",
1651                "ref_mz_in_query_fract",
1652                "n_spectra_contributing",
1653            ]
1654        ]
1655
1656        # Set the index to mf_id
1657        lipid_summary = lipid_summary.set_index("mf_id")
1658
1659        return lipid_summary
1660
1661    def to_report(self, molecular_metadata=None):
1662        """Create a report of the mass features and their annotations.
1663
1664        Parameters
1665        ----------
1666        molecular_metadata : dict, optional
1667            The molecular metadata. Default is None.
1668
1669        Returns
1670        -------
1671        DataFrame
1672            The report of the mass features and their annotations.
1673
1674        Notes
1675        -----
1676        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
1677        """
1678        # Get mass feature dataframe
1679        mf_report = self.mass_spectra.mass_features_to_df()
1680        mf_report = mf_report.reset_index(drop=False)
1681
1682        # Get and clean ms1 annotation dataframe
1683        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1684        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1685        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1686
1687        # Get, summarize, and clean ms2 annotation dataframe
1688        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1689            molecular_metadata=molecular_metadata
1690        )
1691        if ms2_annot_report is not None:
1692            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
1693            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1694            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1695            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1696
1697        # Combine the reports
1698        if not ms1_annot_report.empty:
1699            # MS1 has been run and has molecular formula information
1700            mf_report = pd.merge(
1701                mf_report,
1702                ms1_annot_report,
1703                how="left",
1704                on=["mf_id", "isotopologue_type"],
1705            )
1706        if ms2_annot_report is not None:
1707            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1708            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1709            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1710            mf_no_ion_formula = pd.merge(
1711                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1712            )
1713
1714            # pull out the records with ion_formula
1715            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1716            mf_with_ion_formula = pd.merge(
1717                mf_with_ion_formula,
1718                ms2_annot_report,
1719                how="left",
1720                on=["mf_id", "ion_formula"],
1721            )
1722
1723            # put back together
1724            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1725
1726        # Rename colums
1727        rename_dict = {
1728            "mf_id": "Mass Feature ID",
1729            "scan_time": "Retention Time (min)",
1730            "mz": "m/z",
1731            "apex_scan": "Apex Scan Number",
1732            "intensity": "Intensity",
1733            "persistence": "Persistence",
1734            "area": "Area",
1735            "half_height_width": "Half Height Width (min)",
1736            "tailing_factor": "Tailing Factor",
1737            "dispersity_index": "Dispersity Index",
1738            "ms2_spectrum": "MS2 Spectrum",
1739            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1740            "isotopologue_type": "Isotopologue Type",
1741            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1742            "associated_mass_features": "Associated Mass Features after Deconvolution",
1743            "ion_formula": "Ion Formula",
1744            "formula": "Molecular Formula",
1745            "ref_ion_type": "Ion Type",
1746            "annot_level": "Lipid Annotation Level",
1747            "lipid_molecular_species_id": "Lipid Molecular Species",
1748            "lipid_summed_name": "Lipid Species",
1749            "lipid_subclass": "Lipid Subclass",
1750            "lipid_class": "Lipid Class",
1751            "lipid_category": "Lipid Category",
1752            "entropy_similarity": "Entropy Similarity",
1753            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1754            "n_spectra_contributing": "Spectra with Annotation (n)",
1755        }
1756        mf_report = mf_report.rename(columns=rename_dict)
1757        mf_report["Sample Name"] = self.mass_spectra.sample_name
1758        mf_report["Polarity"] = self.mass_spectra.polarity
1759        mf_report = mf_report[
1760            ["Mass Feature ID", "Sample Name", "Polarity"]
1761            + [
1762                col
1763                for col in mf_report.columns
1764                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1765            ]
1766        ]
1767
1768        # Reorder rows by "Mass Feature ID"
1769        mf_report = mf_report.sort_values("Mass Feature ID")
1770
1771        # Reset index
1772        mf_report = mf_report.reset_index(drop=True)
1773
1774        return mf_report
1775
1776    def report_to_csv(self, molecular_metadata=None):
1777        """Create a report of the mass features and their annotations and save it as a CSV file.
1778
1779        Parameters
1780        ----------
1781        molecular_metadata : dict, optional
1782            The molecular metadata. Default is None.
1783        """
1784        report = self.to_report(molecular_metadata=molecular_metadata)
1785        out_file = self.output_file.with_suffix(".csv")
1786        report.to_csv(out_file, index=False)

A class to export lipidomics data.

This class provides methods to export lipidomics data to various formats and summarize the lipid report.

Parameters
  • out_file_path (str | Path): The output file path, do not include the file extension.
  • mass_spectra (object): The high resolution mass spectra object.
LipidomicsExport(out_file_path, mass_spectra)
1211    def __init__(self, out_file_path, mass_spectra):
1212        super().__init__(out_file_path, mass_spectra)
1213        self.ion_type_dict = ion_type_dict

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

ion_type_dict
@staticmethod
def get_ion_formula(neutral_formula, ion_type):
1215    @staticmethod
1216    def get_ion_formula(neutral_formula, ion_type):
1217        """From a neutral formula and an ion type, return the formula of the ion.
1218
1219        Notes
1220        -----
1221        This is a static method.
1222        If the neutral_formula is not a string, this method will return None.
1223
1224        Parameters
1225        ----------
1226        neutral_formula : str
1227            The neutral formula, this should be a string form from the MolecularFormula class
1228            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1229            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1230            e.g. MgCl2 is parsed as 'Mg Cl2.
1231        ion_type : str
1232            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1233            See the self.ion_type_dict for the available ion types.
1234
1235        Returns
1236        -------
1237        str
1238            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1239        """
1240        # If neutral_formula is not a string, return None
1241        if not isinstance(neutral_formula, str):
1242            return None
1243
1244        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1245        if re.search(r"\s", neutral_formula):
1246            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1247        else:
1248            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1249            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1250            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1251            neutral_formula = MolecularFormula(
1252                dict(
1253                    zip(
1254                        [x[0] for x in elements],
1255                        [int(x[0]) if x else 1 for x in counts],
1256                    )
1257                ),
1258                ion_charge=0,
1259            )
1260        neutral_formula_dict = neutral_formula.to_dict().copy()
1261
1262        adduct_add_dict = ion_type_dict[ion_type][0]
1263        for key in adduct_add_dict:
1264            if key in neutral_formula_dict.keys():
1265                neutral_formula_dict[key] += adduct_add_dict[key]
1266            else:
1267                neutral_formula_dict[key] = adduct_add_dict[key]
1268
1269        adduct_subtract = ion_type_dict[ion_type][1]
1270        for key in adduct_subtract:
1271            neutral_formula_dict[key] -= adduct_subtract[key]
1272
1273        return MolecularFormula(neutral_formula_dict, ion_charge=0).string

From a neutral formula and an ion type, return the formula of the ion.

Notes

This is a static method. If the neutral_formula is not a string, this method will return None.

Parameters
  • neutral_formula (str): The neutral formula, this should be a string form from the MolecularFormula class (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). In the case of a simple string, the atoms are parsed based on the presence of capital letters, e.g. MgCl2 is parsed as 'Mg Cl2.
  • ion_type (str): The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. See the self.ion_type_dict for the available ion types.
Returns
  • str: The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
@staticmethod
def get_isotope_type(ion_formula):
1275    @staticmethod
1276    def get_isotope_type(ion_formula):
1277        """From an ion formula, return the 13C isotope type of the ion.
1278
1279        Notes
1280        -----
1281        This is a static method.
1282        If the ion_formula is not a string, this method will return None.
1283        This is currently only functional for 13C isotopes.
1284
1285        Parameters
1286        ----------
1287        ion_formula : str
1288            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1289
1290        Returns
1291        -------
1292        str
1293            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1294
1295        Raises
1296        ------
1297        ValueError
1298            If the ion_formula is not a string.
1299        """
1300        if not isinstance(ion_formula, str):
1301            return None
1302
1303        if re.search(r"\s", ion_formula):
1304            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1305        else:
1306            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1307        ion_formula_dict = ion_formula.to_dict().copy()
1308
1309        try:
1310            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1311        except KeyError:
1312            iso_class = None
1313
1314        return iso_class

From an ion formula, return the 13C isotope type of the ion.

Notes

This is a static method. If the ion_formula is not a string, this method will return None. This is currently only functional for 13C isotopes.

Parameters
  • ion_formula (str): The formula of the ion, expected to be a string like 'C2 H4 O2'.
Returns
  • str: The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
Raises
  • ValueError: If the ion_formula is not a string.
def clean_ms1_report(self, ms1_summary_full):
1316    def clean_ms1_report(self, ms1_summary_full):
1317        """Clean the MS1 report.
1318
1319        Parameters
1320        ----------
1321        ms1_summary_full : DataFrame
1322            The full MS1 summary DataFrame.
1323
1324        Returns
1325        -------
1326        DataFrame
1327            The cleaned MS1 summary DataFrame.
1328        """
1329        ms1_summary_full = ms1_summary_full.reset_index()
1330        cols_to_keep = [
1331            "mf_id",
1332            "Molecular Formula",
1333            "Ion Type",
1334            "Calculated m/z",
1335            "m/z Error (ppm)",
1336            "m/z Error Score",
1337            "Is Isotopologue",
1338            "Isotopologue Similarity",
1339            "Confidence Score",
1340        ]
1341        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1342        ms1_summary["ion_formula"] = [
1343            self.get_ion_formula(f, a)
1344            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1345        ]
1346        ms1_summary["isotopologue_type"] = [
1347            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1348        ]
1349
1350        # Reorder columns
1351        ms1_summary = ms1_summary[
1352            [
1353                "mf_id",
1354                "ion_formula",
1355                "isotopologue_type",
1356                "Calculated m/z",
1357                "m/z Error (ppm)",
1358                "m/z Error Score",
1359                "Isotopologue Similarity",
1360                "Confidence Score",
1361            ]
1362        ]
1363
1364        # Set the index to mf_id
1365        ms1_summary = ms1_summary.set_index("mf_id")
1366
1367        return ms1_summary

Clean the MS1 report.

Parameters
  • ms1_summary_full (DataFrame): The full MS1 summary DataFrame.
Returns
  • DataFrame: The cleaned MS1 summary DataFrame.
def summarize_lipid_report(self, ms2_annot):
1369    def summarize_lipid_report(self, ms2_annot):
1370        """Summarize the lipid report.
1371
1372        Parameters
1373        ----------
1374        ms2_annot : DataFrame
1375            The MS2 annotation DataFrame with all annotations.
1376
1377        Returns
1378        -------
1379        DataFrame
1380            The summarized lipid report.
1381        """
1382        # Drop unnecessary columns for easier viewing
1383        columns_to_drop = [
1384            "precursor_mz",
1385            "precursor_mz_error_ppm",
1386            "metabref_mol_id",
1387            "metabref_precursor_mz",
1388            "cas",
1389            "inchikey",
1390            "inchi",
1391            "chebi",
1392            "smiles",
1393            "kegg",
1394            "data_id",
1395            "iupac_name",
1396            "traditional_name",
1397            "common_name",
1398            "casno",
1399        ]
1400        ms2_annot = ms2_annot.drop(
1401            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1402        )
1403
1404        # If ion_types_excluded is not empty, remove those ion types
1405        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1406            "ms2"
1407        ].molecular_search.ion_types_excluded
1408        if len(ion_types_excluded) > 0:
1409            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1410
1411        # If mf_id is not present, check that the index name is mf_id and reset the index
1412        if "mf_id" not in ms2_annot.columns:
1413            if ms2_annot.index.name == "mf_id":
1414                ms2_annot = ms2_annot.reset_index()
1415            else:
1416                raise ValueError("mf_id is not present in the dataframe")
1417
1418        # Attempt to get consensus annotations to the MLF level
1419        mlf_results_all = []
1420        for mf_id in ms2_annot["mf_id"].unique():
1421            mlf_results_perid = []
1422            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1423            ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf)
1424
1425            for query_scan in ms2_annot["query_spectrum_id"].unique():
1426                ms2_annot_sub = ms2_annot_mf[
1427                    ms2_annot_mf["query_spectrum_id"] == query_scan
1428                ].copy()
1429
1430                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1431                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1432                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1433                        ms2_annot_sub["entropy_max"] = (
1434                            ms2_annot_sub["entropy_similarity"]
1435                            == ms2_annot_sub["entropy_similarity"].max()
1436                        )
1437                        ms2_annot_sub["ref_match_fract_max"] = (
1438                            ms2_annot_sub["ref_mz_in_query_fract"]
1439                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1440                        )
1441                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1442                            "query_frag_types"
1443                        ].apply(lambda x: True if "MLF" in x else False)
1444
1445                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1446                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1447                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1448                        ].all(axis=1)
1449
1450                        # If there is a consensus, take the row with the highest entropy_similarity
1451                        if ms2_annot_sub["consensus"].any():
1452                            ms2_annot_sub = ms2_annot_sub[
1453                                ms2_annot_sub["entropy_similarity"]
1454                                == ms2_annot_sub["entropy_similarity"].max()
1455                            ].head(1)
1456                            mlf_results_perid.append(ms2_annot_sub)
1457            if len(mlf_results_perid) == 0:
1458                mlf_results_perid = pd.DataFrame()
1459            else:
1460                mlf_results_perid = pd.concat(mlf_results_perid)
1461                if mlf_results_perid["name"].nunique() == 1:
1462                    mlf_results_perid = mlf_results_perid[
1463                        mlf_results_perid["entropy_similarity"]
1464                        == mlf_results_perid["entropy_similarity"].max()
1465                    ].head(1)
1466                else:
1467                    mlf_results_perid = pd.DataFrame()
1468                mlf_results_all.append(mlf_results_perid)
1469
1470        # These are the consensus annotations to the MLF level
1471        if len(mlf_results_all) > 0:
1472            mlf_results_all = pd.concat(mlf_results_all)
1473            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1474        else:
1475            # Make an empty dataframe
1476            mlf_results_all = ms2_annot.head(0)
1477
1478        # For remaining mf_ids, try to get a consensus annotation to the species level
1479        species_results_all = []
1480        # Remove mf_ids that have consensus annotations to the MLF level
1481        ms2_annot_spec = ms2_annot[
1482            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1483        ]
1484        for mf_id in ms2_annot_spec["mf_id"].unique():
1485            # Do all the hits have the same lipid_summed_name?
1486            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1487            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1488
1489            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1490                # Grab the highest entropy_similarity result
1491                ms2_annot_sub = ms2_annot_sub[
1492                    ms2_annot_sub["entropy_similarity"]
1493                    == ms2_annot_sub["entropy_similarity"].max()
1494                ].head(1)
1495                species_results_all.append(ms2_annot_sub)
1496
1497        # These are the consensus annotations to the species level
1498        if len(species_results_all) > 0:
1499            species_results_all = pd.concat(species_results_all)
1500            species_results_all["annot_level"] = "species"
1501        else:
1502            # Make an empty dataframe
1503            species_results_all = ms2_annot.head(0)
1504
1505        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1506        # Remove mf_ids that have consensus annotations to the species level
1507        ms2_annot_remaining = ms2_annot_spec[
1508            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1509        ]
1510        no_consensus = []
1511        for mf_id in ms2_annot_remaining["mf_id"].unique():
1512            id_sub = []
1513            id_no_con = []
1514            ms2_annot_sub_mf = ms2_annot_remaining[
1515                ms2_annot_remaining["mf_id"] == mf_id
1516            ].copy()
1517            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1518                ms2_annot_sub = ms2_annot_sub_mf[
1519                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1520                ].copy()
1521
1522                # New columns for ranking [HIGHER RANK = BETTER]
1523                ms2_annot_sub["entropy_max"] = (
1524                    ms2_annot_sub["entropy_similarity"]
1525                    == ms2_annot_sub["entropy_similarity"].max()
1526                )
1527                ms2_annot_sub["ref_match_fract_max"] = (
1528                    ms2_annot_sub["ref_mz_in_query_fract"]
1529                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1530                )
1531                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1532                    lambda x: True if "MLF" in x else False
1533                )
1534
1535                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1536                ms2_annot_sub["consensus"] = ms2_annot_sub[
1537                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1538                ].all(axis=1)
1539                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1540                id_sub.append(ms2_annot_sub_con)
1541                id_no_con.append(ms2_annot_sub)
1542            id_sub = pd.concat(id_sub)
1543            id_no_con = pd.concat(id_no_con)
1544
1545            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1546            if (
1547                id_sub["query_frag_types"]
1548                .apply(lambda x: True if "MLF" in x else False)
1549                .all()
1550                and len(id_sub) > 0
1551            ):
1552                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1553                id_sub = id_sub.loc[idx]
1554                # Reorder so highest entropy_similarity is first
1555                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1556                id_sub["annot_level"] = id_sub["structure_level"]
1557                no_consensus.append(id_sub)
1558
1559            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1560            elif len(id_sub) == 0:
1561                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1562                    summed_sub = id_no_con[
1563                        id_no_con["lipid_summed_name"] == lipid_summed_name
1564                    ]
1565                    # Any consensus to MLF?
1566                    if summed_sub["consensus"].any():
1567                        summed_sub = summed_sub[summed_sub["consensus"]]
1568                        summed_sub["annot_level"] = summed_sub["structure_level"]
1569                        no_consensus.append(summed_sub)
1570                    else:
1571                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1572                        summed_sub = summed_sub[
1573                            summed_sub["entropy_similarity"]
1574                            == summed_sub["entropy_similarity"].max()
1575                        ].head(1)
1576                        # get first row
1577                        summed_sub["annot_level"] = "species"
1578                        summed_sub["name"] = ""
1579                        no_consensus.append(summed_sub)
1580            else:
1581                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1582
1583        if len(no_consensus) > 0:
1584            no_consensus = pd.concat(no_consensus)
1585        else:
1586            no_consensus = ms2_annot.head(0)
1587
1588        # Combine all the consensus annotations and reformat the dataframe for output
1589        species_results_all = species_results_all.drop(columns=["name"])
1590        species_results_all["lipid_molecular_species_id"] = ""
1591        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1592        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1593        consensus_annotations = pd.concat(
1594            [mlf_results_all, species_results_all, no_consensus]
1595        )
1596        consensus_annotations = consensus_annotations.sort_values(
1597            "mf_id", ascending=True
1598        )
1599        cols_to_keep = [
1600            "mf_id",
1601            "ref_ion_type",
1602            "entropy_similarity",
1603            "ref_mz_in_query_fract",
1604            "lipid_molecular_species_id",
1605            "lipid_summed_name",
1606            "lipid_subclass",
1607            "lipid_class",
1608            "lipid_category",
1609            "formula",
1610            "annot_level",
1611            "n_spectra_contributing",
1612        ]
1613        consensus_annotations = consensus_annotations[cols_to_keep]
1614        consensus_annotations = consensus_annotations.set_index("mf_id")
1615
1616        return consensus_annotations

Summarize the lipid report.

Parameters
  • ms2_annot (DataFrame): The MS2 annotation DataFrame with all annotations.
Returns
  • DataFrame: The summarized lipid report.
def clean_ms2_report(self, lipid_summary):
1618    def clean_ms2_report(self, lipid_summary):
1619        """Clean the MS2 report.
1620
1621        Parameters
1622        ----------
1623        lipid_summary : DataFrame
1624            The full lipid summary DataFrame.
1625
1626        Returns
1627        -------
1628        DataFrame
1629            The cleaned lipid summary DataFrame.
1630        """
1631        lipid_summary = lipid_summary.reset_index()
1632        lipid_summary["ion_formula"] = [
1633            self.get_ion_formula(f, a)
1634            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
1635        ]
1636
1637        # Reorder columns
1638        lipid_summary = lipid_summary[
1639            [
1640                "mf_id",
1641                "ion_formula",
1642                "ref_ion_type",
1643                "formula",
1644                "annot_level",
1645                "lipid_molecular_species_id",
1646                "lipid_summed_name",
1647                "lipid_subclass",
1648                "lipid_class",
1649                "lipid_category",
1650                "entropy_similarity",
1651                "ref_mz_in_query_fract",
1652                "n_spectra_contributing",
1653            ]
1654        ]
1655
1656        # Set the index to mf_id
1657        lipid_summary = lipid_summary.set_index("mf_id")
1658
1659        return lipid_summary

Clean the MS2 report.

Parameters
  • lipid_summary (DataFrame): The full lipid summary DataFrame.
Returns
  • DataFrame: The cleaned lipid summary DataFrame.
def to_report(self, molecular_metadata=None):
1661    def to_report(self, molecular_metadata=None):
1662        """Create a report of the mass features and their annotations.
1663
1664        Parameters
1665        ----------
1666        molecular_metadata : dict, optional
1667            The molecular metadata. Default is None.
1668
1669        Returns
1670        -------
1671        DataFrame
1672            The report of the mass features and their annotations.
1673
1674        Notes
1675        -----
1676        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
1677        """
1678        # Get mass feature dataframe
1679        mf_report = self.mass_spectra.mass_features_to_df()
1680        mf_report = mf_report.reset_index(drop=False)
1681
1682        # Get and clean ms1 annotation dataframe
1683        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1684        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1685        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1686
1687        # Get, summarize, and clean ms2 annotation dataframe
1688        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1689            molecular_metadata=molecular_metadata
1690        )
1691        if ms2_annot_report is not None:
1692            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
1693            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1694            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1695            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1696
1697        # Combine the reports
1698        if not ms1_annot_report.empty:
1699            # MS1 has been run and has molecular formula information
1700            mf_report = pd.merge(
1701                mf_report,
1702                ms1_annot_report,
1703                how="left",
1704                on=["mf_id", "isotopologue_type"],
1705            )
1706        if ms2_annot_report is not None:
1707            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1708            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1709            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1710            mf_no_ion_formula = pd.merge(
1711                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1712            )
1713
1714            # pull out the records with ion_formula
1715            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1716            mf_with_ion_formula = pd.merge(
1717                mf_with_ion_formula,
1718                ms2_annot_report,
1719                how="left",
1720                on=["mf_id", "ion_formula"],
1721            )
1722
1723            # put back together
1724            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1725
1726        # Rename colums
1727        rename_dict = {
1728            "mf_id": "Mass Feature ID",
1729            "scan_time": "Retention Time (min)",
1730            "mz": "m/z",
1731            "apex_scan": "Apex Scan Number",
1732            "intensity": "Intensity",
1733            "persistence": "Persistence",
1734            "area": "Area",
1735            "half_height_width": "Half Height Width (min)",
1736            "tailing_factor": "Tailing Factor",
1737            "dispersity_index": "Dispersity Index",
1738            "ms2_spectrum": "MS2 Spectrum",
1739            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1740            "isotopologue_type": "Isotopologue Type",
1741            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1742            "associated_mass_features": "Associated Mass Features after Deconvolution",
1743            "ion_formula": "Ion Formula",
1744            "formula": "Molecular Formula",
1745            "ref_ion_type": "Ion Type",
1746            "annot_level": "Lipid Annotation Level",
1747            "lipid_molecular_species_id": "Lipid Molecular Species",
1748            "lipid_summed_name": "Lipid Species",
1749            "lipid_subclass": "Lipid Subclass",
1750            "lipid_class": "Lipid Class",
1751            "lipid_category": "Lipid Category",
1752            "entropy_similarity": "Entropy Similarity",
1753            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1754            "n_spectra_contributing": "Spectra with Annotation (n)",
1755        }
1756        mf_report = mf_report.rename(columns=rename_dict)
1757        mf_report["Sample Name"] = self.mass_spectra.sample_name
1758        mf_report["Polarity"] = self.mass_spectra.polarity
1759        mf_report = mf_report[
1760            ["Mass Feature ID", "Sample Name", "Polarity"]
1761            + [
1762                col
1763                for col in mf_report.columns
1764                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1765            ]
1766        ]
1767
1768        # Reorder rows by "Mass Feature ID"
1769        mf_report = mf_report.sort_values("Mass Feature ID")
1770
1771        # Reset index
1772        mf_report = mf_report.reset_index(drop=True)
1773
1774        return mf_report

Create a report of the mass features and their annotations.

Parameters
  • molecular_metadata (dict, optional): The molecular metadata. Default is None.
Returns
  • DataFrame: The report of the mass features and their annotations.
Notes

The report will contain the mass features and their annotations from MS1 and MS2 (if available).

def report_to_csv(self, molecular_metadata=None):
1776    def report_to_csv(self, molecular_metadata=None):
1777        """Create a report of the mass features and their annotations and save it as a CSV file.
1778
1779        Parameters
1780        ----------
1781        molecular_metadata : dict, optional
1782            The molecular metadata. Default is None.
1783        """
1784        report = self.to_report(molecular_metadata=molecular_metadata)
1785        out_file = self.output_file.with_suffix(".csv")
1786        report.to_csv(out_file, index=False)

Create a report of the mass features and their annotations and save it as a CSV file.

Parameters
  • molecular_metadata (dict, optional): The molecular metadata. Default is None.