corems.mass_spectra.output.export

   1__author__ = "Yuri E. Corilo"
   2__date__ = "Dec 14, 2010"
   3
   4
   5import csv
   6import json
   7import re
   8import uuid
   9import warnings
  10from datetime import datetime, timezone
  11from pathlib import Path
  12
  13import h5py
  14import numpy as np
  15import pandas as pd
  16from openpyxl import load_workbook
  17from pandas import DataFrame, ExcelWriter, read_excel
  18
  19from corems import __version__, corems_md5
  20from corems.encapsulation.output import parameter_to_dict
  21from corems.encapsulation.output.parameter_to_json import (
  22    dump_lcms_settings_json,
  23    dump_lcms_settings_toml,
  24)
  25from corems.mass_spectrum.output.export import HighResMassSpecExport
  26from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula
  27from corems.molecular_id.calc.SpectralSimilarity import methods_name
  28
  29ion_type_dict = {
  30    # adduct : [atoms to add, atoms to subtract when calculating formula of ion
  31    "M+": [{}, {}],
  32    "protonated": [{"H": 1}, {}],
  33    "[M+H]+": [{"H": 1}, {}],
  34    "[M+NH4]+": [{"N": 1, "H": 4}, {}],  # ammonium
  35    "[M+Na]+": [{"Na": 1}, {}],
  36    "[M+K]+": [{"K": 1}, {}],
  37    "[M+2Na+Cl]+": [{"Na": 2, "Cl": 1}, {}],
  38    "[M+2Na-H]+": [{"Na": 2}, {"H": 1}],
  39    "[M+C2H3Na2O2]+": [{"C": 2, "H": 3, "Na": 2, "O": 2}, {}],
  40    "[M+C4H10N3]+": [{"C": 4, "H": 10, "N": 3}, {}],
  41    "[M+NH4+ACN]+": [{"C": 2, "H": 7, "N": 2}, {}],
  42    "[M+H-H2O]+": [{}, {"H": 1, "O": 1}],
  43    "de-protonated": [{}, {"H": 1}],
  44    "[M-H]-": [{}, {"H": 1}],
  45    "[M+Cl]-": [{"Cl": 1}, {}],
  46    "[M+HCOO]-": [{"C": 1, "H": 1, "O": 2}, {}],  # formate
  47    "[M+CH3COO]-": [{"C": 2, "H": 3, "O": 2}, {}],  # acetate
  48    "[M+2NaAc+Cl]-": [{"Na": 2, "C": 2, "H": 3, "O": 2, "Cl": 1}, {}],
  49    "[M+K-2H]-": [{"K": 1}, {"H": 2}],
  50    "[M+Na-2H]-": [{"Na": 1}, {"H": 2}],
  51}
  52
  53
  54class LowResGCMSExport:
  55    """A class to export low resolution GC-MS data.
  56
  57    This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
  58
  59    Parameters:
  60    ----------
  61    out_file_path : str
  62        The output file path.
  63    gcms : object
  64        The low resolution GCMS object.
  65
  66    Attributes:
  67    ----------
  68    output_file : Path
  69        The output file path as a Path object.
  70    gcms : object
  71        The low resolution GCMS object.
  72
  73    Methods:
  74    -------
  75    * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
  76    * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
  77    * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
  78    * to_excel(write_mode='a', write_metadata=True, id_label="corems:"),
  79        Export the data to an Excel file.
  80    * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:").
  81        Export the data to a CSV file.
  82    * to_hdf(id_label="corems:").
  83        Export the data to an HDF5 file.
  84    * get_data_stats(gcms).
  85        Get statistics about the GCMS data.
  86
  87    """
  88
  89    def __init__(self, out_file_path, gcms):
  90        self.output_file = Path(out_file_path)
  91
  92        self.gcms = gcms
  93
  94        self._init_columns()
  95
  96    def _init_columns(self):
  97        """Initialize the column names for the exported data.
  98
  99        Returns:
 100        -------
 101        list
 102            The list of column names.
 103        """
 104
 105        columns = [
 106            "Sample name",
 107            "Peak Index",
 108            "Retention Time",
 109            "Retention Time Ref",
 110            "Peak Height",
 111            "Peak Area",
 112            "Retention index",
 113            "Retention index Ref",
 114            "Retention Index Score",
 115            "Similarity Score",
 116            "Spectral Similarity Score",
 117            "Compound Name",
 118            "Chebi ID",
 119            "Kegg Compound ID",
 120            "Inchi",
 121            "Inchi Key",
 122            "Smiles",
 123            "Molecular Formula",
 124            "IUPAC Name",
 125            "Traditional Name",
 126            "Common Name",
 127            "Derivatization",
 128        ]
 129
 130        if self.gcms.molecular_search_settings.exploratory_mode:
 131            columns.extend(
 132                [
 133                    "Weighted Cosine Correlation",
 134                    "Cosine Correlation",
 135                    "Stein Scott Similarity",
 136                    "Pearson Correlation",
 137                    "Spearman Correlation",
 138                    "Kendall Tau Correlation",
 139                    "Euclidean Distance",
 140                    "Manhattan Distance",
 141                    "Jaccard Distance",
 142                    "DWT Correlation",
 143                    "DFT Correlation",
 144                ]
 145            )
 146
 147            columns.extend(list(methods_name.values()))
 148
 149        return columns
 150
 151    def get_pandas_df(self, id_label="corems:"):
 152        """Get the exported data as a Pandas DataFrame.
 153
 154        Parameters:
 155        ----------
 156        id_label : str, optional
 157            The ID label for the data. Default is "corems:".
 158
 159        Returns:
 160        -------
 161        DataFrame
 162            The exported data as a Pandas DataFrame.
 163        """
 164
 165        columns = self._init_columns()
 166
 167        dict_data_list = self.get_list_dict_data(self.gcms)
 168
 169        df = DataFrame(dict_data_list, columns=columns)
 170
 171        df.name = self.gcms.sample_name
 172
 173        return df
 174
 175    def get_json(self, nan=False, id_label="corems:"):
 176        """Get the exported data as a JSON string.
 177
 178        Parameters:
 179        ----------
 180        nan : bool, optional
 181            Whether to include NaN values in the JSON string. Default is False.
 182        id_label : str, optional
 183            The ID label for the data. Default is "corems:".
 184
 185        """
 186
 187        import json
 188
 189        dict_data_list = self.get_list_dict_data(self.gcms)
 190
 191        return json.dumps(
 192            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
 193        )
 194
 195    def to_pandas(self, write_metadata=True, id_label="corems:"):
 196        """Export the data to a Pandas DataFrame and save it as a pickle file.
 197
 198        Parameters:
 199        ----------
 200        write_metadata : bool, optional
 201            Whether to write metadata to the output file.
 202        id_label : str, optional
 203            The ID label for the data.
 204        """
 205
 206        columns = self._init_columns()
 207
 208        dict_data_list = self.get_list_dict_data(self.gcms)
 209
 210        df = DataFrame(dict_data_list, columns=columns)
 211
 212        df.to_pickle(self.output_file.with_suffix(".pkl"))
 213
 214        if write_metadata:
 215            self.write_settings(
 216                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
 217            )
 218
 219    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
 220        """Export the data to an Excel file.
 221
 222        Parameters:
 223        ----------
 224        write_mode : str, optional
 225            The write mode for the Excel file. Default is 'a' (append).
 226        write_metadata : bool, optional
 227            Whether to write metadata to the output file. Default is True.
 228        id_label : str, optional
 229            The ID label for the data. Default is "corems:".
 230        """
 231
 232        out_put_path = self.output_file.with_suffix(".xlsx")
 233
 234        columns = self._init_columns()
 235
 236        dict_data_list = self.get_list_dict_data(self.gcms)
 237
 238        df = DataFrame(dict_data_list, columns=columns)
 239
 240        if write_mode == "a" and out_put_path.exists():
 241            writer = ExcelWriter(out_put_path, engine="openpyxl")
 242            # try to open an existing workbook
 243            writer.book = load_workbook(out_put_path)
 244            # copy existing sheets
 245            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
 246            # read existing file
 247            reader = read_excel(out_put_path)
 248            # write out the new sheet
 249            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
 250
 251            writer.close()
 252        else:
 253            df.to_excel(
 254                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
 255            )
 256
 257        if write_metadata:
 258            self.write_settings(out_put_path, self.gcms, id_label=id_label)
 259
 260    def to_csv(
 261        self,
 262        separate_output=False,
 263        write_mode="w",
 264        write_metadata=True,
 265        id_label="corems:",
 266    ):
 267        """Export the data to a CSV file.
 268
 269        Parameters:
 270        ----------
 271        separate_output : bool, optional
 272            Whether to separate the output into multiple files. Default is False.
 273        write_mode : str, optional
 274            The write mode for the CSV file. Default is 'w' (write).
 275        write_metadata : bool, optional
 276            Whether to write metadata to the output file. Default is True.
 277        id_label : str, optional
 278            The ID label for the data. Default is "corems:".
 279        """
 280
 281        if separate_output:
 282            # set write mode to write
 283            # this mode will overwrite the file without warning
 284            write_mode = "w"
 285        else:
 286            # set write mode to append
 287            write_mode = "a"
 288
 289        columns = self._init_columns()
 290
 291        dict_data_list = self.get_list_dict_data(self.gcms)
 292
 293        out_put_path = self.output_file.with_suffix(".csv")
 294
 295        write_header = not out_put_path.exists()
 296
 297        try:
 298            with open(out_put_path, write_mode, newline="") as csvfile:
 299                writer = csv.DictWriter(csvfile, fieldnames=columns)
 300                if write_header:
 301                    writer.writeheader()
 302                for data in dict_data_list:
 303                    writer.writerow(data)
 304
 305            if write_metadata:
 306                self.write_settings(out_put_path, self.gcms, id_label=id_label)
 307
 308        except IOError as ioerror:
 309            print(ioerror)
 310
 311    def to_hdf(self, id_label="corems:"):
 312        """Export the data to an HDF5 file.
 313
 314        Parameters:
 315        ----------
 316        id_label : str, optional
 317            The ID label for the data. Default is "corems:".
 318        """
 319
 320        # save sample at a time
 321        def add_compound(gc_peak, compound_obj):
 322            modifier = compound_obj.classify if compound_obj.classify else ""
 323            compound_group = compound_obj.name.replace("/", "") + " " + modifier
 324
 325            if compound_group not in peak_group:
 326                compound_group = peak_group.create_group(compound_group)
 327
 328                # compound_group.attrs["retention_time"] = compound_obj.retention_time
 329                compound_group.attrs["retention_index"] = compound_obj.ri
 330                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
 331                compound_group.attrs["spectral_similarity_score"] = (
 332                    compound_obj.spectral_similarity_score
 333                )
 334                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
 335
 336                compond_mz = compound_group.create_dataset(
 337                    "mz", data=np.array(compound_obj.mz), dtype="f8"
 338                )
 339                compond_abundance = compound_group.create_dataset(
 340                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
 341                )
 342
 343                if self.gcms.molecular_search_settings.exploratory_mode:
 344                    compound_group.attrs["Spectral Similarities"] = json.dumps(
 345                        compound_obj.spectral_similarity_scores,
 346                        sort_keys=False,
 347                        indent=4,
 348                        separators=(",", ":"),
 349                    )
 350            else:
 351                warnings.warn("Skipping duplicate reference compound.")
 352
 353        import json
 354        from datetime import datetime, timezone
 355
 356        import h5py
 357        import numpy as np
 358
 359        output_path = self.output_file.with_suffix(".hdf5")
 360
 361        with h5py.File(output_path, "w") as hdf_handle:
 362            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
 363            hdf_handle.attrs["time_stamp"] = timenow
 364            hdf_handle.attrs["data_structure"] = "gcms"
 365            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
 366            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
 367
 368            hdf_handle.attrs["sample_id"] = "self.gcms.id"
 369            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
 370            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
 371            hdf_handle.attrs["output_data"] = str(output_path)
 372            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
 373            hdf_handle.attrs["corems_version"] = __version__
 374
 375            hdf_handle.attrs["Stats"] = json.dumps(
 376                self.get_data_stats(self.gcms),
 377                sort_keys=False,
 378                indent=4,
 379                separators=(",", ": "),
 380            )
 381            hdf_handle.attrs["Calibration"] = json.dumps(
 382                self.get_calibration_stats(self.gcms, id_label),
 383                sort_keys=False,
 384                indent=4,
 385                separators=(",", ": "),
 386            )
 387            hdf_handle.attrs["Blank"] = json.dumps(
 388                self.get_blank_stats(self.gcms),
 389                sort_keys=False,
 390                indent=4,
 391                separators=(",", ": "),
 392            )
 393
 394            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
 395            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
 396                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
 397            )
 398
 399            scans_dataset = hdf_handle.create_dataset(
 400                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
 401            )
 402            rt_dataset = hdf_handle.create_dataset(
 403                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
 404            )
 405            tic_dataset = hdf_handle.create_dataset(
 406                "tic", data=np.array(self.gcms.tic), dtype="f8"
 407            )
 408            processed_tic_dataset = hdf_handle.create_dataset(
 409                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
 410            )
 411
 412            output_score_method = (
 413                self.gcms.molecular_search_settings.output_score_method
 414            )
 415
 416            for gc_peak in self.gcms:
 417                # print(gc_peak.retention_time)
 418                # print(gc_peak.tic)
 419
 420                # check if there is a compound candidate
 421                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
 422                peak_group.attrs["deconvolution"] = int(
 423                    self.gcms.chromatogram_settings.use_deconvolution
 424                )
 425
 426                peak_group.attrs["start_scan"] = gc_peak.start_scan
 427                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
 428                peak_group.attrs["final_scan"] = gc_peak.final_scan
 429
 430                peak_group.attrs["retention_index"] = gc_peak.ri
 431                peak_group.attrs["retention_time"] = gc_peak.retention_time
 432                peak_group.attrs["area"] = gc_peak.area
 433
 434                mz = peak_group.create_dataset(
 435                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
 436                )
 437                abundance = peak_group.create_dataset(
 438                    "abundance",
 439                    data=np.array(gc_peak.mass_spectrum.abundance),
 440                    dtype="f8",
 441                )
 442
 443                if gc_peak:
 444                    if output_score_method == "highest_sim_score":
 445                        compound_obj = gc_peak.highest_score_compound
 446                        add_compound(gc_peak, compound_obj)
 447
 448                    elif output_score_method == "highest_ss":
 449                        compound_obj = gc_peak.highest_ss_compound
 450                        add_compound(gc_peak, compound_obj)
 451
 452                    else:
 453                        for compound_obj in gc_peak:
 454                            add_compound(gc_peak, compound_obj)
 455
 456    def get_data_stats(self, gcms):
 457        """Get statistics about the GCMS data.
 458
 459        Parameters:
 460        ----------
 461        gcms : object
 462            The low resolution GCMS object.
 463
 464        Returns:
 465        -------
 466        dict
 467            A dictionary containing the data statistics.
 468        """
 469
 470        matched_peaks = gcms.matched_peaks
 471        no_matched_peaks = gcms.no_matched_peaks
 472        unique_metabolites = gcms.unique_metabolites
 473
 474        peak_matchs_above_0p85 = 0
 475        unique_peak_match_above_0p85 = 0
 476        for match_peak in matched_peaks:
 477            gc_peak_above_85 = 0
 478            matches_above_85 = list(
 479                filter(lambda m: m.similarity_score >= 0.85, match_peak)
 480            )
 481            if matches_above_85:
 482                peak_matchs_above_0p85 += 1
 483            if len(matches_above_85) == 1:
 484                unique_peak_match_above_0p85 += 1
 485
 486        data_stats = {}
 487        data_stats["average_signal_noise"] = "ni"
 488        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
 489        data_stats["total_number_peaks"] = len(gcms)
 490        data_stats["total_peaks_matched"] = len(matched_peaks)
 491        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
 492        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
 493        data_stats["single_matches_above_similarity_score_0.85"] = (
 494            unique_peak_match_above_0p85
 495        )
 496        data_stats["unique_metabolites"] = len(unique_metabolites)
 497
 498        return data_stats
 499
 500    def get_calibration_stats(self, gcms, id_label):
 501        """Get statistics about the GC-MS calibration.
 502
 503        Parameters:
 504        ----------
 505        """
 506        calibration_parameters = {}
 507
 508        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
 509        calibration_parameters["data_url"] = str(gcms.cal_file_path)
 510        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
 511        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
 512        calibration_parameters["calibration_method"] = ""
 513
 514        return calibration_parameters
 515
 516    def get_blank_stats(self, gcms):
 517        """Get statistics about the GC-MS blank."""
 518        blank_parameters = {}
 519
 520        blank_parameters["data_name"] = "ni"
 521        blank_parameters["blank_id"] = "ni"
 522        blank_parameters["data_url"] = "ni"
 523        blank_parameters["has_input"] = "ni"
 524        blank_parameters["common_features_to_blank"] = "ni"
 525
 526        return blank_parameters
 527
 528    def get_instrument_metadata(self, gcms):
 529        """Get metadata about the GC-MS instrument."""
 530        instrument_metadata = {}
 531
 532        instrument_metadata["analyzer"] = gcms.analyzer
 533        instrument_metadata["instrument_label"] = gcms.instrument_label
 534        instrument_metadata["instrument_id"] = uuid.uuid4().hex
 535
 536        return instrument_metadata
 537
 538    def get_data_metadata(self, gcms, id_label, output_path):
 539        """Get metadata about the GC-MS data.
 540
 541        Parameters:
 542        ----------
 543        gcms : object
 544            The low resolution GCMS object.
 545        id_label : str
 546            The ID label for the data.
 547        output_path : str
 548            The output file path.
 549
 550        Returns:
 551        -------
 552        dict
 553            A dictionary containing the data metadata.
 554        """
 555        if isinstance(output_path, str):
 556            output_path = Path(output_path)
 557
 558        paramaters_path = output_path.with_suffix(".json")
 559
 560        if paramaters_path.exists():
 561            with paramaters_path.open() as current_param:
 562                metadata = json.load(current_param)
 563                data_metadata = metadata.get("Data")
 564        else:
 565            data_metadata = {}
 566            data_metadata["data_name"] = []
 567            data_metadata["input_data_url"] = []
 568            data_metadata["has_input"] = []
 569
 570        data_metadata["data_name"].append(gcms.sample_name)
 571        data_metadata["input_data_url"].append(str(gcms.file_location))
 572        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
 573
 574        data_metadata["output_data_name"] = str(output_path.stem)
 575        data_metadata["output_data_url"] = str(output_path)
 576        data_metadata["has_output"] = id_label + corems_md5(output_path)
 577
 578        return data_metadata
 579
 580    def get_parameters_json(self, gcms, id_label, output_path):
 581        """Get the parameters as a JSON string.
 582
 583        Parameters:
 584        ----------
 585        gcms : GCMS object
 586            The low resolution GCMS object.
 587        id_label : str
 588            The ID label for the data.
 589        output_path : str
 590            The output file path.
 591
 592        Returns:
 593        -------
 594        str
 595            The parameters as a JSON string.
 596        """
 597
 598        output_parameters_dict = {}
 599        output_parameters_dict["Data"] = self.get_data_metadata(
 600            gcms, id_label, output_path
 601        )
 602        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
 603        output_parameters_dict["Calibration"] = self.get_calibration_stats(
 604            gcms, id_label
 605        )
 606        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
 607        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
 608        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
 609        corems_dict_setting["corems_version"] = __version__
 610        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
 611        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
 612        output = json.dumps(
 613            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
 614        )
 615
 616        return output
 617
 618    def write_settings(self, output_path, gcms, id_label="emsl:"):
 619        """Write the settings to a JSON file.
 620
 621        Parameters:
 622        ----------
 623        output_path : str
 624            The output file path.
 625        gcms : GCMS object
 626            The low resolution GCMS object.
 627        id_label : str
 628            The ID label for the data. Default is "emsl:".
 629
 630        """
 631
 632        output = self.get_parameters_json(gcms, id_label, output_path)
 633
 634        with open(
 635            output_path.with_suffix(".json"),
 636            "w",
 637            encoding="utf8",
 638        ) as outfile:
 639            outfile.write(output)
 640
 641    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
 642        """Get the exported data as a list of dictionaries.
 643
 644        Parameters:
 645        ----------
 646        gcms : object
 647            The low resolution GCMS object.
 648        include_no_match : bool, optional
 649            Whether to include no match data. Default is True.
 650        no_match_inline : bool, optional
 651            Whether to include no match data inline. Default is False.
 652
 653        Returns:
 654        -------
 655        list
 656            The exported data as a list of dictionaries.
 657        """
 658
 659        output_score_method = gcms.molecular_search_settings.output_score_method
 660
 661        dict_data_list = []
 662
 663        def add_match_dict_data():
 664            derivatization = "{}:{}:{}".format(
 665                compound_obj.classify,
 666                compound_obj.derivativenum,
 667                compound_obj.derivatization,
 668            )
 669            out_dict = {
 670                "Sample name": gcms.sample_name,
 671                "Peak Index": gcpeak_index,
 672                "Retention Time": gc_peak.retention_time,
 673                "Retention Time Ref": compound_obj.retention_time,
 674                "Peak Height": gc_peak.tic,
 675                "Peak Area": gc_peak.area,
 676                "Retention index": gc_peak.ri,
 677                "Retention index Ref": compound_obj.ri,
 678                "Retention Index Score": compound_obj.ri_score,
 679                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
 680                "Similarity Score": compound_obj.similarity_score,
 681                "Compound Name": compound_obj.name,
 682                "Chebi ID": compound_obj.metadata.chebi,
 683                "Kegg Compound ID": compound_obj.metadata.kegg,
 684                "Inchi": compound_obj.metadata.inchi,
 685                "Inchi Key": compound_obj.metadata.inchikey,
 686                "Smiles": compound_obj.metadata.smiles,
 687                "Molecular Formula": compound_obj.formula,
 688                "IUPAC Name": compound_obj.metadata.iupac_name,
 689                "Traditional Name": compound_obj.metadata.traditional_name,
 690                "Common Name": compound_obj.metadata.common_name,
 691                "Derivatization": derivatization,
 692            }
 693
 694            if self.gcms.molecular_search_settings.exploratory_mode:
 695                out_dict.update(
 696                    {
 697                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
 698                            "weighted_cosine_correlation"
 699                        ),
 700                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
 701                            "cosine_correlation"
 702                        ),
 703                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
 704                            "stein_scott_similarity"
 705                        ),
 706                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
 707                            "pearson_correlation"
 708                        ),
 709                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
 710                            "spearman_correlation"
 711                        ),
 712                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
 713                            "kendall_tau_correlation"
 714                        ),
 715                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
 716                            "dft_correlation"
 717                        ),
 718                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
 719                            "dwt_correlation"
 720                        ),
 721                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
 722                            "euclidean_distance"
 723                        ),
 724                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
 725                            "manhattan_distance"
 726                        ),
 727                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
 728                            "jaccard_distance"
 729                        ),
 730                    }
 731                )
 732                for method in methods_name:
 733                    out_dict[methods_name.get(method)] = (
 734                        compound_obj.spectral_similarity_scores.get(method)
 735                    )
 736
 737            dict_data_list.append(out_dict)
 738
 739        def add_no_match_dict_data():
 740            dict_data_list.append(
 741                {
 742                    "Sample name": gcms.sample_name,
 743                    "Peak Index": gcpeak_index,
 744                    "Retention Time": gc_peak.retention_time,
 745                    "Peak Height": gc_peak.tic,
 746                    "Peak Area": gc_peak.area,
 747                    "Retention index": gc_peak.ri,
 748                }
 749            )
 750
 751        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
 752            # check if there is a compound candidate
 753            if gc_peak:
 754                if output_score_method == "highest_sim_score":
 755                    compound_obj = gc_peak.highest_score_compound
 756                    add_match_dict_data()
 757
 758                elif output_score_method == "highest_ss":
 759                    compound_obj = gc_peak.highest_ss_compound
 760                    add_match_dict_data()
 761
 762                else:
 763                    for compound_obj in gc_peak:
 764                        add_match_dict_data()  # add monoisotopic peak
 765
 766            else:
 767                # include not_match
 768                if include_no_match and no_match_inline:
 769                    add_no_match_dict_data()
 770
 771        if include_no_match and not no_match_inline:
 772            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
 773                if not gc_peak:
 774                    add_no_match_dict_data()
 775
 776        return dict_data_list
 777
 778
 779class HighResMassSpectraExport(HighResMassSpecExport):
 780    """A class to export high resolution mass spectra data.
 781
 782    This class provides methods to export high resolution mass spectra data to various formats
 783    such as Excel, CSV, HDF5, and Pandas DataFrame.
 784
 785    Parameters
 786    ----------
 787    out_file_path : str | Path
 788        The output file path.
 789    mass_spectra : object
 790        The high resolution mass spectra object.
 791    output_type : str, optional
 792        The output type. Default is 'excel'.
 793
 794    Attributes
 795    ----------
 796    output_file : Path
 797        The output file path without suffix
 798    dir_loc : Path
 799        The directory location for the output file,
 800        by default this will be the output_file + ".corems" and all output files will be
 801        written into this location
 802    mass_spectra : MassSpectraBase
 803        The high resolution mass spectra object.
 804    """
 805
 806    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
 807        super().__init__(
 808            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
 809        )
 810
 811        self.dir_loc = Path(out_file_path + ".corems")
 812        self.dir_loc.mkdir(exist_ok=True)
 813        # Place the output file in the directory
 814        self.output_file = self.dir_loc / Path(out_file_path).name
 815        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
 816        self.mass_spectra = mass_spectra
 817        self.atoms_order_list = None
 818        self._init_columns()
 819
 820    def get_pandas_df(self):
 821        """Get the mass spectra as a list of Pandas DataFrames."""
 822
 823        list_df = []
 824
 825        for mass_spectrum in self.mass_spectra:
 826            columns = self.columns_label + self.get_all_used_atoms_in_order(
 827                mass_spectrum
 828            )
 829
 830            dict_data_list = self.get_list_dict_data(mass_spectrum)
 831
 832            df = DataFrame(dict_data_list, columns=columns)
 833
 834            scan_number = mass_spectrum.scan_number
 835
 836            df.name = str(self.output_file) + "_" + str(scan_number)
 837
 838            list_df.append(df)
 839
 840        return list_df
 841
 842    def to_pandas(self, write_metadata=True):
 843        """Export the data to a Pandas DataFrame and save it as a pickle file.
 844
 845        Parameters:
 846        ----------
 847        write_metadata : bool, optional
 848            Whether to write metadata to the output file. Default is True.
 849        """
 850
 851        for mass_spectrum in self.mass_spectra:
 852            columns = self.columns_label + self.get_all_used_atoms_in_order(
 853                mass_spectrum
 854            )
 855
 856            dict_data_list = self.get_list_dict_data(mass_spectrum)
 857
 858            df = DataFrame(dict_data_list, columns=columns)
 859
 860            scan_number = mass_spectrum.scan_number
 861
 862            out_filename = Path(
 863                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
 864            )
 865
 866            df.to_pickle(self.dir_loc / out_filename)
 867
 868            if write_metadata:
 869                self.write_settings(
 870                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 871                )
 872
 873    def to_excel(self, write_metadata=True):
 874        """Export the data to an Excel file.
 875
 876        Parameters:
 877        ----------
 878        write_metadata : bool, optional
 879            Whether to write metadata to the output file. Default is True.
 880        """
 881        for mass_spectrum in self.mass_spectra:
 882            columns = self.columns_label + self.get_all_used_atoms_in_order(
 883                mass_spectrum
 884            )
 885
 886            dict_data_list = self.get_list_dict_data(mass_spectrum)
 887
 888            df = DataFrame(dict_data_list, columns=columns)
 889
 890            scan_number = mass_spectrum.scan_number
 891
 892            out_filename = Path(
 893                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
 894            )
 895
 896            df.to_excel(self.dir_loc / out_filename)
 897
 898            if write_metadata:
 899                self.write_settings(
 900                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 901                )
 902
 903    def to_csv(self, write_metadata=True):
 904        """Export the data to a CSV file.
 905
 906        Parameters:
 907        ----------
 908        write_metadata : bool, optional
 909            Whether to write metadata to the output file. Default is True.
 910        """
 911        import csv
 912
 913        for mass_spectrum in self.mass_spectra:
 914            columns = self.columns_label + self.get_all_used_atoms_in_order(
 915                mass_spectrum
 916            )
 917
 918            scan_number = mass_spectrum.scan_number
 919
 920            dict_data_list = self.get_list_dict_data(mass_spectrum)
 921
 922            out_filename = Path(
 923                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
 924            )
 925
 926            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
 927                writer = csv.DictWriter(csvfile, fieldnames=columns)
 928                writer.writeheader()
 929                for data in dict_data_list:
 930                    writer.writerow(data)
 931
 932            if write_metadata:
 933                self.write_settings(
 934                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 935                )
 936
 937    def get_mass_spectra_attrs(self):
 938        """Get the mass spectra attributes as a JSON string.
 939
 940        Parameters:
 941        ----------
 942        mass_spectra : object
 943            The high resolution mass spectra object.
 944
 945        Returns:
 946        -------
 947        str
 948            The mass spectra attributes as a JSON string.
 949        """
 950        dict_ms_attrs = {}
 951        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
 952        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
 953        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
 954
 955        return json.dumps(
 956            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
 957        )
 958
 959    def to_hdf(self, overwrite=False, export_raw=True):
 960        """Export the data to an HDF5 file.
 961
 962        Parameters
 963        ----------
 964        overwrite : bool, optional
 965            Whether to overwrite the output file. Default is False.
 966        export_raw : bool, optional
 967            Whether to export the raw mass spectra data. Default is True.
 968        """
 969        if overwrite:
 970            if self.output_file.with_suffix(".hdf5").exists():
 971                self.output_file.with_suffix(".hdf5").unlink()
 972
 973        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 974            if not hdf_handle.attrs.get("date_utc"):
 975                # Set metadata for all mass spectra
 976                timenow = str(
 977                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 978                )
 979                hdf_handle.attrs["date_utc"] = timenow
 980                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 981                hdf_handle.attrs["data_structure"] = "mass_spectra"
 982                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 983                hdf_handle.attrs["instrument_label"] = (
 984                    self.mass_spectra.instrument_label
 985                )
 986                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 987                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 988                hdf_handle.attrs["parser_type"] = (
 989                    self.mass_spectra.spectra_parser_class.__name__
 990                )
 991                hdf_handle.attrs["original_file_location"] = (
 992                    self.mass_spectra.file_location._str
 993                )
 994
 995            if "mass_spectra" not in hdf_handle:
 996                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 997            else:
 998                mass_spectra_group = hdf_handle.get("mass_spectra")
 999
1000            for mass_spectrum in self.mass_spectra:
1001                group_key = str(int(mass_spectrum.scan_number))
1002
1003                self.add_mass_spectrum_to_hdf5(
1004                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1005                )
1006
1007
1008class LCMSExport(HighResMassSpectraExport):
1009    """A class to export high resolution LC-MS data.
1010
1011    This class provides methods to export high resolution LC-MS data to HDF5.
1012
1013    Parameters
1014    ----------
1015    out_file_path : str | Path
1016        The output file path, do not include the file extension.
1017    lcms_object : LCMSBase
1018        The high resolution lc-ms object.
1019    """
1020
1021    def __init__(self, out_file_path, mass_spectra):
1022        super().__init__(out_file_path, mass_spectra, output_type="hdf5")
1023
1024    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1025        """Export the data to an HDF5.
1026
1027        Parameters
1028        ----------
1029        overwrite : bool, optional
1030            Whether to overwrite the output file. Default is False.
1031        save_parameters : bool, optional
1032            Whether to save the parameters as a separate json or toml file. Default is True.
1033        parameter_format : str, optional
1034            The format to save the parameters in. Default is 'toml'.
1035
1036        Raises
1037        ------
1038        ValueError
1039            If parameter_format is not 'json' or 'toml'.
1040        """
1041        export_profile_spectra = (
1042            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1043        )
1044
1045        # Write the mass spectra data to the hdf5 file
1046        super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra)
1047
1048        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1049        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1050            # Add scan_info to hdf5 file
1051            if "scan_info" not in hdf_handle:
1052                scan_info_group = hdf_handle.create_group("scan_info")
1053                for k, v in self.mass_spectra._scan_info.items():
1054                    array = np.array(list(v.values()))
1055                    if array.dtype.str[0:2] == "<U":
1056                        array = array.astype("S")
1057                    scan_info_group.create_dataset(k, data=array)
1058
1059            # Add ms_unprocessed to hdf5 file
1060            export_unprocessed_ms1 = (
1061                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1062            )
1063            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1064                if "ms_unprocessed" not in hdf_handle:
1065                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1066                else:
1067                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1068                for k, v in self.mass_spectra._ms_unprocessed.items():
1069                    array = np.array(v)
1070                    ms_unprocessed_group.create_dataset(str(k), data=array)
1071
1072            # Add LCMS mass features to hdf5 file
1073            if len(self.mass_spectra.mass_features) > 0:
1074                if "mass_features" not in hdf_handle:
1075                    mass_features_group = hdf_handle.create_group("mass_features")
1076                else:
1077                    mass_features_group = hdf_handle.get("mass_features")
1078
1079                # Create group for each mass feature, with key as the mass feature id
1080                for k, v in self.mass_spectra.mass_features.items():
1081                    mass_features_group.create_group(str(k))
1082                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1083                    for k2, v2 in v.__dict__.items():
1084                        if v2 is not None:
1085                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1086                            if k2 not in [
1087                                "chromatogram_parent",
1088                                "ms2_mass_spectra",
1089                                "mass_spectrum",
1090                                "_eic_data",
1091                                "ms2_similarity_results",
1092                            ]:
1093                                if k2 == "ms2_scan_numbers":
1094                                    array = np.array(v2)
1095                                    mass_features_group[str(k)].create_dataset(
1096                                        str(k2), data=array
1097                                    )
1098                                elif k2 == "_half_height_width":
1099                                    array = np.array(v2)
1100                                    mass_features_group[str(k)].create_dataset(
1101                                        str(k2), data=array
1102                                    )
1103                                elif k2 == "_ms_deconvoluted_idx":
1104                                    array = np.array(v2)
1105                                    mass_features_group[str(k)].create_dataset(
1106                                        str(k2), data=array
1107                                    )
1108                                elif k2 == "associated_mass_features_deconvoluted":
1109                                    array = np.array(v2)
1110                                    mass_features_group[str(k)].create_dataset(
1111                                        str(k2), data=array
1112                                    )
1113                                elif (
1114                                    isinstance(v2, int)
1115                                    or isinstance(v2, float)
1116                                    or isinstance(v2, str)
1117                                    or isinstance(v2, np.integer)
1118                                    or isinstance(v2, np.bool_)
1119                                ):
1120                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1121                                else:
1122                                    raise TypeError(
1123                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1124                                    )
1125
1126            # Add EIC data to hdf5 file
1127            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1128            if len(self.mass_spectra.eics) > 0 and export_eics:
1129                if "eics" not in hdf_handle:
1130                    eic_group = hdf_handle.create_group("eics")
1131                else:
1132                    eic_group = hdf_handle.get("eics")
1133
1134                # Create group for each eic
1135                for k, v in self.mass_spectra.eics.items():
1136                    eic_group.create_group(str(k))
1137                    eic_group[str(k)].attrs["mz"] = k
1138                    # Loop through each of the attributes and add them as datasets (if array)
1139                    for k2, v2 in v.__dict__.items():
1140                        if v2 is not None:
1141                            array = np.array(v2)
1142                            eic_group[str(k)].create_dataset(str(k2), data=array)
1143
1144            # Add ms2_search results to hdf5 file
1145            if len(self.mass_spectra.spectral_search_results) > 0:
1146                if "spectral_search_results" not in hdf_handle:
1147                    spectral_search_results = hdf_handle.create_group(
1148                        "spectral_search_results"
1149                    )
1150                else:
1151                    spectral_search_results = hdf_handle.get("spectral_search_results")
1152                # Create group for each search result by ms2_scan / precursor_mz
1153                for k, v in self.mass_spectra.spectral_search_results.items():
1154                    spectral_search_results.create_group(str(k))
1155                    for k2, v2 in v.items():
1156                        spectral_search_results[str(k)].create_group(str(k2))
1157                        spectral_search_results[str(k)][str(k2)].attrs[
1158                            "precursor_mz"
1159                        ] = v2.precursor_mz
1160                        spectral_search_results[str(k)][str(k2)].attrs[
1161                            "query_spectrum_id"
1162                        ] = v2.query_spectrum_id
1163                        # Loop through each of the attributes and add them as datasets (if array)
1164                        for k3, v3 in v2.__dict__.items():
1165                            if v3 is not None and k3 not in [
1166                                "query_spectrum",
1167                                "precursor_mz",
1168                                "query_spectrum_id",
1169                            ]:
1170                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1171                                    v3 = [", ".join(x) for x in v3]
1172                                if all(v3 is not None for v3 in v3):
1173                                    array = np.array(v3)
1174                                if array.dtype.str[0:2] == "<U":
1175                                    array = array.astype("S")
1176                                spectral_search_results[str(k)][str(k2)].create_dataset(
1177                                    str(k3), data=array
1178                                )
1179
1180        # Save parameters as separate json
1181        if save_parameters:
1182            # Check if parameter_format is valid
1183            if parameter_format not in ["json", "toml"]:
1184                raise ValueError("parameter_format must be 'json' or 'toml'")
1185
1186            if parameter_format == "json":
1187                dump_lcms_settings_json(
1188                    filename=self.output_file.with_suffix(".json"),
1189                    lcms_obj=self.mass_spectra,
1190                )
1191            elif parameter_format == "toml":
1192                dump_lcms_settings_toml(
1193                    filename=self.output_file.with_suffix(".toml"),
1194                    lcms_obj=self.mass_spectra,
1195                )
1196
1197class LCMSMetabolomicsExport(LCMSExport):
1198    """A class to export LCMS metabolite data.
1199
1200    This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report.
1201
1202    Parameters
1203    ----------
1204    out_file_path : str | Path
1205        The output file path, do not include the file extension.
1206    mass_spectra : object
1207        The high resolution mass spectra object.
1208    """
1209
1210    def __init__(self, out_file_path, mass_spectra):
1211        super().__init__(out_file_path, mass_spectra)
1212        self.ion_type_dict = ion_type_dict
1213    
1214    @staticmethod
1215    def get_ion_formula(neutral_formula, ion_type):
1216        """From a neutral formula and an ion type, return the formula of the ion.
1217
1218        Notes
1219        -----
1220        This is a static method.
1221        If the neutral_formula is not a string, this method will return None.
1222
1223        Parameters
1224        ----------
1225        neutral_formula : str
1226            The neutral formula, this should be a string form from the MolecularFormula class
1227            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1228            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1229            e.g. MgCl2 is parsed as 'Mg Cl2.
1230        ion_type : str
1231            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1232            See the self.ion_type_dict for the available ion types.
1233
1234        Returns
1235        -------
1236        str
1237            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1238        """
1239        # If neutral_formula is not a string, return None
1240        if not isinstance(neutral_formula, str):
1241            return None
1242
1243        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1244        if re.search(r"\s", neutral_formula):
1245            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1246        else:
1247            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1248            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1249            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1250            neutral_formula = MolecularFormula(
1251                dict(
1252                    zip(
1253                        [x[0] for x in elements],
1254                        [int(x[0]) if x else 1 for x in counts],
1255                    )
1256                ),
1257                ion_charge=0,
1258            )
1259        neutral_formula_dict = neutral_formula.to_dict().copy()
1260
1261        adduct_add_dict = ion_type_dict[ion_type][0]
1262        for key in adduct_add_dict:
1263            if key in neutral_formula_dict.keys():
1264                neutral_formula_dict[key] += adduct_add_dict[key]
1265            else:
1266                neutral_formula_dict[key] = adduct_add_dict[key]
1267
1268        adduct_subtract = ion_type_dict[ion_type][1]
1269        for key in adduct_subtract:
1270            neutral_formula_dict[key] -= adduct_subtract[key]
1271
1272        return MolecularFormula(neutral_formula_dict, ion_charge=0).string
1273
1274    @staticmethod
1275    def get_isotope_type(ion_formula):
1276        """From an ion formula, return the 13C isotope type of the ion.
1277
1278        Notes
1279        -----
1280        This is a static method.
1281        If the ion_formula is not a string, this method will return None.
1282        This is currently only functional for 13C isotopes.
1283
1284        Parameters
1285        ----------
1286        ion_formula : str
1287            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1288
1289        Returns
1290        -------
1291        str
1292            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1293
1294        Raises
1295        ------
1296        ValueError
1297            If the ion_formula is not a string.
1298        """
1299        if not isinstance(ion_formula, str):
1300            return None
1301
1302        if re.search(r"\s", ion_formula):
1303            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1304        else:
1305            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1306        ion_formula_dict = ion_formula.to_dict().copy()
1307
1308        try:
1309            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1310        except KeyError:
1311            iso_class = None
1312
1313        return iso_class
1314    
1315    def report_to_csv(self, molecular_metadata=None):
1316        """Create a report of the mass features and their annotations and save it as a CSV file.
1317
1318        Parameters
1319        ----------
1320        molecular_metadata : dict, optional
1321            The molecular metadata. Default is None.
1322        """
1323        report = self.to_report(molecular_metadata=molecular_metadata)
1324        out_file = self.output_file.with_suffix(".csv")
1325        report.to_csv(out_file, index=False)
1326    
1327    def clean_ms1_report(self, ms1_summary_full):
1328        """Clean the MS1 report.
1329
1330        Parameters
1331        ----------
1332        ms1_summary_full : DataFrame
1333            The full MS1 summary DataFrame.
1334
1335        Returns
1336        -------
1337        DataFrame
1338            The cleaned MS1 summary DataFrame.
1339        """
1340        ms1_summary_full = ms1_summary_full.reset_index()
1341        cols_to_keep = [
1342            "mf_id",
1343            "Molecular Formula",
1344            "Ion Type",
1345            "Calculated m/z",
1346            "m/z Error (ppm)",
1347            "m/z Error Score",
1348            "Is Isotopologue",
1349            "Isotopologue Similarity",
1350            "Confidence Score",
1351        ]
1352        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1353        ms1_summary["ion_formula"] = [
1354            self.get_ion_formula(f, a)
1355            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1356        ]
1357        ms1_summary["isotopologue_type"] = [
1358            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1359        ]
1360
1361        # Reorder columns
1362        ms1_summary = ms1_summary[
1363            [
1364                "mf_id",
1365                "ion_formula",
1366                "isotopologue_type",
1367                "Calculated m/z",
1368                "m/z Error (ppm)",
1369                "m/z Error Score",
1370                "Isotopologue Similarity",
1371                "Confidence Score",
1372            ]
1373        ]
1374
1375        # Set the index to mf_id
1376        ms1_summary = ms1_summary.set_index("mf_id")
1377
1378        return ms1_summary
1379    
1380    def summarize_ms2_report(self, ms2_annot_report):
1381        """
1382        Summarize the MS2 report.
1383
1384        Parameters
1385        ----------
1386        ms2_annot_report : DataFrame
1387            The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
1388        
1389        Returns
1390        -------
1391        """
1392    
1393    def summarize_metabolomics_report(self, ms2_annot_report):
1394        """Summarize the MS2 hits for a metabolomics report
1395        
1396        Parameters
1397        ----------
1398        ms2_annot : DataFrame
1399            The MS2 annotation DataFrame with all annotations.
1400
1401        Returns
1402        -------
1403        DataFrame
1404            The summarized metabolomics report.
1405        """
1406        columns_to_drop = [
1407            "precursor_mz",
1408            "precursor_mz_error_ppm",
1409            "cas",
1410            "data_id",
1411            "iupac_name",
1412            "traditional_name",
1413            "common_name",
1414            "casno",
1415        ]
1416        ms2_annot = ms2_annot_report.drop(
1417            columns=[col for col in columns_to_drop if col in ms2_annot_report.columns]
1418        )
1419        
1420        # Prepare information about the search results, pulling out the best hit for the single report
1421        # Group by mf_id,ref_mol_id grab row with highest entropy similarity
1422        ms2_annot = ms2_annot.reset_index()
1423        # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id
1424        ms2_annot["n_spectra_contributing"] = (
1425            ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"]
1426            .transform("nunique")
1427        )
1428        # Sort by entropy similarity
1429        ms2_annot = ms2_annot.sort_values(
1430            by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False]
1431        )
1432        best_entropy = ms2_annot.drop_duplicates(
1433            subset=["mf_id", "ref_mol_id"], keep="first"
1434        )
1435
1436        return best_entropy
1437
1438    def clean_ms2_report(self, metabolite_summary):
1439        """Clean the MS2 report.
1440
1441        Parameters
1442        ----------
1443        metabolite_summary : DataFrame
1444            The full metabolomics summary DataFrame.
1445
1446        Returns
1447        -------
1448        DataFrame
1449            The cleaned metabolomics summary DataFrame.
1450        """
1451        metabolite_summary = metabolite_summary.reset_index()
1452        metabolite_summary["ion_formula"] = [
1453            self.get_ion_formula(f, a)
1454            for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"])
1455        ]
1456
1457        col_order = [
1458            "mf_id",
1459            "ion_formula",
1460            "ref_ion_type",
1461            "formula",
1462            "inchikey",
1463            "name",
1464            "inchi",
1465            "chebi",
1466            "smiles",
1467            "kegg",
1468            "cas",
1469            "database_name",
1470            "ref_ms_id",
1471            "entropy_similarity",
1472            "ref_mz_in_query_fract",
1473            "n_spectra_contributing",
1474        ]
1475
1476        # Reorder columns
1477        metabolite_summary = metabolite_summary[
1478            [col for col in col_order if col in metabolite_summary.columns]
1479        ]
1480
1481        # Convert chebi (if present) to int:
1482        if "chebi" in metabolite_summary.columns:
1483            metabolite_summary["chebi"] = metabolite_summary["chebi"].astype(
1484                "Int64", errors="ignore"
1485            )
1486
1487        # Set the index to mf_id
1488        metabolite_summary = metabolite_summary.set_index("mf_id")
1489
1490        return metabolite_summary
1491    
1492    def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report):
1493        """Combine the mass feature report with the MS1 and MS2 reports.
1494
1495        Parameters
1496        ----------
1497        mf_report : DataFrame
1498            The mass feature report DataFrame.
1499        ms1_annot_report : DataFrame
1500            The MS1 annotation report DataFrame.
1501        ms2_annot_report : DataFrame
1502            The MS2 annotation report DataFrame.
1503        """
1504        # If there is an ms1_annot_report, merge it with the mf_report
1505        if not ms1_annot_report.empty:
1506            # MS1 has been run and has molecular formula information
1507            mf_report = pd.merge(
1508                mf_report,
1509                ms1_annot_report,
1510                how="left",
1511                on=["mf_id", "isotopologue_type"],
1512            )
1513        if ms2_annot_report is not None:
1514            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1515            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1516            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1517            mf_no_ion_formula = pd.merge(
1518                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1519            )
1520
1521            # pull out the records with ion_formula
1522            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1523            mf_with_ion_formula = pd.merge(
1524                mf_with_ion_formula,
1525                ms2_annot_report,
1526                how="left",
1527                on=["mf_id", "ion_formula"],
1528            )
1529
1530            # put back together
1531            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1532
1533        # Rename colums
1534        rename_dict = {
1535            "mf_id": "Mass Feature ID",
1536            "scan_time": "Retention Time (min)",
1537            "mz": "m/z",
1538            "apex_scan": "Apex Scan Number",
1539            "intensity": "Intensity",
1540            "persistence": "Persistence",
1541            "area": "Area",
1542            "half_height_width": "Half Height Width (min)",
1543            "tailing_factor": "Tailing Factor",
1544            "dispersity_index": "Dispersity Index",
1545            "ms2_spectrum": "MS2 Spectrum",
1546            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1547            "isotopologue_type": "Isotopologue Type",
1548            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1549            "associated_mass_features": "Associated Mass Features after Deconvolution",
1550            "ion_formula": "Ion Formula",
1551            "formula": "Molecular Formula",
1552            "ref_ion_type": "Ion Type",
1553            "annot_level": "Lipid Annotation Level",
1554            "lipid_molecular_species_id": "Lipid Molecular Species",
1555            "lipid_summed_name": "Lipid Species",
1556            "lipid_subclass": "Lipid Subclass",
1557            "lipid_class": "Lipid Class",
1558            "lipid_category": "Lipid Category",
1559            "entropy_similarity": "Entropy Similarity",
1560            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1561            "n_spectra_contributing": "Spectra with Annotation (n)",
1562        }
1563        mf_report = mf_report.rename(columns=rename_dict)
1564        mf_report["Sample Name"] = self.mass_spectra.sample_name
1565        mf_report["Polarity"] = self.mass_spectra.polarity
1566        mf_report = mf_report[
1567            ["Mass Feature ID", "Sample Name", "Polarity"]
1568            + [
1569                col
1570                for col in mf_report.columns
1571                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1572            ]
1573        ]
1574
1575        # Reorder rows by "Mass Feature ID"
1576        mf_report = mf_report.sort_values("Mass Feature ID")
1577
1578        # Reset index
1579        mf_report = mf_report.reset_index(drop=True)
1580
1581        return mf_report
1582    
1583    def to_report(self, molecular_metadata=None):
1584        """Create a report of the mass features and their annotations.
1585
1586        Parameters
1587        ----------
1588        molecular_metadata : dict, optional
1589            The molecular metadata. Default is None.
1590
1591        Returns
1592        -------
1593        DataFrame
1594            The report as a Pandas DataFrame.
1595        """
1596        # Get mass feature dataframe
1597        mf_report = self.mass_spectra.mass_features_to_df()
1598        mf_report = mf_report.reset_index(drop=False)
1599
1600        # Get and clean ms1 annotation dataframe
1601        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1602        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1603        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1604
1605        # Get, summarize, and clean ms2 annotation dataframe
1606        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1607            molecular_metadata=molecular_metadata
1608        )
1609        if ms2_annot_report is not None and molecular_metadata is not None:
1610            ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report)
1611            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1612            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1613            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1614        else:
1615            ms2_annot_report = None
1616
1617        report = self.combine_reports(
1618            mf_report=mf_report,
1619            ms1_annot_report=ms1_annot_report,
1620            ms2_annot_report=ms2_annot_report
1621        )
1622
1623        return report
1624class LipidomicsExport(LCMSMetabolomicsExport):
1625    """A class to export lipidomics data.
1626
1627    This class provides methods to export lipidomics data to various formats and summarize the lipid report.
1628
1629    Parameters
1630    ----------
1631    out_file_path : str | Path
1632        The output file path, do not include the file extension.
1633    mass_spectra : object
1634        The high resolution mass spectra object.
1635    """
1636
1637    def __init__(self, out_file_path, mass_spectra):
1638        super().__init__(out_file_path, mass_spectra)
1639
1640    def summarize_lipid_report(self, ms2_annot):
1641        """Summarize the lipid report.
1642
1643        Parameters
1644        ----------
1645        ms2_annot : DataFrame
1646            The MS2 annotation DataFrame with all annotations.
1647
1648        Returns
1649        -------
1650        DataFrame
1651            The summarized lipid report.
1652        """
1653        # Drop unnecessary columns for easier viewing
1654        columns_to_drop = [
1655            "precursor_mz",
1656            "precursor_mz_error_ppm",
1657            "metabref_mol_id",
1658            "metabref_precursor_mz",
1659            "cas",
1660            "inchikey",
1661            "inchi",
1662            "chebi",
1663            "smiles",
1664            "kegg",
1665            "data_id",
1666            "iupac_name",
1667            "traditional_name",
1668            "common_name",
1669            "casno",
1670        ]
1671        ms2_annot = ms2_annot.drop(
1672            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1673        )
1674
1675        # If ion_types_excluded is not empty, remove those ion types
1676        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1677            "ms2"
1678        ].molecular_search.ion_types_excluded
1679        if len(ion_types_excluded) > 0:
1680            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1681
1682        # If mf_id is not present, check that the index name is mf_id and reset the index
1683        if "mf_id" not in ms2_annot.columns:
1684            if ms2_annot.index.name == "mf_id":
1685                ms2_annot = ms2_annot.reset_index()
1686            else:
1687                raise ValueError("mf_id is not present in the dataframe")
1688
1689        # Attempt to get consensus annotations to the MLF level
1690        mlf_results_all = []
1691        for mf_id in ms2_annot["mf_id"].unique():
1692            mlf_results_perid = []
1693            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1694            #FIXME: Fix this - it's not giving what we want!
1695            ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf)
1696
1697            for query_scan in ms2_annot["query_spectrum_id"].unique():
1698                ms2_annot_sub = ms2_annot_mf[
1699                    ms2_annot_mf["query_spectrum_id"] == query_scan
1700                ].copy()
1701
1702                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1703                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1704                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1705                        ms2_annot_sub["entropy_max"] = (
1706                            ms2_annot_sub["entropy_similarity"]
1707                            == ms2_annot_sub["entropy_similarity"].max()
1708                        )
1709                        ms2_annot_sub["ref_match_fract_max"] = (
1710                            ms2_annot_sub["ref_mz_in_query_fract"]
1711                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1712                        )
1713                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1714                            "query_frag_types"
1715                        ].apply(lambda x: True if "MLF" in x else False)
1716
1717                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1718                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1719                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1720                        ].all(axis=1)
1721
1722                        # If there is a consensus, take the row with the highest entropy_similarity
1723                        if ms2_annot_sub["consensus"].any():
1724                            ms2_annot_sub = ms2_annot_sub[
1725                                ms2_annot_sub["entropy_similarity"]
1726                                == ms2_annot_sub["entropy_similarity"].max()
1727                            ].head(1)
1728                            mlf_results_perid.append(ms2_annot_sub)
1729            if len(mlf_results_perid) == 0:
1730                mlf_results_perid = pd.DataFrame()
1731            else:
1732                mlf_results_perid = pd.concat(mlf_results_perid)
1733                if mlf_results_perid["name"].nunique() == 1:
1734                    mlf_results_perid = mlf_results_perid[
1735                        mlf_results_perid["entropy_similarity"]
1736                        == mlf_results_perid["entropy_similarity"].max()
1737                    ].head(1)
1738                else:
1739                    mlf_results_perid = pd.DataFrame()
1740                mlf_results_all.append(mlf_results_perid)
1741
1742        # These are the consensus annotations to the MLF level
1743        if len(mlf_results_all) > 0:
1744            mlf_results_all = pd.concat(mlf_results_all)
1745            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1746        else:
1747            # Make an empty dataframe
1748            mlf_results_all = ms2_annot.head(0)
1749
1750        # For remaining mf_ids, try to get a consensus annotation to the species level
1751        species_results_all = []
1752        # Remove mf_ids that have consensus annotations to the MLF level
1753        ms2_annot_spec = ms2_annot[
1754            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1755        ]
1756        for mf_id in ms2_annot_spec["mf_id"].unique():
1757            # Do all the hits have the same lipid_summed_name?
1758            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1759            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1760
1761            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1762                # Grab the highest entropy_similarity result
1763                ms2_annot_sub = ms2_annot_sub[
1764                    ms2_annot_sub["entropy_similarity"]
1765                    == ms2_annot_sub["entropy_similarity"].max()
1766                ].head(1)
1767                species_results_all.append(ms2_annot_sub)
1768
1769        # These are the consensus annotations to the species level
1770        if len(species_results_all) > 0:
1771            species_results_all = pd.concat(species_results_all)
1772            species_results_all["annot_level"] = "species"
1773        else:
1774            # Make an empty dataframe
1775            species_results_all = ms2_annot.head(0)
1776
1777        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1778        # Remove mf_ids that have consensus annotations to the species level
1779        ms2_annot_remaining = ms2_annot_spec[
1780            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1781        ]
1782        no_consensus = []
1783        for mf_id in ms2_annot_remaining["mf_id"].unique():
1784            id_sub = []
1785            id_no_con = []
1786            ms2_annot_sub_mf = ms2_annot_remaining[
1787                ms2_annot_remaining["mf_id"] == mf_id
1788            ].copy()
1789            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1790                ms2_annot_sub = ms2_annot_sub_mf[
1791                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1792                ].copy()
1793
1794                # New columns for ranking [HIGHER RANK = BETTER]
1795                ms2_annot_sub["entropy_max"] = (
1796                    ms2_annot_sub["entropy_similarity"]
1797                    == ms2_annot_sub["entropy_similarity"].max()
1798                )
1799                ms2_annot_sub["ref_match_fract_max"] = (
1800                    ms2_annot_sub["ref_mz_in_query_fract"]
1801                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1802                )
1803                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1804                    lambda x: True if "MLF" in x else False
1805                )
1806
1807                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1808                ms2_annot_sub["consensus"] = ms2_annot_sub[
1809                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1810                ].all(axis=1)
1811                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1812                id_sub.append(ms2_annot_sub_con)
1813                id_no_con.append(ms2_annot_sub)
1814            id_sub = pd.concat(id_sub)
1815            id_no_con = pd.concat(id_no_con)
1816
1817            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1818            if (
1819                id_sub["query_frag_types"]
1820                .apply(lambda x: True if "MLF" in x else False)
1821                .all()
1822                and len(id_sub) > 0
1823            ):
1824                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1825                id_sub = id_sub.loc[idx]
1826                # Reorder so highest entropy_similarity is first
1827                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1828                id_sub["annot_level"] = id_sub["structure_level"]
1829                no_consensus.append(id_sub)
1830
1831            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1832            elif len(id_sub) == 0:
1833                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1834                    summed_sub = id_no_con[
1835                        id_no_con["lipid_summed_name"] == lipid_summed_name
1836                    ]
1837                    # Any consensus to MLF?
1838                    if summed_sub["consensus"].any():
1839                        summed_sub = summed_sub[summed_sub["consensus"]]
1840                        summed_sub["annot_level"] = summed_sub["structure_level"]
1841                        no_consensus.append(summed_sub)
1842                    else:
1843                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1844                        summed_sub = summed_sub[
1845                            summed_sub["entropy_similarity"]
1846                            == summed_sub["entropy_similarity"].max()
1847                        ].head(1)
1848                        # get first row
1849                        summed_sub["annot_level"] = "species"
1850                        summed_sub["name"] = ""
1851                        no_consensus.append(summed_sub)
1852            else:
1853                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1854
1855        if len(no_consensus) > 0:
1856            no_consensus = pd.concat(no_consensus)
1857        else:
1858            no_consensus = ms2_annot.head(0)
1859
1860        # Combine all the consensus annotations and reformat the dataframe for output
1861        species_results_all = species_results_all.drop(columns=["name"])
1862        species_results_all["lipid_molecular_species_id"] = ""
1863        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1864        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1865        consensus_annotations = pd.concat(
1866            [mlf_results_all, species_results_all, no_consensus]
1867        )
1868        consensus_annotations = consensus_annotations.sort_values(
1869            "mf_id", ascending=True
1870        )
1871        cols_to_keep = [
1872            "mf_id",
1873            "ref_ion_type",
1874            "entropy_similarity",
1875            "ref_mz_in_query_fract",
1876            "lipid_molecular_species_id",
1877            "lipid_summed_name",
1878            "lipid_subclass",
1879            "lipid_class",
1880            "lipid_category",
1881            "formula",
1882            "annot_level",
1883            "n_spectra_contributing",
1884        ]
1885        consensus_annotations = consensus_annotations[cols_to_keep]
1886        consensus_annotations = consensus_annotations.set_index("mf_id")
1887
1888        return consensus_annotations
1889
1890    def clean_ms2_report(self, lipid_summary):
1891        """Clean the MS2 report.
1892
1893        Parameters
1894        ----------
1895        lipid_summary : DataFrame
1896            The full lipid summary DataFrame.
1897
1898        Returns
1899        -------
1900        DataFrame
1901            The cleaned lipid summary DataFrame.
1902        """
1903        lipid_summary = lipid_summary.reset_index()
1904        lipid_summary["ion_formula"] = [
1905            self.get_ion_formula(f, a)
1906            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
1907        ]
1908
1909        # Reorder columns
1910        lipid_summary = lipid_summary[
1911            [
1912                "mf_id",
1913                "ion_formula",
1914                "ref_ion_type",
1915                "formula",
1916                "annot_level",
1917                "lipid_molecular_species_id",
1918                "lipid_summed_name",
1919                "lipid_subclass",
1920                "lipid_class",
1921                "lipid_category",
1922                "entropy_similarity",
1923                "ref_mz_in_query_fract",
1924                "n_spectra_contributing",
1925            ]
1926        ]
1927
1928        # Set the index to mf_id
1929        lipid_summary = lipid_summary.set_index("mf_id")
1930
1931        return lipid_summary
1932
1933    def to_report(self, molecular_metadata=None):
1934        """Create a report of the mass features and their annotations.
1935
1936        Parameters
1937        ----------
1938        molecular_metadata : dict, optional
1939            The molecular metadata. Default is None.
1940
1941        Returns
1942        -------
1943        DataFrame
1944            The report of the mass features and their annotations.
1945
1946        Notes
1947        -----
1948        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
1949        """
1950        # Get mass feature dataframe
1951        mf_report = self.mass_spectra.mass_features_to_df()
1952        mf_report = mf_report.reset_index(drop=False)
1953
1954        # Get and clean ms1 annotation dataframe
1955        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1956        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1957        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1958
1959        # Get, summarize, and clean ms2 annotation dataframe
1960        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1961            molecular_metadata=molecular_metadata
1962        )
1963        if ms2_annot_report is not None and molecular_metadata is not None:
1964            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
1965            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1966            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1967            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1968        report = self.combine_reports(
1969            mf_report=mf_report,
1970            ms1_annot_report=ms1_annot_report,
1971            ms2_annot_report=ms2_annot_report
1972        )
1973        return report
1974
1975        
ion_type_dict = {'M+': [{}, {}], 'protonated': [{'H': 1}, {}], '[M+H]+': [{'H': 1}, {}], '[M+NH4]+': [{'N': 1, 'H': 4}, {}], '[M+Na]+': [{'Na': 1}, {}], '[M+K]+': [{'K': 1}, {}], '[M+2Na+Cl]+': [{'Na': 2, 'Cl': 1}, {}], '[M+2Na-H]+': [{'Na': 2}, {'H': 1}], '[M+C2H3Na2O2]+': [{'C': 2, 'H': 3, 'Na': 2, 'O': 2}, {}], '[M+C4H10N3]+': [{'C': 4, 'H': 10, 'N': 3}, {}], '[M+NH4+ACN]+': [{'C': 2, 'H': 7, 'N': 2}, {}], '[M+H-H2O]+': [{}, {'H': 1, 'O': 1}], 'de-protonated': [{}, {'H': 1}], '[M-H]-': [{}, {'H': 1}], '[M+Cl]-': [{'Cl': 1}, {}], '[M+HCOO]-': [{'C': 1, 'H': 1, 'O': 2}, {}], '[M+CH3COO]-': [{'C': 2, 'H': 3, 'O': 2}, {}], '[M+2NaAc+Cl]-': [{'Na': 2, 'C': 2, 'H': 3, 'O': 2, 'Cl': 1}, {}], '[M+K-2H]-': [{'K': 1}, {'H': 2}], '[M+Na-2H]-': [{'Na': 1}, {'H': 2}]}
class LowResGCMSExport:
 55class LowResGCMSExport:
 56    """A class to export low resolution GC-MS data.
 57
 58    This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
 59
 60    Parameters:
 61    ----------
 62    out_file_path : str
 63        The output file path.
 64    gcms : object
 65        The low resolution GCMS object.
 66
 67    Attributes:
 68    ----------
 69    output_file : Path
 70        The output file path as a Path object.
 71    gcms : object
 72        The low resolution GCMS object.
 73
 74    Methods:
 75    -------
 76    * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
 77    * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
 78    * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
 79    * to_excel(write_mode='a', write_metadata=True, id_label="corems:"),
 80        Export the data to an Excel file.
 81    * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:").
 82        Export the data to a CSV file.
 83    * to_hdf(id_label="corems:").
 84        Export the data to an HDF5 file.
 85    * get_data_stats(gcms).
 86        Get statistics about the GCMS data.
 87
 88    """
 89
 90    def __init__(self, out_file_path, gcms):
 91        self.output_file = Path(out_file_path)
 92
 93        self.gcms = gcms
 94
 95        self._init_columns()
 96
 97    def _init_columns(self):
 98        """Initialize the column names for the exported data.
 99
100        Returns:
101        -------
102        list
103            The list of column names.
104        """
105
106        columns = [
107            "Sample name",
108            "Peak Index",
109            "Retention Time",
110            "Retention Time Ref",
111            "Peak Height",
112            "Peak Area",
113            "Retention index",
114            "Retention index Ref",
115            "Retention Index Score",
116            "Similarity Score",
117            "Spectral Similarity Score",
118            "Compound Name",
119            "Chebi ID",
120            "Kegg Compound ID",
121            "Inchi",
122            "Inchi Key",
123            "Smiles",
124            "Molecular Formula",
125            "IUPAC Name",
126            "Traditional Name",
127            "Common Name",
128            "Derivatization",
129        ]
130
131        if self.gcms.molecular_search_settings.exploratory_mode:
132            columns.extend(
133                [
134                    "Weighted Cosine Correlation",
135                    "Cosine Correlation",
136                    "Stein Scott Similarity",
137                    "Pearson Correlation",
138                    "Spearman Correlation",
139                    "Kendall Tau Correlation",
140                    "Euclidean Distance",
141                    "Manhattan Distance",
142                    "Jaccard Distance",
143                    "DWT Correlation",
144                    "DFT Correlation",
145                ]
146            )
147
148            columns.extend(list(methods_name.values()))
149
150        return columns
151
152    def get_pandas_df(self, id_label="corems:"):
153        """Get the exported data as a Pandas DataFrame.
154
155        Parameters:
156        ----------
157        id_label : str, optional
158            The ID label for the data. Default is "corems:".
159
160        Returns:
161        -------
162        DataFrame
163            The exported data as a Pandas DataFrame.
164        """
165
166        columns = self._init_columns()
167
168        dict_data_list = self.get_list_dict_data(self.gcms)
169
170        df = DataFrame(dict_data_list, columns=columns)
171
172        df.name = self.gcms.sample_name
173
174        return df
175
176    def get_json(self, nan=False, id_label="corems:"):
177        """Get the exported data as a JSON string.
178
179        Parameters:
180        ----------
181        nan : bool, optional
182            Whether to include NaN values in the JSON string. Default is False.
183        id_label : str, optional
184            The ID label for the data. Default is "corems:".
185
186        """
187
188        import json
189
190        dict_data_list = self.get_list_dict_data(self.gcms)
191
192        return json.dumps(
193            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
194        )
195
196    def to_pandas(self, write_metadata=True, id_label="corems:"):
197        """Export the data to a Pandas DataFrame and save it as a pickle file.
198
199        Parameters:
200        ----------
201        write_metadata : bool, optional
202            Whether to write metadata to the output file.
203        id_label : str, optional
204            The ID label for the data.
205        """
206
207        columns = self._init_columns()
208
209        dict_data_list = self.get_list_dict_data(self.gcms)
210
211        df = DataFrame(dict_data_list, columns=columns)
212
213        df.to_pickle(self.output_file.with_suffix(".pkl"))
214
215        if write_metadata:
216            self.write_settings(
217                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
218            )
219
220    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
221        """Export the data to an Excel file.
222
223        Parameters:
224        ----------
225        write_mode : str, optional
226            The write mode for the Excel file. Default is 'a' (append).
227        write_metadata : bool, optional
228            Whether to write metadata to the output file. Default is True.
229        id_label : str, optional
230            The ID label for the data. Default is "corems:".
231        """
232
233        out_put_path = self.output_file.with_suffix(".xlsx")
234
235        columns = self._init_columns()
236
237        dict_data_list = self.get_list_dict_data(self.gcms)
238
239        df = DataFrame(dict_data_list, columns=columns)
240
241        if write_mode == "a" and out_put_path.exists():
242            writer = ExcelWriter(out_put_path, engine="openpyxl")
243            # try to open an existing workbook
244            writer.book = load_workbook(out_put_path)
245            # copy existing sheets
246            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
247            # read existing file
248            reader = read_excel(out_put_path)
249            # write out the new sheet
250            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
251
252            writer.close()
253        else:
254            df.to_excel(
255                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
256            )
257
258        if write_metadata:
259            self.write_settings(out_put_path, self.gcms, id_label=id_label)
260
261    def to_csv(
262        self,
263        separate_output=False,
264        write_mode="w",
265        write_metadata=True,
266        id_label="corems:",
267    ):
268        """Export the data to a CSV file.
269
270        Parameters:
271        ----------
272        separate_output : bool, optional
273            Whether to separate the output into multiple files. Default is False.
274        write_mode : str, optional
275            The write mode for the CSV file. Default is 'w' (write).
276        write_metadata : bool, optional
277            Whether to write metadata to the output file. Default is True.
278        id_label : str, optional
279            The ID label for the data. Default is "corems:".
280        """
281
282        if separate_output:
283            # set write mode to write
284            # this mode will overwrite the file without warning
285            write_mode = "w"
286        else:
287            # set write mode to append
288            write_mode = "a"
289
290        columns = self._init_columns()
291
292        dict_data_list = self.get_list_dict_data(self.gcms)
293
294        out_put_path = self.output_file.with_suffix(".csv")
295
296        write_header = not out_put_path.exists()
297
298        try:
299            with open(out_put_path, write_mode, newline="") as csvfile:
300                writer = csv.DictWriter(csvfile, fieldnames=columns)
301                if write_header:
302                    writer.writeheader()
303                for data in dict_data_list:
304                    writer.writerow(data)
305
306            if write_metadata:
307                self.write_settings(out_put_path, self.gcms, id_label=id_label)
308
309        except IOError as ioerror:
310            print(ioerror)
311
312    def to_hdf(self, id_label="corems:"):
313        """Export the data to an HDF5 file.
314
315        Parameters:
316        ----------
317        id_label : str, optional
318            The ID label for the data. Default is "corems:".
319        """
320
321        # save sample at a time
322        def add_compound(gc_peak, compound_obj):
323            modifier = compound_obj.classify if compound_obj.classify else ""
324            compound_group = compound_obj.name.replace("/", "") + " " + modifier
325
326            if compound_group not in peak_group:
327                compound_group = peak_group.create_group(compound_group)
328
329                # compound_group.attrs["retention_time"] = compound_obj.retention_time
330                compound_group.attrs["retention_index"] = compound_obj.ri
331                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
332                compound_group.attrs["spectral_similarity_score"] = (
333                    compound_obj.spectral_similarity_score
334                )
335                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
336
337                compond_mz = compound_group.create_dataset(
338                    "mz", data=np.array(compound_obj.mz), dtype="f8"
339                )
340                compond_abundance = compound_group.create_dataset(
341                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
342                )
343
344                if self.gcms.molecular_search_settings.exploratory_mode:
345                    compound_group.attrs["Spectral Similarities"] = json.dumps(
346                        compound_obj.spectral_similarity_scores,
347                        sort_keys=False,
348                        indent=4,
349                        separators=(",", ":"),
350                    )
351            else:
352                warnings.warn("Skipping duplicate reference compound.")
353
354        import json
355        from datetime import datetime, timezone
356
357        import h5py
358        import numpy as np
359
360        output_path = self.output_file.with_suffix(".hdf5")
361
362        with h5py.File(output_path, "w") as hdf_handle:
363            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
364            hdf_handle.attrs["time_stamp"] = timenow
365            hdf_handle.attrs["data_structure"] = "gcms"
366            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
367            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
368
369            hdf_handle.attrs["sample_id"] = "self.gcms.id"
370            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
371            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
372            hdf_handle.attrs["output_data"] = str(output_path)
373            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
374            hdf_handle.attrs["corems_version"] = __version__
375
376            hdf_handle.attrs["Stats"] = json.dumps(
377                self.get_data_stats(self.gcms),
378                sort_keys=False,
379                indent=4,
380                separators=(",", ": "),
381            )
382            hdf_handle.attrs["Calibration"] = json.dumps(
383                self.get_calibration_stats(self.gcms, id_label),
384                sort_keys=False,
385                indent=4,
386                separators=(",", ": "),
387            )
388            hdf_handle.attrs["Blank"] = json.dumps(
389                self.get_blank_stats(self.gcms),
390                sort_keys=False,
391                indent=4,
392                separators=(",", ": "),
393            )
394
395            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
396            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
397                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
398            )
399
400            scans_dataset = hdf_handle.create_dataset(
401                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
402            )
403            rt_dataset = hdf_handle.create_dataset(
404                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
405            )
406            tic_dataset = hdf_handle.create_dataset(
407                "tic", data=np.array(self.gcms.tic), dtype="f8"
408            )
409            processed_tic_dataset = hdf_handle.create_dataset(
410                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
411            )
412
413            output_score_method = (
414                self.gcms.molecular_search_settings.output_score_method
415            )
416
417            for gc_peak in self.gcms:
418                # print(gc_peak.retention_time)
419                # print(gc_peak.tic)
420
421                # check if there is a compound candidate
422                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
423                peak_group.attrs["deconvolution"] = int(
424                    self.gcms.chromatogram_settings.use_deconvolution
425                )
426
427                peak_group.attrs["start_scan"] = gc_peak.start_scan
428                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
429                peak_group.attrs["final_scan"] = gc_peak.final_scan
430
431                peak_group.attrs["retention_index"] = gc_peak.ri
432                peak_group.attrs["retention_time"] = gc_peak.retention_time
433                peak_group.attrs["area"] = gc_peak.area
434
435                mz = peak_group.create_dataset(
436                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
437                )
438                abundance = peak_group.create_dataset(
439                    "abundance",
440                    data=np.array(gc_peak.mass_spectrum.abundance),
441                    dtype="f8",
442                )
443
444                if gc_peak:
445                    if output_score_method == "highest_sim_score":
446                        compound_obj = gc_peak.highest_score_compound
447                        add_compound(gc_peak, compound_obj)
448
449                    elif output_score_method == "highest_ss":
450                        compound_obj = gc_peak.highest_ss_compound
451                        add_compound(gc_peak, compound_obj)
452
453                    else:
454                        for compound_obj in gc_peak:
455                            add_compound(gc_peak, compound_obj)
456
457    def get_data_stats(self, gcms):
458        """Get statistics about the GCMS data.
459
460        Parameters:
461        ----------
462        gcms : object
463            The low resolution GCMS object.
464
465        Returns:
466        -------
467        dict
468            A dictionary containing the data statistics.
469        """
470
471        matched_peaks = gcms.matched_peaks
472        no_matched_peaks = gcms.no_matched_peaks
473        unique_metabolites = gcms.unique_metabolites
474
475        peak_matchs_above_0p85 = 0
476        unique_peak_match_above_0p85 = 0
477        for match_peak in matched_peaks:
478            gc_peak_above_85 = 0
479            matches_above_85 = list(
480                filter(lambda m: m.similarity_score >= 0.85, match_peak)
481            )
482            if matches_above_85:
483                peak_matchs_above_0p85 += 1
484            if len(matches_above_85) == 1:
485                unique_peak_match_above_0p85 += 1
486
487        data_stats = {}
488        data_stats["average_signal_noise"] = "ni"
489        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
490        data_stats["total_number_peaks"] = len(gcms)
491        data_stats["total_peaks_matched"] = len(matched_peaks)
492        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
493        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
494        data_stats["single_matches_above_similarity_score_0.85"] = (
495            unique_peak_match_above_0p85
496        )
497        data_stats["unique_metabolites"] = len(unique_metabolites)
498
499        return data_stats
500
501    def get_calibration_stats(self, gcms, id_label):
502        """Get statistics about the GC-MS calibration.
503
504        Parameters:
505        ----------
506        """
507        calibration_parameters = {}
508
509        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
510        calibration_parameters["data_url"] = str(gcms.cal_file_path)
511        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
512        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
513        calibration_parameters["calibration_method"] = ""
514
515        return calibration_parameters
516
517    def get_blank_stats(self, gcms):
518        """Get statistics about the GC-MS blank."""
519        blank_parameters = {}
520
521        blank_parameters["data_name"] = "ni"
522        blank_parameters["blank_id"] = "ni"
523        blank_parameters["data_url"] = "ni"
524        blank_parameters["has_input"] = "ni"
525        blank_parameters["common_features_to_blank"] = "ni"
526
527        return blank_parameters
528
529    def get_instrument_metadata(self, gcms):
530        """Get metadata about the GC-MS instrument."""
531        instrument_metadata = {}
532
533        instrument_metadata["analyzer"] = gcms.analyzer
534        instrument_metadata["instrument_label"] = gcms.instrument_label
535        instrument_metadata["instrument_id"] = uuid.uuid4().hex
536
537        return instrument_metadata
538
539    def get_data_metadata(self, gcms, id_label, output_path):
540        """Get metadata about the GC-MS data.
541
542        Parameters:
543        ----------
544        gcms : object
545            The low resolution GCMS object.
546        id_label : str
547            The ID label for the data.
548        output_path : str
549            The output file path.
550
551        Returns:
552        -------
553        dict
554            A dictionary containing the data metadata.
555        """
556        if isinstance(output_path, str):
557            output_path = Path(output_path)
558
559        paramaters_path = output_path.with_suffix(".json")
560
561        if paramaters_path.exists():
562            with paramaters_path.open() as current_param:
563                metadata = json.load(current_param)
564                data_metadata = metadata.get("Data")
565        else:
566            data_metadata = {}
567            data_metadata["data_name"] = []
568            data_metadata["input_data_url"] = []
569            data_metadata["has_input"] = []
570
571        data_metadata["data_name"].append(gcms.sample_name)
572        data_metadata["input_data_url"].append(str(gcms.file_location))
573        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
574
575        data_metadata["output_data_name"] = str(output_path.stem)
576        data_metadata["output_data_url"] = str(output_path)
577        data_metadata["has_output"] = id_label + corems_md5(output_path)
578
579        return data_metadata
580
581    def get_parameters_json(self, gcms, id_label, output_path):
582        """Get the parameters as a JSON string.
583
584        Parameters:
585        ----------
586        gcms : GCMS object
587            The low resolution GCMS object.
588        id_label : str
589            The ID label for the data.
590        output_path : str
591            The output file path.
592
593        Returns:
594        -------
595        str
596            The parameters as a JSON string.
597        """
598
599        output_parameters_dict = {}
600        output_parameters_dict["Data"] = self.get_data_metadata(
601            gcms, id_label, output_path
602        )
603        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
604        output_parameters_dict["Calibration"] = self.get_calibration_stats(
605            gcms, id_label
606        )
607        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
608        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
609        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
610        corems_dict_setting["corems_version"] = __version__
611        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
612        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
613        output = json.dumps(
614            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
615        )
616
617        return output
618
619    def write_settings(self, output_path, gcms, id_label="emsl:"):
620        """Write the settings to a JSON file.
621
622        Parameters:
623        ----------
624        output_path : str
625            The output file path.
626        gcms : GCMS object
627            The low resolution GCMS object.
628        id_label : str
629            The ID label for the data. Default is "emsl:".
630
631        """
632
633        output = self.get_parameters_json(gcms, id_label, output_path)
634
635        with open(
636            output_path.with_suffix(".json"),
637            "w",
638            encoding="utf8",
639        ) as outfile:
640            outfile.write(output)
641
642    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
643        """Get the exported data as a list of dictionaries.
644
645        Parameters:
646        ----------
647        gcms : object
648            The low resolution GCMS object.
649        include_no_match : bool, optional
650            Whether to include no match data. Default is True.
651        no_match_inline : bool, optional
652            Whether to include no match data inline. Default is False.
653
654        Returns:
655        -------
656        list
657            The exported data as a list of dictionaries.
658        """
659
660        output_score_method = gcms.molecular_search_settings.output_score_method
661
662        dict_data_list = []
663
664        def add_match_dict_data():
665            derivatization = "{}:{}:{}".format(
666                compound_obj.classify,
667                compound_obj.derivativenum,
668                compound_obj.derivatization,
669            )
670            out_dict = {
671                "Sample name": gcms.sample_name,
672                "Peak Index": gcpeak_index,
673                "Retention Time": gc_peak.retention_time,
674                "Retention Time Ref": compound_obj.retention_time,
675                "Peak Height": gc_peak.tic,
676                "Peak Area": gc_peak.area,
677                "Retention index": gc_peak.ri,
678                "Retention index Ref": compound_obj.ri,
679                "Retention Index Score": compound_obj.ri_score,
680                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
681                "Similarity Score": compound_obj.similarity_score,
682                "Compound Name": compound_obj.name,
683                "Chebi ID": compound_obj.metadata.chebi,
684                "Kegg Compound ID": compound_obj.metadata.kegg,
685                "Inchi": compound_obj.metadata.inchi,
686                "Inchi Key": compound_obj.metadata.inchikey,
687                "Smiles": compound_obj.metadata.smiles,
688                "Molecular Formula": compound_obj.formula,
689                "IUPAC Name": compound_obj.metadata.iupac_name,
690                "Traditional Name": compound_obj.metadata.traditional_name,
691                "Common Name": compound_obj.metadata.common_name,
692                "Derivatization": derivatization,
693            }
694
695            if self.gcms.molecular_search_settings.exploratory_mode:
696                out_dict.update(
697                    {
698                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
699                            "weighted_cosine_correlation"
700                        ),
701                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
702                            "cosine_correlation"
703                        ),
704                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
705                            "stein_scott_similarity"
706                        ),
707                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
708                            "pearson_correlation"
709                        ),
710                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
711                            "spearman_correlation"
712                        ),
713                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
714                            "kendall_tau_correlation"
715                        ),
716                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
717                            "dft_correlation"
718                        ),
719                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
720                            "dwt_correlation"
721                        ),
722                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
723                            "euclidean_distance"
724                        ),
725                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
726                            "manhattan_distance"
727                        ),
728                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
729                            "jaccard_distance"
730                        ),
731                    }
732                )
733                for method in methods_name:
734                    out_dict[methods_name.get(method)] = (
735                        compound_obj.spectral_similarity_scores.get(method)
736                    )
737
738            dict_data_list.append(out_dict)
739
740        def add_no_match_dict_data():
741            dict_data_list.append(
742                {
743                    "Sample name": gcms.sample_name,
744                    "Peak Index": gcpeak_index,
745                    "Retention Time": gc_peak.retention_time,
746                    "Peak Height": gc_peak.tic,
747                    "Peak Area": gc_peak.area,
748                    "Retention index": gc_peak.ri,
749                }
750            )
751
752        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
753            # check if there is a compound candidate
754            if gc_peak:
755                if output_score_method == "highest_sim_score":
756                    compound_obj = gc_peak.highest_score_compound
757                    add_match_dict_data()
758
759                elif output_score_method == "highest_ss":
760                    compound_obj = gc_peak.highest_ss_compound
761                    add_match_dict_data()
762
763                else:
764                    for compound_obj in gc_peak:
765                        add_match_dict_data()  # add monoisotopic peak
766
767            else:
768                # include not_match
769                if include_no_match and no_match_inline:
770                    add_no_match_dict_data()
771
772        if include_no_match and not no_match_inline:
773            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
774                if not gc_peak:
775                    add_no_match_dict_data()
776
777        return dict_data_list

A class to export low resolution GC-MS data.

This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.

Parameters:

out_file_path : str The output file path. gcms : object The low resolution GCMS object.

Attributes:

output_file : Path The output file path as a Path object. gcms : object The low resolution GCMS object.

Methods:

  • get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
  • get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
  • to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
  • to_excel(write_mode='a', write_metadata=True, id_label="corems:"), Export the data to an Excel file.
  • to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). Export the data to a CSV file.
  • to_hdf(id_label="corems:"). Export the data to an HDF5 file.
  • get_data_stats(gcms). Get statistics about the GCMS data.
LowResGCMSExport(out_file_path, gcms)
90    def __init__(self, out_file_path, gcms):
91        self.output_file = Path(out_file_path)
92
93        self.gcms = gcms
94
95        self._init_columns()
output_file
gcms
def get_pandas_df(self, id_label='corems:'):
152    def get_pandas_df(self, id_label="corems:"):
153        """Get the exported data as a Pandas DataFrame.
154
155        Parameters:
156        ----------
157        id_label : str, optional
158            The ID label for the data. Default is "corems:".
159
160        Returns:
161        -------
162        DataFrame
163            The exported data as a Pandas DataFrame.
164        """
165
166        columns = self._init_columns()
167
168        dict_data_list = self.get_list_dict_data(self.gcms)
169
170        df = DataFrame(dict_data_list, columns=columns)
171
172        df.name = self.gcms.sample_name
173
174        return df

Get the exported data as a Pandas DataFrame.

Parameters:

id_label : str, optional The ID label for the data. Default is "corems:".

Returns:

DataFrame The exported data as a Pandas DataFrame.

def get_json(self, nan=False, id_label='corems:'):
176    def get_json(self, nan=False, id_label="corems:"):
177        """Get the exported data as a JSON string.
178
179        Parameters:
180        ----------
181        nan : bool, optional
182            Whether to include NaN values in the JSON string. Default is False.
183        id_label : str, optional
184            The ID label for the data. Default is "corems:".
185
186        """
187
188        import json
189
190        dict_data_list = self.get_list_dict_data(self.gcms)
191
192        return json.dumps(
193            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
194        )

Get the exported data as a JSON string.

Parameters:

nan : bool, optional Whether to include NaN values in the JSON string. Default is False. id_label : str, optional The ID label for the data. Default is "corems:".

def to_pandas(self, write_metadata=True, id_label='corems:'):
196    def to_pandas(self, write_metadata=True, id_label="corems:"):
197        """Export the data to a Pandas DataFrame and save it as a pickle file.
198
199        Parameters:
200        ----------
201        write_metadata : bool, optional
202            Whether to write metadata to the output file.
203        id_label : str, optional
204            The ID label for the data.
205        """
206
207        columns = self._init_columns()
208
209        dict_data_list = self.get_list_dict_data(self.gcms)
210
211        df = DataFrame(dict_data_list, columns=columns)
212
213        df.to_pickle(self.output_file.with_suffix(".pkl"))
214
215        if write_metadata:
216            self.write_settings(
217                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
218            )

Export the data to a Pandas DataFrame and save it as a pickle file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. id_label : str, optional The ID label for the data.

def to_excel(self, write_mode='a', write_metadata=True, id_label='corems:'):
220    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
221        """Export the data to an Excel file.
222
223        Parameters:
224        ----------
225        write_mode : str, optional
226            The write mode for the Excel file. Default is 'a' (append).
227        write_metadata : bool, optional
228            Whether to write metadata to the output file. Default is True.
229        id_label : str, optional
230            The ID label for the data. Default is "corems:".
231        """
232
233        out_put_path = self.output_file.with_suffix(".xlsx")
234
235        columns = self._init_columns()
236
237        dict_data_list = self.get_list_dict_data(self.gcms)
238
239        df = DataFrame(dict_data_list, columns=columns)
240
241        if write_mode == "a" and out_put_path.exists():
242            writer = ExcelWriter(out_put_path, engine="openpyxl")
243            # try to open an existing workbook
244            writer.book = load_workbook(out_put_path)
245            # copy existing sheets
246            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
247            # read existing file
248            reader = read_excel(out_put_path)
249            # write out the new sheet
250            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
251
252            writer.close()
253        else:
254            df.to_excel(
255                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
256            )
257
258        if write_metadata:
259            self.write_settings(out_put_path, self.gcms, id_label=id_label)

Export the data to an Excel file.

Parameters:

write_mode : str, optional The write mode for the Excel file. Default is 'a' (append). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".

def to_csv( self, separate_output=False, write_mode='w', write_metadata=True, id_label='corems:'):
261    def to_csv(
262        self,
263        separate_output=False,
264        write_mode="w",
265        write_metadata=True,
266        id_label="corems:",
267    ):
268        """Export the data to a CSV file.
269
270        Parameters:
271        ----------
272        separate_output : bool, optional
273            Whether to separate the output into multiple files. Default is False.
274        write_mode : str, optional
275            The write mode for the CSV file. Default is 'w' (write).
276        write_metadata : bool, optional
277            Whether to write metadata to the output file. Default is True.
278        id_label : str, optional
279            The ID label for the data. Default is "corems:".
280        """
281
282        if separate_output:
283            # set write mode to write
284            # this mode will overwrite the file without warning
285            write_mode = "w"
286        else:
287            # set write mode to append
288            write_mode = "a"
289
290        columns = self._init_columns()
291
292        dict_data_list = self.get_list_dict_data(self.gcms)
293
294        out_put_path = self.output_file.with_suffix(".csv")
295
296        write_header = not out_put_path.exists()
297
298        try:
299            with open(out_put_path, write_mode, newline="") as csvfile:
300                writer = csv.DictWriter(csvfile, fieldnames=columns)
301                if write_header:
302                    writer.writeheader()
303                for data in dict_data_list:
304                    writer.writerow(data)
305
306            if write_metadata:
307                self.write_settings(out_put_path, self.gcms, id_label=id_label)
308
309        except IOError as ioerror:
310            print(ioerror)

Export the data to a CSV file.

Parameters:

separate_output : bool, optional Whether to separate the output into multiple files. Default is False. write_mode : str, optional The write mode for the CSV file. Default is 'w' (write). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".

def to_hdf(self, id_label='corems:'):
312    def to_hdf(self, id_label="corems:"):
313        """Export the data to an HDF5 file.
314
315        Parameters:
316        ----------
317        id_label : str, optional
318            The ID label for the data. Default is "corems:".
319        """
320
321        # save sample at a time
322        def add_compound(gc_peak, compound_obj):
323            modifier = compound_obj.classify if compound_obj.classify else ""
324            compound_group = compound_obj.name.replace("/", "") + " " + modifier
325
326            if compound_group not in peak_group:
327                compound_group = peak_group.create_group(compound_group)
328
329                # compound_group.attrs["retention_time"] = compound_obj.retention_time
330                compound_group.attrs["retention_index"] = compound_obj.ri
331                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
332                compound_group.attrs["spectral_similarity_score"] = (
333                    compound_obj.spectral_similarity_score
334                )
335                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
336
337                compond_mz = compound_group.create_dataset(
338                    "mz", data=np.array(compound_obj.mz), dtype="f8"
339                )
340                compond_abundance = compound_group.create_dataset(
341                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
342                )
343
344                if self.gcms.molecular_search_settings.exploratory_mode:
345                    compound_group.attrs["Spectral Similarities"] = json.dumps(
346                        compound_obj.spectral_similarity_scores,
347                        sort_keys=False,
348                        indent=4,
349                        separators=(",", ":"),
350                    )
351            else:
352                warnings.warn("Skipping duplicate reference compound.")
353
354        import json
355        from datetime import datetime, timezone
356
357        import h5py
358        import numpy as np
359
360        output_path = self.output_file.with_suffix(".hdf5")
361
362        with h5py.File(output_path, "w") as hdf_handle:
363            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
364            hdf_handle.attrs["time_stamp"] = timenow
365            hdf_handle.attrs["data_structure"] = "gcms"
366            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
367            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
368
369            hdf_handle.attrs["sample_id"] = "self.gcms.id"
370            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
371            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
372            hdf_handle.attrs["output_data"] = str(output_path)
373            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
374            hdf_handle.attrs["corems_version"] = __version__
375
376            hdf_handle.attrs["Stats"] = json.dumps(
377                self.get_data_stats(self.gcms),
378                sort_keys=False,
379                indent=4,
380                separators=(",", ": "),
381            )
382            hdf_handle.attrs["Calibration"] = json.dumps(
383                self.get_calibration_stats(self.gcms, id_label),
384                sort_keys=False,
385                indent=4,
386                separators=(",", ": "),
387            )
388            hdf_handle.attrs["Blank"] = json.dumps(
389                self.get_blank_stats(self.gcms),
390                sort_keys=False,
391                indent=4,
392                separators=(",", ": "),
393            )
394
395            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
396            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
397                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
398            )
399
400            scans_dataset = hdf_handle.create_dataset(
401                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
402            )
403            rt_dataset = hdf_handle.create_dataset(
404                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
405            )
406            tic_dataset = hdf_handle.create_dataset(
407                "tic", data=np.array(self.gcms.tic), dtype="f8"
408            )
409            processed_tic_dataset = hdf_handle.create_dataset(
410                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
411            )
412
413            output_score_method = (
414                self.gcms.molecular_search_settings.output_score_method
415            )
416
417            for gc_peak in self.gcms:
418                # print(gc_peak.retention_time)
419                # print(gc_peak.tic)
420
421                # check if there is a compound candidate
422                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
423                peak_group.attrs["deconvolution"] = int(
424                    self.gcms.chromatogram_settings.use_deconvolution
425                )
426
427                peak_group.attrs["start_scan"] = gc_peak.start_scan
428                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
429                peak_group.attrs["final_scan"] = gc_peak.final_scan
430
431                peak_group.attrs["retention_index"] = gc_peak.ri
432                peak_group.attrs["retention_time"] = gc_peak.retention_time
433                peak_group.attrs["area"] = gc_peak.area
434
435                mz = peak_group.create_dataset(
436                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
437                )
438                abundance = peak_group.create_dataset(
439                    "abundance",
440                    data=np.array(gc_peak.mass_spectrum.abundance),
441                    dtype="f8",
442                )
443
444                if gc_peak:
445                    if output_score_method == "highest_sim_score":
446                        compound_obj = gc_peak.highest_score_compound
447                        add_compound(gc_peak, compound_obj)
448
449                    elif output_score_method == "highest_ss":
450                        compound_obj = gc_peak.highest_ss_compound
451                        add_compound(gc_peak, compound_obj)
452
453                    else:
454                        for compound_obj in gc_peak:
455                            add_compound(gc_peak, compound_obj)

Export the data to an HDF5 file.

Parameters:

id_label : str, optional The ID label for the data. Default is "corems:".

def get_data_stats(self, gcms):
457    def get_data_stats(self, gcms):
458        """Get statistics about the GCMS data.
459
460        Parameters:
461        ----------
462        gcms : object
463            The low resolution GCMS object.
464
465        Returns:
466        -------
467        dict
468            A dictionary containing the data statistics.
469        """
470
471        matched_peaks = gcms.matched_peaks
472        no_matched_peaks = gcms.no_matched_peaks
473        unique_metabolites = gcms.unique_metabolites
474
475        peak_matchs_above_0p85 = 0
476        unique_peak_match_above_0p85 = 0
477        for match_peak in matched_peaks:
478            gc_peak_above_85 = 0
479            matches_above_85 = list(
480                filter(lambda m: m.similarity_score >= 0.85, match_peak)
481            )
482            if matches_above_85:
483                peak_matchs_above_0p85 += 1
484            if len(matches_above_85) == 1:
485                unique_peak_match_above_0p85 += 1
486
487        data_stats = {}
488        data_stats["average_signal_noise"] = "ni"
489        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
490        data_stats["total_number_peaks"] = len(gcms)
491        data_stats["total_peaks_matched"] = len(matched_peaks)
492        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
493        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
494        data_stats["single_matches_above_similarity_score_0.85"] = (
495            unique_peak_match_above_0p85
496        )
497        data_stats["unique_metabolites"] = len(unique_metabolites)
498
499        return data_stats

Get statistics about the GCMS data.

Parameters:

gcms : object The low resolution GCMS object.

Returns:

dict A dictionary containing the data statistics.

def get_calibration_stats(self, gcms, id_label):
501    def get_calibration_stats(self, gcms, id_label):
502        """Get statistics about the GC-MS calibration.
503
504        Parameters:
505        ----------
506        """
507        calibration_parameters = {}
508
509        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
510        calibration_parameters["data_url"] = str(gcms.cal_file_path)
511        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
512        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
513        calibration_parameters["calibration_method"] = ""
514
515        return calibration_parameters

Get statistics about the GC-MS calibration.

Parameters:

def get_blank_stats(self, gcms):
517    def get_blank_stats(self, gcms):
518        """Get statistics about the GC-MS blank."""
519        blank_parameters = {}
520
521        blank_parameters["data_name"] = "ni"
522        blank_parameters["blank_id"] = "ni"
523        blank_parameters["data_url"] = "ni"
524        blank_parameters["has_input"] = "ni"
525        blank_parameters["common_features_to_blank"] = "ni"
526
527        return blank_parameters

Get statistics about the GC-MS blank.

def get_instrument_metadata(self, gcms):
529    def get_instrument_metadata(self, gcms):
530        """Get metadata about the GC-MS instrument."""
531        instrument_metadata = {}
532
533        instrument_metadata["analyzer"] = gcms.analyzer
534        instrument_metadata["instrument_label"] = gcms.instrument_label
535        instrument_metadata["instrument_id"] = uuid.uuid4().hex
536
537        return instrument_metadata

Get metadata about the GC-MS instrument.

def get_data_metadata(self, gcms, id_label, output_path):
539    def get_data_metadata(self, gcms, id_label, output_path):
540        """Get metadata about the GC-MS data.
541
542        Parameters:
543        ----------
544        gcms : object
545            The low resolution GCMS object.
546        id_label : str
547            The ID label for the data.
548        output_path : str
549            The output file path.
550
551        Returns:
552        -------
553        dict
554            A dictionary containing the data metadata.
555        """
556        if isinstance(output_path, str):
557            output_path = Path(output_path)
558
559        paramaters_path = output_path.with_suffix(".json")
560
561        if paramaters_path.exists():
562            with paramaters_path.open() as current_param:
563                metadata = json.load(current_param)
564                data_metadata = metadata.get("Data")
565        else:
566            data_metadata = {}
567            data_metadata["data_name"] = []
568            data_metadata["input_data_url"] = []
569            data_metadata["has_input"] = []
570
571        data_metadata["data_name"].append(gcms.sample_name)
572        data_metadata["input_data_url"].append(str(gcms.file_location))
573        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
574
575        data_metadata["output_data_name"] = str(output_path.stem)
576        data_metadata["output_data_url"] = str(output_path)
577        data_metadata["has_output"] = id_label + corems_md5(output_path)
578
579        return data_metadata

Get metadata about the GC-MS data.

Parameters:

gcms : object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.

Returns:

dict A dictionary containing the data metadata.

def get_parameters_json(self, gcms, id_label, output_path):
581    def get_parameters_json(self, gcms, id_label, output_path):
582        """Get the parameters as a JSON string.
583
584        Parameters:
585        ----------
586        gcms : GCMS object
587            The low resolution GCMS object.
588        id_label : str
589            The ID label for the data.
590        output_path : str
591            The output file path.
592
593        Returns:
594        -------
595        str
596            The parameters as a JSON string.
597        """
598
599        output_parameters_dict = {}
600        output_parameters_dict["Data"] = self.get_data_metadata(
601            gcms, id_label, output_path
602        )
603        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
604        output_parameters_dict["Calibration"] = self.get_calibration_stats(
605            gcms, id_label
606        )
607        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
608        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
609        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
610        corems_dict_setting["corems_version"] = __version__
611        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
612        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
613        output = json.dumps(
614            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
615        )
616
617        return output

Get the parameters as a JSON string.

Parameters:

gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.

Returns:

str The parameters as a JSON string.

def write_settings(self, output_path, gcms, id_label='emsl:'):
619    def write_settings(self, output_path, gcms, id_label="emsl:"):
620        """Write the settings to a JSON file.
621
622        Parameters:
623        ----------
624        output_path : str
625            The output file path.
626        gcms : GCMS object
627            The low resolution GCMS object.
628        id_label : str
629            The ID label for the data. Default is "emsl:".
630
631        """
632
633        output = self.get_parameters_json(gcms, id_label, output_path)
634
635        with open(
636            output_path.with_suffix(".json"),
637            "w",
638            encoding="utf8",
639        ) as outfile:
640            outfile.write(output)

Write the settings to a JSON file.

Parameters:

output_path : str The output file path. gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. Default is "emsl:".

def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
642    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
643        """Get the exported data as a list of dictionaries.
644
645        Parameters:
646        ----------
647        gcms : object
648            The low resolution GCMS object.
649        include_no_match : bool, optional
650            Whether to include no match data. Default is True.
651        no_match_inline : bool, optional
652            Whether to include no match data inline. Default is False.
653
654        Returns:
655        -------
656        list
657            The exported data as a list of dictionaries.
658        """
659
660        output_score_method = gcms.molecular_search_settings.output_score_method
661
662        dict_data_list = []
663
664        def add_match_dict_data():
665            derivatization = "{}:{}:{}".format(
666                compound_obj.classify,
667                compound_obj.derivativenum,
668                compound_obj.derivatization,
669            )
670            out_dict = {
671                "Sample name": gcms.sample_name,
672                "Peak Index": gcpeak_index,
673                "Retention Time": gc_peak.retention_time,
674                "Retention Time Ref": compound_obj.retention_time,
675                "Peak Height": gc_peak.tic,
676                "Peak Area": gc_peak.area,
677                "Retention index": gc_peak.ri,
678                "Retention index Ref": compound_obj.ri,
679                "Retention Index Score": compound_obj.ri_score,
680                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
681                "Similarity Score": compound_obj.similarity_score,
682                "Compound Name": compound_obj.name,
683                "Chebi ID": compound_obj.metadata.chebi,
684                "Kegg Compound ID": compound_obj.metadata.kegg,
685                "Inchi": compound_obj.metadata.inchi,
686                "Inchi Key": compound_obj.metadata.inchikey,
687                "Smiles": compound_obj.metadata.smiles,
688                "Molecular Formula": compound_obj.formula,
689                "IUPAC Name": compound_obj.metadata.iupac_name,
690                "Traditional Name": compound_obj.metadata.traditional_name,
691                "Common Name": compound_obj.metadata.common_name,
692                "Derivatization": derivatization,
693            }
694
695            if self.gcms.molecular_search_settings.exploratory_mode:
696                out_dict.update(
697                    {
698                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
699                            "weighted_cosine_correlation"
700                        ),
701                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
702                            "cosine_correlation"
703                        ),
704                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
705                            "stein_scott_similarity"
706                        ),
707                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
708                            "pearson_correlation"
709                        ),
710                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
711                            "spearman_correlation"
712                        ),
713                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
714                            "kendall_tau_correlation"
715                        ),
716                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
717                            "dft_correlation"
718                        ),
719                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
720                            "dwt_correlation"
721                        ),
722                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
723                            "euclidean_distance"
724                        ),
725                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
726                            "manhattan_distance"
727                        ),
728                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
729                            "jaccard_distance"
730                        ),
731                    }
732                )
733                for method in methods_name:
734                    out_dict[methods_name.get(method)] = (
735                        compound_obj.spectral_similarity_scores.get(method)
736                    )
737
738            dict_data_list.append(out_dict)
739
740        def add_no_match_dict_data():
741            dict_data_list.append(
742                {
743                    "Sample name": gcms.sample_name,
744                    "Peak Index": gcpeak_index,
745                    "Retention Time": gc_peak.retention_time,
746                    "Peak Height": gc_peak.tic,
747                    "Peak Area": gc_peak.area,
748                    "Retention index": gc_peak.ri,
749                }
750            )
751
752        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
753            # check if there is a compound candidate
754            if gc_peak:
755                if output_score_method == "highest_sim_score":
756                    compound_obj = gc_peak.highest_score_compound
757                    add_match_dict_data()
758
759                elif output_score_method == "highest_ss":
760                    compound_obj = gc_peak.highest_ss_compound
761                    add_match_dict_data()
762
763                else:
764                    for compound_obj in gc_peak:
765                        add_match_dict_data()  # add monoisotopic peak
766
767            else:
768                # include not_match
769                if include_no_match and no_match_inline:
770                    add_no_match_dict_data()
771
772        if include_no_match and not no_match_inline:
773            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
774                if not gc_peak:
775                    add_no_match_dict_data()
776
777        return dict_data_list

Get the exported data as a list of dictionaries.

Parameters:

gcms : object The low resolution GCMS object. include_no_match : bool, optional Whether to include no match data. Default is True. no_match_inline : bool, optional Whether to include no match data inline. Default is False.

Returns:

list The exported data as a list of dictionaries.

class HighResMassSpectraExport(corems.mass_spectrum.output.export.HighResMassSpecExport):
 780class HighResMassSpectraExport(HighResMassSpecExport):
 781    """A class to export high resolution mass spectra data.
 782
 783    This class provides methods to export high resolution mass spectra data to various formats
 784    such as Excel, CSV, HDF5, and Pandas DataFrame.
 785
 786    Parameters
 787    ----------
 788    out_file_path : str | Path
 789        The output file path.
 790    mass_spectra : object
 791        The high resolution mass spectra object.
 792    output_type : str, optional
 793        The output type. Default is 'excel'.
 794
 795    Attributes
 796    ----------
 797    output_file : Path
 798        The output file path without suffix
 799    dir_loc : Path
 800        The directory location for the output file,
 801        by default this will be the output_file + ".corems" and all output files will be
 802        written into this location
 803    mass_spectra : MassSpectraBase
 804        The high resolution mass spectra object.
 805    """
 806
 807    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
 808        super().__init__(
 809            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
 810        )
 811
 812        self.dir_loc = Path(out_file_path + ".corems")
 813        self.dir_loc.mkdir(exist_ok=True)
 814        # Place the output file in the directory
 815        self.output_file = self.dir_loc / Path(out_file_path).name
 816        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
 817        self.mass_spectra = mass_spectra
 818        self.atoms_order_list = None
 819        self._init_columns()
 820
 821    def get_pandas_df(self):
 822        """Get the mass spectra as a list of Pandas DataFrames."""
 823
 824        list_df = []
 825
 826        for mass_spectrum in self.mass_spectra:
 827            columns = self.columns_label + self.get_all_used_atoms_in_order(
 828                mass_spectrum
 829            )
 830
 831            dict_data_list = self.get_list_dict_data(mass_spectrum)
 832
 833            df = DataFrame(dict_data_list, columns=columns)
 834
 835            scan_number = mass_spectrum.scan_number
 836
 837            df.name = str(self.output_file) + "_" + str(scan_number)
 838
 839            list_df.append(df)
 840
 841        return list_df
 842
 843    def to_pandas(self, write_metadata=True):
 844        """Export the data to a Pandas DataFrame and save it as a pickle file.
 845
 846        Parameters:
 847        ----------
 848        write_metadata : bool, optional
 849            Whether to write metadata to the output file. Default is True.
 850        """
 851
 852        for mass_spectrum in self.mass_spectra:
 853            columns = self.columns_label + self.get_all_used_atoms_in_order(
 854                mass_spectrum
 855            )
 856
 857            dict_data_list = self.get_list_dict_data(mass_spectrum)
 858
 859            df = DataFrame(dict_data_list, columns=columns)
 860
 861            scan_number = mass_spectrum.scan_number
 862
 863            out_filename = Path(
 864                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
 865            )
 866
 867            df.to_pickle(self.dir_loc / out_filename)
 868
 869            if write_metadata:
 870                self.write_settings(
 871                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 872                )
 873
 874    def to_excel(self, write_metadata=True):
 875        """Export the data to an Excel file.
 876
 877        Parameters:
 878        ----------
 879        write_metadata : bool, optional
 880            Whether to write metadata to the output file. Default is True.
 881        """
 882        for mass_spectrum in self.mass_spectra:
 883            columns = self.columns_label + self.get_all_used_atoms_in_order(
 884                mass_spectrum
 885            )
 886
 887            dict_data_list = self.get_list_dict_data(mass_spectrum)
 888
 889            df = DataFrame(dict_data_list, columns=columns)
 890
 891            scan_number = mass_spectrum.scan_number
 892
 893            out_filename = Path(
 894                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
 895            )
 896
 897            df.to_excel(self.dir_loc / out_filename)
 898
 899            if write_metadata:
 900                self.write_settings(
 901                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 902                )
 903
 904    def to_csv(self, write_metadata=True):
 905        """Export the data to a CSV file.
 906
 907        Parameters:
 908        ----------
 909        write_metadata : bool, optional
 910            Whether to write metadata to the output file. Default is True.
 911        """
 912        import csv
 913
 914        for mass_spectrum in self.mass_spectra:
 915            columns = self.columns_label + self.get_all_used_atoms_in_order(
 916                mass_spectrum
 917            )
 918
 919            scan_number = mass_spectrum.scan_number
 920
 921            dict_data_list = self.get_list_dict_data(mass_spectrum)
 922
 923            out_filename = Path(
 924                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
 925            )
 926
 927            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
 928                writer = csv.DictWriter(csvfile, fieldnames=columns)
 929                writer.writeheader()
 930                for data in dict_data_list:
 931                    writer.writerow(data)
 932
 933            if write_metadata:
 934                self.write_settings(
 935                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 936                )
 937
 938    def get_mass_spectra_attrs(self):
 939        """Get the mass spectra attributes as a JSON string.
 940
 941        Parameters:
 942        ----------
 943        mass_spectra : object
 944            The high resolution mass spectra object.
 945
 946        Returns:
 947        -------
 948        str
 949            The mass spectra attributes as a JSON string.
 950        """
 951        dict_ms_attrs = {}
 952        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
 953        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
 954        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
 955
 956        return json.dumps(
 957            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
 958        )
 959
 960    def to_hdf(self, overwrite=False, export_raw=True):
 961        """Export the data to an HDF5 file.
 962
 963        Parameters
 964        ----------
 965        overwrite : bool, optional
 966            Whether to overwrite the output file. Default is False.
 967        export_raw : bool, optional
 968            Whether to export the raw mass spectra data. Default is True.
 969        """
 970        if overwrite:
 971            if self.output_file.with_suffix(".hdf5").exists():
 972                self.output_file.with_suffix(".hdf5").unlink()
 973
 974        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 975            if not hdf_handle.attrs.get("date_utc"):
 976                # Set metadata for all mass spectra
 977                timenow = str(
 978                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 979                )
 980                hdf_handle.attrs["date_utc"] = timenow
 981                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 982                hdf_handle.attrs["data_structure"] = "mass_spectra"
 983                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 984                hdf_handle.attrs["instrument_label"] = (
 985                    self.mass_spectra.instrument_label
 986                )
 987                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 988                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 989                hdf_handle.attrs["parser_type"] = (
 990                    self.mass_spectra.spectra_parser_class.__name__
 991                )
 992                hdf_handle.attrs["original_file_location"] = (
 993                    self.mass_spectra.file_location._str
 994                )
 995
 996            if "mass_spectra" not in hdf_handle:
 997                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 998            else:
 999                mass_spectra_group = hdf_handle.get("mass_spectra")
1000
1001            for mass_spectrum in self.mass_spectra:
1002                group_key = str(int(mass_spectrum.scan_number))
1003
1004                self.add_mass_spectrum_to_hdf5(
1005                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1006                )

A class to export high resolution mass spectra data.

This class provides methods to export high resolution mass spectra data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.

Parameters
  • out_file_path (str | Path): The output file path.
  • mass_spectra (object): The high resolution mass spectra object.
  • output_type (str, optional): The output type. Default is 'excel'.
Attributes
  • output_file (Path): The output file path without suffix
  • dir_loc (Path): The directory location for the output file, by default this will be the output_file + ".corems" and all output files will be written into this location
  • mass_spectra (MassSpectraBase): The high resolution mass spectra object.
HighResMassSpectraExport(out_file_path, mass_spectra, output_type='excel')
807    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
808        super().__init__(
809            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
810        )
811
812        self.dir_loc = Path(out_file_path + ".corems")
813        self.dir_loc.mkdir(exist_ok=True)
814        # Place the output file in the directory
815        self.output_file = self.dir_loc / Path(out_file_path).name
816        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
817        self.mass_spectra = mass_spectra
818        self.atoms_order_list = None
819        self._init_columns()

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

dir_loc
output_file
mass_spectra
atoms_order_list
def get_pandas_df(self):
821    def get_pandas_df(self):
822        """Get the mass spectra as a list of Pandas DataFrames."""
823
824        list_df = []
825
826        for mass_spectrum in self.mass_spectra:
827            columns = self.columns_label + self.get_all_used_atoms_in_order(
828                mass_spectrum
829            )
830
831            dict_data_list = self.get_list_dict_data(mass_spectrum)
832
833            df = DataFrame(dict_data_list, columns=columns)
834
835            scan_number = mass_spectrum.scan_number
836
837            df.name = str(self.output_file) + "_" + str(scan_number)
838
839            list_df.append(df)
840
841        return list_df

Get the mass spectra as a list of Pandas DataFrames.

def to_pandas(self, write_metadata=True):
843    def to_pandas(self, write_metadata=True):
844        """Export the data to a Pandas DataFrame and save it as a pickle file.
845
846        Parameters:
847        ----------
848        write_metadata : bool, optional
849            Whether to write metadata to the output file. Default is True.
850        """
851
852        for mass_spectrum in self.mass_spectra:
853            columns = self.columns_label + self.get_all_used_atoms_in_order(
854                mass_spectrum
855            )
856
857            dict_data_list = self.get_list_dict_data(mass_spectrum)
858
859            df = DataFrame(dict_data_list, columns=columns)
860
861            scan_number = mass_spectrum.scan_number
862
863            out_filename = Path(
864                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
865            )
866
867            df.to_pickle(self.dir_loc / out_filename)
868
869            if write_metadata:
870                self.write_settings(
871                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
872                )

Export the data to a Pandas DataFrame and save it as a pickle file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def to_excel(self, write_metadata=True):
874    def to_excel(self, write_metadata=True):
875        """Export the data to an Excel file.
876
877        Parameters:
878        ----------
879        write_metadata : bool, optional
880            Whether to write metadata to the output file. Default is True.
881        """
882        for mass_spectrum in self.mass_spectra:
883            columns = self.columns_label + self.get_all_used_atoms_in_order(
884                mass_spectrum
885            )
886
887            dict_data_list = self.get_list_dict_data(mass_spectrum)
888
889            df = DataFrame(dict_data_list, columns=columns)
890
891            scan_number = mass_spectrum.scan_number
892
893            out_filename = Path(
894                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
895            )
896
897            df.to_excel(self.dir_loc / out_filename)
898
899            if write_metadata:
900                self.write_settings(
901                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
902                )

Export the data to an Excel file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def to_csv(self, write_metadata=True):
904    def to_csv(self, write_metadata=True):
905        """Export the data to a CSV file.
906
907        Parameters:
908        ----------
909        write_metadata : bool, optional
910            Whether to write metadata to the output file. Default is True.
911        """
912        import csv
913
914        for mass_spectrum in self.mass_spectra:
915            columns = self.columns_label + self.get_all_used_atoms_in_order(
916                mass_spectrum
917            )
918
919            scan_number = mass_spectrum.scan_number
920
921            dict_data_list = self.get_list_dict_data(mass_spectrum)
922
923            out_filename = Path(
924                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
925            )
926
927            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
928                writer = csv.DictWriter(csvfile, fieldnames=columns)
929                writer.writeheader()
930                for data in dict_data_list:
931                    writer.writerow(data)
932
933            if write_metadata:
934                self.write_settings(
935                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
936                )

Export the data to a CSV file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def get_mass_spectra_attrs(self):
938    def get_mass_spectra_attrs(self):
939        """Get the mass spectra attributes as a JSON string.
940
941        Parameters:
942        ----------
943        mass_spectra : object
944            The high resolution mass spectra object.
945
946        Returns:
947        -------
948        str
949            The mass spectra attributes as a JSON string.
950        """
951        dict_ms_attrs = {}
952        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
953        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
954        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
955
956        return json.dumps(
957            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
958        )

Get the mass spectra attributes as a JSON string.

Parameters:

mass_spectra : object The high resolution mass spectra object.

Returns:

str The mass spectra attributes as a JSON string.

def to_hdf(self, overwrite=False, export_raw=True):
 960    def to_hdf(self, overwrite=False, export_raw=True):
 961        """Export the data to an HDF5 file.
 962
 963        Parameters
 964        ----------
 965        overwrite : bool, optional
 966            Whether to overwrite the output file. Default is False.
 967        export_raw : bool, optional
 968            Whether to export the raw mass spectra data. Default is True.
 969        """
 970        if overwrite:
 971            if self.output_file.with_suffix(".hdf5").exists():
 972                self.output_file.with_suffix(".hdf5").unlink()
 973
 974        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 975            if not hdf_handle.attrs.get("date_utc"):
 976                # Set metadata for all mass spectra
 977                timenow = str(
 978                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 979                )
 980                hdf_handle.attrs["date_utc"] = timenow
 981                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 982                hdf_handle.attrs["data_structure"] = "mass_spectra"
 983                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 984                hdf_handle.attrs["instrument_label"] = (
 985                    self.mass_spectra.instrument_label
 986                )
 987                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 988                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 989                hdf_handle.attrs["parser_type"] = (
 990                    self.mass_spectra.spectra_parser_class.__name__
 991                )
 992                hdf_handle.attrs["original_file_location"] = (
 993                    self.mass_spectra.file_location._str
 994                )
 995
 996            if "mass_spectra" not in hdf_handle:
 997                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 998            else:
 999                mass_spectra_group = hdf_handle.get("mass_spectra")
1000
1001            for mass_spectrum in self.mass_spectra:
1002                group_key = str(int(mass_spectrum.scan_number))
1003
1004                self.add_mass_spectrum_to_hdf5(
1005                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1006                )

Export the data to an HDF5 file.

Parameters
  • overwrite (bool, optional): Whether to overwrite the output file. Default is False.
  • export_raw (bool, optional): Whether to export the raw mass spectra data. Default is True.
class LCMSExport(HighResMassSpectraExport):
1009class LCMSExport(HighResMassSpectraExport):
1010    """A class to export high resolution LC-MS data.
1011
1012    This class provides methods to export high resolution LC-MS data to HDF5.
1013
1014    Parameters
1015    ----------
1016    out_file_path : str | Path
1017        The output file path, do not include the file extension.
1018    lcms_object : LCMSBase
1019        The high resolution lc-ms object.
1020    """
1021
1022    def __init__(self, out_file_path, mass_spectra):
1023        super().__init__(out_file_path, mass_spectra, output_type="hdf5")
1024
1025    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1026        """Export the data to an HDF5.
1027
1028        Parameters
1029        ----------
1030        overwrite : bool, optional
1031            Whether to overwrite the output file. Default is False.
1032        save_parameters : bool, optional
1033            Whether to save the parameters as a separate json or toml file. Default is True.
1034        parameter_format : str, optional
1035            The format to save the parameters in. Default is 'toml'.
1036
1037        Raises
1038        ------
1039        ValueError
1040            If parameter_format is not 'json' or 'toml'.
1041        """
1042        export_profile_spectra = (
1043            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1044        )
1045
1046        # Write the mass spectra data to the hdf5 file
1047        super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra)
1048
1049        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1050        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1051            # Add scan_info to hdf5 file
1052            if "scan_info" not in hdf_handle:
1053                scan_info_group = hdf_handle.create_group("scan_info")
1054                for k, v in self.mass_spectra._scan_info.items():
1055                    array = np.array(list(v.values()))
1056                    if array.dtype.str[0:2] == "<U":
1057                        array = array.astype("S")
1058                    scan_info_group.create_dataset(k, data=array)
1059
1060            # Add ms_unprocessed to hdf5 file
1061            export_unprocessed_ms1 = (
1062                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1063            )
1064            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1065                if "ms_unprocessed" not in hdf_handle:
1066                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1067                else:
1068                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1069                for k, v in self.mass_spectra._ms_unprocessed.items():
1070                    array = np.array(v)
1071                    ms_unprocessed_group.create_dataset(str(k), data=array)
1072
1073            # Add LCMS mass features to hdf5 file
1074            if len(self.mass_spectra.mass_features) > 0:
1075                if "mass_features" not in hdf_handle:
1076                    mass_features_group = hdf_handle.create_group("mass_features")
1077                else:
1078                    mass_features_group = hdf_handle.get("mass_features")
1079
1080                # Create group for each mass feature, with key as the mass feature id
1081                for k, v in self.mass_spectra.mass_features.items():
1082                    mass_features_group.create_group(str(k))
1083                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1084                    for k2, v2 in v.__dict__.items():
1085                        if v2 is not None:
1086                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1087                            if k2 not in [
1088                                "chromatogram_parent",
1089                                "ms2_mass_spectra",
1090                                "mass_spectrum",
1091                                "_eic_data",
1092                                "ms2_similarity_results",
1093                            ]:
1094                                if k2 == "ms2_scan_numbers":
1095                                    array = np.array(v2)
1096                                    mass_features_group[str(k)].create_dataset(
1097                                        str(k2), data=array
1098                                    )
1099                                elif k2 == "_half_height_width":
1100                                    array = np.array(v2)
1101                                    mass_features_group[str(k)].create_dataset(
1102                                        str(k2), data=array
1103                                    )
1104                                elif k2 == "_ms_deconvoluted_idx":
1105                                    array = np.array(v2)
1106                                    mass_features_group[str(k)].create_dataset(
1107                                        str(k2), data=array
1108                                    )
1109                                elif k2 == "associated_mass_features_deconvoluted":
1110                                    array = np.array(v2)
1111                                    mass_features_group[str(k)].create_dataset(
1112                                        str(k2), data=array
1113                                    )
1114                                elif (
1115                                    isinstance(v2, int)
1116                                    or isinstance(v2, float)
1117                                    or isinstance(v2, str)
1118                                    or isinstance(v2, np.integer)
1119                                    or isinstance(v2, np.bool_)
1120                                ):
1121                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1122                                else:
1123                                    raise TypeError(
1124                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1125                                    )
1126
1127            # Add EIC data to hdf5 file
1128            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1129            if len(self.mass_spectra.eics) > 0 and export_eics:
1130                if "eics" not in hdf_handle:
1131                    eic_group = hdf_handle.create_group("eics")
1132                else:
1133                    eic_group = hdf_handle.get("eics")
1134
1135                # Create group for each eic
1136                for k, v in self.mass_spectra.eics.items():
1137                    eic_group.create_group(str(k))
1138                    eic_group[str(k)].attrs["mz"] = k
1139                    # Loop through each of the attributes and add them as datasets (if array)
1140                    for k2, v2 in v.__dict__.items():
1141                        if v2 is not None:
1142                            array = np.array(v2)
1143                            eic_group[str(k)].create_dataset(str(k2), data=array)
1144
1145            # Add ms2_search results to hdf5 file
1146            if len(self.mass_spectra.spectral_search_results) > 0:
1147                if "spectral_search_results" not in hdf_handle:
1148                    spectral_search_results = hdf_handle.create_group(
1149                        "spectral_search_results"
1150                    )
1151                else:
1152                    spectral_search_results = hdf_handle.get("spectral_search_results")
1153                # Create group for each search result by ms2_scan / precursor_mz
1154                for k, v in self.mass_spectra.spectral_search_results.items():
1155                    spectral_search_results.create_group(str(k))
1156                    for k2, v2 in v.items():
1157                        spectral_search_results[str(k)].create_group(str(k2))
1158                        spectral_search_results[str(k)][str(k2)].attrs[
1159                            "precursor_mz"
1160                        ] = v2.precursor_mz
1161                        spectral_search_results[str(k)][str(k2)].attrs[
1162                            "query_spectrum_id"
1163                        ] = v2.query_spectrum_id
1164                        # Loop through each of the attributes and add them as datasets (if array)
1165                        for k3, v3 in v2.__dict__.items():
1166                            if v3 is not None and k3 not in [
1167                                "query_spectrum",
1168                                "precursor_mz",
1169                                "query_spectrum_id",
1170                            ]:
1171                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1172                                    v3 = [", ".join(x) for x in v3]
1173                                if all(v3 is not None for v3 in v3):
1174                                    array = np.array(v3)
1175                                if array.dtype.str[0:2] == "<U":
1176                                    array = array.astype("S")
1177                                spectral_search_results[str(k)][str(k2)].create_dataset(
1178                                    str(k3), data=array
1179                                )
1180
1181        # Save parameters as separate json
1182        if save_parameters:
1183            # Check if parameter_format is valid
1184            if parameter_format not in ["json", "toml"]:
1185                raise ValueError("parameter_format must be 'json' or 'toml'")
1186
1187            if parameter_format == "json":
1188                dump_lcms_settings_json(
1189                    filename=self.output_file.with_suffix(".json"),
1190                    lcms_obj=self.mass_spectra,
1191                )
1192            elif parameter_format == "toml":
1193                dump_lcms_settings_toml(
1194                    filename=self.output_file.with_suffix(".toml"),
1195                    lcms_obj=self.mass_spectra,
1196                )

A class to export high resolution LC-MS data.

This class provides methods to export high resolution LC-MS data to HDF5.

Parameters
  • out_file_path (str | Path): The output file path, do not include the file extension.
  • lcms_object (LCMSBase): The high resolution lc-ms object.
LCMSExport(out_file_path, mass_spectra)
1022    def __init__(self, out_file_path, mass_spectra):
1023        super().__init__(out_file_path, mass_spectra, output_type="hdf5")

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

def to_hdf(self, overwrite=False, save_parameters=True, parameter_format='toml'):
1025    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1026        """Export the data to an HDF5.
1027
1028        Parameters
1029        ----------
1030        overwrite : bool, optional
1031            Whether to overwrite the output file. Default is False.
1032        save_parameters : bool, optional
1033            Whether to save the parameters as a separate json or toml file. Default is True.
1034        parameter_format : str, optional
1035            The format to save the parameters in. Default is 'toml'.
1036
1037        Raises
1038        ------
1039        ValueError
1040            If parameter_format is not 'json' or 'toml'.
1041        """
1042        export_profile_spectra = (
1043            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1044        )
1045
1046        # Write the mass spectra data to the hdf5 file
1047        super().to_hdf(overwrite=overwrite, export_raw=export_profile_spectra)
1048
1049        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1050        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1051            # Add scan_info to hdf5 file
1052            if "scan_info" not in hdf_handle:
1053                scan_info_group = hdf_handle.create_group("scan_info")
1054                for k, v in self.mass_spectra._scan_info.items():
1055                    array = np.array(list(v.values()))
1056                    if array.dtype.str[0:2] == "<U":
1057                        array = array.astype("S")
1058                    scan_info_group.create_dataset(k, data=array)
1059
1060            # Add ms_unprocessed to hdf5 file
1061            export_unprocessed_ms1 = (
1062                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1063            )
1064            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1065                if "ms_unprocessed" not in hdf_handle:
1066                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1067                else:
1068                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1069                for k, v in self.mass_spectra._ms_unprocessed.items():
1070                    array = np.array(v)
1071                    ms_unprocessed_group.create_dataset(str(k), data=array)
1072
1073            # Add LCMS mass features to hdf5 file
1074            if len(self.mass_spectra.mass_features) > 0:
1075                if "mass_features" not in hdf_handle:
1076                    mass_features_group = hdf_handle.create_group("mass_features")
1077                else:
1078                    mass_features_group = hdf_handle.get("mass_features")
1079
1080                # Create group for each mass feature, with key as the mass feature id
1081                for k, v in self.mass_spectra.mass_features.items():
1082                    mass_features_group.create_group(str(k))
1083                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1084                    for k2, v2 in v.__dict__.items():
1085                        if v2 is not None:
1086                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1087                            if k2 not in [
1088                                "chromatogram_parent",
1089                                "ms2_mass_spectra",
1090                                "mass_spectrum",
1091                                "_eic_data",
1092                                "ms2_similarity_results",
1093                            ]:
1094                                if k2 == "ms2_scan_numbers":
1095                                    array = np.array(v2)
1096                                    mass_features_group[str(k)].create_dataset(
1097                                        str(k2), data=array
1098                                    )
1099                                elif k2 == "_half_height_width":
1100                                    array = np.array(v2)
1101                                    mass_features_group[str(k)].create_dataset(
1102                                        str(k2), data=array
1103                                    )
1104                                elif k2 == "_ms_deconvoluted_idx":
1105                                    array = np.array(v2)
1106                                    mass_features_group[str(k)].create_dataset(
1107                                        str(k2), data=array
1108                                    )
1109                                elif k2 == "associated_mass_features_deconvoluted":
1110                                    array = np.array(v2)
1111                                    mass_features_group[str(k)].create_dataset(
1112                                        str(k2), data=array
1113                                    )
1114                                elif (
1115                                    isinstance(v2, int)
1116                                    or isinstance(v2, float)
1117                                    or isinstance(v2, str)
1118                                    or isinstance(v2, np.integer)
1119                                    or isinstance(v2, np.bool_)
1120                                ):
1121                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1122                                else:
1123                                    raise TypeError(
1124                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1125                                    )
1126
1127            # Add EIC data to hdf5 file
1128            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1129            if len(self.mass_spectra.eics) > 0 and export_eics:
1130                if "eics" not in hdf_handle:
1131                    eic_group = hdf_handle.create_group("eics")
1132                else:
1133                    eic_group = hdf_handle.get("eics")
1134
1135                # Create group for each eic
1136                for k, v in self.mass_spectra.eics.items():
1137                    eic_group.create_group(str(k))
1138                    eic_group[str(k)].attrs["mz"] = k
1139                    # Loop through each of the attributes and add them as datasets (if array)
1140                    for k2, v2 in v.__dict__.items():
1141                        if v2 is not None:
1142                            array = np.array(v2)
1143                            eic_group[str(k)].create_dataset(str(k2), data=array)
1144
1145            # Add ms2_search results to hdf5 file
1146            if len(self.mass_spectra.spectral_search_results) > 0:
1147                if "spectral_search_results" not in hdf_handle:
1148                    spectral_search_results = hdf_handle.create_group(
1149                        "spectral_search_results"
1150                    )
1151                else:
1152                    spectral_search_results = hdf_handle.get("spectral_search_results")
1153                # Create group for each search result by ms2_scan / precursor_mz
1154                for k, v in self.mass_spectra.spectral_search_results.items():
1155                    spectral_search_results.create_group(str(k))
1156                    for k2, v2 in v.items():
1157                        spectral_search_results[str(k)].create_group(str(k2))
1158                        spectral_search_results[str(k)][str(k2)].attrs[
1159                            "precursor_mz"
1160                        ] = v2.precursor_mz
1161                        spectral_search_results[str(k)][str(k2)].attrs[
1162                            "query_spectrum_id"
1163                        ] = v2.query_spectrum_id
1164                        # Loop through each of the attributes and add them as datasets (if array)
1165                        for k3, v3 in v2.__dict__.items():
1166                            if v3 is not None and k3 not in [
1167                                "query_spectrum",
1168                                "precursor_mz",
1169                                "query_spectrum_id",
1170                            ]:
1171                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1172                                    v3 = [", ".join(x) for x in v3]
1173                                if all(v3 is not None for v3 in v3):
1174                                    array = np.array(v3)
1175                                if array.dtype.str[0:2] == "<U":
1176                                    array = array.astype("S")
1177                                spectral_search_results[str(k)][str(k2)].create_dataset(
1178                                    str(k3), data=array
1179                                )
1180
1181        # Save parameters as separate json
1182        if save_parameters:
1183            # Check if parameter_format is valid
1184            if parameter_format not in ["json", "toml"]:
1185                raise ValueError("parameter_format must be 'json' or 'toml'")
1186
1187            if parameter_format == "json":
1188                dump_lcms_settings_json(
1189                    filename=self.output_file.with_suffix(".json"),
1190                    lcms_obj=self.mass_spectra,
1191                )
1192            elif parameter_format == "toml":
1193                dump_lcms_settings_toml(
1194                    filename=self.output_file.with_suffix(".toml"),
1195                    lcms_obj=self.mass_spectra,
1196                )

Export the data to an HDF5.

Parameters
  • overwrite (bool, optional): Whether to overwrite the output file. Default is False.
  • save_parameters (bool, optional): Whether to save the parameters as a separate json or toml file. Default is True.
  • parameter_format (str, optional): The format to save the parameters in. Default is 'toml'.
Raises
  • ValueError: If parameter_format is not 'json' or 'toml'.
class LCMSMetabolomicsExport(LCMSExport):
1198class LCMSMetabolomicsExport(LCMSExport):
1199    """A class to export LCMS metabolite data.
1200
1201    This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report.
1202
1203    Parameters
1204    ----------
1205    out_file_path : str | Path
1206        The output file path, do not include the file extension.
1207    mass_spectra : object
1208        The high resolution mass spectra object.
1209    """
1210
1211    def __init__(self, out_file_path, mass_spectra):
1212        super().__init__(out_file_path, mass_spectra)
1213        self.ion_type_dict = ion_type_dict
1214    
1215    @staticmethod
1216    def get_ion_formula(neutral_formula, ion_type):
1217        """From a neutral formula and an ion type, return the formula of the ion.
1218
1219        Notes
1220        -----
1221        This is a static method.
1222        If the neutral_formula is not a string, this method will return None.
1223
1224        Parameters
1225        ----------
1226        neutral_formula : str
1227            The neutral formula, this should be a string form from the MolecularFormula class
1228            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1229            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1230            e.g. MgCl2 is parsed as 'Mg Cl2.
1231        ion_type : str
1232            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1233            See the self.ion_type_dict for the available ion types.
1234
1235        Returns
1236        -------
1237        str
1238            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1239        """
1240        # If neutral_formula is not a string, return None
1241        if not isinstance(neutral_formula, str):
1242            return None
1243
1244        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1245        if re.search(r"\s", neutral_formula):
1246            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1247        else:
1248            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1249            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1250            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1251            neutral_formula = MolecularFormula(
1252                dict(
1253                    zip(
1254                        [x[0] for x in elements],
1255                        [int(x[0]) if x else 1 for x in counts],
1256                    )
1257                ),
1258                ion_charge=0,
1259            )
1260        neutral_formula_dict = neutral_formula.to_dict().copy()
1261
1262        adduct_add_dict = ion_type_dict[ion_type][0]
1263        for key in adduct_add_dict:
1264            if key in neutral_formula_dict.keys():
1265                neutral_formula_dict[key] += adduct_add_dict[key]
1266            else:
1267                neutral_formula_dict[key] = adduct_add_dict[key]
1268
1269        adduct_subtract = ion_type_dict[ion_type][1]
1270        for key in adduct_subtract:
1271            neutral_formula_dict[key] -= adduct_subtract[key]
1272
1273        return MolecularFormula(neutral_formula_dict, ion_charge=0).string
1274
1275    @staticmethod
1276    def get_isotope_type(ion_formula):
1277        """From an ion formula, return the 13C isotope type of the ion.
1278
1279        Notes
1280        -----
1281        This is a static method.
1282        If the ion_formula is not a string, this method will return None.
1283        This is currently only functional for 13C isotopes.
1284
1285        Parameters
1286        ----------
1287        ion_formula : str
1288            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1289
1290        Returns
1291        -------
1292        str
1293            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1294
1295        Raises
1296        ------
1297        ValueError
1298            If the ion_formula is not a string.
1299        """
1300        if not isinstance(ion_formula, str):
1301            return None
1302
1303        if re.search(r"\s", ion_formula):
1304            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1305        else:
1306            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1307        ion_formula_dict = ion_formula.to_dict().copy()
1308
1309        try:
1310            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1311        except KeyError:
1312            iso_class = None
1313
1314        return iso_class
1315    
1316    def report_to_csv(self, molecular_metadata=None):
1317        """Create a report of the mass features and their annotations and save it as a CSV file.
1318
1319        Parameters
1320        ----------
1321        molecular_metadata : dict, optional
1322            The molecular metadata. Default is None.
1323        """
1324        report = self.to_report(molecular_metadata=molecular_metadata)
1325        out_file = self.output_file.with_suffix(".csv")
1326        report.to_csv(out_file, index=False)
1327    
1328    def clean_ms1_report(self, ms1_summary_full):
1329        """Clean the MS1 report.
1330
1331        Parameters
1332        ----------
1333        ms1_summary_full : DataFrame
1334            The full MS1 summary DataFrame.
1335
1336        Returns
1337        -------
1338        DataFrame
1339            The cleaned MS1 summary DataFrame.
1340        """
1341        ms1_summary_full = ms1_summary_full.reset_index()
1342        cols_to_keep = [
1343            "mf_id",
1344            "Molecular Formula",
1345            "Ion Type",
1346            "Calculated m/z",
1347            "m/z Error (ppm)",
1348            "m/z Error Score",
1349            "Is Isotopologue",
1350            "Isotopologue Similarity",
1351            "Confidence Score",
1352        ]
1353        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1354        ms1_summary["ion_formula"] = [
1355            self.get_ion_formula(f, a)
1356            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1357        ]
1358        ms1_summary["isotopologue_type"] = [
1359            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1360        ]
1361
1362        # Reorder columns
1363        ms1_summary = ms1_summary[
1364            [
1365                "mf_id",
1366                "ion_formula",
1367                "isotopologue_type",
1368                "Calculated m/z",
1369                "m/z Error (ppm)",
1370                "m/z Error Score",
1371                "Isotopologue Similarity",
1372                "Confidence Score",
1373            ]
1374        ]
1375
1376        # Set the index to mf_id
1377        ms1_summary = ms1_summary.set_index("mf_id")
1378
1379        return ms1_summary
1380    
1381    def summarize_ms2_report(self, ms2_annot_report):
1382        """
1383        Summarize the MS2 report.
1384
1385        Parameters
1386        ----------
1387        ms2_annot_report : DataFrame
1388            The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
1389        
1390        Returns
1391        -------
1392        """
1393    
1394    def summarize_metabolomics_report(self, ms2_annot_report):
1395        """Summarize the MS2 hits for a metabolomics report
1396        
1397        Parameters
1398        ----------
1399        ms2_annot : DataFrame
1400            The MS2 annotation DataFrame with all annotations.
1401
1402        Returns
1403        -------
1404        DataFrame
1405            The summarized metabolomics report.
1406        """
1407        columns_to_drop = [
1408            "precursor_mz",
1409            "precursor_mz_error_ppm",
1410            "cas",
1411            "data_id",
1412            "iupac_name",
1413            "traditional_name",
1414            "common_name",
1415            "casno",
1416        ]
1417        ms2_annot = ms2_annot_report.drop(
1418            columns=[col for col in columns_to_drop if col in ms2_annot_report.columns]
1419        )
1420        
1421        # Prepare information about the search results, pulling out the best hit for the single report
1422        # Group by mf_id,ref_mol_id grab row with highest entropy similarity
1423        ms2_annot = ms2_annot.reset_index()
1424        # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id
1425        ms2_annot["n_spectra_contributing"] = (
1426            ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"]
1427            .transform("nunique")
1428        )
1429        # Sort by entropy similarity
1430        ms2_annot = ms2_annot.sort_values(
1431            by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False]
1432        )
1433        best_entropy = ms2_annot.drop_duplicates(
1434            subset=["mf_id", "ref_mol_id"], keep="first"
1435        )
1436
1437        return best_entropy
1438
1439    def clean_ms2_report(self, metabolite_summary):
1440        """Clean the MS2 report.
1441
1442        Parameters
1443        ----------
1444        metabolite_summary : DataFrame
1445            The full metabolomics summary DataFrame.
1446
1447        Returns
1448        -------
1449        DataFrame
1450            The cleaned metabolomics summary DataFrame.
1451        """
1452        metabolite_summary = metabolite_summary.reset_index()
1453        metabolite_summary["ion_formula"] = [
1454            self.get_ion_formula(f, a)
1455            for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"])
1456        ]
1457
1458        col_order = [
1459            "mf_id",
1460            "ion_formula",
1461            "ref_ion_type",
1462            "formula",
1463            "inchikey",
1464            "name",
1465            "inchi",
1466            "chebi",
1467            "smiles",
1468            "kegg",
1469            "cas",
1470            "database_name",
1471            "ref_ms_id",
1472            "entropy_similarity",
1473            "ref_mz_in_query_fract",
1474            "n_spectra_contributing",
1475        ]
1476
1477        # Reorder columns
1478        metabolite_summary = metabolite_summary[
1479            [col for col in col_order if col in metabolite_summary.columns]
1480        ]
1481
1482        # Convert chebi (if present) to int:
1483        if "chebi" in metabolite_summary.columns:
1484            metabolite_summary["chebi"] = metabolite_summary["chebi"].astype(
1485                "Int64", errors="ignore"
1486            )
1487
1488        # Set the index to mf_id
1489        metabolite_summary = metabolite_summary.set_index("mf_id")
1490
1491        return metabolite_summary
1492    
1493    def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report):
1494        """Combine the mass feature report with the MS1 and MS2 reports.
1495
1496        Parameters
1497        ----------
1498        mf_report : DataFrame
1499            The mass feature report DataFrame.
1500        ms1_annot_report : DataFrame
1501            The MS1 annotation report DataFrame.
1502        ms2_annot_report : DataFrame
1503            The MS2 annotation report DataFrame.
1504        """
1505        # If there is an ms1_annot_report, merge it with the mf_report
1506        if not ms1_annot_report.empty:
1507            # MS1 has been run and has molecular formula information
1508            mf_report = pd.merge(
1509                mf_report,
1510                ms1_annot_report,
1511                how="left",
1512                on=["mf_id", "isotopologue_type"],
1513            )
1514        if ms2_annot_report is not None:
1515            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1516            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1517            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1518            mf_no_ion_formula = pd.merge(
1519                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1520            )
1521
1522            # pull out the records with ion_formula
1523            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1524            mf_with_ion_formula = pd.merge(
1525                mf_with_ion_formula,
1526                ms2_annot_report,
1527                how="left",
1528                on=["mf_id", "ion_formula"],
1529            )
1530
1531            # put back together
1532            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1533
1534        # Rename colums
1535        rename_dict = {
1536            "mf_id": "Mass Feature ID",
1537            "scan_time": "Retention Time (min)",
1538            "mz": "m/z",
1539            "apex_scan": "Apex Scan Number",
1540            "intensity": "Intensity",
1541            "persistence": "Persistence",
1542            "area": "Area",
1543            "half_height_width": "Half Height Width (min)",
1544            "tailing_factor": "Tailing Factor",
1545            "dispersity_index": "Dispersity Index",
1546            "ms2_spectrum": "MS2 Spectrum",
1547            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1548            "isotopologue_type": "Isotopologue Type",
1549            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1550            "associated_mass_features": "Associated Mass Features after Deconvolution",
1551            "ion_formula": "Ion Formula",
1552            "formula": "Molecular Formula",
1553            "ref_ion_type": "Ion Type",
1554            "annot_level": "Lipid Annotation Level",
1555            "lipid_molecular_species_id": "Lipid Molecular Species",
1556            "lipid_summed_name": "Lipid Species",
1557            "lipid_subclass": "Lipid Subclass",
1558            "lipid_class": "Lipid Class",
1559            "lipid_category": "Lipid Category",
1560            "entropy_similarity": "Entropy Similarity",
1561            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1562            "n_spectra_contributing": "Spectra with Annotation (n)",
1563        }
1564        mf_report = mf_report.rename(columns=rename_dict)
1565        mf_report["Sample Name"] = self.mass_spectra.sample_name
1566        mf_report["Polarity"] = self.mass_spectra.polarity
1567        mf_report = mf_report[
1568            ["Mass Feature ID", "Sample Name", "Polarity"]
1569            + [
1570                col
1571                for col in mf_report.columns
1572                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1573            ]
1574        ]
1575
1576        # Reorder rows by "Mass Feature ID"
1577        mf_report = mf_report.sort_values("Mass Feature ID")
1578
1579        # Reset index
1580        mf_report = mf_report.reset_index(drop=True)
1581
1582        return mf_report
1583    
1584    def to_report(self, molecular_metadata=None):
1585        """Create a report of the mass features and their annotations.
1586
1587        Parameters
1588        ----------
1589        molecular_metadata : dict, optional
1590            The molecular metadata. Default is None.
1591
1592        Returns
1593        -------
1594        DataFrame
1595            The report as a Pandas DataFrame.
1596        """
1597        # Get mass feature dataframe
1598        mf_report = self.mass_spectra.mass_features_to_df()
1599        mf_report = mf_report.reset_index(drop=False)
1600
1601        # Get and clean ms1 annotation dataframe
1602        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1603        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1604        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1605
1606        # Get, summarize, and clean ms2 annotation dataframe
1607        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1608            molecular_metadata=molecular_metadata
1609        )
1610        if ms2_annot_report is not None and molecular_metadata is not None:
1611            ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report)
1612            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1613            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1614            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1615        else:
1616            ms2_annot_report = None
1617
1618        report = self.combine_reports(
1619            mf_report=mf_report,
1620            ms1_annot_report=ms1_annot_report,
1621            ms2_annot_report=ms2_annot_report
1622        )
1623
1624        return report

A class to export LCMS metabolite data.

This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report.

Parameters
  • out_file_path (str | Path): The output file path, do not include the file extension.
  • mass_spectra (object): The high resolution mass spectra object.
LCMSMetabolomicsExport(out_file_path, mass_spectra)
1211    def __init__(self, out_file_path, mass_spectra):
1212        super().__init__(out_file_path, mass_spectra)
1213        self.ion_type_dict = ion_type_dict

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

ion_type_dict
@staticmethod
def get_ion_formula(neutral_formula, ion_type):
1215    @staticmethod
1216    def get_ion_formula(neutral_formula, ion_type):
1217        """From a neutral formula and an ion type, return the formula of the ion.
1218
1219        Notes
1220        -----
1221        This is a static method.
1222        If the neutral_formula is not a string, this method will return None.
1223
1224        Parameters
1225        ----------
1226        neutral_formula : str
1227            The neutral formula, this should be a string form from the MolecularFormula class
1228            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1229            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1230            e.g. MgCl2 is parsed as 'Mg Cl2.
1231        ion_type : str
1232            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1233            See the self.ion_type_dict for the available ion types.
1234
1235        Returns
1236        -------
1237        str
1238            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1239        """
1240        # If neutral_formula is not a string, return None
1241        if not isinstance(neutral_formula, str):
1242            return None
1243
1244        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1245        if re.search(r"\s", neutral_formula):
1246            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1247        else:
1248            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1249            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1250            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1251            neutral_formula = MolecularFormula(
1252                dict(
1253                    zip(
1254                        [x[0] for x in elements],
1255                        [int(x[0]) if x else 1 for x in counts],
1256                    )
1257                ),
1258                ion_charge=0,
1259            )
1260        neutral_formula_dict = neutral_formula.to_dict().copy()
1261
1262        adduct_add_dict = ion_type_dict[ion_type][0]
1263        for key in adduct_add_dict:
1264            if key in neutral_formula_dict.keys():
1265                neutral_formula_dict[key] += adduct_add_dict[key]
1266            else:
1267                neutral_formula_dict[key] = adduct_add_dict[key]
1268
1269        adduct_subtract = ion_type_dict[ion_type][1]
1270        for key in adduct_subtract:
1271            neutral_formula_dict[key] -= adduct_subtract[key]
1272
1273        return MolecularFormula(neutral_formula_dict, ion_charge=0).string

From a neutral formula and an ion type, return the formula of the ion.

Notes

This is a static method. If the neutral_formula is not a string, this method will return None.

Parameters
  • neutral_formula (str): The neutral formula, this should be a string form from the MolecularFormula class (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). In the case of a simple string, the atoms are parsed based on the presence of capital letters, e.g. MgCl2 is parsed as 'Mg Cl2.
  • ion_type (str): The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. See the self.ion_type_dict for the available ion types.
Returns
  • str: The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
@staticmethod
def get_isotope_type(ion_formula):
1275    @staticmethod
1276    def get_isotope_type(ion_formula):
1277        """From an ion formula, return the 13C isotope type of the ion.
1278
1279        Notes
1280        -----
1281        This is a static method.
1282        If the ion_formula is not a string, this method will return None.
1283        This is currently only functional for 13C isotopes.
1284
1285        Parameters
1286        ----------
1287        ion_formula : str
1288            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1289
1290        Returns
1291        -------
1292        str
1293            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1294
1295        Raises
1296        ------
1297        ValueError
1298            If the ion_formula is not a string.
1299        """
1300        if not isinstance(ion_formula, str):
1301            return None
1302
1303        if re.search(r"\s", ion_formula):
1304            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1305        else:
1306            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1307        ion_formula_dict = ion_formula.to_dict().copy()
1308
1309        try:
1310            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1311        except KeyError:
1312            iso_class = None
1313
1314        return iso_class

From an ion formula, return the 13C isotope type of the ion.

Notes

This is a static method. If the ion_formula is not a string, this method will return None. This is currently only functional for 13C isotopes.

Parameters
  • ion_formula (str): The formula of the ion, expected to be a string like 'C2 H4 O2'.
Returns
  • str: The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
Raises
  • ValueError: If the ion_formula is not a string.
def report_to_csv(self, molecular_metadata=None):
1316    def report_to_csv(self, molecular_metadata=None):
1317        """Create a report of the mass features and their annotations and save it as a CSV file.
1318
1319        Parameters
1320        ----------
1321        molecular_metadata : dict, optional
1322            The molecular metadata. Default is None.
1323        """
1324        report = self.to_report(molecular_metadata=molecular_metadata)
1325        out_file = self.output_file.with_suffix(".csv")
1326        report.to_csv(out_file, index=False)

Create a report of the mass features and their annotations and save it as a CSV file.

Parameters
  • molecular_metadata (dict, optional): The molecular metadata. Default is None.
def clean_ms1_report(self, ms1_summary_full):
1328    def clean_ms1_report(self, ms1_summary_full):
1329        """Clean the MS1 report.
1330
1331        Parameters
1332        ----------
1333        ms1_summary_full : DataFrame
1334            The full MS1 summary DataFrame.
1335
1336        Returns
1337        -------
1338        DataFrame
1339            The cleaned MS1 summary DataFrame.
1340        """
1341        ms1_summary_full = ms1_summary_full.reset_index()
1342        cols_to_keep = [
1343            "mf_id",
1344            "Molecular Formula",
1345            "Ion Type",
1346            "Calculated m/z",
1347            "m/z Error (ppm)",
1348            "m/z Error Score",
1349            "Is Isotopologue",
1350            "Isotopologue Similarity",
1351            "Confidence Score",
1352        ]
1353        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1354        ms1_summary["ion_formula"] = [
1355            self.get_ion_formula(f, a)
1356            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1357        ]
1358        ms1_summary["isotopologue_type"] = [
1359            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1360        ]
1361
1362        # Reorder columns
1363        ms1_summary = ms1_summary[
1364            [
1365                "mf_id",
1366                "ion_formula",
1367                "isotopologue_type",
1368                "Calculated m/z",
1369                "m/z Error (ppm)",
1370                "m/z Error Score",
1371                "Isotopologue Similarity",
1372                "Confidence Score",
1373            ]
1374        ]
1375
1376        # Set the index to mf_id
1377        ms1_summary = ms1_summary.set_index("mf_id")
1378
1379        return ms1_summary

Clean the MS1 report.

Parameters
  • ms1_summary_full (DataFrame): The full MS1 summary DataFrame.
Returns
  • DataFrame: The cleaned MS1 summary DataFrame.
def summarize_ms2_report(self, ms2_annot_report):
1381    def summarize_ms2_report(self, ms2_annot_report):
1382        """
1383        Summarize the MS2 report.
1384
1385        Parameters
1386        ----------
1387        ms2_annot_report : DataFrame
1388            The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
1389        
1390        Returns
1391        -------
1392        """

Summarize the MS2 report.

Parameters
  • ms2_annot_report (DataFrame): The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
  • Returns
  • -------
def summarize_metabolomics_report(self, ms2_annot_report):
1394    def summarize_metabolomics_report(self, ms2_annot_report):
1395        """Summarize the MS2 hits for a metabolomics report
1396        
1397        Parameters
1398        ----------
1399        ms2_annot : DataFrame
1400            The MS2 annotation DataFrame with all annotations.
1401
1402        Returns
1403        -------
1404        DataFrame
1405            The summarized metabolomics report.
1406        """
1407        columns_to_drop = [
1408            "precursor_mz",
1409            "precursor_mz_error_ppm",
1410            "cas",
1411            "data_id",
1412            "iupac_name",
1413            "traditional_name",
1414            "common_name",
1415            "casno",
1416        ]
1417        ms2_annot = ms2_annot_report.drop(
1418            columns=[col for col in columns_to_drop if col in ms2_annot_report.columns]
1419        )
1420        
1421        # Prepare information about the search results, pulling out the best hit for the single report
1422        # Group by mf_id,ref_mol_id grab row with highest entropy similarity
1423        ms2_annot = ms2_annot.reset_index()
1424        # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id
1425        ms2_annot["n_spectra_contributing"] = (
1426            ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"]
1427            .transform("nunique")
1428        )
1429        # Sort by entropy similarity
1430        ms2_annot = ms2_annot.sort_values(
1431            by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False]
1432        )
1433        best_entropy = ms2_annot.drop_duplicates(
1434            subset=["mf_id", "ref_mol_id"], keep="first"
1435        )
1436
1437        return best_entropy

Summarize the MS2 hits for a metabolomics report

Parameters
  • ms2_annot (DataFrame): The MS2 annotation DataFrame with all annotations.
Returns
  • DataFrame: The summarized metabolomics report.
def clean_ms2_report(self, metabolite_summary):
1439    def clean_ms2_report(self, metabolite_summary):
1440        """Clean the MS2 report.
1441
1442        Parameters
1443        ----------
1444        metabolite_summary : DataFrame
1445            The full metabolomics summary DataFrame.
1446
1447        Returns
1448        -------
1449        DataFrame
1450            The cleaned metabolomics summary DataFrame.
1451        """
1452        metabolite_summary = metabolite_summary.reset_index()
1453        metabolite_summary["ion_formula"] = [
1454            self.get_ion_formula(f, a)
1455            for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"])
1456        ]
1457
1458        col_order = [
1459            "mf_id",
1460            "ion_formula",
1461            "ref_ion_type",
1462            "formula",
1463            "inchikey",
1464            "name",
1465            "inchi",
1466            "chebi",
1467            "smiles",
1468            "kegg",
1469            "cas",
1470            "database_name",
1471            "ref_ms_id",
1472            "entropy_similarity",
1473            "ref_mz_in_query_fract",
1474            "n_spectra_contributing",
1475        ]
1476
1477        # Reorder columns
1478        metabolite_summary = metabolite_summary[
1479            [col for col in col_order if col in metabolite_summary.columns]
1480        ]
1481
1482        # Convert chebi (if present) to int:
1483        if "chebi" in metabolite_summary.columns:
1484            metabolite_summary["chebi"] = metabolite_summary["chebi"].astype(
1485                "Int64", errors="ignore"
1486            )
1487
1488        # Set the index to mf_id
1489        metabolite_summary = metabolite_summary.set_index("mf_id")
1490
1491        return metabolite_summary

Clean the MS2 report.

Parameters
  • metabolite_summary (DataFrame): The full metabolomics summary DataFrame.
Returns
  • DataFrame: The cleaned metabolomics summary DataFrame.
def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report):
1493    def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report):
1494        """Combine the mass feature report with the MS1 and MS2 reports.
1495
1496        Parameters
1497        ----------
1498        mf_report : DataFrame
1499            The mass feature report DataFrame.
1500        ms1_annot_report : DataFrame
1501            The MS1 annotation report DataFrame.
1502        ms2_annot_report : DataFrame
1503            The MS2 annotation report DataFrame.
1504        """
1505        # If there is an ms1_annot_report, merge it with the mf_report
1506        if not ms1_annot_report.empty:
1507            # MS1 has been run and has molecular formula information
1508            mf_report = pd.merge(
1509                mf_report,
1510                ms1_annot_report,
1511                how="left",
1512                on=["mf_id", "isotopologue_type"],
1513            )
1514        if ms2_annot_report is not None:
1515            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1516            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1517            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1518            mf_no_ion_formula = pd.merge(
1519                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1520            )
1521
1522            # pull out the records with ion_formula
1523            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1524            mf_with_ion_formula = pd.merge(
1525                mf_with_ion_formula,
1526                ms2_annot_report,
1527                how="left",
1528                on=["mf_id", "ion_formula"],
1529            )
1530
1531            # put back together
1532            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1533
1534        # Rename colums
1535        rename_dict = {
1536            "mf_id": "Mass Feature ID",
1537            "scan_time": "Retention Time (min)",
1538            "mz": "m/z",
1539            "apex_scan": "Apex Scan Number",
1540            "intensity": "Intensity",
1541            "persistence": "Persistence",
1542            "area": "Area",
1543            "half_height_width": "Half Height Width (min)",
1544            "tailing_factor": "Tailing Factor",
1545            "dispersity_index": "Dispersity Index",
1546            "ms2_spectrum": "MS2 Spectrum",
1547            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1548            "isotopologue_type": "Isotopologue Type",
1549            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1550            "associated_mass_features": "Associated Mass Features after Deconvolution",
1551            "ion_formula": "Ion Formula",
1552            "formula": "Molecular Formula",
1553            "ref_ion_type": "Ion Type",
1554            "annot_level": "Lipid Annotation Level",
1555            "lipid_molecular_species_id": "Lipid Molecular Species",
1556            "lipid_summed_name": "Lipid Species",
1557            "lipid_subclass": "Lipid Subclass",
1558            "lipid_class": "Lipid Class",
1559            "lipid_category": "Lipid Category",
1560            "entropy_similarity": "Entropy Similarity",
1561            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1562            "n_spectra_contributing": "Spectra with Annotation (n)",
1563        }
1564        mf_report = mf_report.rename(columns=rename_dict)
1565        mf_report["Sample Name"] = self.mass_spectra.sample_name
1566        mf_report["Polarity"] = self.mass_spectra.polarity
1567        mf_report = mf_report[
1568            ["Mass Feature ID", "Sample Name", "Polarity"]
1569            + [
1570                col
1571                for col in mf_report.columns
1572                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1573            ]
1574        ]
1575
1576        # Reorder rows by "Mass Feature ID"
1577        mf_report = mf_report.sort_values("Mass Feature ID")
1578
1579        # Reset index
1580        mf_report = mf_report.reset_index(drop=True)
1581
1582        return mf_report

Combine the mass feature report with the MS1 and MS2 reports.

Parameters
  • mf_report (DataFrame): The mass feature report DataFrame.
  • ms1_annot_report (DataFrame): The MS1 annotation report DataFrame.
  • ms2_annot_report (DataFrame): The MS2 annotation report DataFrame.
def to_report(self, molecular_metadata=None):
1584    def to_report(self, molecular_metadata=None):
1585        """Create a report of the mass features and their annotations.
1586
1587        Parameters
1588        ----------
1589        molecular_metadata : dict, optional
1590            The molecular metadata. Default is None.
1591
1592        Returns
1593        -------
1594        DataFrame
1595            The report as a Pandas DataFrame.
1596        """
1597        # Get mass feature dataframe
1598        mf_report = self.mass_spectra.mass_features_to_df()
1599        mf_report = mf_report.reset_index(drop=False)
1600
1601        # Get and clean ms1 annotation dataframe
1602        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1603        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1604        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1605
1606        # Get, summarize, and clean ms2 annotation dataframe
1607        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1608            molecular_metadata=molecular_metadata
1609        )
1610        if ms2_annot_report is not None and molecular_metadata is not None:
1611            ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report)
1612            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1613            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1614            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1615        else:
1616            ms2_annot_report = None
1617
1618        report = self.combine_reports(
1619            mf_report=mf_report,
1620            ms1_annot_report=ms1_annot_report,
1621            ms2_annot_report=ms2_annot_report
1622        )
1623
1624        return report

Create a report of the mass features and their annotations.

Parameters
  • molecular_metadata (dict, optional): The molecular metadata. Default is None.
Returns
  • DataFrame: The report as a Pandas DataFrame.
class LipidomicsExport(LCMSMetabolomicsExport):
1625class LipidomicsExport(LCMSMetabolomicsExport):
1626    """A class to export lipidomics data.
1627
1628    This class provides methods to export lipidomics data to various formats and summarize the lipid report.
1629
1630    Parameters
1631    ----------
1632    out_file_path : str | Path
1633        The output file path, do not include the file extension.
1634    mass_spectra : object
1635        The high resolution mass spectra object.
1636    """
1637
1638    def __init__(self, out_file_path, mass_spectra):
1639        super().__init__(out_file_path, mass_spectra)
1640
1641    def summarize_lipid_report(self, ms2_annot):
1642        """Summarize the lipid report.
1643
1644        Parameters
1645        ----------
1646        ms2_annot : DataFrame
1647            The MS2 annotation DataFrame with all annotations.
1648
1649        Returns
1650        -------
1651        DataFrame
1652            The summarized lipid report.
1653        """
1654        # Drop unnecessary columns for easier viewing
1655        columns_to_drop = [
1656            "precursor_mz",
1657            "precursor_mz_error_ppm",
1658            "metabref_mol_id",
1659            "metabref_precursor_mz",
1660            "cas",
1661            "inchikey",
1662            "inchi",
1663            "chebi",
1664            "smiles",
1665            "kegg",
1666            "data_id",
1667            "iupac_name",
1668            "traditional_name",
1669            "common_name",
1670            "casno",
1671        ]
1672        ms2_annot = ms2_annot.drop(
1673            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1674        )
1675
1676        # If ion_types_excluded is not empty, remove those ion types
1677        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1678            "ms2"
1679        ].molecular_search.ion_types_excluded
1680        if len(ion_types_excluded) > 0:
1681            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1682
1683        # If mf_id is not present, check that the index name is mf_id and reset the index
1684        if "mf_id" not in ms2_annot.columns:
1685            if ms2_annot.index.name == "mf_id":
1686                ms2_annot = ms2_annot.reset_index()
1687            else:
1688                raise ValueError("mf_id is not present in the dataframe")
1689
1690        # Attempt to get consensus annotations to the MLF level
1691        mlf_results_all = []
1692        for mf_id in ms2_annot["mf_id"].unique():
1693            mlf_results_perid = []
1694            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1695            #FIXME: Fix this - it's not giving what we want!
1696            ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf)
1697
1698            for query_scan in ms2_annot["query_spectrum_id"].unique():
1699                ms2_annot_sub = ms2_annot_mf[
1700                    ms2_annot_mf["query_spectrum_id"] == query_scan
1701                ].copy()
1702
1703                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1704                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1705                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1706                        ms2_annot_sub["entropy_max"] = (
1707                            ms2_annot_sub["entropy_similarity"]
1708                            == ms2_annot_sub["entropy_similarity"].max()
1709                        )
1710                        ms2_annot_sub["ref_match_fract_max"] = (
1711                            ms2_annot_sub["ref_mz_in_query_fract"]
1712                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1713                        )
1714                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1715                            "query_frag_types"
1716                        ].apply(lambda x: True if "MLF" in x else False)
1717
1718                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1719                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1720                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1721                        ].all(axis=1)
1722
1723                        # If there is a consensus, take the row with the highest entropy_similarity
1724                        if ms2_annot_sub["consensus"].any():
1725                            ms2_annot_sub = ms2_annot_sub[
1726                                ms2_annot_sub["entropy_similarity"]
1727                                == ms2_annot_sub["entropy_similarity"].max()
1728                            ].head(1)
1729                            mlf_results_perid.append(ms2_annot_sub)
1730            if len(mlf_results_perid) == 0:
1731                mlf_results_perid = pd.DataFrame()
1732            else:
1733                mlf_results_perid = pd.concat(mlf_results_perid)
1734                if mlf_results_perid["name"].nunique() == 1:
1735                    mlf_results_perid = mlf_results_perid[
1736                        mlf_results_perid["entropy_similarity"]
1737                        == mlf_results_perid["entropy_similarity"].max()
1738                    ].head(1)
1739                else:
1740                    mlf_results_perid = pd.DataFrame()
1741                mlf_results_all.append(mlf_results_perid)
1742
1743        # These are the consensus annotations to the MLF level
1744        if len(mlf_results_all) > 0:
1745            mlf_results_all = pd.concat(mlf_results_all)
1746            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1747        else:
1748            # Make an empty dataframe
1749            mlf_results_all = ms2_annot.head(0)
1750
1751        # For remaining mf_ids, try to get a consensus annotation to the species level
1752        species_results_all = []
1753        # Remove mf_ids that have consensus annotations to the MLF level
1754        ms2_annot_spec = ms2_annot[
1755            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1756        ]
1757        for mf_id in ms2_annot_spec["mf_id"].unique():
1758            # Do all the hits have the same lipid_summed_name?
1759            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1760            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1761
1762            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1763                # Grab the highest entropy_similarity result
1764                ms2_annot_sub = ms2_annot_sub[
1765                    ms2_annot_sub["entropy_similarity"]
1766                    == ms2_annot_sub["entropy_similarity"].max()
1767                ].head(1)
1768                species_results_all.append(ms2_annot_sub)
1769
1770        # These are the consensus annotations to the species level
1771        if len(species_results_all) > 0:
1772            species_results_all = pd.concat(species_results_all)
1773            species_results_all["annot_level"] = "species"
1774        else:
1775            # Make an empty dataframe
1776            species_results_all = ms2_annot.head(0)
1777
1778        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1779        # Remove mf_ids that have consensus annotations to the species level
1780        ms2_annot_remaining = ms2_annot_spec[
1781            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1782        ]
1783        no_consensus = []
1784        for mf_id in ms2_annot_remaining["mf_id"].unique():
1785            id_sub = []
1786            id_no_con = []
1787            ms2_annot_sub_mf = ms2_annot_remaining[
1788                ms2_annot_remaining["mf_id"] == mf_id
1789            ].copy()
1790            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1791                ms2_annot_sub = ms2_annot_sub_mf[
1792                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1793                ].copy()
1794
1795                # New columns for ranking [HIGHER RANK = BETTER]
1796                ms2_annot_sub["entropy_max"] = (
1797                    ms2_annot_sub["entropy_similarity"]
1798                    == ms2_annot_sub["entropy_similarity"].max()
1799                )
1800                ms2_annot_sub["ref_match_fract_max"] = (
1801                    ms2_annot_sub["ref_mz_in_query_fract"]
1802                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1803                )
1804                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1805                    lambda x: True if "MLF" in x else False
1806                )
1807
1808                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1809                ms2_annot_sub["consensus"] = ms2_annot_sub[
1810                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1811                ].all(axis=1)
1812                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1813                id_sub.append(ms2_annot_sub_con)
1814                id_no_con.append(ms2_annot_sub)
1815            id_sub = pd.concat(id_sub)
1816            id_no_con = pd.concat(id_no_con)
1817
1818            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1819            if (
1820                id_sub["query_frag_types"]
1821                .apply(lambda x: True if "MLF" in x else False)
1822                .all()
1823                and len(id_sub) > 0
1824            ):
1825                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1826                id_sub = id_sub.loc[idx]
1827                # Reorder so highest entropy_similarity is first
1828                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1829                id_sub["annot_level"] = id_sub["structure_level"]
1830                no_consensus.append(id_sub)
1831
1832            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1833            elif len(id_sub) == 0:
1834                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1835                    summed_sub = id_no_con[
1836                        id_no_con["lipid_summed_name"] == lipid_summed_name
1837                    ]
1838                    # Any consensus to MLF?
1839                    if summed_sub["consensus"].any():
1840                        summed_sub = summed_sub[summed_sub["consensus"]]
1841                        summed_sub["annot_level"] = summed_sub["structure_level"]
1842                        no_consensus.append(summed_sub)
1843                    else:
1844                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1845                        summed_sub = summed_sub[
1846                            summed_sub["entropy_similarity"]
1847                            == summed_sub["entropy_similarity"].max()
1848                        ].head(1)
1849                        # get first row
1850                        summed_sub["annot_level"] = "species"
1851                        summed_sub["name"] = ""
1852                        no_consensus.append(summed_sub)
1853            else:
1854                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1855
1856        if len(no_consensus) > 0:
1857            no_consensus = pd.concat(no_consensus)
1858        else:
1859            no_consensus = ms2_annot.head(0)
1860
1861        # Combine all the consensus annotations and reformat the dataframe for output
1862        species_results_all = species_results_all.drop(columns=["name"])
1863        species_results_all["lipid_molecular_species_id"] = ""
1864        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1865        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1866        consensus_annotations = pd.concat(
1867            [mlf_results_all, species_results_all, no_consensus]
1868        )
1869        consensus_annotations = consensus_annotations.sort_values(
1870            "mf_id", ascending=True
1871        )
1872        cols_to_keep = [
1873            "mf_id",
1874            "ref_ion_type",
1875            "entropy_similarity",
1876            "ref_mz_in_query_fract",
1877            "lipid_molecular_species_id",
1878            "lipid_summed_name",
1879            "lipid_subclass",
1880            "lipid_class",
1881            "lipid_category",
1882            "formula",
1883            "annot_level",
1884            "n_spectra_contributing",
1885        ]
1886        consensus_annotations = consensus_annotations[cols_to_keep]
1887        consensus_annotations = consensus_annotations.set_index("mf_id")
1888
1889        return consensus_annotations
1890
1891    def clean_ms2_report(self, lipid_summary):
1892        """Clean the MS2 report.
1893
1894        Parameters
1895        ----------
1896        lipid_summary : DataFrame
1897            The full lipid summary DataFrame.
1898
1899        Returns
1900        -------
1901        DataFrame
1902            The cleaned lipid summary DataFrame.
1903        """
1904        lipid_summary = lipid_summary.reset_index()
1905        lipid_summary["ion_formula"] = [
1906            self.get_ion_formula(f, a)
1907            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
1908        ]
1909
1910        # Reorder columns
1911        lipid_summary = lipid_summary[
1912            [
1913                "mf_id",
1914                "ion_formula",
1915                "ref_ion_type",
1916                "formula",
1917                "annot_level",
1918                "lipid_molecular_species_id",
1919                "lipid_summed_name",
1920                "lipid_subclass",
1921                "lipid_class",
1922                "lipid_category",
1923                "entropy_similarity",
1924                "ref_mz_in_query_fract",
1925                "n_spectra_contributing",
1926            ]
1927        ]
1928
1929        # Set the index to mf_id
1930        lipid_summary = lipid_summary.set_index("mf_id")
1931
1932        return lipid_summary
1933
1934    def to_report(self, molecular_metadata=None):
1935        """Create a report of the mass features and their annotations.
1936
1937        Parameters
1938        ----------
1939        molecular_metadata : dict, optional
1940            The molecular metadata. Default is None.
1941
1942        Returns
1943        -------
1944        DataFrame
1945            The report of the mass features and their annotations.
1946
1947        Notes
1948        -----
1949        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
1950        """
1951        # Get mass feature dataframe
1952        mf_report = self.mass_spectra.mass_features_to_df()
1953        mf_report = mf_report.reset_index(drop=False)
1954
1955        # Get and clean ms1 annotation dataframe
1956        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1957        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1958        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1959
1960        # Get, summarize, and clean ms2 annotation dataframe
1961        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1962            molecular_metadata=molecular_metadata
1963        )
1964        if ms2_annot_report is not None and molecular_metadata is not None:
1965            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
1966            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1967            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1968            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1969        report = self.combine_reports(
1970            mf_report=mf_report,
1971            ms1_annot_report=ms1_annot_report,
1972            ms2_annot_report=ms2_annot_report
1973        )
1974        return report

A class to export lipidomics data.

This class provides methods to export lipidomics data to various formats and summarize the lipid report.

Parameters
  • out_file_path (str | Path): The output file path, do not include the file extension.
  • mass_spectra (object): The high resolution mass spectra object.
LipidomicsExport(out_file_path, mass_spectra)
1638    def __init__(self, out_file_path, mass_spectra):
1639        super().__init__(out_file_path, mass_spectra)

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

def summarize_lipid_report(self, ms2_annot):
1641    def summarize_lipid_report(self, ms2_annot):
1642        """Summarize the lipid report.
1643
1644        Parameters
1645        ----------
1646        ms2_annot : DataFrame
1647            The MS2 annotation DataFrame with all annotations.
1648
1649        Returns
1650        -------
1651        DataFrame
1652            The summarized lipid report.
1653        """
1654        # Drop unnecessary columns for easier viewing
1655        columns_to_drop = [
1656            "precursor_mz",
1657            "precursor_mz_error_ppm",
1658            "metabref_mol_id",
1659            "metabref_precursor_mz",
1660            "cas",
1661            "inchikey",
1662            "inchi",
1663            "chebi",
1664            "smiles",
1665            "kegg",
1666            "data_id",
1667            "iupac_name",
1668            "traditional_name",
1669            "common_name",
1670            "casno",
1671        ]
1672        ms2_annot = ms2_annot.drop(
1673            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1674        )
1675
1676        # If ion_types_excluded is not empty, remove those ion types
1677        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1678            "ms2"
1679        ].molecular_search.ion_types_excluded
1680        if len(ion_types_excluded) > 0:
1681            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1682
1683        # If mf_id is not present, check that the index name is mf_id and reset the index
1684        if "mf_id" not in ms2_annot.columns:
1685            if ms2_annot.index.name == "mf_id":
1686                ms2_annot = ms2_annot.reset_index()
1687            else:
1688                raise ValueError("mf_id is not present in the dataframe")
1689
1690        # Attempt to get consensus annotations to the MLF level
1691        mlf_results_all = []
1692        for mf_id in ms2_annot["mf_id"].unique():
1693            mlf_results_perid = []
1694            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1695            #FIXME: Fix this - it's not giving what we want!
1696            ms2_annot_mf["n_spectra_contributing"] = len(ms2_annot_mf)
1697
1698            for query_scan in ms2_annot["query_spectrum_id"].unique():
1699                ms2_annot_sub = ms2_annot_mf[
1700                    ms2_annot_mf["query_spectrum_id"] == query_scan
1701                ].copy()
1702
1703                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1704                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1705                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1706                        ms2_annot_sub["entropy_max"] = (
1707                            ms2_annot_sub["entropy_similarity"]
1708                            == ms2_annot_sub["entropy_similarity"].max()
1709                        )
1710                        ms2_annot_sub["ref_match_fract_max"] = (
1711                            ms2_annot_sub["ref_mz_in_query_fract"]
1712                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1713                        )
1714                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1715                            "query_frag_types"
1716                        ].apply(lambda x: True if "MLF" in x else False)
1717
1718                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1719                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1720                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1721                        ].all(axis=1)
1722
1723                        # If there is a consensus, take the row with the highest entropy_similarity
1724                        if ms2_annot_sub["consensus"].any():
1725                            ms2_annot_sub = ms2_annot_sub[
1726                                ms2_annot_sub["entropy_similarity"]
1727                                == ms2_annot_sub["entropy_similarity"].max()
1728                            ].head(1)
1729                            mlf_results_perid.append(ms2_annot_sub)
1730            if len(mlf_results_perid) == 0:
1731                mlf_results_perid = pd.DataFrame()
1732            else:
1733                mlf_results_perid = pd.concat(mlf_results_perid)
1734                if mlf_results_perid["name"].nunique() == 1:
1735                    mlf_results_perid = mlf_results_perid[
1736                        mlf_results_perid["entropy_similarity"]
1737                        == mlf_results_perid["entropy_similarity"].max()
1738                    ].head(1)
1739                else:
1740                    mlf_results_perid = pd.DataFrame()
1741                mlf_results_all.append(mlf_results_perid)
1742
1743        # These are the consensus annotations to the MLF level
1744        if len(mlf_results_all) > 0:
1745            mlf_results_all = pd.concat(mlf_results_all)
1746            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1747        else:
1748            # Make an empty dataframe
1749            mlf_results_all = ms2_annot.head(0)
1750
1751        # For remaining mf_ids, try to get a consensus annotation to the species level
1752        species_results_all = []
1753        # Remove mf_ids that have consensus annotations to the MLF level
1754        ms2_annot_spec = ms2_annot[
1755            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1756        ]
1757        for mf_id in ms2_annot_spec["mf_id"].unique():
1758            # Do all the hits have the same lipid_summed_name?
1759            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1760            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1761
1762            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1763                # Grab the highest entropy_similarity result
1764                ms2_annot_sub = ms2_annot_sub[
1765                    ms2_annot_sub["entropy_similarity"]
1766                    == ms2_annot_sub["entropy_similarity"].max()
1767                ].head(1)
1768                species_results_all.append(ms2_annot_sub)
1769
1770        # These are the consensus annotations to the species level
1771        if len(species_results_all) > 0:
1772            species_results_all = pd.concat(species_results_all)
1773            species_results_all["annot_level"] = "species"
1774        else:
1775            # Make an empty dataframe
1776            species_results_all = ms2_annot.head(0)
1777
1778        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1779        # Remove mf_ids that have consensus annotations to the species level
1780        ms2_annot_remaining = ms2_annot_spec[
1781            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1782        ]
1783        no_consensus = []
1784        for mf_id in ms2_annot_remaining["mf_id"].unique():
1785            id_sub = []
1786            id_no_con = []
1787            ms2_annot_sub_mf = ms2_annot_remaining[
1788                ms2_annot_remaining["mf_id"] == mf_id
1789            ].copy()
1790            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1791                ms2_annot_sub = ms2_annot_sub_mf[
1792                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1793                ].copy()
1794
1795                # New columns for ranking [HIGHER RANK = BETTER]
1796                ms2_annot_sub["entropy_max"] = (
1797                    ms2_annot_sub["entropy_similarity"]
1798                    == ms2_annot_sub["entropy_similarity"].max()
1799                )
1800                ms2_annot_sub["ref_match_fract_max"] = (
1801                    ms2_annot_sub["ref_mz_in_query_fract"]
1802                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1803                )
1804                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1805                    lambda x: True if "MLF" in x else False
1806                )
1807
1808                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1809                ms2_annot_sub["consensus"] = ms2_annot_sub[
1810                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1811                ].all(axis=1)
1812                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1813                id_sub.append(ms2_annot_sub_con)
1814                id_no_con.append(ms2_annot_sub)
1815            id_sub = pd.concat(id_sub)
1816            id_no_con = pd.concat(id_no_con)
1817
1818            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1819            if (
1820                id_sub["query_frag_types"]
1821                .apply(lambda x: True if "MLF" in x else False)
1822                .all()
1823                and len(id_sub) > 0
1824            ):
1825                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1826                id_sub = id_sub.loc[idx]
1827                # Reorder so highest entropy_similarity is first
1828                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1829                id_sub["annot_level"] = id_sub["structure_level"]
1830                no_consensus.append(id_sub)
1831
1832            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1833            elif len(id_sub) == 0:
1834                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1835                    summed_sub = id_no_con[
1836                        id_no_con["lipid_summed_name"] == lipid_summed_name
1837                    ]
1838                    # Any consensus to MLF?
1839                    if summed_sub["consensus"].any():
1840                        summed_sub = summed_sub[summed_sub["consensus"]]
1841                        summed_sub["annot_level"] = summed_sub["structure_level"]
1842                        no_consensus.append(summed_sub)
1843                    else:
1844                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1845                        summed_sub = summed_sub[
1846                            summed_sub["entropy_similarity"]
1847                            == summed_sub["entropy_similarity"].max()
1848                        ].head(1)
1849                        # get first row
1850                        summed_sub["annot_level"] = "species"
1851                        summed_sub["name"] = ""
1852                        no_consensus.append(summed_sub)
1853            else:
1854                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1855
1856        if len(no_consensus) > 0:
1857            no_consensus = pd.concat(no_consensus)
1858        else:
1859            no_consensus = ms2_annot.head(0)
1860
1861        # Combine all the consensus annotations and reformat the dataframe for output
1862        species_results_all = species_results_all.drop(columns=["name"])
1863        species_results_all["lipid_molecular_species_id"] = ""
1864        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1865        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1866        consensus_annotations = pd.concat(
1867            [mlf_results_all, species_results_all, no_consensus]
1868        )
1869        consensus_annotations = consensus_annotations.sort_values(
1870            "mf_id", ascending=True
1871        )
1872        cols_to_keep = [
1873            "mf_id",
1874            "ref_ion_type",
1875            "entropy_similarity",
1876            "ref_mz_in_query_fract",
1877            "lipid_molecular_species_id",
1878            "lipid_summed_name",
1879            "lipid_subclass",
1880            "lipid_class",
1881            "lipid_category",
1882            "formula",
1883            "annot_level",
1884            "n_spectra_contributing",
1885        ]
1886        consensus_annotations = consensus_annotations[cols_to_keep]
1887        consensus_annotations = consensus_annotations.set_index("mf_id")
1888
1889        return consensus_annotations

Summarize the lipid report.

Parameters
  • ms2_annot (DataFrame): The MS2 annotation DataFrame with all annotations.
Returns
  • DataFrame: The summarized lipid report.
def clean_ms2_report(self, lipid_summary):
1891    def clean_ms2_report(self, lipid_summary):
1892        """Clean the MS2 report.
1893
1894        Parameters
1895        ----------
1896        lipid_summary : DataFrame
1897            The full lipid summary DataFrame.
1898
1899        Returns
1900        -------
1901        DataFrame
1902            The cleaned lipid summary DataFrame.
1903        """
1904        lipid_summary = lipid_summary.reset_index()
1905        lipid_summary["ion_formula"] = [
1906            self.get_ion_formula(f, a)
1907            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
1908        ]
1909
1910        # Reorder columns
1911        lipid_summary = lipid_summary[
1912            [
1913                "mf_id",
1914                "ion_formula",
1915                "ref_ion_type",
1916                "formula",
1917                "annot_level",
1918                "lipid_molecular_species_id",
1919                "lipid_summed_name",
1920                "lipid_subclass",
1921                "lipid_class",
1922                "lipid_category",
1923                "entropy_similarity",
1924                "ref_mz_in_query_fract",
1925                "n_spectra_contributing",
1926            ]
1927        ]
1928
1929        # Set the index to mf_id
1930        lipid_summary = lipid_summary.set_index("mf_id")
1931
1932        return lipid_summary

Clean the MS2 report.

Parameters
  • lipid_summary (DataFrame): The full lipid summary DataFrame.
Returns
  • DataFrame: The cleaned lipid summary DataFrame.
def to_report(self, molecular_metadata=None):
1934    def to_report(self, molecular_metadata=None):
1935        """Create a report of the mass features and their annotations.
1936
1937        Parameters
1938        ----------
1939        molecular_metadata : dict, optional
1940            The molecular metadata. Default is None.
1941
1942        Returns
1943        -------
1944        DataFrame
1945            The report of the mass features and their annotations.
1946
1947        Notes
1948        -----
1949        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
1950        """
1951        # Get mass feature dataframe
1952        mf_report = self.mass_spectra.mass_features_to_df()
1953        mf_report = mf_report.reset_index(drop=False)
1954
1955        # Get and clean ms1 annotation dataframe
1956        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1957        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1958        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1959
1960        # Get, summarize, and clean ms2 annotation dataframe
1961        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1962            molecular_metadata=molecular_metadata
1963        )
1964        if ms2_annot_report is not None and molecular_metadata is not None:
1965            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
1966            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1967            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1968            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1969        report = self.combine_reports(
1970            mf_report=mf_report,
1971            ms1_annot_report=ms1_annot_report,
1972            ms2_annot_report=ms2_annot_report
1973        )
1974        return report

Create a report of the mass features and their annotations.

Parameters
  • molecular_metadata (dict, optional): The molecular metadata. Default is None.
Returns
  • DataFrame: The report of the mass features and their annotations.
Notes

The report will contain the mass features and their annotations from MS1 and MS2 (if available).