corems.mass_spectra.output.export

   1__author__ = "Yuri E. Corilo"
   2__date__ = "Dec 14, 2010"
   3
   4
   5import csv
   6import json
   7import re
   8import uuid
   9import warnings
  10from datetime import datetime, timezone
  11from pathlib import Path
  12
  13import h5py
  14import numpy as np
  15import pandas as pd
  16from openpyxl import load_workbook
  17from pandas import DataFrame, ExcelWriter, read_excel
  18
  19from corems import __version__, corems_md5
  20from corems.encapsulation.output import parameter_to_dict
  21from corems.encapsulation.output.parameter_to_json import (
  22    dump_lcms_settings_json,
  23    dump_lcms_settings_toml,
  24)
  25from corems.mass_spectrum.output.export import HighResMassSpecExport
  26from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula
  27from corems.molecular_id.calc.SpectralSimilarity import methods_name
  28
  29ion_type_dict = {
  30    # adduct : [atoms to add, atoms to subtract when calculating formula of ion
  31    "M+": [{}, {}],
  32    "protonated": [{"H": 1}, {}],
  33    "[M+H]+": [{"H": 1}, {}],
  34    "[M+NH4]+": [{"N": 1, "H": 4}, {}],  # ammonium
  35    "[M+Na]+": [{"Na": 1}, {}],
  36    "[M+K]+": [{"K": 1}, {}],
  37    "[M+2Na+Cl]+": [{"Na": 2, "Cl": 1}, {}],
  38    "[M+2Na-H]+": [{"Na": 2}, {"H": 1}],
  39    "[M+C2H3Na2O2]+": [{"C": 2, "H": 3, "Na": 2, "O": 2}, {}],
  40    "[M+C4H10N3]+": [{"C": 4, "H": 10, "N": 3}, {}],
  41    "[M+NH4+ACN]+": [{"C": 2, "H": 7, "N": 2}, {}],
  42    "[M+H-H2O]+": [{}, {"H": 1, "O": 1}],
  43    "de-protonated": [{}, {"H": 1}],
  44    "[M-H]-": [{}, {"H": 1}],
  45    "[M+Cl]-": [{"Cl": 1}, {}],
  46    "[M+HCOO]-": [{"C": 1, "H": 1, "O": 2}, {}],  # formate
  47    "[M+CH3COO]-": [{"C": 2, "H": 3, "O": 2}, {}],  # acetate
  48    "[M+2NaAc+Cl]-": [{"Na": 2, "C": 2, "H": 3, "O": 2, "Cl": 1}, {}],
  49    "[M+K-2H]-": [{"K": 1}, {"H": 2}],
  50    "[M+Na-2H]-": [{"Na": 1}, {"H": 2}],
  51}
  52
  53
  54class LowResGCMSExport:
  55    """A class to export low resolution GC-MS data.
  56
  57    This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
  58
  59    Parameters:
  60    ----------
  61    out_file_path : str
  62        The output file path.
  63    gcms : object
  64        The low resolution GCMS object.
  65
  66    Attributes:
  67    ----------
  68    output_file : Path
  69        The output file path as a Path object.
  70    gcms : object
  71        The low resolution GCMS object.
  72
  73    Methods:
  74    -------
  75    * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
  76    * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
  77    * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
  78    * to_excel(write_mode='a', write_metadata=True, id_label="corems:"),
  79        Export the data to an Excel file.
  80    * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:").
  81        Export the data to a CSV file.
  82    * to_hdf(id_label="corems:").
  83        Export the data to an HDF5 file.
  84    * get_data_stats(gcms).
  85        Get statistics about the GCMS data.
  86
  87    """
  88
  89    def __init__(self, out_file_path, gcms):
  90        self.output_file = Path(out_file_path)
  91
  92        self.gcms = gcms
  93
  94        self._init_columns()
  95
  96    def _init_columns(self):
  97        """Initialize the column names for the exported data.
  98
  99        Returns:
 100        -------
 101        list
 102            The list of column names.
 103        """
 104
 105        columns = [
 106            "Sample name",
 107            "Peak Index",
 108            "Retention Time",
 109            "Retention Time Ref",
 110            "Peak Height",
 111            "Peak Area",
 112            "Retention index",
 113            "Retention index Ref",
 114            "Retention Index Score",
 115            "Similarity Score",
 116            "Spectral Similarity Score",
 117            "Compound Name",
 118            "Chebi ID",
 119            "Kegg Compound ID",
 120            "Inchi",
 121            "Inchi Key",
 122            "Smiles",
 123            "Molecular Formula",
 124            "IUPAC Name",
 125            "Traditional Name",
 126            "Common Name",
 127            "Derivatization",
 128        ]
 129
 130        if self.gcms.molecular_search_settings.exploratory_mode:
 131            columns.extend(
 132                [
 133                    "Weighted Cosine Correlation",
 134                    "Cosine Correlation",
 135                    "Stein Scott Similarity",
 136                    "Pearson Correlation",
 137                    "Spearman Correlation",
 138                    "Kendall Tau Correlation",
 139                    "Euclidean Distance",
 140                    "Manhattan Distance",
 141                    "Jaccard Distance",
 142                    "DWT Correlation",
 143                    "DFT Correlation",
 144                ]
 145            )
 146
 147            columns.extend(list(methods_name.values()))
 148
 149        return columns
 150
 151    def get_pandas_df(self, id_label="corems:"):
 152        """Get the exported data as a Pandas DataFrame.
 153
 154        Parameters:
 155        ----------
 156        id_label : str, optional
 157            The ID label for the data. Default is "corems:".
 158
 159        Returns:
 160        -------
 161        DataFrame
 162            The exported data as a Pandas DataFrame.
 163        """
 164
 165        columns = self._init_columns()
 166
 167        dict_data_list = self.get_list_dict_data(self.gcms)
 168
 169        df = DataFrame(dict_data_list, columns=columns)
 170
 171        df.name = self.gcms.sample_name
 172
 173        return df
 174
 175    def get_json(self, nan=False, id_label="corems:"):
 176        """Get the exported data as a JSON string.
 177
 178        Parameters:
 179        ----------
 180        nan : bool, optional
 181            Whether to include NaN values in the JSON string. Default is False.
 182        id_label : str, optional
 183            The ID label for the data. Default is "corems:".
 184
 185        """
 186
 187        import json
 188
 189        dict_data_list = self.get_list_dict_data(self.gcms)
 190
 191        return json.dumps(
 192            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
 193        )
 194
 195    def to_pandas(self, write_metadata=True, id_label="corems:"):
 196        """Export the data to a Pandas DataFrame and save it as a pickle file.
 197
 198        Parameters:
 199        ----------
 200        write_metadata : bool, optional
 201            Whether to write metadata to the output file.
 202        id_label : str, optional
 203            The ID label for the data.
 204        """
 205
 206        columns = self._init_columns()
 207
 208        dict_data_list = self.get_list_dict_data(self.gcms)
 209
 210        df = DataFrame(dict_data_list, columns=columns)
 211
 212        df.to_pickle(self.output_file.with_suffix(".pkl"))
 213
 214        if write_metadata:
 215            self.write_settings(
 216                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
 217            )
 218
 219    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
 220        """Export the data to an Excel file.
 221
 222        Parameters:
 223        ----------
 224        write_mode : str, optional
 225            The write mode for the Excel file. Default is 'a' (append).
 226        write_metadata : bool, optional
 227            Whether to write metadata to the output file. Default is True.
 228        id_label : str, optional
 229            The ID label for the data. Default is "corems:".
 230        """
 231
 232        out_put_path = self.output_file.with_suffix(".xlsx")
 233
 234        columns = self._init_columns()
 235
 236        dict_data_list = self.get_list_dict_data(self.gcms)
 237
 238        df = DataFrame(dict_data_list, columns=columns)
 239
 240        if write_mode == "a" and out_put_path.exists():
 241            writer = ExcelWriter(out_put_path, engine="openpyxl")
 242            # try to open an existing workbook
 243            writer.book = load_workbook(out_put_path)
 244            # copy existing sheets
 245            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
 246            # read existing file
 247            reader = read_excel(out_put_path)
 248            # write out the new sheet
 249            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
 250
 251            writer.close()
 252        else:
 253            df.to_excel(
 254                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
 255            )
 256
 257        if write_metadata:
 258            self.write_settings(out_put_path, self.gcms, id_label=id_label)
 259
 260    def to_csv(
 261        self,
 262        separate_output=False,
 263        write_mode="w",
 264        write_metadata=True,
 265        id_label="corems:",
 266    ):
 267        """Export the data to a CSV file.
 268
 269        Parameters:
 270        ----------
 271        separate_output : bool, optional
 272            Whether to separate the output into multiple files. Default is False.
 273        write_mode : str, optional
 274            The write mode for the CSV file. Default is 'w' (write).
 275        write_metadata : bool, optional
 276            Whether to write metadata to the output file. Default is True.
 277        id_label : str, optional
 278            The ID label for the data. Default is "corems:".
 279        """
 280
 281        if separate_output:
 282            # set write mode to write
 283            # this mode will overwrite the file without warning
 284            write_mode = "w"
 285        else:
 286            # set write mode to append
 287            write_mode = "a"
 288
 289        columns = self._init_columns()
 290
 291        dict_data_list = self.get_list_dict_data(self.gcms)
 292
 293        out_put_path = self.output_file.with_suffix(".csv")
 294
 295        write_header = not out_put_path.exists()
 296
 297        try:
 298            with open(out_put_path, write_mode, newline="") as csvfile:
 299                writer = csv.DictWriter(csvfile, fieldnames=columns)
 300                if write_header:
 301                    writer.writeheader()
 302                for data in dict_data_list:
 303                    writer.writerow(data)
 304
 305            if write_metadata:
 306                self.write_settings(out_put_path, self.gcms, id_label=id_label)
 307
 308        except IOError as ioerror:
 309            print(ioerror)
 310
 311    def to_hdf(self, id_label="corems:"):
 312        """Export the data to an HDF5 file.
 313
 314        Parameters:
 315        ----------
 316        id_label : str, optional
 317            The ID label for the data. Default is "corems:".
 318        """
 319
 320        # save sample at a time
 321        def add_compound(gc_peak, compound_obj):
 322            modifier = compound_obj.classify if compound_obj.classify else ""
 323            compound_group = compound_obj.name.replace("/", "") + " " + modifier
 324
 325            if compound_group not in peak_group:
 326                compound_group = peak_group.create_group(compound_group)
 327
 328                # compound_group.attrs["retention_time"] = compound_obj.retention_time
 329                compound_group.attrs["retention_index"] = compound_obj.ri
 330                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
 331                compound_group.attrs["spectral_similarity_score"] = (
 332                    compound_obj.spectral_similarity_score
 333                )
 334                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
 335
 336                compond_mz = compound_group.create_dataset(
 337                    "mz", data=np.array(compound_obj.mz), dtype="f8"
 338                )
 339                compond_abundance = compound_group.create_dataset(
 340                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
 341                )
 342
 343                if self.gcms.molecular_search_settings.exploratory_mode:
 344                    compound_group.attrs["Spectral Similarities"] = json.dumps(
 345                        compound_obj.spectral_similarity_scores,
 346                        sort_keys=False,
 347                        indent=4,
 348                        separators=(",", ":"),
 349                    )
 350            else:
 351                warnings.warn("Skipping duplicate reference compound.")
 352
 353        import json
 354        from datetime import datetime, timezone
 355
 356        import h5py
 357        import numpy as np
 358
 359        output_path = self.output_file.with_suffix(".hdf5")
 360
 361        with h5py.File(output_path, "w") as hdf_handle:
 362            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
 363            hdf_handle.attrs["time_stamp"] = timenow
 364            hdf_handle.attrs["data_structure"] = "gcms"
 365            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
 366            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
 367
 368            hdf_handle.attrs["sample_id"] = "self.gcms.id"
 369            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
 370            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
 371            hdf_handle.attrs["output_data"] = str(output_path)
 372            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
 373            hdf_handle.attrs["corems_version"] = __version__
 374
 375            hdf_handle.attrs["Stats"] = json.dumps(
 376                self.get_data_stats(self.gcms),
 377                sort_keys=False,
 378                indent=4,
 379                separators=(",", ": "),
 380            )
 381            hdf_handle.attrs["Calibration"] = json.dumps(
 382                self.get_calibration_stats(self.gcms, id_label),
 383                sort_keys=False,
 384                indent=4,
 385                separators=(",", ": "),
 386            )
 387            hdf_handle.attrs["Blank"] = json.dumps(
 388                self.get_blank_stats(self.gcms),
 389                sort_keys=False,
 390                indent=4,
 391                separators=(",", ": "),
 392            )
 393
 394            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
 395            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
 396                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
 397            )
 398
 399            scans_dataset = hdf_handle.create_dataset(
 400                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
 401            )
 402            rt_dataset = hdf_handle.create_dataset(
 403                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
 404            )
 405            tic_dataset = hdf_handle.create_dataset(
 406                "tic", data=np.array(self.gcms.tic), dtype="f8"
 407            )
 408            processed_tic_dataset = hdf_handle.create_dataset(
 409                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
 410            )
 411
 412            output_score_method = (
 413                self.gcms.molecular_search_settings.output_score_method
 414            )
 415
 416            for gc_peak in self.gcms:
 417                # print(gc_peak.retention_time)
 418                # print(gc_peak.tic)
 419
 420                # check if there is a compound candidate
 421                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
 422                peak_group.attrs["deconvolution"] = int(
 423                    self.gcms.chromatogram_settings.use_deconvolution
 424                )
 425
 426                peak_group.attrs["start_scan"] = gc_peak.start_scan
 427                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
 428                peak_group.attrs["final_scan"] = gc_peak.final_scan
 429
 430                peak_group.attrs["retention_index"] = gc_peak.ri
 431                peak_group.attrs["retention_time"] = gc_peak.retention_time
 432                peak_group.attrs["area"] = gc_peak.area
 433
 434                mz = peak_group.create_dataset(
 435                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
 436                )
 437                abundance = peak_group.create_dataset(
 438                    "abundance",
 439                    data=np.array(gc_peak.mass_spectrum.abundance),
 440                    dtype="f8",
 441                )
 442
 443                if gc_peak:
 444                    if output_score_method == "highest_sim_score":
 445                        compound_obj = gc_peak.highest_score_compound
 446                        add_compound(gc_peak, compound_obj)
 447
 448                    elif output_score_method == "highest_ss":
 449                        compound_obj = gc_peak.highest_ss_compound
 450                        add_compound(gc_peak, compound_obj)
 451
 452                    else:
 453                        for compound_obj in gc_peak:
 454                            add_compound(gc_peak, compound_obj)
 455
 456    def get_data_stats(self, gcms):
 457        """Get statistics about the GCMS data.
 458
 459        Parameters:
 460        ----------
 461        gcms : object
 462            The low resolution GCMS object.
 463
 464        Returns:
 465        -------
 466        dict
 467            A dictionary containing the data statistics.
 468        """
 469
 470        matched_peaks = gcms.matched_peaks
 471        no_matched_peaks = gcms.no_matched_peaks
 472        unique_metabolites = gcms.unique_metabolites
 473
 474        peak_matchs_above_0p85 = 0
 475        unique_peak_match_above_0p85 = 0
 476        for match_peak in matched_peaks:
 477            gc_peak_above_85 = 0
 478            matches_above_85 = list(
 479                filter(lambda m: m.similarity_score >= 0.85, match_peak)
 480            )
 481            if matches_above_85:
 482                peak_matchs_above_0p85 += 1
 483            if len(matches_above_85) == 1:
 484                unique_peak_match_above_0p85 += 1
 485
 486        data_stats = {}
 487        data_stats["average_signal_noise"] = "ni"
 488        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
 489        data_stats["total_number_peaks"] = len(gcms)
 490        data_stats["total_peaks_matched"] = len(matched_peaks)
 491        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
 492        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
 493        data_stats["single_matches_above_similarity_score_0.85"] = (
 494            unique_peak_match_above_0p85
 495        )
 496        data_stats["unique_metabolites"] = len(unique_metabolites)
 497
 498        return data_stats
 499
 500    def get_calibration_stats(self, gcms, id_label):
 501        """Get statistics about the GC-MS calibration.
 502
 503        Parameters:
 504        ----------
 505        """
 506        calibration_parameters = {}
 507
 508        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
 509        calibration_parameters["data_url"] = str(gcms.cal_file_path)
 510        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
 511        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
 512        calibration_parameters["calibration_method"] = ""
 513
 514        return calibration_parameters
 515
 516    def get_blank_stats(self, gcms):
 517        """Get statistics about the GC-MS blank."""
 518        blank_parameters = {}
 519
 520        blank_parameters["data_name"] = "ni"
 521        blank_parameters["blank_id"] = "ni"
 522        blank_parameters["data_url"] = "ni"
 523        blank_parameters["has_input"] = "ni"
 524        blank_parameters["common_features_to_blank"] = "ni"
 525
 526        return blank_parameters
 527
 528    def get_instrument_metadata(self, gcms):
 529        """Get metadata about the GC-MS instrument."""
 530        instrument_metadata = {}
 531
 532        instrument_metadata["analyzer"] = gcms.analyzer
 533        instrument_metadata["instrument_label"] = gcms.instrument_label
 534        instrument_metadata["instrument_id"] = uuid.uuid4().hex
 535
 536        return instrument_metadata
 537
 538    def get_data_metadata(self, gcms, id_label, output_path):
 539        """Get metadata about the GC-MS data.
 540
 541        Parameters:
 542        ----------
 543        gcms : object
 544            The low resolution GCMS object.
 545        id_label : str
 546            The ID label for the data.
 547        output_path : str
 548            The output file path.
 549
 550        Returns:
 551        -------
 552        dict
 553            A dictionary containing the data metadata.
 554        """
 555        if isinstance(output_path, str):
 556            output_path = Path(output_path)
 557
 558        paramaters_path = output_path.with_suffix(".json")
 559
 560        if paramaters_path.exists():
 561            with paramaters_path.open() as current_param:
 562                metadata = json.load(current_param)
 563                data_metadata = metadata.get("Data")
 564        else:
 565            data_metadata = {}
 566            data_metadata["data_name"] = []
 567            data_metadata["input_data_url"] = []
 568            data_metadata["has_input"] = []
 569
 570        data_metadata["data_name"].append(gcms.sample_name)
 571        data_metadata["input_data_url"].append(str(gcms.file_location))
 572        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
 573
 574        data_metadata["output_data_name"] = str(output_path.stem)
 575        data_metadata["output_data_url"] = str(output_path)
 576        data_metadata["has_output"] = id_label + corems_md5(output_path)
 577
 578        return data_metadata
 579
 580    def get_parameters_json(self, gcms, id_label, output_path):
 581        """Get the parameters as a JSON string.
 582
 583        Parameters:
 584        ----------
 585        gcms : GCMS object
 586            The low resolution GCMS object.
 587        id_label : str
 588            The ID label for the data.
 589        output_path : str
 590            The output file path.
 591
 592        Returns:
 593        -------
 594        str
 595            The parameters as a JSON string.
 596        """
 597
 598        output_parameters_dict = {}
 599        output_parameters_dict["Data"] = self.get_data_metadata(
 600            gcms, id_label, output_path
 601        )
 602        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
 603        output_parameters_dict["Calibration"] = self.get_calibration_stats(
 604            gcms, id_label
 605        )
 606        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
 607        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
 608        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
 609        corems_dict_setting["corems_version"] = __version__
 610        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
 611        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
 612        output = json.dumps(
 613            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
 614        )
 615
 616        return output
 617
 618    def write_settings(self, output_path, gcms, id_label="emsl:"):
 619        """Write the settings to a JSON file.
 620
 621        Parameters:
 622        ----------
 623        output_path : str
 624            The output file path.
 625        gcms : GCMS object
 626            The low resolution GCMS object.
 627        id_label : str
 628            The ID label for the data. Default is "emsl:".
 629
 630        """
 631
 632        output = self.get_parameters_json(gcms, id_label, output_path)
 633
 634        with open(
 635            output_path.with_suffix(".json"),
 636            "w",
 637            encoding="utf8",
 638        ) as outfile:
 639            outfile.write(output)
 640
 641    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
 642        """Get the exported data as a list of dictionaries.
 643
 644        Parameters:
 645        ----------
 646        gcms : object
 647            The low resolution GCMS object.
 648        include_no_match : bool, optional
 649            Whether to include no match data. Default is True.
 650        no_match_inline : bool, optional
 651            Whether to include no match data inline. Default is False.
 652
 653        Returns:
 654        -------
 655        list
 656            The exported data as a list of dictionaries.
 657        """
 658
 659        output_score_method = gcms.molecular_search_settings.output_score_method
 660
 661        dict_data_list = []
 662
 663        def add_match_dict_data():
 664            derivatization = "{}:{}:{}".format(
 665                compound_obj.classify,
 666                compound_obj.derivativenum,
 667                compound_obj.derivatization,
 668            )
 669            out_dict = {
 670                "Sample name": gcms.sample_name,
 671                "Peak Index": gcpeak_index,
 672                "Retention Time": gc_peak.retention_time,
 673                "Retention Time Ref": compound_obj.retention_time,
 674                "Peak Height": gc_peak.tic,
 675                "Peak Area": gc_peak.area,
 676                "Retention index": gc_peak.ri,
 677                "Retention index Ref": compound_obj.ri,
 678                "Retention Index Score": compound_obj.ri_score,
 679                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
 680                "Similarity Score": compound_obj.similarity_score,
 681                "Compound Name": compound_obj.name,
 682                "Chebi ID": compound_obj.metadata.chebi,
 683                "Kegg Compound ID": compound_obj.metadata.kegg,
 684                "Inchi": compound_obj.metadata.inchi,
 685                "Inchi Key": compound_obj.metadata.inchikey,
 686                "Smiles": compound_obj.metadata.smiles,
 687                "Molecular Formula": compound_obj.formula,
 688                "IUPAC Name": compound_obj.metadata.iupac_name,
 689                "Traditional Name": compound_obj.metadata.traditional_name,
 690                "Common Name": compound_obj.metadata.common_name,
 691                "Derivatization": derivatization,
 692            }
 693
 694            if self.gcms.molecular_search_settings.exploratory_mode:
 695                out_dict.update(
 696                    {
 697                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
 698                            "weighted_cosine_correlation"
 699                        ),
 700                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
 701                            "cosine_correlation"
 702                        ),
 703                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
 704                            "stein_scott_similarity"
 705                        ),
 706                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
 707                            "pearson_correlation"
 708                        ),
 709                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
 710                            "spearman_correlation"
 711                        ),
 712                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
 713                            "kendall_tau_correlation"
 714                        ),
 715                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
 716                            "dft_correlation"
 717                        ),
 718                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
 719                            "dwt_correlation"
 720                        ),
 721                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
 722                            "euclidean_distance"
 723                        ),
 724                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
 725                            "manhattan_distance"
 726                        ),
 727                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
 728                            "jaccard_distance"
 729                        ),
 730                    }
 731                )
 732                for method in methods_name:
 733                    out_dict[methods_name.get(method)] = (
 734                        compound_obj.spectral_similarity_scores.get(method)
 735                    )
 736
 737            dict_data_list.append(out_dict)
 738
 739        def add_no_match_dict_data():
 740            dict_data_list.append(
 741                {
 742                    "Sample name": gcms.sample_name,
 743                    "Peak Index": gcpeak_index,
 744                    "Retention Time": gc_peak.retention_time,
 745                    "Peak Height": gc_peak.tic,
 746                    "Peak Area": gc_peak.area,
 747                    "Retention index": gc_peak.ri,
 748                }
 749            )
 750
 751        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
 752            # check if there is a compound candidate
 753            if gc_peak:
 754                if output_score_method == "highest_sim_score":
 755                    compound_obj = gc_peak.highest_score_compound
 756                    add_match_dict_data()
 757
 758                elif output_score_method == "highest_ss":
 759                    compound_obj = gc_peak.highest_ss_compound
 760                    add_match_dict_data()
 761
 762                else:
 763                    for compound_obj in gc_peak:
 764                        add_match_dict_data()  # add monoisotopic peak
 765
 766            else:
 767                # include not_match
 768                if include_no_match and no_match_inline:
 769                    add_no_match_dict_data()
 770
 771        if include_no_match and not no_match_inline:
 772            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
 773                if not gc_peak:
 774                    add_no_match_dict_data()
 775
 776        return dict_data_list
 777
 778
 779class HighResMassSpectraExport(HighResMassSpecExport):
 780    """A class to export high resolution mass spectra data.
 781
 782    This class provides methods to export high resolution mass spectra data to various formats
 783    such as Excel, CSV, HDF5, and Pandas DataFrame.
 784
 785    Parameters
 786    ----------
 787    out_file_path : str | Path
 788        The output file path.
 789    mass_spectra : object
 790        The high resolution mass spectra object.
 791    output_type : str, optional
 792        The output type. Default is 'excel'.
 793
 794    Attributes
 795    ----------
 796    output_file : Path
 797        The output file path without suffix
 798    dir_loc : Path
 799        The directory location for the output file,
 800        by default this will be the output_file + ".corems" and all output files will be
 801        written into this location
 802    mass_spectra : MassSpectraBase
 803        The high resolution mass spectra object.
 804    """
 805
 806    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
 807        super().__init__(
 808            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
 809        )
 810
 811        self.dir_loc = Path(out_file_path + ".corems")
 812        self.dir_loc.mkdir(exist_ok=True)
 813        # Place the output file in the directory
 814        self.output_file = self.dir_loc / Path(out_file_path).name
 815        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
 816        self.mass_spectra = mass_spectra
 817        self.atoms_order_list = None
 818        self._init_columns()
 819
 820    def get_pandas_df(self):
 821        """Get the mass spectra as a list of Pandas DataFrames."""
 822
 823        list_df = []
 824
 825        for mass_spectrum in self.mass_spectra:
 826            columns = self.columns_label + self.get_all_used_atoms_in_order(
 827                mass_spectrum
 828            )
 829
 830            dict_data_list = self.get_list_dict_data(mass_spectrum)
 831
 832            df = DataFrame(dict_data_list, columns=columns)
 833
 834            scan_number = mass_spectrum.scan_number
 835
 836            df.name = str(self.output_file) + "_" + str(scan_number)
 837
 838            list_df.append(df)
 839
 840        return list_df
 841
 842    def to_pandas(self, write_metadata=True):
 843        """Export the data to a Pandas DataFrame and save it as a pickle file.
 844
 845        Parameters:
 846        ----------
 847        write_metadata : bool, optional
 848            Whether to write metadata to the output file. Default is True.
 849        """
 850
 851        for mass_spectrum in self.mass_spectra:
 852            columns = self.columns_label + self.get_all_used_atoms_in_order(
 853                mass_spectrum
 854            )
 855
 856            dict_data_list = self.get_list_dict_data(mass_spectrum)
 857
 858            df = DataFrame(dict_data_list, columns=columns)
 859
 860            scan_number = mass_spectrum.scan_number
 861
 862            out_filename = Path(
 863                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
 864            )
 865
 866            df.to_pickle(self.dir_loc / out_filename)
 867
 868            if write_metadata:
 869                self.write_settings(
 870                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 871                )
 872
 873    def to_excel(self, write_metadata=True):
 874        """Export the data to an Excel file.
 875
 876        Parameters:
 877        ----------
 878        write_metadata : bool, optional
 879            Whether to write metadata to the output file. Default is True.
 880        """
 881        for mass_spectrum in self.mass_spectra:
 882            columns = self.columns_label + self.get_all_used_atoms_in_order(
 883                mass_spectrum
 884            )
 885
 886            dict_data_list = self.get_list_dict_data(mass_spectrum)
 887
 888            df = DataFrame(dict_data_list, columns=columns)
 889
 890            scan_number = mass_spectrum.scan_number
 891
 892            out_filename = Path(
 893                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
 894            )
 895
 896            df.to_excel(self.dir_loc / out_filename)
 897
 898            if write_metadata:
 899                self.write_settings(
 900                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 901                )
 902
 903    def to_csv(self, write_metadata=True):
 904        """Export the data to a CSV file.
 905
 906        Parameters:
 907        ----------
 908        write_metadata : bool, optional
 909            Whether to write metadata to the output file. Default is True.
 910        """
 911        import csv
 912
 913        for mass_spectrum in self.mass_spectra:
 914            columns = self.columns_label + self.get_all_used_atoms_in_order(
 915                mass_spectrum
 916            )
 917
 918            scan_number = mass_spectrum.scan_number
 919
 920            dict_data_list = self.get_list_dict_data(mass_spectrum)
 921
 922            out_filename = Path(
 923                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
 924            )
 925
 926            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
 927                writer = csv.DictWriter(csvfile, fieldnames=columns)
 928                writer.writeheader()
 929                for data in dict_data_list:
 930                    writer.writerow(data)
 931
 932            if write_metadata:
 933                self.write_settings(
 934                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 935                )
 936
 937    def get_mass_spectra_attrs(self):
 938        """Get the mass spectra attributes as a JSON string.
 939
 940        Parameters:
 941        ----------
 942        mass_spectra : object
 943            The high resolution mass spectra object.
 944
 945        Returns:
 946        -------
 947        str
 948            The mass spectra attributes as a JSON string.
 949        """
 950        dict_ms_attrs = {}
 951        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
 952        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
 953        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
 954
 955        return json.dumps(
 956            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
 957        )
 958
 959    def to_hdf(self, overwrite=False, export_raw=True):
 960        """Export the data to an HDF5 file.
 961
 962        Parameters
 963        ----------
 964        overwrite : bool, optional
 965            Whether to overwrite the output file. Default is False.
 966        export_raw : bool, optional
 967            Whether to export the raw mass spectra data. Default is True.
 968        """
 969        if overwrite:
 970            if self.output_file.with_suffix(".hdf5").exists():
 971                self.output_file.with_suffix(".hdf5").unlink()
 972
 973        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 974            if not hdf_handle.attrs.get("date_utc"):
 975                # Set metadata for all mass spectra
 976                timenow = str(
 977                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 978                )
 979                hdf_handle.attrs["date_utc"] = timenow
 980                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 981                hdf_handle.attrs["data_structure"] = "mass_spectra"
 982                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 983                hdf_handle.attrs["instrument_label"] = (
 984                    self.mass_spectra.instrument_label
 985                )
 986                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 987                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 988                hdf_handle.attrs["parser_type"] = (
 989                    self.mass_spectra.spectra_parser_class.__name__
 990                )
 991                hdf_handle.attrs["original_file_location"] = (
 992                    self.mass_spectra.file_location._str
 993                )
 994
 995            if "mass_spectra" not in hdf_handle:
 996                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 997            else:
 998                mass_spectra_group = hdf_handle.get("mass_spectra")
 999
1000            for mass_spectrum in self.mass_spectra:
1001                group_key = str(int(mass_spectrum.scan_number))
1002
1003                self.add_mass_spectrum_to_hdf5(
1004                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1005                )
1006
1007
1008class LCMSExport(HighResMassSpectraExport):
1009    """A class to export high resolution LC-MS data.
1010
1011    This class provides methods to export high resolution LC-MS data to HDF5.
1012
1013    Parameters
1014    ----------
1015    out_file_path : str | Path
1016        The output file path, do not include the file extension.
1017    lcms_object : LCMSBase
1018        The high resolution lc-ms object.
1019    """
1020
1021    def __init__(self, out_file_path, mass_spectra):
1022        super().__init__(out_file_path, mass_spectra, output_type="hdf5")
1023
1024    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1025        """Export the data to an HDF5.
1026
1027        Parameters
1028        ----------
1029        overwrite : bool, optional
1030            Whether to overwrite the output file. Default is False.
1031        save_parameters : bool, optional
1032            Whether to save the parameters as a separate json or toml file. Default is True.
1033        parameter_format : str, optional
1034            The format to save the parameters in. Default is 'toml'.
1035
1036        Raises
1037        ------
1038        ValueError
1039            If parameter_format is not 'json' or 'toml'.
1040        """
1041        export_profile_spectra = (
1042            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1043        )
1044
1045        # Parameterized export: all spectra (default) or only relevant spectra
1046        export_only_relevant = self.mass_spectra.parameters.lc_ms.export_only_relevant_mass_spectra
1047        if export_only_relevant:
1048            relevant_scan_numbers = set()
1049            # Add MS1 spectra associated with mass features (apex scans) and best MS2 spectra
1050            for mass_feature in self.mass_spectra.mass_features.values():
1051                relevant_scan_numbers.add(mass_feature.apex_scan)
1052                if mass_feature.best_ms2 is not None:
1053                    relevant_scan_numbers.add(mass_feature.best_ms2.scan_number)
1054        if overwrite:
1055            if self.output_file.with_suffix(".hdf5").exists():
1056                self.output_file.with_suffix(".hdf5").unlink()
1057
1058        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1059            if not hdf_handle.attrs.get("date_utc"):
1060                # Set metadata for all mass spectra
1061                timenow = str(
1062                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
1063                )
1064                hdf_handle.attrs["date_utc"] = timenow
1065                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
1066                hdf_handle.attrs["data_structure"] = "mass_spectra"
1067                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
1068                hdf_handle.attrs["instrument_label"] = (
1069                    self.mass_spectra.instrument_label
1070                )
1071                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
1072                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
1073                hdf_handle.attrs["parser_type"] = (
1074                    self.mass_spectra.spectra_parser_class.__name__
1075                )
1076                hdf_handle.attrs["original_file_location"] = (
1077                    self.mass_spectra.file_location._str
1078                )
1079
1080            if "mass_spectra" not in hdf_handle:
1081                mass_spectra_group = hdf_handle.create_group("mass_spectra")
1082            else:
1083                mass_spectra_group = hdf_handle.get("mass_spectra")
1084
1085            # Export logic based on parameter
1086            for mass_spectrum in self.mass_spectra:
1087                if export_only_relevant:
1088                    if mass_spectrum.scan_number in relevant_scan_numbers:
1089                        group_key = str(int(mass_spectrum.scan_number))
1090                        self.add_mass_spectrum_to_hdf5(
1091                            hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra
1092                        )
1093                else:
1094                    group_key = str(int(mass_spectrum.scan_number))
1095                    self.add_mass_spectrum_to_hdf5(
1096                        hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra
1097                    )
1098
1099        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1100        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1101            # Add scan_info to hdf5 file
1102            if "scan_info" not in hdf_handle:
1103                scan_info_group = hdf_handle.create_group("scan_info")
1104                for k, v in self.mass_spectra._scan_info.items():
1105                    array = np.array(list(v.values()))
1106                    if array.dtype.str[0:2] == "<U":
1107                        # Convert Unicode strings to UTF-8 encoded bytes first
1108                        string_data = [str(item) for item in array]
1109                        string_dtype = h5py.string_dtype(encoding='utf-8')
1110                        scan_info_group.create_dataset(k, data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1111                    else:
1112                        # Apply data type optimization for numeric data
1113                        if array.dtype == np.float64:
1114                            array = array.astype(np.float32)
1115                        elif array.dtype == np.int64:
1116                            array = array.astype(np.int32)
1117                        scan_info_group.create_dataset(k, data=array, compression="gzip", compression_opts=9, chunks=True)
1118
1119            # Add ms_unprocessed to hdf5 file
1120            export_unprocessed_ms1 = (
1121                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1122            )
1123            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1124                if "ms_unprocessed" not in hdf_handle:
1125                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1126                else:
1127                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1128                for k, v in self.mass_spectra._ms_unprocessed.items():
1129                    array = np.array(v)
1130                    # Apply data type optimization and compression
1131                    if array.dtype == np.int64:
1132                        array = array.astype(np.int32)
1133                    elif array.dtype == np.float64:
1134                        array = array.astype(np.float32)
1135                    elif array.dtype.str[0:2] == "<U":
1136                        # Convert Unicode strings to UTF-8 encoded strings
1137                        string_data = [str(item) for item in array]
1138                        string_dtype = h5py.string_dtype(encoding='utf-8')
1139                        ms_unprocessed_group.create_dataset(str(k), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1140                        continue
1141                    ms_unprocessed_group.create_dataset(str(k), data=array, compression="gzip", compression_opts=9, chunks=True)
1142
1143            # Add LCMS mass features to hdf5 file
1144            if len(self.mass_spectra.mass_features) > 0:
1145                if "mass_features" not in hdf_handle:
1146                    mass_features_group = hdf_handle.create_group("mass_features")
1147                else:
1148                    mass_features_group = hdf_handle.get("mass_features")
1149
1150                # Create group for each mass feature, with key as the mass feature id
1151                for k, v in self.mass_spectra.mass_features.items():
1152                    mass_features_group.create_group(str(k))
1153                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1154                    for k2, v2 in v.__dict__.items():
1155                        if v2 is not None:
1156                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1157                            if k2 not in [
1158                                "chromatogram_parent",
1159                                "ms2_mass_spectra",
1160                                "mass_spectrum",
1161                                "_eic_data",
1162                                "ms2_similarity_results",
1163                            ]:
1164                                if k2 == "ms2_scan_numbers":
1165                                    array = np.array(v2)
1166                                    # Convert int64 to int32
1167                                    if array.dtype == np.int64:
1168                                        array = array.astype(np.int32)
1169                                    mass_features_group[str(k)].create_dataset(
1170                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1171                                    )
1172                                elif k2 == "_half_height_width":
1173                                    array = np.array(v2)
1174                                    # Convert float64 to float32
1175                                    if array.dtype == np.float64:
1176                                        array = array.astype(np.float32)
1177                                    mass_features_group[str(k)].create_dataset(
1178                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1179                                    )
1180                                elif k2 == "_ms_deconvoluted_idx":
1181                                    array = np.array(v2)
1182                                    # Convert int64 to int32
1183                                    if array.dtype == np.int64:
1184                                        array = array.astype(np.int32)
1185                                    mass_features_group[str(k)].create_dataset(
1186                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1187                                    )
1188                                elif k2 == "associated_mass_features_deconvoluted":
1189                                    array = np.array(v2)
1190                                    # Convert int64 to int32
1191                                    if array.dtype == np.int64:
1192                                        array = array.astype(np.int32)
1193                                    mass_features_group[str(k)].create_dataset(
1194                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1195                                    )
1196                                elif k2 == "_noise_score":
1197                                    array = np.array(v2)
1198                                    # Convert float64 to float32
1199                                    if array.dtype == np.float64:
1200                                        array = array.astype(np.float32)
1201                                    mass_features_group[str(k)].create_dataset(
1202                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1203                                    )
1204                                elif (
1205                                    isinstance(v2, int)
1206                                    or isinstance(v2, float)
1207                                    or isinstance(v2, str)
1208                                    or isinstance(v2, np.integer)
1209                                    or isinstance(v2, np.float32)
1210                                    or isinstance(v2, np.float64)
1211                                    or isinstance(v2, np.bool_)
1212                                ):
1213                                    # Convert numpy types to smaller precision for storage
1214                                    if isinstance(v2, np.int64):
1215                                        v2 = np.int32(v2)
1216                                    elif isinstance(v2, np.float64):
1217                                        v2 = np.float32(v2)
1218                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1219                                else:
1220                                    raise TypeError(
1221                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1222                                    )
1223
1224            # Add EIC data to hdf5 file
1225            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1226            if len(self.mass_spectra.eics) > 0 and export_eics:
1227                if "eics" not in hdf_handle:
1228                    eic_group = hdf_handle.create_group("eics")
1229                else:
1230                    eic_group = hdf_handle.get("eics")
1231
1232                # Create group for each eic
1233                for k, v in self.mass_spectra.eics.items():
1234                    eic_group.create_group(str(k))
1235                    eic_group[str(k)].attrs["mz"] = k
1236                    # Loop through each of the attributes and add them as datasets (if array)
1237                    for k2, v2 in v.__dict__.items():
1238                        if v2 is not None:
1239                            array = np.array(v2)
1240                            # Apply data type optimization and compression
1241                            if array.dtype == np.int64:
1242                                array = array.astype(np.int32)
1243                            elif array.dtype == np.float64:
1244                                array = array.astype(np.float32)
1245                            elif array.dtype.str[0:2] == "<U":
1246                                # Convert Unicode strings to UTF-8 encoded strings
1247                                string_data = [str(item) for item in array]
1248                                string_dtype = h5py.string_dtype(encoding='utf-8')
1249                                eic_group[str(k)].create_dataset(str(k2), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1250                                continue
1251                            eic_group[str(k)].create_dataset(str(k2), data=array, compression="gzip", compression_opts=9, chunks=True)
1252
1253            # Add ms2_search results to hdf5 file (parameterized)
1254            if len(self.mass_spectra.spectral_search_results) > 0:
1255                if "spectral_search_results" not in hdf_handle:
1256                    spectral_search_results = hdf_handle.create_group(
1257                        "spectral_search_results"
1258                    )
1259                else:
1260                    spectral_search_results = hdf_handle.get("spectral_search_results")
1261                # Create group for each search result by ms2_scan / precursor_mz
1262                for k, v in self.mass_spectra.spectral_search_results.items():
1263                    # If parameter is set, only export spectral search results for relevant scans
1264                    if export_only_relevant and k not in relevant_scan_numbers:
1265                        continue
1266                    spectral_search_results.create_group(str(k))
1267                    for k2, v2 in v.items():
1268                        spectral_search_results[str(k)].create_group(str(k2))
1269                        spectral_search_results[str(k)][str(k2)].attrs[
1270                            "precursor_mz"
1271                        ] = v2.precursor_mz
1272                        spectral_search_results[str(k)][str(k2)].attrs[
1273                            "query_spectrum_id"
1274                        ] = v2.query_spectrum_id
1275                        # Loop through each of the attributes and add them as datasets (if array)
1276                        for k3, v3 in v2.__dict__.items():
1277                            if v3 is not None and k3 not in [
1278                                "query_spectrum",
1279                                "precursor_mz",
1280                                "query_spectrum_id",
1281                            ]:
1282                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1283                                    v3 = [", ".join(x) for x in v3]
1284                                if all(v3 is not None for v3 in v3):
1285                                    array = np.array(v3)
1286                                if array.dtype.str[0:2] == "<U":
1287                                    array = array.astype("S")
1288                                spectral_search_results[str(k)][str(k2)].create_dataset(
1289                                    str(k3), data=array
1290                                )
1291        if save_parameters:
1292            # Check if parameter_format is valid
1293            if parameter_format not in ["json", "toml"]:
1294                raise ValueError("parameter_format must be 'json' or 'toml'")
1295
1296            if parameter_format == "json":
1297                dump_lcms_settings_json(
1298                    filename=self.output_file.with_suffix(".json"),
1299                    lcms_obj=self.mass_spectra,
1300                )
1301            elif parameter_format == "toml":
1302                dump_lcms_settings_toml(
1303                    filename=self.output_file.with_suffix(".toml"),
1304                    lcms_obj=self.mass_spectra,
1305                )
1306
1307class LCMSMetabolomicsExport(LCMSExport):
1308    """A class to export LCMS metabolite data.
1309
1310    This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report.
1311
1312    Parameters
1313    ----------
1314    out_file_path : str | Path
1315        The output file path, do not include the file extension.
1316    mass_spectra : object
1317        The high resolution mass spectra object.
1318    """
1319
1320    def __init__(self, out_file_path, mass_spectra):
1321        super().__init__(out_file_path, mass_spectra)
1322        self.ion_type_dict = ion_type_dict
1323    
1324    @staticmethod
1325    def get_ion_formula(neutral_formula, ion_type):
1326        """From a neutral formula and an ion type, return the formula of the ion.
1327
1328        Notes
1329        -----
1330        This is a static method.
1331        If the neutral_formula is not a string, this method will return None.
1332
1333        Parameters
1334        ----------
1335        neutral_formula : str
1336            The neutral formula, this should be a string form from the MolecularFormula class
1337            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1338            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1339            e.g. MgCl2 is parsed as 'Mg Cl2.
1340        ion_type : str
1341            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1342            See the self.ion_type_dict for the available ion types.
1343
1344        Returns
1345        -------
1346        str
1347            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1348        """
1349        # If neutral_formula is not a string, return None
1350        if not isinstance(neutral_formula, str):
1351            return None
1352
1353        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1354        if re.search(r"\s", neutral_formula):
1355            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1356        else:
1357            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1358            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1359            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1360            neutral_formula = MolecularFormula(
1361                dict(
1362                    zip(
1363                        [x[0] for x in elements],
1364                        [int(x[0]) if x else 1 for x in counts],
1365                    )
1366                ),
1367                ion_charge=0,
1368            )
1369        neutral_formula_dict = neutral_formula.to_dict().copy()
1370
1371        adduct_add_dict = ion_type_dict[ion_type][0]
1372        for key in adduct_add_dict:
1373            if key in neutral_formula_dict.keys():
1374                neutral_formula_dict[key] += adduct_add_dict[key]
1375            else:
1376                neutral_formula_dict[key] = adduct_add_dict[key]
1377
1378        adduct_subtract = ion_type_dict[ion_type][1]
1379        for key in adduct_subtract:
1380            neutral_formula_dict[key] -= adduct_subtract[key]
1381
1382        return MolecularFormula(neutral_formula_dict, ion_charge=0).string
1383
1384    @staticmethod
1385    def get_isotope_type(ion_formula):
1386        """From an ion formula, return the 13C isotope type of the ion.
1387
1388        Notes
1389        -----
1390        This is a static method.
1391        If the ion_formula is not a string, this method will return None.
1392        This is currently only functional for 13C isotopes.
1393
1394        Parameters
1395        ----------
1396        ion_formula : str
1397            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1398
1399        Returns
1400        -------
1401        str
1402            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1403
1404        Raises
1405        ------
1406        ValueError
1407            If the ion_formula is not a string.
1408        """
1409        if not isinstance(ion_formula, str):
1410            return None
1411
1412        if re.search(r"\s", ion_formula):
1413            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1414        else:
1415            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1416        ion_formula_dict = ion_formula.to_dict().copy()
1417
1418        try:
1419            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1420        except KeyError:
1421            iso_class = None
1422
1423        return iso_class
1424    
1425    def report_to_csv(self, molecular_metadata=None):
1426        """Create a report of the mass features and their annotations and save it as a CSV file.
1427
1428        Parameters
1429        ----------
1430        molecular_metadata : dict, optional
1431            The molecular metadata. Default is None.
1432        """
1433        report = self.to_report(molecular_metadata=molecular_metadata)
1434        out_file = self.output_file.with_suffix(".csv")
1435        report.to_csv(out_file, index=False)
1436    
1437    def clean_ms1_report(self, ms1_summary_full):
1438        """Clean the MS1 report.
1439
1440        Parameters
1441        ----------
1442        ms1_summary_full : DataFrame
1443            The full MS1 summary DataFrame.
1444
1445        Returns
1446        -------
1447        DataFrame
1448            The cleaned MS1 summary DataFrame.
1449        """
1450        ms1_summary_full = ms1_summary_full.reset_index()
1451        cols_to_keep = [
1452            "mf_id",
1453            "Molecular Formula",
1454            "Ion Type",
1455            "Calculated m/z",
1456            "m/z Error (ppm)",
1457            "m/z Error Score",
1458            "Is Isotopologue",
1459            "Isotopologue Similarity",
1460            "Confidence Score",
1461        ]
1462        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1463        ms1_summary["ion_formula"] = [
1464            self.get_ion_formula(f, a)
1465            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1466        ]
1467        ms1_summary["isotopologue_type"] = [
1468            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1469        ]
1470
1471        # Reorder columns
1472        ms1_summary = ms1_summary[
1473            [
1474                "mf_id",
1475                "ion_formula",
1476                "isotopologue_type",
1477                "Calculated m/z",
1478                "m/z Error (ppm)",
1479                "m/z Error Score",
1480                "Isotopologue Similarity",
1481                "Confidence Score",
1482            ]
1483        ]
1484
1485        # Set the index to mf_id
1486        ms1_summary = ms1_summary.set_index("mf_id")
1487
1488        return ms1_summary
1489    
1490    def summarize_ms2_report(self, ms2_annot_report):
1491        """
1492        Summarize the MS2 report.
1493
1494        Parameters
1495        ----------
1496        ms2_annot_report : DataFrame
1497            The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
1498        
1499        Returns
1500        -------
1501        """
1502    
1503    def summarize_metabolomics_report(self, ms2_annot_report):
1504        """Summarize the MS2 hits for a metabolomics report
1505        
1506        Parameters
1507        ----------
1508        ms2_annot : DataFrame
1509            The MS2 annotation DataFrame with all annotations.
1510
1511        Returns
1512        -------
1513        DataFrame
1514            The summarized metabolomics report.
1515        """
1516        columns_to_drop = [
1517            "precursor_mz",
1518            "precursor_mz_error_ppm",
1519            "cas",
1520            "data_id",
1521            "iupac_name",
1522            "traditional_name",
1523            "common_name",
1524            "casno",
1525        ]
1526        ms2_annot = ms2_annot_report.drop(
1527            columns=[col for col in columns_to_drop if col in ms2_annot_report.columns]
1528        )
1529        
1530        # Prepare information about the search results, pulling out the best hit for the single report
1531        # Group by mf_id,ref_mol_id grab row with highest entropy similarity
1532        ms2_annot = ms2_annot.reset_index()
1533        # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id
1534        ms2_annot["n_spectra_contributing"] = (
1535            ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"]
1536            .transform("nunique")
1537        )
1538        # Sort by entropy similarity
1539        ms2_annot = ms2_annot.sort_values(
1540            by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False]
1541        )
1542        best_entropy = ms2_annot.drop_duplicates(
1543            subset=["mf_id", "ref_mol_id"], keep="first"
1544        )
1545
1546        return best_entropy
1547
1548    def clean_ms2_report(self, metabolite_summary):
1549        """Clean the MS2 report.
1550
1551        Parameters
1552        ----------
1553        metabolite_summary : DataFrame
1554            The full metabolomics summary DataFrame.
1555
1556        Returns
1557        -------
1558        DataFrame
1559            The cleaned metabolomics summary DataFrame.
1560        """
1561        metabolite_summary = metabolite_summary.reset_index()
1562        metabolite_summary["ion_formula"] = [
1563            self.get_ion_formula(f, a)
1564            for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"])
1565        ]
1566
1567        col_order = [
1568            "mf_id",
1569            "ion_formula",
1570            "ref_ion_type",
1571            "formula",
1572            "inchikey",
1573            "name",
1574            "inchi",
1575            "chebi",
1576            "smiles",
1577            "kegg",
1578            "cas",
1579            "database_name",
1580            "ref_ms_id",
1581            "entropy_similarity",
1582            "ref_mz_in_query_fract",
1583            "n_spectra_contributing",
1584        ]
1585
1586        # Reorder columns
1587        metabolite_summary = metabolite_summary[
1588            [col for col in col_order if col in metabolite_summary.columns]
1589        ]
1590
1591        # Convert chebi (if present) to int:
1592        if "chebi" in metabolite_summary.columns:
1593            metabolite_summary["chebi"] = metabolite_summary["chebi"].astype(
1594                "Int64", errors="ignore"
1595            )
1596
1597        # Set the index to mf_id
1598        metabolite_summary = metabolite_summary.set_index("mf_id")
1599
1600        return metabolite_summary
1601    
1602    def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report):
1603        """Combine the mass feature report with the MS1 and MS2 reports.
1604
1605        Parameters
1606        ----------
1607        mf_report : DataFrame
1608            The mass feature report DataFrame.
1609        ms1_annot_report : DataFrame
1610            The MS1 annotation report DataFrame.
1611        ms2_annot_report : DataFrame
1612            The MS2 annotation report DataFrame.
1613        """
1614        # If there is an ms1_annot_report, merge it with the mf_report
1615        if not ms1_annot_report.empty:
1616            # MS1 has been run and has molecular formula information
1617            mf_report = pd.merge(
1618                mf_report,
1619                ms1_annot_report,
1620                how="left",
1621                on=["mf_id", "isotopologue_type"],
1622            )
1623        if ms2_annot_report is not None:
1624            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1625            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1626            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1627            mf_no_ion_formula = pd.merge(
1628                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1629            )
1630
1631            # pull out the records with ion_formula
1632            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1633            mf_with_ion_formula = pd.merge(
1634                mf_with_ion_formula,
1635                ms2_annot_report,
1636                how="left",
1637                on=["mf_id", "ion_formula"],
1638            )
1639
1640            # put back together
1641            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1642
1643        # Rename colums
1644        rename_dict = {
1645            "mf_id": "Mass Feature ID",
1646            "scan_time": "Retention Time (min)",
1647            "mz": "m/z",
1648            "apex_scan": "Apex Scan Number",
1649            "intensity": "Intensity",
1650            "persistence": "Persistence",
1651            "area": "Area",
1652            "half_height_width": "Half Height Width (min)",
1653            "tailing_factor": "Tailing Factor",
1654            "dispersity_index": "Dispersity Index",
1655            "ms2_spectrum": "MS2 Spectrum",
1656            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1657            "isotopologue_type": "Isotopologue Type",
1658            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1659            "associated_mass_features": "Associated Mass Features after Deconvolution",
1660            "ion_formula": "Ion Formula",
1661            "formula": "Molecular Formula",
1662            "ref_ion_type": "Ion Type",
1663            "annot_level": "Lipid Annotation Level",
1664            "lipid_molecular_species_id": "Lipid Molecular Species",
1665            "lipid_summed_name": "Lipid Species",
1666            "lipid_subclass": "Lipid Subclass",
1667            "lipid_class": "Lipid Class",
1668            "lipid_category": "Lipid Category",
1669            "entropy_similarity": "Entropy Similarity",
1670            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1671            "n_spectra_contributing": "Spectra with Annotation (n)",
1672        }
1673        mf_report = mf_report.rename(columns=rename_dict)
1674        mf_report["Sample Name"] = self.mass_spectra.sample_name
1675        mf_report["Polarity"] = self.mass_spectra.polarity
1676        mf_report = mf_report[
1677            ["Mass Feature ID", "Sample Name", "Polarity"]
1678            + [
1679                col
1680                for col in mf_report.columns
1681                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1682            ]
1683        ]
1684
1685        # Reorder rows by "Mass Feature ID", then "Entropy Similarity" (descending), then "Confidence Score" (descending)
1686        if "Entropy Similarity" in mf_report.columns:
1687            mf_report = mf_report.sort_values(
1688                by=["Mass Feature ID", "Entropy Similarity", "Confidence Score"],
1689                ascending=[True, False, False],
1690            )
1691        elif "Confidence Score" in mf_report.columns:
1692             mf_report = mf_report.sort_values(
1693                by=["Mass Feature ID", "Confidence Score"],
1694                ascending=[True, False],
1695            )
1696        # If neither "Entropy Similarity" nor "Confidence Score" are in the columns, just sort by "Mass Feature ID"
1697        else:
1698            mf_report = mf_report.sort_values("Mass Feature ID")
1699
1700        # Reset index
1701        mf_report = mf_report.reset_index(drop=True)
1702
1703        return mf_report
1704    
1705    def to_report(self, molecular_metadata=None):
1706        """Create a report of the mass features and their annotations.
1707
1708        Parameters
1709        ----------
1710        molecular_metadata : dict, optional
1711            The molecular metadata. Default is None.
1712
1713        Returns
1714        -------
1715        DataFrame
1716            The report as a Pandas DataFrame.
1717        """
1718        # Get mass feature dataframe
1719        mf_report = self.mass_spectra.mass_features_to_df()
1720        mf_report = mf_report.reset_index(drop=False)
1721
1722        # Get and clean ms1 annotation dataframe
1723        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1724        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1725        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1726
1727        # Get, summarize, and clean ms2 annotation dataframe
1728        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1729            molecular_metadata=molecular_metadata
1730        )
1731        if ms2_annot_report is not None and molecular_metadata is not None:
1732            ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report)
1733            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1734            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1735            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1736        else:
1737            ms2_annot_report = None
1738
1739        report = self.combine_reports(
1740            mf_report=mf_report,
1741            ms1_annot_report=ms1_annot_report,
1742            ms2_annot_report=ms2_annot_report
1743        )
1744
1745        return report
1746class LipidomicsExport(LCMSMetabolomicsExport):
1747    """A class to export lipidomics data.
1748
1749    This class provides methods to export lipidomics data to various formats and summarize the lipid report.
1750
1751    Parameters
1752    ----------
1753    out_file_path : str | Path
1754        The output file path, do not include the file extension.
1755    mass_spectra : object
1756        The high resolution mass spectra object.
1757    """
1758
1759    def __init__(self, out_file_path, mass_spectra):
1760        super().__init__(out_file_path, mass_spectra)
1761
1762    def summarize_lipid_report(self, ms2_annot):
1763        """Summarize the lipid report.
1764
1765        Parameters
1766        ----------
1767        ms2_annot : DataFrame
1768            The MS2 annotation DataFrame with all annotations.
1769
1770        Returns
1771        -------
1772        DataFrame
1773            The summarized lipid report.
1774        """
1775        # Drop unnecessary columns for easier viewing
1776        columns_to_drop = [
1777            "precursor_mz",
1778            "precursor_mz_error_ppm",
1779            "metabref_mol_id",
1780            "metabref_precursor_mz",
1781            "cas",
1782            "inchikey",
1783            "inchi",
1784            "chebi",
1785            "smiles",
1786            "kegg",
1787            "data_id",
1788            "iupac_name",
1789            "traditional_name",
1790            "common_name",
1791            "casno",
1792        ]
1793        ms2_annot = ms2_annot.drop(
1794            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1795        )
1796
1797        # If ion_types_excluded is not empty, remove those ion types
1798        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1799            "ms2"
1800        ].molecular_search.ion_types_excluded
1801        if len(ion_types_excluded) > 0:
1802            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1803
1804        # If mf_id is not present, check that the index name is mf_id and reset the index
1805        if "mf_id" not in ms2_annot.columns:
1806            if ms2_annot.index.name == "mf_id":
1807                ms2_annot = ms2_annot.reset_index()
1808            else:
1809                raise ValueError("mf_id is not present in the dataframe")
1810
1811        # Attempt to get consensus annotations to the MLF level
1812        mlf_results_all = []
1813        for mf_id in ms2_annot["mf_id"].unique():
1814            mlf_results_perid = []
1815            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1816            ms2_annot_mf["n_spectra_contributing"] = ms2_annot_mf.query_spectrum_id.nunique()
1817
1818            for query_scan in ms2_annot["query_spectrum_id"].unique():
1819                ms2_annot_sub = ms2_annot_mf[
1820                    ms2_annot_mf["query_spectrum_id"] == query_scan
1821                ].copy()
1822
1823                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1824                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1825                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1826                        ms2_annot_sub["entropy_max"] = (
1827                            ms2_annot_sub["entropy_similarity"]
1828                            == ms2_annot_sub["entropy_similarity"].max()
1829                        )
1830                        ms2_annot_sub["ref_match_fract_max"] = (
1831                            ms2_annot_sub["ref_mz_in_query_fract"]
1832                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1833                        )
1834                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1835                            "query_frag_types"
1836                        ].apply(lambda x: True if "MLF" in x else False)
1837
1838                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1839                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1840                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1841                        ].all(axis=1)
1842
1843                        # If there is a consensus, take the row with the highest entropy_similarity
1844                        if ms2_annot_sub["consensus"].any():
1845                            ms2_annot_sub = ms2_annot_sub[
1846                                ms2_annot_sub["entropy_similarity"]
1847                                == ms2_annot_sub["entropy_similarity"].max()
1848                            ].head(1)
1849                            mlf_results_perid.append(ms2_annot_sub)
1850            if len(mlf_results_perid) == 0:
1851                mlf_results_perid = pd.DataFrame()
1852            else:
1853                mlf_results_perid = pd.concat(mlf_results_perid)
1854                if mlf_results_perid["name"].nunique() == 1:
1855                    mlf_results_perid = mlf_results_perid[
1856                        mlf_results_perid["entropy_similarity"]
1857                        == mlf_results_perid["entropy_similarity"].max()
1858                    ].head(1)
1859                else:
1860                    mlf_results_perid = pd.DataFrame()
1861                mlf_results_all.append(mlf_results_perid)
1862
1863        # These are the consensus annotations to the MLF level
1864        if len(mlf_results_all) > 0:
1865            mlf_results_all = pd.concat(mlf_results_all)
1866            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1867        else:
1868            # Make an empty dataframe
1869            mlf_results_all = ms2_annot.head(0)
1870
1871        # For remaining mf_ids, try to get a consensus annotation to the species level
1872        species_results_all = []
1873        # Remove mf_ids that have consensus annotations to the MLF level
1874        ms2_annot_spec = ms2_annot[
1875            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1876        ]
1877        for mf_id in ms2_annot_spec["mf_id"].unique():
1878            # Do all the hits have the same lipid_summed_name?
1879            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1880            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1881
1882            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1883                # Grab the highest entropy_similarity result
1884                ms2_annot_sub = ms2_annot_sub[
1885                    ms2_annot_sub["entropy_similarity"]
1886                    == ms2_annot_sub["entropy_similarity"].max()
1887                ].head(1)
1888                species_results_all.append(ms2_annot_sub)
1889
1890        # These are the consensus annotations to the species level
1891        if len(species_results_all) > 0:
1892            species_results_all = pd.concat(species_results_all)
1893            species_results_all["annot_level"] = "species"
1894        else:
1895            # Make an empty dataframe
1896            species_results_all = ms2_annot.head(0)
1897
1898        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1899        # Remove mf_ids that have consensus annotations to the species level
1900        ms2_annot_remaining = ms2_annot_spec[
1901            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1902        ]
1903        no_consensus = []
1904        for mf_id in ms2_annot_remaining["mf_id"].unique():
1905            id_sub = []
1906            id_no_con = []
1907            ms2_annot_sub_mf = ms2_annot_remaining[
1908                ms2_annot_remaining["mf_id"] == mf_id
1909            ].copy()
1910            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1911                ms2_annot_sub = ms2_annot_sub_mf[
1912                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1913                ].copy()
1914
1915                # New columns for ranking [HIGHER RANK = BETTER]
1916                ms2_annot_sub["entropy_max"] = (
1917                    ms2_annot_sub["entropy_similarity"]
1918                    == ms2_annot_sub["entropy_similarity"].max()
1919                )
1920                ms2_annot_sub["ref_match_fract_max"] = (
1921                    ms2_annot_sub["ref_mz_in_query_fract"]
1922                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1923                )
1924                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1925                    lambda x: True if "MLF" in x else False
1926                )
1927
1928                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1929                ms2_annot_sub["consensus"] = ms2_annot_sub[
1930                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1931                ].all(axis=1)
1932                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1933                id_sub.append(ms2_annot_sub_con)
1934                id_no_con.append(ms2_annot_sub)
1935            id_sub = pd.concat(id_sub)
1936            id_no_con = pd.concat(id_no_con)
1937
1938            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1939            if (
1940                id_sub["query_frag_types"]
1941                .apply(lambda x: True if "MLF" in x else False)
1942                .all()
1943                and len(id_sub) > 0
1944            ):
1945                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1946                id_sub = id_sub.loc[idx]
1947                # Reorder so highest entropy_similarity is first
1948                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1949                id_sub["annot_level"] = id_sub["structure_level"]
1950                no_consensus.append(id_sub)
1951
1952            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1953            elif len(id_sub) == 0:
1954                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1955                    summed_sub = id_no_con[
1956                        id_no_con["lipid_summed_name"] == lipid_summed_name
1957                    ]
1958                    # Any consensus to MLF?
1959                    if summed_sub["consensus"].any():
1960                        summed_sub = summed_sub[summed_sub["consensus"]]
1961                        summed_sub["annot_level"] = summed_sub["structure_level"]
1962                        no_consensus.append(summed_sub)
1963                    else:
1964                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1965                        summed_sub = summed_sub[
1966                            summed_sub["entropy_similarity"]
1967                            == summed_sub["entropy_similarity"].max()
1968                        ].head(1)
1969                        # get first row
1970                        summed_sub["annot_level"] = "species"
1971                        summed_sub["name"] = ""
1972                        no_consensus.append(summed_sub)
1973            else:
1974                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1975
1976        if len(no_consensus) > 0:
1977            no_consensus = pd.concat(no_consensus)
1978        else:
1979            no_consensus = ms2_annot.head(0)
1980
1981        # Combine all the consensus annotations and reformat the dataframe for output
1982        species_results_all = species_results_all.drop(columns=["name"])
1983        species_results_all["lipid_molecular_species_id"] = ""
1984        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1985        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1986        consensus_annotations = pd.concat(
1987            [mlf_results_all, species_results_all, no_consensus]
1988        )
1989        consensus_annotations = consensus_annotations.sort_values(
1990            "mf_id", ascending=True
1991        )
1992        cols_to_keep = [
1993            "mf_id",
1994            "ref_ion_type",
1995            "entropy_similarity",
1996            "ref_mz_in_query_fract",
1997            "lipid_molecular_species_id",
1998            "lipid_summed_name",
1999            "lipid_subclass",
2000            "lipid_class",
2001            "lipid_category",
2002            "formula",
2003            "annot_level",
2004            "n_spectra_contributing",
2005        ]
2006        consensus_annotations = consensus_annotations[cols_to_keep]
2007        consensus_annotations = consensus_annotations.set_index("mf_id")
2008
2009        return consensus_annotations
2010
2011    def clean_ms2_report(self, lipid_summary):
2012        """Clean the MS2 report.
2013
2014        Parameters
2015        ----------
2016        lipid_summary : DataFrame
2017            The full lipid summary DataFrame.
2018
2019        Returns
2020        -------
2021        DataFrame
2022            The cleaned lipid summary DataFrame.
2023        """
2024        lipid_summary = lipid_summary.reset_index()
2025        lipid_summary["ion_formula"] = [
2026            self.get_ion_formula(f, a)
2027            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
2028        ]
2029
2030        # Reorder columns
2031        lipid_summary = lipid_summary[
2032            [
2033                "mf_id",
2034                "ion_formula",
2035                "ref_ion_type",
2036                "formula",
2037                "annot_level",
2038                "lipid_molecular_species_id",
2039                "lipid_summed_name",
2040                "lipid_subclass",
2041                "lipid_class",
2042                "lipid_category",
2043                "entropy_similarity",
2044                "ref_mz_in_query_fract",
2045                "n_spectra_contributing",
2046            ]
2047        ]
2048
2049        # Set the index to mf_id
2050        lipid_summary = lipid_summary.set_index("mf_id")
2051
2052        return lipid_summary
2053
2054    def to_report(self, molecular_metadata=None):
2055        """Create a report of the mass features and their annotations.
2056
2057        Parameters
2058        ----------
2059        molecular_metadata : dict, optional
2060            The molecular metadata. Default is None.
2061
2062        Returns
2063        -------
2064        DataFrame
2065            The report of the mass features and their annotations.
2066
2067        Notes
2068        -----
2069        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
2070        """
2071        # Get mass feature dataframe
2072        mf_report = self.mass_spectra.mass_features_to_df()
2073        mf_report = mf_report.reset_index(drop=False)
2074
2075        # Get and clean ms1 annotation dataframe
2076        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
2077        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
2078        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
2079
2080        # Get, summarize, and clean ms2 annotation dataframe
2081        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
2082            molecular_metadata=molecular_metadata
2083        )
2084        if ms2_annot_report is not None and molecular_metadata is not None:
2085            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
2086            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
2087            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
2088            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
2089        report = self.combine_reports(
2090            mf_report=mf_report,
2091            ms1_annot_report=ms1_annot_report,
2092            ms2_annot_report=ms2_annot_report
2093        )
2094        return report
2095
2096        
ion_type_dict = {'M+': [{}, {}], 'protonated': [{'H': 1}, {}], '[M+H]+': [{'H': 1}, {}], '[M+NH4]+': [{'N': 1, 'H': 4}, {}], '[M+Na]+': [{'Na': 1}, {}], '[M+K]+': [{'K': 1}, {}], '[M+2Na+Cl]+': [{'Na': 2, 'Cl': 1}, {}], '[M+2Na-H]+': [{'Na': 2}, {'H': 1}], '[M+C2H3Na2O2]+': [{'C': 2, 'H': 3, 'Na': 2, 'O': 2}, {}], '[M+C4H10N3]+': [{'C': 4, 'H': 10, 'N': 3}, {}], '[M+NH4+ACN]+': [{'C': 2, 'H': 7, 'N': 2}, {}], '[M+H-H2O]+': [{}, {'H': 1, 'O': 1}], 'de-protonated': [{}, {'H': 1}], '[M-H]-': [{}, {'H': 1}], '[M+Cl]-': [{'Cl': 1}, {}], '[M+HCOO]-': [{'C': 1, 'H': 1, 'O': 2}, {}], '[M+CH3COO]-': [{'C': 2, 'H': 3, 'O': 2}, {}], '[M+2NaAc+Cl]-': [{'Na': 2, 'C': 2, 'H': 3, 'O': 2, 'Cl': 1}, {}], '[M+K-2H]-': [{'K': 1}, {'H': 2}], '[M+Na-2H]-': [{'Na': 1}, {'H': 2}]}
class LowResGCMSExport:
 55class LowResGCMSExport:
 56    """A class to export low resolution GC-MS data.
 57
 58    This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.
 59
 60    Parameters:
 61    ----------
 62    out_file_path : str
 63        The output file path.
 64    gcms : object
 65        The low resolution GCMS object.
 66
 67    Attributes:
 68    ----------
 69    output_file : Path
 70        The output file path as a Path object.
 71    gcms : object
 72        The low resolution GCMS object.
 73
 74    Methods:
 75    -------
 76    * get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
 77    * get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
 78    * to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
 79    * to_excel(write_mode='a', write_metadata=True, id_label="corems:"),
 80        Export the data to an Excel file.
 81    * to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:").
 82        Export the data to a CSV file.
 83    * to_hdf(id_label="corems:").
 84        Export the data to an HDF5 file.
 85    * get_data_stats(gcms).
 86        Get statistics about the GCMS data.
 87
 88    """
 89
 90    def __init__(self, out_file_path, gcms):
 91        self.output_file = Path(out_file_path)
 92
 93        self.gcms = gcms
 94
 95        self._init_columns()
 96
 97    def _init_columns(self):
 98        """Initialize the column names for the exported data.
 99
100        Returns:
101        -------
102        list
103            The list of column names.
104        """
105
106        columns = [
107            "Sample name",
108            "Peak Index",
109            "Retention Time",
110            "Retention Time Ref",
111            "Peak Height",
112            "Peak Area",
113            "Retention index",
114            "Retention index Ref",
115            "Retention Index Score",
116            "Similarity Score",
117            "Spectral Similarity Score",
118            "Compound Name",
119            "Chebi ID",
120            "Kegg Compound ID",
121            "Inchi",
122            "Inchi Key",
123            "Smiles",
124            "Molecular Formula",
125            "IUPAC Name",
126            "Traditional Name",
127            "Common Name",
128            "Derivatization",
129        ]
130
131        if self.gcms.molecular_search_settings.exploratory_mode:
132            columns.extend(
133                [
134                    "Weighted Cosine Correlation",
135                    "Cosine Correlation",
136                    "Stein Scott Similarity",
137                    "Pearson Correlation",
138                    "Spearman Correlation",
139                    "Kendall Tau Correlation",
140                    "Euclidean Distance",
141                    "Manhattan Distance",
142                    "Jaccard Distance",
143                    "DWT Correlation",
144                    "DFT Correlation",
145                ]
146            )
147
148            columns.extend(list(methods_name.values()))
149
150        return columns
151
152    def get_pandas_df(self, id_label="corems:"):
153        """Get the exported data as a Pandas DataFrame.
154
155        Parameters:
156        ----------
157        id_label : str, optional
158            The ID label for the data. Default is "corems:".
159
160        Returns:
161        -------
162        DataFrame
163            The exported data as a Pandas DataFrame.
164        """
165
166        columns = self._init_columns()
167
168        dict_data_list = self.get_list_dict_data(self.gcms)
169
170        df = DataFrame(dict_data_list, columns=columns)
171
172        df.name = self.gcms.sample_name
173
174        return df
175
176    def get_json(self, nan=False, id_label="corems:"):
177        """Get the exported data as a JSON string.
178
179        Parameters:
180        ----------
181        nan : bool, optional
182            Whether to include NaN values in the JSON string. Default is False.
183        id_label : str, optional
184            The ID label for the data. Default is "corems:".
185
186        """
187
188        import json
189
190        dict_data_list = self.get_list_dict_data(self.gcms)
191
192        return json.dumps(
193            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
194        )
195
196    def to_pandas(self, write_metadata=True, id_label="corems:"):
197        """Export the data to a Pandas DataFrame and save it as a pickle file.
198
199        Parameters:
200        ----------
201        write_metadata : bool, optional
202            Whether to write metadata to the output file.
203        id_label : str, optional
204            The ID label for the data.
205        """
206
207        columns = self._init_columns()
208
209        dict_data_list = self.get_list_dict_data(self.gcms)
210
211        df = DataFrame(dict_data_list, columns=columns)
212
213        df.to_pickle(self.output_file.with_suffix(".pkl"))
214
215        if write_metadata:
216            self.write_settings(
217                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
218            )
219
220    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
221        """Export the data to an Excel file.
222
223        Parameters:
224        ----------
225        write_mode : str, optional
226            The write mode for the Excel file. Default is 'a' (append).
227        write_metadata : bool, optional
228            Whether to write metadata to the output file. Default is True.
229        id_label : str, optional
230            The ID label for the data. Default is "corems:".
231        """
232
233        out_put_path = self.output_file.with_suffix(".xlsx")
234
235        columns = self._init_columns()
236
237        dict_data_list = self.get_list_dict_data(self.gcms)
238
239        df = DataFrame(dict_data_list, columns=columns)
240
241        if write_mode == "a" and out_put_path.exists():
242            writer = ExcelWriter(out_put_path, engine="openpyxl")
243            # try to open an existing workbook
244            writer.book = load_workbook(out_put_path)
245            # copy existing sheets
246            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
247            # read existing file
248            reader = read_excel(out_put_path)
249            # write out the new sheet
250            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
251
252            writer.close()
253        else:
254            df.to_excel(
255                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
256            )
257
258        if write_metadata:
259            self.write_settings(out_put_path, self.gcms, id_label=id_label)
260
261    def to_csv(
262        self,
263        separate_output=False,
264        write_mode="w",
265        write_metadata=True,
266        id_label="corems:",
267    ):
268        """Export the data to a CSV file.
269
270        Parameters:
271        ----------
272        separate_output : bool, optional
273            Whether to separate the output into multiple files. Default is False.
274        write_mode : str, optional
275            The write mode for the CSV file. Default is 'w' (write).
276        write_metadata : bool, optional
277            Whether to write metadata to the output file. Default is True.
278        id_label : str, optional
279            The ID label for the data. Default is "corems:".
280        """
281
282        if separate_output:
283            # set write mode to write
284            # this mode will overwrite the file without warning
285            write_mode = "w"
286        else:
287            # set write mode to append
288            write_mode = "a"
289
290        columns = self._init_columns()
291
292        dict_data_list = self.get_list_dict_data(self.gcms)
293
294        out_put_path = self.output_file.with_suffix(".csv")
295
296        write_header = not out_put_path.exists()
297
298        try:
299            with open(out_put_path, write_mode, newline="") as csvfile:
300                writer = csv.DictWriter(csvfile, fieldnames=columns)
301                if write_header:
302                    writer.writeheader()
303                for data in dict_data_list:
304                    writer.writerow(data)
305
306            if write_metadata:
307                self.write_settings(out_put_path, self.gcms, id_label=id_label)
308
309        except IOError as ioerror:
310            print(ioerror)
311
312    def to_hdf(self, id_label="corems:"):
313        """Export the data to an HDF5 file.
314
315        Parameters:
316        ----------
317        id_label : str, optional
318            The ID label for the data. Default is "corems:".
319        """
320
321        # save sample at a time
322        def add_compound(gc_peak, compound_obj):
323            modifier = compound_obj.classify if compound_obj.classify else ""
324            compound_group = compound_obj.name.replace("/", "") + " " + modifier
325
326            if compound_group not in peak_group:
327                compound_group = peak_group.create_group(compound_group)
328
329                # compound_group.attrs["retention_time"] = compound_obj.retention_time
330                compound_group.attrs["retention_index"] = compound_obj.ri
331                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
332                compound_group.attrs["spectral_similarity_score"] = (
333                    compound_obj.spectral_similarity_score
334                )
335                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
336
337                compond_mz = compound_group.create_dataset(
338                    "mz", data=np.array(compound_obj.mz), dtype="f8"
339                )
340                compond_abundance = compound_group.create_dataset(
341                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
342                )
343
344                if self.gcms.molecular_search_settings.exploratory_mode:
345                    compound_group.attrs["Spectral Similarities"] = json.dumps(
346                        compound_obj.spectral_similarity_scores,
347                        sort_keys=False,
348                        indent=4,
349                        separators=(",", ":"),
350                    )
351            else:
352                warnings.warn("Skipping duplicate reference compound.")
353
354        import json
355        from datetime import datetime, timezone
356
357        import h5py
358        import numpy as np
359
360        output_path = self.output_file.with_suffix(".hdf5")
361
362        with h5py.File(output_path, "w") as hdf_handle:
363            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
364            hdf_handle.attrs["time_stamp"] = timenow
365            hdf_handle.attrs["data_structure"] = "gcms"
366            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
367            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
368
369            hdf_handle.attrs["sample_id"] = "self.gcms.id"
370            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
371            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
372            hdf_handle.attrs["output_data"] = str(output_path)
373            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
374            hdf_handle.attrs["corems_version"] = __version__
375
376            hdf_handle.attrs["Stats"] = json.dumps(
377                self.get_data_stats(self.gcms),
378                sort_keys=False,
379                indent=4,
380                separators=(",", ": "),
381            )
382            hdf_handle.attrs["Calibration"] = json.dumps(
383                self.get_calibration_stats(self.gcms, id_label),
384                sort_keys=False,
385                indent=4,
386                separators=(",", ": "),
387            )
388            hdf_handle.attrs["Blank"] = json.dumps(
389                self.get_blank_stats(self.gcms),
390                sort_keys=False,
391                indent=4,
392                separators=(",", ": "),
393            )
394
395            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
396            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
397                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
398            )
399
400            scans_dataset = hdf_handle.create_dataset(
401                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
402            )
403            rt_dataset = hdf_handle.create_dataset(
404                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
405            )
406            tic_dataset = hdf_handle.create_dataset(
407                "tic", data=np.array(self.gcms.tic), dtype="f8"
408            )
409            processed_tic_dataset = hdf_handle.create_dataset(
410                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
411            )
412
413            output_score_method = (
414                self.gcms.molecular_search_settings.output_score_method
415            )
416
417            for gc_peak in self.gcms:
418                # print(gc_peak.retention_time)
419                # print(gc_peak.tic)
420
421                # check if there is a compound candidate
422                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
423                peak_group.attrs["deconvolution"] = int(
424                    self.gcms.chromatogram_settings.use_deconvolution
425                )
426
427                peak_group.attrs["start_scan"] = gc_peak.start_scan
428                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
429                peak_group.attrs["final_scan"] = gc_peak.final_scan
430
431                peak_group.attrs["retention_index"] = gc_peak.ri
432                peak_group.attrs["retention_time"] = gc_peak.retention_time
433                peak_group.attrs["area"] = gc_peak.area
434
435                mz = peak_group.create_dataset(
436                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
437                )
438                abundance = peak_group.create_dataset(
439                    "abundance",
440                    data=np.array(gc_peak.mass_spectrum.abundance),
441                    dtype="f8",
442                )
443
444                if gc_peak:
445                    if output_score_method == "highest_sim_score":
446                        compound_obj = gc_peak.highest_score_compound
447                        add_compound(gc_peak, compound_obj)
448
449                    elif output_score_method == "highest_ss":
450                        compound_obj = gc_peak.highest_ss_compound
451                        add_compound(gc_peak, compound_obj)
452
453                    else:
454                        for compound_obj in gc_peak:
455                            add_compound(gc_peak, compound_obj)
456
457    def get_data_stats(self, gcms):
458        """Get statistics about the GCMS data.
459
460        Parameters:
461        ----------
462        gcms : object
463            The low resolution GCMS object.
464
465        Returns:
466        -------
467        dict
468            A dictionary containing the data statistics.
469        """
470
471        matched_peaks = gcms.matched_peaks
472        no_matched_peaks = gcms.no_matched_peaks
473        unique_metabolites = gcms.unique_metabolites
474
475        peak_matchs_above_0p85 = 0
476        unique_peak_match_above_0p85 = 0
477        for match_peak in matched_peaks:
478            gc_peak_above_85 = 0
479            matches_above_85 = list(
480                filter(lambda m: m.similarity_score >= 0.85, match_peak)
481            )
482            if matches_above_85:
483                peak_matchs_above_0p85 += 1
484            if len(matches_above_85) == 1:
485                unique_peak_match_above_0p85 += 1
486
487        data_stats = {}
488        data_stats["average_signal_noise"] = "ni"
489        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
490        data_stats["total_number_peaks"] = len(gcms)
491        data_stats["total_peaks_matched"] = len(matched_peaks)
492        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
493        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
494        data_stats["single_matches_above_similarity_score_0.85"] = (
495            unique_peak_match_above_0p85
496        )
497        data_stats["unique_metabolites"] = len(unique_metabolites)
498
499        return data_stats
500
501    def get_calibration_stats(self, gcms, id_label):
502        """Get statistics about the GC-MS calibration.
503
504        Parameters:
505        ----------
506        """
507        calibration_parameters = {}
508
509        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
510        calibration_parameters["data_url"] = str(gcms.cal_file_path)
511        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
512        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
513        calibration_parameters["calibration_method"] = ""
514
515        return calibration_parameters
516
517    def get_blank_stats(self, gcms):
518        """Get statistics about the GC-MS blank."""
519        blank_parameters = {}
520
521        blank_parameters["data_name"] = "ni"
522        blank_parameters["blank_id"] = "ni"
523        blank_parameters["data_url"] = "ni"
524        blank_parameters["has_input"] = "ni"
525        blank_parameters["common_features_to_blank"] = "ni"
526
527        return blank_parameters
528
529    def get_instrument_metadata(self, gcms):
530        """Get metadata about the GC-MS instrument."""
531        instrument_metadata = {}
532
533        instrument_metadata["analyzer"] = gcms.analyzer
534        instrument_metadata["instrument_label"] = gcms.instrument_label
535        instrument_metadata["instrument_id"] = uuid.uuid4().hex
536
537        return instrument_metadata
538
539    def get_data_metadata(self, gcms, id_label, output_path):
540        """Get metadata about the GC-MS data.
541
542        Parameters:
543        ----------
544        gcms : object
545            The low resolution GCMS object.
546        id_label : str
547            The ID label for the data.
548        output_path : str
549            The output file path.
550
551        Returns:
552        -------
553        dict
554            A dictionary containing the data metadata.
555        """
556        if isinstance(output_path, str):
557            output_path = Path(output_path)
558
559        paramaters_path = output_path.with_suffix(".json")
560
561        if paramaters_path.exists():
562            with paramaters_path.open() as current_param:
563                metadata = json.load(current_param)
564                data_metadata = metadata.get("Data")
565        else:
566            data_metadata = {}
567            data_metadata["data_name"] = []
568            data_metadata["input_data_url"] = []
569            data_metadata["has_input"] = []
570
571        data_metadata["data_name"].append(gcms.sample_name)
572        data_metadata["input_data_url"].append(str(gcms.file_location))
573        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
574
575        data_metadata["output_data_name"] = str(output_path.stem)
576        data_metadata["output_data_url"] = str(output_path)
577        data_metadata["has_output"] = id_label + corems_md5(output_path)
578
579        return data_metadata
580
581    def get_parameters_json(self, gcms, id_label, output_path):
582        """Get the parameters as a JSON string.
583
584        Parameters:
585        ----------
586        gcms : GCMS object
587            The low resolution GCMS object.
588        id_label : str
589            The ID label for the data.
590        output_path : str
591            The output file path.
592
593        Returns:
594        -------
595        str
596            The parameters as a JSON string.
597        """
598
599        output_parameters_dict = {}
600        output_parameters_dict["Data"] = self.get_data_metadata(
601            gcms, id_label, output_path
602        )
603        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
604        output_parameters_dict["Calibration"] = self.get_calibration_stats(
605            gcms, id_label
606        )
607        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
608        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
609        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
610        corems_dict_setting["corems_version"] = __version__
611        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
612        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
613        output = json.dumps(
614            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
615        )
616
617        return output
618
619    def write_settings(self, output_path, gcms, id_label="emsl:"):
620        """Write the settings to a JSON file.
621
622        Parameters:
623        ----------
624        output_path : str
625            The output file path.
626        gcms : GCMS object
627            The low resolution GCMS object.
628        id_label : str
629            The ID label for the data. Default is "emsl:".
630
631        """
632
633        output = self.get_parameters_json(gcms, id_label, output_path)
634
635        with open(
636            output_path.with_suffix(".json"),
637            "w",
638            encoding="utf8",
639        ) as outfile:
640            outfile.write(output)
641
642    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
643        """Get the exported data as a list of dictionaries.
644
645        Parameters:
646        ----------
647        gcms : object
648            The low resolution GCMS object.
649        include_no_match : bool, optional
650            Whether to include no match data. Default is True.
651        no_match_inline : bool, optional
652            Whether to include no match data inline. Default is False.
653
654        Returns:
655        -------
656        list
657            The exported data as a list of dictionaries.
658        """
659
660        output_score_method = gcms.molecular_search_settings.output_score_method
661
662        dict_data_list = []
663
664        def add_match_dict_data():
665            derivatization = "{}:{}:{}".format(
666                compound_obj.classify,
667                compound_obj.derivativenum,
668                compound_obj.derivatization,
669            )
670            out_dict = {
671                "Sample name": gcms.sample_name,
672                "Peak Index": gcpeak_index,
673                "Retention Time": gc_peak.retention_time,
674                "Retention Time Ref": compound_obj.retention_time,
675                "Peak Height": gc_peak.tic,
676                "Peak Area": gc_peak.area,
677                "Retention index": gc_peak.ri,
678                "Retention index Ref": compound_obj.ri,
679                "Retention Index Score": compound_obj.ri_score,
680                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
681                "Similarity Score": compound_obj.similarity_score,
682                "Compound Name": compound_obj.name,
683                "Chebi ID": compound_obj.metadata.chebi,
684                "Kegg Compound ID": compound_obj.metadata.kegg,
685                "Inchi": compound_obj.metadata.inchi,
686                "Inchi Key": compound_obj.metadata.inchikey,
687                "Smiles": compound_obj.metadata.smiles,
688                "Molecular Formula": compound_obj.formula,
689                "IUPAC Name": compound_obj.metadata.iupac_name,
690                "Traditional Name": compound_obj.metadata.traditional_name,
691                "Common Name": compound_obj.metadata.common_name,
692                "Derivatization": derivatization,
693            }
694
695            if self.gcms.molecular_search_settings.exploratory_mode:
696                out_dict.update(
697                    {
698                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
699                            "weighted_cosine_correlation"
700                        ),
701                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
702                            "cosine_correlation"
703                        ),
704                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
705                            "stein_scott_similarity"
706                        ),
707                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
708                            "pearson_correlation"
709                        ),
710                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
711                            "spearman_correlation"
712                        ),
713                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
714                            "kendall_tau_correlation"
715                        ),
716                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
717                            "dft_correlation"
718                        ),
719                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
720                            "dwt_correlation"
721                        ),
722                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
723                            "euclidean_distance"
724                        ),
725                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
726                            "manhattan_distance"
727                        ),
728                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
729                            "jaccard_distance"
730                        ),
731                    }
732                )
733                for method in methods_name:
734                    out_dict[methods_name.get(method)] = (
735                        compound_obj.spectral_similarity_scores.get(method)
736                    )
737
738            dict_data_list.append(out_dict)
739
740        def add_no_match_dict_data():
741            dict_data_list.append(
742                {
743                    "Sample name": gcms.sample_name,
744                    "Peak Index": gcpeak_index,
745                    "Retention Time": gc_peak.retention_time,
746                    "Peak Height": gc_peak.tic,
747                    "Peak Area": gc_peak.area,
748                    "Retention index": gc_peak.ri,
749                }
750            )
751
752        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
753            # check if there is a compound candidate
754            if gc_peak:
755                if output_score_method == "highest_sim_score":
756                    compound_obj = gc_peak.highest_score_compound
757                    add_match_dict_data()
758
759                elif output_score_method == "highest_ss":
760                    compound_obj = gc_peak.highest_ss_compound
761                    add_match_dict_data()
762
763                else:
764                    for compound_obj in gc_peak:
765                        add_match_dict_data()  # add monoisotopic peak
766
767            else:
768                # include not_match
769                if include_no_match and no_match_inline:
770                    add_no_match_dict_data()
771
772        if include_no_match and not no_match_inline:
773            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
774                if not gc_peak:
775                    add_no_match_dict_data()
776
777        return dict_data_list

A class to export low resolution GC-MS data.

This class provides methods to export low resolution GC-MS data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.

Parameters:

out_file_path : str The output file path. gcms : object The low resolution GCMS object.

Attributes:

output_file : Path The output file path as a Path object. gcms : object The low resolution GCMS object.

Methods:

  • get_pandas_df(id_label="corems:"). Get the exported data as a Pandas DataFrame.
  • get_json(nan=False, id_label="corems:"). Get the exported data as a JSON string.
  • to_pandas(write_metadata=True, id_label="corems:"). Export the data to a Pandas DataFrame and save it as a pickle file.
  • to_excel(write_mode='a', write_metadata=True, id_label="corems:"), Export the data to an Excel file.
  • to_csv(separate_output=False, write_mode="w", write_metadata=True, id_label="corems:"). Export the data to a CSV file.
  • to_hdf(id_label="corems:"). Export the data to an HDF5 file.
  • get_data_stats(gcms). Get statistics about the GCMS data.
LowResGCMSExport(out_file_path, gcms)
90    def __init__(self, out_file_path, gcms):
91        self.output_file = Path(out_file_path)
92
93        self.gcms = gcms
94
95        self._init_columns()
output_file
gcms
def get_pandas_df(self, id_label='corems:'):
152    def get_pandas_df(self, id_label="corems:"):
153        """Get the exported data as a Pandas DataFrame.
154
155        Parameters:
156        ----------
157        id_label : str, optional
158            The ID label for the data. Default is "corems:".
159
160        Returns:
161        -------
162        DataFrame
163            The exported data as a Pandas DataFrame.
164        """
165
166        columns = self._init_columns()
167
168        dict_data_list = self.get_list_dict_data(self.gcms)
169
170        df = DataFrame(dict_data_list, columns=columns)
171
172        df.name = self.gcms.sample_name
173
174        return df

Get the exported data as a Pandas DataFrame.

Parameters:

id_label : str, optional The ID label for the data. Default is "corems:".

Returns:

DataFrame The exported data as a Pandas DataFrame.

def get_json(self, nan=False, id_label='corems:'):
176    def get_json(self, nan=False, id_label="corems:"):
177        """Get the exported data as a JSON string.
178
179        Parameters:
180        ----------
181        nan : bool, optional
182            Whether to include NaN values in the JSON string. Default is False.
183        id_label : str, optional
184            The ID label for the data. Default is "corems:".
185
186        """
187
188        import json
189
190        dict_data_list = self.get_list_dict_data(self.gcms)
191
192        return json.dumps(
193            dict_data_list, sort_keys=False, indent=4, separators=(",", ": ")
194        )

Get the exported data as a JSON string.

Parameters:

nan : bool, optional Whether to include NaN values in the JSON string. Default is False. id_label : str, optional The ID label for the data. Default is "corems:".

def to_pandas(self, write_metadata=True, id_label='corems:'):
196    def to_pandas(self, write_metadata=True, id_label="corems:"):
197        """Export the data to a Pandas DataFrame and save it as a pickle file.
198
199        Parameters:
200        ----------
201        write_metadata : bool, optional
202            Whether to write metadata to the output file.
203        id_label : str, optional
204            The ID label for the data.
205        """
206
207        columns = self._init_columns()
208
209        dict_data_list = self.get_list_dict_data(self.gcms)
210
211        df = DataFrame(dict_data_list, columns=columns)
212
213        df.to_pickle(self.output_file.with_suffix(".pkl"))
214
215        if write_metadata:
216            self.write_settings(
217                self.output_file.with_suffix(".pkl"), self.gcms, id_label="corems:"
218            )

Export the data to a Pandas DataFrame and save it as a pickle file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. id_label : str, optional The ID label for the data.

def to_excel(self, write_mode='a', write_metadata=True, id_label='corems:'):
220    def to_excel(self, write_mode="a", write_metadata=True, id_label="corems:"):
221        """Export the data to an Excel file.
222
223        Parameters:
224        ----------
225        write_mode : str, optional
226            The write mode for the Excel file. Default is 'a' (append).
227        write_metadata : bool, optional
228            Whether to write metadata to the output file. Default is True.
229        id_label : str, optional
230            The ID label for the data. Default is "corems:".
231        """
232
233        out_put_path = self.output_file.with_suffix(".xlsx")
234
235        columns = self._init_columns()
236
237        dict_data_list = self.get_list_dict_data(self.gcms)
238
239        df = DataFrame(dict_data_list, columns=columns)
240
241        if write_mode == "a" and out_put_path.exists():
242            writer = ExcelWriter(out_put_path, engine="openpyxl")
243            # try to open an existing workbook
244            writer.book = load_workbook(out_put_path)
245            # copy existing sheets
246            writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
247            # read existing file
248            reader = read_excel(out_put_path)
249            # write out the new sheet
250            df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1)
251
252            writer.close()
253        else:
254            df.to_excel(
255                self.output_file.with_suffix(".xlsx"), index=False, engine="openpyxl"
256            )
257
258        if write_metadata:
259            self.write_settings(out_put_path, self.gcms, id_label=id_label)

Export the data to an Excel file.

Parameters:

write_mode : str, optional The write mode for the Excel file. Default is 'a' (append). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".

def to_csv( self, separate_output=False, write_mode='w', write_metadata=True, id_label='corems:'):
261    def to_csv(
262        self,
263        separate_output=False,
264        write_mode="w",
265        write_metadata=True,
266        id_label="corems:",
267    ):
268        """Export the data to a CSV file.
269
270        Parameters:
271        ----------
272        separate_output : bool, optional
273            Whether to separate the output into multiple files. Default is False.
274        write_mode : str, optional
275            The write mode for the CSV file. Default is 'w' (write).
276        write_metadata : bool, optional
277            Whether to write metadata to the output file. Default is True.
278        id_label : str, optional
279            The ID label for the data. Default is "corems:".
280        """
281
282        if separate_output:
283            # set write mode to write
284            # this mode will overwrite the file without warning
285            write_mode = "w"
286        else:
287            # set write mode to append
288            write_mode = "a"
289
290        columns = self._init_columns()
291
292        dict_data_list = self.get_list_dict_data(self.gcms)
293
294        out_put_path = self.output_file.with_suffix(".csv")
295
296        write_header = not out_put_path.exists()
297
298        try:
299            with open(out_put_path, write_mode, newline="") as csvfile:
300                writer = csv.DictWriter(csvfile, fieldnames=columns)
301                if write_header:
302                    writer.writeheader()
303                for data in dict_data_list:
304                    writer.writerow(data)
305
306            if write_metadata:
307                self.write_settings(out_put_path, self.gcms, id_label=id_label)
308
309        except IOError as ioerror:
310            print(ioerror)

Export the data to a CSV file.

Parameters:

separate_output : bool, optional Whether to separate the output into multiple files. Default is False. write_mode : str, optional The write mode for the CSV file. Default is 'w' (write). write_metadata : bool, optional Whether to write metadata to the output file. Default is True. id_label : str, optional The ID label for the data. Default is "corems:".

def to_hdf(self, id_label='corems:'):
312    def to_hdf(self, id_label="corems:"):
313        """Export the data to an HDF5 file.
314
315        Parameters:
316        ----------
317        id_label : str, optional
318            The ID label for the data. Default is "corems:".
319        """
320
321        # save sample at a time
322        def add_compound(gc_peak, compound_obj):
323            modifier = compound_obj.classify if compound_obj.classify else ""
324            compound_group = compound_obj.name.replace("/", "") + " " + modifier
325
326            if compound_group not in peak_group:
327                compound_group = peak_group.create_group(compound_group)
328
329                # compound_group.attrs["retention_time"] = compound_obj.retention_time
330                compound_group.attrs["retention_index"] = compound_obj.ri
331                compound_group.attrs["retention_index_score"] = compound_obj.ri_score
332                compound_group.attrs["spectral_similarity_score"] = (
333                    compound_obj.spectral_similarity_score
334                )
335                compound_group.attrs["similarity_score"] = compound_obj.similarity_score
336
337                compond_mz = compound_group.create_dataset(
338                    "mz", data=np.array(compound_obj.mz), dtype="f8"
339                )
340                compond_abundance = compound_group.create_dataset(
341                    "abundance", data=np.array(compound_obj.abundance), dtype="f8"
342                )
343
344                if self.gcms.molecular_search_settings.exploratory_mode:
345                    compound_group.attrs["Spectral Similarities"] = json.dumps(
346                        compound_obj.spectral_similarity_scores,
347                        sort_keys=False,
348                        indent=4,
349                        separators=(",", ":"),
350                    )
351            else:
352                warnings.warn("Skipping duplicate reference compound.")
353
354        import json
355        from datetime import datetime, timezone
356
357        import h5py
358        import numpy as np
359
360        output_path = self.output_file.with_suffix(".hdf5")
361
362        with h5py.File(output_path, "w") as hdf_handle:
363            timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
364            hdf_handle.attrs["time_stamp"] = timenow
365            hdf_handle.attrs["data_structure"] = "gcms"
366            hdf_handle.attrs["analyzer"] = self.gcms.analyzer
367            hdf_handle.attrs["instrument_label"] = self.gcms.instrument_label
368
369            hdf_handle.attrs["sample_id"] = "self.gcms.id"
370            hdf_handle.attrs["sample_name"] = self.gcms.sample_name
371            hdf_handle.attrs["input_data"] = str(self.gcms.file_location)
372            hdf_handle.attrs["output_data"] = str(output_path)
373            hdf_handle.attrs["output_data_id"] = id_label + uuid.uuid4().hex
374            hdf_handle.attrs["corems_version"] = __version__
375
376            hdf_handle.attrs["Stats"] = json.dumps(
377                self.get_data_stats(self.gcms),
378                sort_keys=False,
379                indent=4,
380                separators=(",", ": "),
381            )
382            hdf_handle.attrs["Calibration"] = json.dumps(
383                self.get_calibration_stats(self.gcms, id_label),
384                sort_keys=False,
385                indent=4,
386                separators=(",", ": "),
387            )
388            hdf_handle.attrs["Blank"] = json.dumps(
389                self.get_blank_stats(self.gcms),
390                sort_keys=False,
391                indent=4,
392                separators=(",", ": "),
393            )
394
395            corems_dict_setting = parameter_to_dict.get_dict_data_gcms(self.gcms)
396            hdf_handle.attrs["CoreMSParameters"] = json.dumps(
397                corems_dict_setting, sort_keys=False, indent=4, separators=(",", ": ")
398            )
399
400            scans_dataset = hdf_handle.create_dataset(
401                "scans", data=np.array(self.gcms.scans_number), dtype="f8"
402            )
403            rt_dataset = hdf_handle.create_dataset(
404                "rt", data=np.array(self.gcms.retention_time), dtype="f8"
405            )
406            tic_dataset = hdf_handle.create_dataset(
407                "tic", data=np.array(self.gcms.tic), dtype="f8"
408            )
409            processed_tic_dataset = hdf_handle.create_dataset(
410                "processed_tic", data=np.array(self.gcms.processed_tic), dtype="f8"
411            )
412
413            output_score_method = (
414                self.gcms.molecular_search_settings.output_score_method
415            )
416
417            for gc_peak in self.gcms:
418                # print(gc_peak.retention_time)
419                # print(gc_peak.tic)
420
421                # check if there is a compound candidate
422                peak_group = hdf_handle.create_group(str(gc_peak.retention_time))
423                peak_group.attrs["deconvolution"] = int(
424                    self.gcms.chromatogram_settings.use_deconvolution
425                )
426
427                peak_group.attrs["start_scan"] = gc_peak.start_scan
428                peak_group.attrs["apex_scan"] = gc_peak.apex_scan
429                peak_group.attrs["final_scan"] = gc_peak.final_scan
430
431                peak_group.attrs["retention_index"] = gc_peak.ri
432                peak_group.attrs["retention_time"] = gc_peak.retention_time
433                peak_group.attrs["area"] = gc_peak.area
434
435                mz = peak_group.create_dataset(
436                    "mz", data=np.array(gc_peak.mass_spectrum.mz_exp), dtype="f8"
437                )
438                abundance = peak_group.create_dataset(
439                    "abundance",
440                    data=np.array(gc_peak.mass_spectrum.abundance),
441                    dtype="f8",
442                )
443
444                if gc_peak:
445                    if output_score_method == "highest_sim_score":
446                        compound_obj = gc_peak.highest_score_compound
447                        add_compound(gc_peak, compound_obj)
448
449                    elif output_score_method == "highest_ss":
450                        compound_obj = gc_peak.highest_ss_compound
451                        add_compound(gc_peak, compound_obj)
452
453                    else:
454                        for compound_obj in gc_peak:
455                            add_compound(gc_peak, compound_obj)

Export the data to an HDF5 file.

Parameters:

id_label : str, optional The ID label for the data. Default is "corems:".

def get_data_stats(self, gcms):
457    def get_data_stats(self, gcms):
458        """Get statistics about the GCMS data.
459
460        Parameters:
461        ----------
462        gcms : object
463            The low resolution GCMS object.
464
465        Returns:
466        -------
467        dict
468            A dictionary containing the data statistics.
469        """
470
471        matched_peaks = gcms.matched_peaks
472        no_matched_peaks = gcms.no_matched_peaks
473        unique_metabolites = gcms.unique_metabolites
474
475        peak_matchs_above_0p85 = 0
476        unique_peak_match_above_0p85 = 0
477        for match_peak in matched_peaks:
478            gc_peak_above_85 = 0
479            matches_above_85 = list(
480                filter(lambda m: m.similarity_score >= 0.85, match_peak)
481            )
482            if matches_above_85:
483                peak_matchs_above_0p85 += 1
484            if len(matches_above_85) == 1:
485                unique_peak_match_above_0p85 += 1
486
487        data_stats = {}
488        data_stats["average_signal_noise"] = "ni"
489        data_stats["chromatogram_dynamic_range"] = gcms.dynamic_range
490        data_stats["total_number_peaks"] = len(gcms)
491        data_stats["total_peaks_matched"] = len(matched_peaks)
492        data_stats["total_peaks_without_matches"] = len(no_matched_peaks)
493        data_stats["total_matches_above_similarity_score_0.85"] = peak_matchs_above_0p85
494        data_stats["single_matches_above_similarity_score_0.85"] = (
495            unique_peak_match_above_0p85
496        )
497        data_stats["unique_metabolites"] = len(unique_metabolites)
498
499        return data_stats

Get statistics about the GCMS data.

Parameters:

gcms : object The low resolution GCMS object.

Returns:

dict A dictionary containing the data statistics.

def get_calibration_stats(self, gcms, id_label):
501    def get_calibration_stats(self, gcms, id_label):
502        """Get statistics about the GC-MS calibration.
503
504        Parameters:
505        ----------
506        """
507        calibration_parameters = {}
508
509        calibration_parameters["calibration_rt_ri_pairs_ref"] = gcms.ri_pairs_ref
510        calibration_parameters["data_url"] = str(gcms.cal_file_path)
511        calibration_parameters["has_input"] = id_label + corems_md5(gcms.cal_file_path)
512        calibration_parameters["data_name"] = str(gcms.cal_file_path.stem)
513        calibration_parameters["calibration_method"] = ""
514
515        return calibration_parameters

Get statistics about the GC-MS calibration.

Parameters:

def get_blank_stats(self, gcms):
517    def get_blank_stats(self, gcms):
518        """Get statistics about the GC-MS blank."""
519        blank_parameters = {}
520
521        blank_parameters["data_name"] = "ni"
522        blank_parameters["blank_id"] = "ni"
523        blank_parameters["data_url"] = "ni"
524        blank_parameters["has_input"] = "ni"
525        blank_parameters["common_features_to_blank"] = "ni"
526
527        return blank_parameters

Get statistics about the GC-MS blank.

def get_instrument_metadata(self, gcms):
529    def get_instrument_metadata(self, gcms):
530        """Get metadata about the GC-MS instrument."""
531        instrument_metadata = {}
532
533        instrument_metadata["analyzer"] = gcms.analyzer
534        instrument_metadata["instrument_label"] = gcms.instrument_label
535        instrument_metadata["instrument_id"] = uuid.uuid4().hex
536
537        return instrument_metadata

Get metadata about the GC-MS instrument.

def get_data_metadata(self, gcms, id_label, output_path):
539    def get_data_metadata(self, gcms, id_label, output_path):
540        """Get metadata about the GC-MS data.
541
542        Parameters:
543        ----------
544        gcms : object
545            The low resolution GCMS object.
546        id_label : str
547            The ID label for the data.
548        output_path : str
549            The output file path.
550
551        Returns:
552        -------
553        dict
554            A dictionary containing the data metadata.
555        """
556        if isinstance(output_path, str):
557            output_path = Path(output_path)
558
559        paramaters_path = output_path.with_suffix(".json")
560
561        if paramaters_path.exists():
562            with paramaters_path.open() as current_param:
563                metadata = json.load(current_param)
564                data_metadata = metadata.get("Data")
565        else:
566            data_metadata = {}
567            data_metadata["data_name"] = []
568            data_metadata["input_data_url"] = []
569            data_metadata["has_input"] = []
570
571        data_metadata["data_name"].append(gcms.sample_name)
572        data_metadata["input_data_url"].append(str(gcms.file_location))
573        data_metadata["has_input"].append(id_label + corems_md5(gcms.file_location))
574
575        data_metadata["output_data_name"] = str(output_path.stem)
576        data_metadata["output_data_url"] = str(output_path)
577        data_metadata["has_output"] = id_label + corems_md5(output_path)
578
579        return data_metadata

Get metadata about the GC-MS data.

Parameters:

gcms : object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.

Returns:

dict A dictionary containing the data metadata.

def get_parameters_json(self, gcms, id_label, output_path):
581    def get_parameters_json(self, gcms, id_label, output_path):
582        """Get the parameters as a JSON string.
583
584        Parameters:
585        ----------
586        gcms : GCMS object
587            The low resolution GCMS object.
588        id_label : str
589            The ID label for the data.
590        output_path : str
591            The output file path.
592
593        Returns:
594        -------
595        str
596            The parameters as a JSON string.
597        """
598
599        output_parameters_dict = {}
600        output_parameters_dict["Data"] = self.get_data_metadata(
601            gcms, id_label, output_path
602        )
603        output_parameters_dict["Stats"] = self.get_data_stats(gcms)
604        output_parameters_dict["Calibration"] = self.get_calibration_stats(
605            gcms, id_label
606        )
607        output_parameters_dict["Blank"] = self.get_blank_stats(gcms)
608        output_parameters_dict["Instrument"] = self.get_instrument_metadata(gcms)
609        corems_dict_setting = parameter_to_dict.get_dict_data_gcms(gcms)
610        corems_dict_setting["corems_version"] = __version__
611        output_parameters_dict["CoreMSParameters"] = corems_dict_setting
612        output_parameters_dict["has_metabolite"] = gcms.metabolites_data
613        output = json.dumps(
614            output_parameters_dict, sort_keys=False, indent=4, separators=(",", ": ")
615        )
616
617        return output

Get the parameters as a JSON string.

Parameters:

gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. output_path : str The output file path.

Returns:

str The parameters as a JSON string.

def write_settings(self, output_path, gcms, id_label='emsl:'):
619    def write_settings(self, output_path, gcms, id_label="emsl:"):
620        """Write the settings to a JSON file.
621
622        Parameters:
623        ----------
624        output_path : str
625            The output file path.
626        gcms : GCMS object
627            The low resolution GCMS object.
628        id_label : str
629            The ID label for the data. Default is "emsl:".
630
631        """
632
633        output = self.get_parameters_json(gcms, id_label, output_path)
634
635        with open(
636            output_path.with_suffix(".json"),
637            "w",
638            encoding="utf8",
639        ) as outfile:
640            outfile.write(output)

Write the settings to a JSON file.

Parameters:

output_path : str The output file path. gcms : GCMS object The low resolution GCMS object. id_label : str The ID label for the data. Default is "emsl:".

def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
642    def get_list_dict_data(self, gcms, include_no_match=True, no_match_inline=False):
643        """Get the exported data as a list of dictionaries.
644
645        Parameters:
646        ----------
647        gcms : object
648            The low resolution GCMS object.
649        include_no_match : bool, optional
650            Whether to include no match data. Default is True.
651        no_match_inline : bool, optional
652            Whether to include no match data inline. Default is False.
653
654        Returns:
655        -------
656        list
657            The exported data as a list of dictionaries.
658        """
659
660        output_score_method = gcms.molecular_search_settings.output_score_method
661
662        dict_data_list = []
663
664        def add_match_dict_data():
665            derivatization = "{}:{}:{}".format(
666                compound_obj.classify,
667                compound_obj.derivativenum,
668                compound_obj.derivatization,
669            )
670            out_dict = {
671                "Sample name": gcms.sample_name,
672                "Peak Index": gcpeak_index,
673                "Retention Time": gc_peak.retention_time,
674                "Retention Time Ref": compound_obj.retention_time,
675                "Peak Height": gc_peak.tic,
676                "Peak Area": gc_peak.area,
677                "Retention index": gc_peak.ri,
678                "Retention index Ref": compound_obj.ri,
679                "Retention Index Score": compound_obj.ri_score,
680                "Spectral Similarity Score": compound_obj.spectral_similarity_score,
681                "Similarity Score": compound_obj.similarity_score,
682                "Compound Name": compound_obj.name,
683                "Chebi ID": compound_obj.metadata.chebi,
684                "Kegg Compound ID": compound_obj.metadata.kegg,
685                "Inchi": compound_obj.metadata.inchi,
686                "Inchi Key": compound_obj.metadata.inchikey,
687                "Smiles": compound_obj.metadata.smiles,
688                "Molecular Formula": compound_obj.formula,
689                "IUPAC Name": compound_obj.metadata.iupac_name,
690                "Traditional Name": compound_obj.metadata.traditional_name,
691                "Common Name": compound_obj.metadata.common_name,
692                "Derivatization": derivatization,
693            }
694
695            if self.gcms.molecular_search_settings.exploratory_mode:
696                out_dict.update(
697                    {
698                        "Weighted Cosine Correlation": compound_obj.spectral_similarity_scores.get(
699                            "weighted_cosine_correlation"
700                        ),
701                        "Cosine Correlation": compound_obj.spectral_similarity_scores.get(
702                            "cosine_correlation"
703                        ),
704                        "Stein Scott Similarity": compound_obj.spectral_similarity_scores.get(
705                            "stein_scott_similarity"
706                        ),
707                        "Pearson Correlation": compound_obj.spectral_similarity_scores.get(
708                            "pearson_correlation"
709                        ),
710                        "Spearman Correlation": compound_obj.spectral_similarity_scores.get(
711                            "spearman_correlation"
712                        ),
713                        "Kendall Tau Correlation": compound_obj.spectral_similarity_scores.get(
714                            "kendall_tau_correlation"
715                        ),
716                        "DFT Correlation": compound_obj.spectral_similarity_scores.get(
717                            "dft_correlation"
718                        ),
719                        "DWT Correlation": compound_obj.spectral_similarity_scores.get(
720                            "dwt_correlation"
721                        ),
722                        "Euclidean Distance": compound_obj.spectral_similarity_scores.get(
723                            "euclidean_distance"
724                        ),
725                        "Manhattan Distance": compound_obj.spectral_similarity_scores.get(
726                            "manhattan_distance"
727                        ),
728                        "Jaccard Distance": compound_obj.spectral_similarity_scores.get(
729                            "jaccard_distance"
730                        ),
731                    }
732                )
733                for method in methods_name:
734                    out_dict[methods_name.get(method)] = (
735                        compound_obj.spectral_similarity_scores.get(method)
736                    )
737
738            dict_data_list.append(out_dict)
739
740        def add_no_match_dict_data():
741            dict_data_list.append(
742                {
743                    "Sample name": gcms.sample_name,
744                    "Peak Index": gcpeak_index,
745                    "Retention Time": gc_peak.retention_time,
746                    "Peak Height": gc_peak.tic,
747                    "Peak Area": gc_peak.area,
748                    "Retention index": gc_peak.ri,
749                }
750            )
751
752        for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
753            # check if there is a compound candidate
754            if gc_peak:
755                if output_score_method == "highest_sim_score":
756                    compound_obj = gc_peak.highest_score_compound
757                    add_match_dict_data()
758
759                elif output_score_method == "highest_ss":
760                    compound_obj = gc_peak.highest_ss_compound
761                    add_match_dict_data()
762
763                else:
764                    for compound_obj in gc_peak:
765                        add_match_dict_data()  # add monoisotopic peak
766
767            else:
768                # include not_match
769                if include_no_match and no_match_inline:
770                    add_no_match_dict_data()
771
772        if include_no_match and not no_match_inline:
773            for gcpeak_index, gc_peak in enumerate(gcms.sorted_gcpeaks):
774                if not gc_peak:
775                    add_no_match_dict_data()
776
777        return dict_data_list

Get the exported data as a list of dictionaries.

Parameters:

gcms : object The low resolution GCMS object. include_no_match : bool, optional Whether to include no match data. Default is True. no_match_inline : bool, optional Whether to include no match data inline. Default is False.

Returns:

list The exported data as a list of dictionaries.

class HighResMassSpectraExport(corems.mass_spectrum.output.export.HighResMassSpecExport):
 780class HighResMassSpectraExport(HighResMassSpecExport):
 781    """A class to export high resolution mass spectra data.
 782
 783    This class provides methods to export high resolution mass spectra data to various formats
 784    such as Excel, CSV, HDF5, and Pandas DataFrame.
 785
 786    Parameters
 787    ----------
 788    out_file_path : str | Path
 789        The output file path.
 790    mass_spectra : object
 791        The high resolution mass spectra object.
 792    output_type : str, optional
 793        The output type. Default is 'excel'.
 794
 795    Attributes
 796    ----------
 797    output_file : Path
 798        The output file path without suffix
 799    dir_loc : Path
 800        The directory location for the output file,
 801        by default this will be the output_file + ".corems" and all output files will be
 802        written into this location
 803    mass_spectra : MassSpectraBase
 804        The high resolution mass spectra object.
 805    """
 806
 807    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
 808        super().__init__(
 809            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
 810        )
 811
 812        self.dir_loc = Path(out_file_path + ".corems")
 813        self.dir_loc.mkdir(exist_ok=True)
 814        # Place the output file in the directory
 815        self.output_file = self.dir_loc / Path(out_file_path).name
 816        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
 817        self.mass_spectra = mass_spectra
 818        self.atoms_order_list = None
 819        self._init_columns()
 820
 821    def get_pandas_df(self):
 822        """Get the mass spectra as a list of Pandas DataFrames."""
 823
 824        list_df = []
 825
 826        for mass_spectrum in self.mass_spectra:
 827            columns = self.columns_label + self.get_all_used_atoms_in_order(
 828                mass_spectrum
 829            )
 830
 831            dict_data_list = self.get_list_dict_data(mass_spectrum)
 832
 833            df = DataFrame(dict_data_list, columns=columns)
 834
 835            scan_number = mass_spectrum.scan_number
 836
 837            df.name = str(self.output_file) + "_" + str(scan_number)
 838
 839            list_df.append(df)
 840
 841        return list_df
 842
 843    def to_pandas(self, write_metadata=True):
 844        """Export the data to a Pandas DataFrame and save it as a pickle file.
 845
 846        Parameters:
 847        ----------
 848        write_metadata : bool, optional
 849            Whether to write metadata to the output file. Default is True.
 850        """
 851
 852        for mass_spectrum in self.mass_spectra:
 853            columns = self.columns_label + self.get_all_used_atoms_in_order(
 854                mass_spectrum
 855            )
 856
 857            dict_data_list = self.get_list_dict_data(mass_spectrum)
 858
 859            df = DataFrame(dict_data_list, columns=columns)
 860
 861            scan_number = mass_spectrum.scan_number
 862
 863            out_filename = Path(
 864                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
 865            )
 866
 867            df.to_pickle(self.dir_loc / out_filename)
 868
 869            if write_metadata:
 870                self.write_settings(
 871                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 872                )
 873
 874    def to_excel(self, write_metadata=True):
 875        """Export the data to an Excel file.
 876
 877        Parameters:
 878        ----------
 879        write_metadata : bool, optional
 880            Whether to write metadata to the output file. Default is True.
 881        """
 882        for mass_spectrum in self.mass_spectra:
 883            columns = self.columns_label + self.get_all_used_atoms_in_order(
 884                mass_spectrum
 885            )
 886
 887            dict_data_list = self.get_list_dict_data(mass_spectrum)
 888
 889            df = DataFrame(dict_data_list, columns=columns)
 890
 891            scan_number = mass_spectrum.scan_number
 892
 893            out_filename = Path(
 894                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
 895            )
 896
 897            df.to_excel(self.dir_loc / out_filename)
 898
 899            if write_metadata:
 900                self.write_settings(
 901                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 902                )
 903
 904    def to_csv(self, write_metadata=True):
 905        """Export the data to a CSV file.
 906
 907        Parameters:
 908        ----------
 909        write_metadata : bool, optional
 910            Whether to write metadata to the output file. Default is True.
 911        """
 912        import csv
 913
 914        for mass_spectrum in self.mass_spectra:
 915            columns = self.columns_label + self.get_all_used_atoms_in_order(
 916                mass_spectrum
 917            )
 918
 919            scan_number = mass_spectrum.scan_number
 920
 921            dict_data_list = self.get_list_dict_data(mass_spectrum)
 922
 923            out_filename = Path(
 924                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
 925            )
 926
 927            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
 928                writer = csv.DictWriter(csvfile, fieldnames=columns)
 929                writer.writeheader()
 930                for data in dict_data_list:
 931                    writer.writerow(data)
 932
 933            if write_metadata:
 934                self.write_settings(
 935                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
 936                )
 937
 938    def get_mass_spectra_attrs(self):
 939        """Get the mass spectra attributes as a JSON string.
 940
 941        Parameters:
 942        ----------
 943        mass_spectra : object
 944            The high resolution mass spectra object.
 945
 946        Returns:
 947        -------
 948        str
 949            The mass spectra attributes as a JSON string.
 950        """
 951        dict_ms_attrs = {}
 952        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
 953        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
 954        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
 955
 956        return json.dumps(
 957            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
 958        )
 959
 960    def to_hdf(self, overwrite=False, export_raw=True):
 961        """Export the data to an HDF5 file.
 962
 963        Parameters
 964        ----------
 965        overwrite : bool, optional
 966            Whether to overwrite the output file. Default is False.
 967        export_raw : bool, optional
 968            Whether to export the raw mass spectra data. Default is True.
 969        """
 970        if overwrite:
 971            if self.output_file.with_suffix(".hdf5").exists():
 972                self.output_file.with_suffix(".hdf5").unlink()
 973
 974        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 975            if not hdf_handle.attrs.get("date_utc"):
 976                # Set metadata for all mass spectra
 977                timenow = str(
 978                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 979                )
 980                hdf_handle.attrs["date_utc"] = timenow
 981                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 982                hdf_handle.attrs["data_structure"] = "mass_spectra"
 983                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 984                hdf_handle.attrs["instrument_label"] = (
 985                    self.mass_spectra.instrument_label
 986                )
 987                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 988                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 989                hdf_handle.attrs["parser_type"] = (
 990                    self.mass_spectra.spectra_parser_class.__name__
 991                )
 992                hdf_handle.attrs["original_file_location"] = (
 993                    self.mass_spectra.file_location._str
 994                )
 995
 996            if "mass_spectra" not in hdf_handle:
 997                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 998            else:
 999                mass_spectra_group = hdf_handle.get("mass_spectra")
1000
1001            for mass_spectrum in self.mass_spectra:
1002                group_key = str(int(mass_spectrum.scan_number))
1003
1004                self.add_mass_spectrum_to_hdf5(
1005                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1006                )

A class to export high resolution mass spectra data.

This class provides methods to export high resolution mass spectra data to various formats such as Excel, CSV, HDF5, and Pandas DataFrame.

Parameters
  • out_file_path (str | Path): The output file path.
  • mass_spectra (object): The high resolution mass spectra object.
  • output_type (str, optional): The output type. Default is 'excel'.
Attributes
  • output_file (Path): The output file path without suffix
  • dir_loc (Path): The directory location for the output file, by default this will be the output_file + ".corems" and all output files will be written into this location
  • mass_spectra (MassSpectraBase): The high resolution mass spectra object.
HighResMassSpectraExport(out_file_path, mass_spectra, output_type='excel')
807    def __init__(self, out_file_path, mass_spectra, output_type="excel"):
808        super().__init__(
809            out_file_path=out_file_path, mass_spectrum=None, output_type=output_type
810        )
811
812        self.dir_loc = Path(out_file_path + ".corems")
813        self.dir_loc.mkdir(exist_ok=True)
814        # Place the output file in the directory
815        self.output_file = self.dir_loc / Path(out_file_path).name
816        self._output_type = output_type  # 'excel', 'csv', 'pandas' or 'hdf5'
817        self.mass_spectra = mass_spectra
818        self.atoms_order_list = None
819        self._init_columns()

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

dir_loc
output_file
mass_spectra
atoms_order_list
def get_pandas_df(self):
821    def get_pandas_df(self):
822        """Get the mass spectra as a list of Pandas DataFrames."""
823
824        list_df = []
825
826        for mass_spectrum in self.mass_spectra:
827            columns = self.columns_label + self.get_all_used_atoms_in_order(
828                mass_spectrum
829            )
830
831            dict_data_list = self.get_list_dict_data(mass_spectrum)
832
833            df = DataFrame(dict_data_list, columns=columns)
834
835            scan_number = mass_spectrum.scan_number
836
837            df.name = str(self.output_file) + "_" + str(scan_number)
838
839            list_df.append(df)
840
841        return list_df

Get the mass spectra as a list of Pandas DataFrames.

def to_pandas(self, write_metadata=True):
843    def to_pandas(self, write_metadata=True):
844        """Export the data to a Pandas DataFrame and save it as a pickle file.
845
846        Parameters:
847        ----------
848        write_metadata : bool, optional
849            Whether to write metadata to the output file. Default is True.
850        """
851
852        for mass_spectrum in self.mass_spectra:
853            columns = self.columns_label + self.get_all_used_atoms_in_order(
854                mass_spectrum
855            )
856
857            dict_data_list = self.get_list_dict_data(mass_spectrum)
858
859            df = DataFrame(dict_data_list, columns=columns)
860
861            scan_number = mass_spectrum.scan_number
862
863            out_filename = Path(
864                "%s_scan%s%s" % (self.output_file, str(scan_number), ".pkl")
865            )
866
867            df.to_pickle(self.dir_loc / out_filename)
868
869            if write_metadata:
870                self.write_settings(
871                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
872                )

Export the data to a Pandas DataFrame and save it as a pickle file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def to_excel(self, write_metadata=True):
874    def to_excel(self, write_metadata=True):
875        """Export the data to an Excel file.
876
877        Parameters:
878        ----------
879        write_metadata : bool, optional
880            Whether to write metadata to the output file. Default is True.
881        """
882        for mass_spectrum in self.mass_spectra:
883            columns = self.columns_label + self.get_all_used_atoms_in_order(
884                mass_spectrum
885            )
886
887            dict_data_list = self.get_list_dict_data(mass_spectrum)
888
889            df = DataFrame(dict_data_list, columns=columns)
890
891            scan_number = mass_spectrum.scan_number
892
893            out_filename = Path(
894                "%s_scan%s%s" % (self.output_file, str(scan_number), ".xlsx")
895            )
896
897            df.to_excel(self.dir_loc / out_filename)
898
899            if write_metadata:
900                self.write_settings(
901                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
902                )

Export the data to an Excel file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def to_csv(self, write_metadata=True):
904    def to_csv(self, write_metadata=True):
905        """Export the data to a CSV file.
906
907        Parameters:
908        ----------
909        write_metadata : bool, optional
910            Whether to write metadata to the output file. Default is True.
911        """
912        import csv
913
914        for mass_spectrum in self.mass_spectra:
915            columns = self.columns_label + self.get_all_used_atoms_in_order(
916                mass_spectrum
917            )
918
919            scan_number = mass_spectrum.scan_number
920
921            dict_data_list = self.get_list_dict_data(mass_spectrum)
922
923            out_filename = Path(
924                "%s_scan%s%s" % (self.output_file, str(scan_number), ".csv")
925            )
926
927            with open(self.dir_loc / out_filename, "w", newline="") as csvfile:
928                writer = csv.DictWriter(csvfile, fieldnames=columns)
929                writer.writeheader()
930                for data in dict_data_list:
931                    writer.writerow(data)
932
933            if write_metadata:
934                self.write_settings(
935                    self.dir_loc / out_filename.with_suffix(""), mass_spectrum
936                )

Export the data to a CSV file.

Parameters:

write_metadata : bool, optional Whether to write metadata to the output file. Default is True.

def get_mass_spectra_attrs(self):
938    def get_mass_spectra_attrs(self):
939        """Get the mass spectra attributes as a JSON string.
940
941        Parameters:
942        ----------
943        mass_spectra : object
944            The high resolution mass spectra object.
945
946        Returns:
947        -------
948        str
949            The mass spectra attributes as a JSON string.
950        """
951        dict_ms_attrs = {}
952        dict_ms_attrs["analyzer"] = self.mass_spectra.analyzer
953        dict_ms_attrs["instrument_label"] = self.mass_spectra.instrument_label
954        dict_ms_attrs["sample_name"] = self.mass_spectra.sample_name
955
956        return json.dumps(
957            dict_ms_attrs, sort_keys=False, indent=4, separators=(",", ": ")
958        )

Get the mass spectra attributes as a JSON string.

Parameters:

mass_spectra : object The high resolution mass spectra object.

Returns:

str The mass spectra attributes as a JSON string.

def to_hdf(self, overwrite=False, export_raw=True):
 960    def to_hdf(self, overwrite=False, export_raw=True):
 961        """Export the data to an HDF5 file.
 962
 963        Parameters
 964        ----------
 965        overwrite : bool, optional
 966            Whether to overwrite the output file. Default is False.
 967        export_raw : bool, optional
 968            Whether to export the raw mass spectra data. Default is True.
 969        """
 970        if overwrite:
 971            if self.output_file.with_suffix(".hdf5").exists():
 972                self.output_file.with_suffix(".hdf5").unlink()
 973
 974        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
 975            if not hdf_handle.attrs.get("date_utc"):
 976                # Set metadata for all mass spectra
 977                timenow = str(
 978                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
 979                )
 980                hdf_handle.attrs["date_utc"] = timenow
 981                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
 982                hdf_handle.attrs["data_structure"] = "mass_spectra"
 983                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
 984                hdf_handle.attrs["instrument_label"] = (
 985                    self.mass_spectra.instrument_label
 986                )
 987                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
 988                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
 989                hdf_handle.attrs["parser_type"] = (
 990                    self.mass_spectra.spectra_parser_class.__name__
 991                )
 992                hdf_handle.attrs["original_file_location"] = (
 993                    self.mass_spectra.file_location._str
 994                )
 995
 996            if "mass_spectra" not in hdf_handle:
 997                mass_spectra_group = hdf_handle.create_group("mass_spectra")
 998            else:
 999                mass_spectra_group = hdf_handle.get("mass_spectra")
1000
1001            for mass_spectrum in self.mass_spectra:
1002                group_key = str(int(mass_spectrum.scan_number))
1003
1004                self.add_mass_spectrum_to_hdf5(
1005                    hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_raw
1006                )

Export the data to an HDF5 file.

Parameters
  • overwrite (bool, optional): Whether to overwrite the output file. Default is False.
  • export_raw (bool, optional): Whether to export the raw mass spectra data. Default is True.
class LCMSExport(HighResMassSpectraExport):
1009class LCMSExport(HighResMassSpectraExport):
1010    """A class to export high resolution LC-MS data.
1011
1012    This class provides methods to export high resolution LC-MS data to HDF5.
1013
1014    Parameters
1015    ----------
1016    out_file_path : str | Path
1017        The output file path, do not include the file extension.
1018    lcms_object : LCMSBase
1019        The high resolution lc-ms object.
1020    """
1021
1022    def __init__(self, out_file_path, mass_spectra):
1023        super().__init__(out_file_path, mass_spectra, output_type="hdf5")
1024
1025    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1026        """Export the data to an HDF5.
1027
1028        Parameters
1029        ----------
1030        overwrite : bool, optional
1031            Whether to overwrite the output file. Default is False.
1032        save_parameters : bool, optional
1033            Whether to save the parameters as a separate json or toml file. Default is True.
1034        parameter_format : str, optional
1035            The format to save the parameters in. Default is 'toml'.
1036
1037        Raises
1038        ------
1039        ValueError
1040            If parameter_format is not 'json' or 'toml'.
1041        """
1042        export_profile_spectra = (
1043            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1044        )
1045
1046        # Parameterized export: all spectra (default) or only relevant spectra
1047        export_only_relevant = self.mass_spectra.parameters.lc_ms.export_only_relevant_mass_spectra
1048        if export_only_relevant:
1049            relevant_scan_numbers = set()
1050            # Add MS1 spectra associated with mass features (apex scans) and best MS2 spectra
1051            for mass_feature in self.mass_spectra.mass_features.values():
1052                relevant_scan_numbers.add(mass_feature.apex_scan)
1053                if mass_feature.best_ms2 is not None:
1054                    relevant_scan_numbers.add(mass_feature.best_ms2.scan_number)
1055        if overwrite:
1056            if self.output_file.with_suffix(".hdf5").exists():
1057                self.output_file.with_suffix(".hdf5").unlink()
1058
1059        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1060            if not hdf_handle.attrs.get("date_utc"):
1061                # Set metadata for all mass spectra
1062                timenow = str(
1063                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
1064                )
1065                hdf_handle.attrs["date_utc"] = timenow
1066                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
1067                hdf_handle.attrs["data_structure"] = "mass_spectra"
1068                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
1069                hdf_handle.attrs["instrument_label"] = (
1070                    self.mass_spectra.instrument_label
1071                )
1072                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
1073                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
1074                hdf_handle.attrs["parser_type"] = (
1075                    self.mass_spectra.spectra_parser_class.__name__
1076                )
1077                hdf_handle.attrs["original_file_location"] = (
1078                    self.mass_spectra.file_location._str
1079                )
1080
1081            if "mass_spectra" not in hdf_handle:
1082                mass_spectra_group = hdf_handle.create_group("mass_spectra")
1083            else:
1084                mass_spectra_group = hdf_handle.get("mass_spectra")
1085
1086            # Export logic based on parameter
1087            for mass_spectrum in self.mass_spectra:
1088                if export_only_relevant:
1089                    if mass_spectrum.scan_number in relevant_scan_numbers:
1090                        group_key = str(int(mass_spectrum.scan_number))
1091                        self.add_mass_spectrum_to_hdf5(
1092                            hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra
1093                        )
1094                else:
1095                    group_key = str(int(mass_spectrum.scan_number))
1096                    self.add_mass_spectrum_to_hdf5(
1097                        hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra
1098                    )
1099
1100        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1101        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1102            # Add scan_info to hdf5 file
1103            if "scan_info" not in hdf_handle:
1104                scan_info_group = hdf_handle.create_group("scan_info")
1105                for k, v in self.mass_spectra._scan_info.items():
1106                    array = np.array(list(v.values()))
1107                    if array.dtype.str[0:2] == "<U":
1108                        # Convert Unicode strings to UTF-8 encoded bytes first
1109                        string_data = [str(item) for item in array]
1110                        string_dtype = h5py.string_dtype(encoding='utf-8')
1111                        scan_info_group.create_dataset(k, data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1112                    else:
1113                        # Apply data type optimization for numeric data
1114                        if array.dtype == np.float64:
1115                            array = array.astype(np.float32)
1116                        elif array.dtype == np.int64:
1117                            array = array.astype(np.int32)
1118                        scan_info_group.create_dataset(k, data=array, compression="gzip", compression_opts=9, chunks=True)
1119
1120            # Add ms_unprocessed to hdf5 file
1121            export_unprocessed_ms1 = (
1122                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1123            )
1124            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1125                if "ms_unprocessed" not in hdf_handle:
1126                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1127                else:
1128                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1129                for k, v in self.mass_spectra._ms_unprocessed.items():
1130                    array = np.array(v)
1131                    # Apply data type optimization and compression
1132                    if array.dtype == np.int64:
1133                        array = array.astype(np.int32)
1134                    elif array.dtype == np.float64:
1135                        array = array.astype(np.float32)
1136                    elif array.dtype.str[0:2] == "<U":
1137                        # Convert Unicode strings to UTF-8 encoded strings
1138                        string_data = [str(item) for item in array]
1139                        string_dtype = h5py.string_dtype(encoding='utf-8')
1140                        ms_unprocessed_group.create_dataset(str(k), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1141                        continue
1142                    ms_unprocessed_group.create_dataset(str(k), data=array, compression="gzip", compression_opts=9, chunks=True)
1143
1144            # Add LCMS mass features to hdf5 file
1145            if len(self.mass_spectra.mass_features) > 0:
1146                if "mass_features" not in hdf_handle:
1147                    mass_features_group = hdf_handle.create_group("mass_features")
1148                else:
1149                    mass_features_group = hdf_handle.get("mass_features")
1150
1151                # Create group for each mass feature, with key as the mass feature id
1152                for k, v in self.mass_spectra.mass_features.items():
1153                    mass_features_group.create_group(str(k))
1154                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1155                    for k2, v2 in v.__dict__.items():
1156                        if v2 is not None:
1157                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1158                            if k2 not in [
1159                                "chromatogram_parent",
1160                                "ms2_mass_spectra",
1161                                "mass_spectrum",
1162                                "_eic_data",
1163                                "ms2_similarity_results",
1164                            ]:
1165                                if k2 == "ms2_scan_numbers":
1166                                    array = np.array(v2)
1167                                    # Convert int64 to int32
1168                                    if array.dtype == np.int64:
1169                                        array = array.astype(np.int32)
1170                                    mass_features_group[str(k)].create_dataset(
1171                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1172                                    )
1173                                elif k2 == "_half_height_width":
1174                                    array = np.array(v2)
1175                                    # Convert float64 to float32
1176                                    if array.dtype == np.float64:
1177                                        array = array.astype(np.float32)
1178                                    mass_features_group[str(k)].create_dataset(
1179                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1180                                    )
1181                                elif k2 == "_ms_deconvoluted_idx":
1182                                    array = np.array(v2)
1183                                    # Convert int64 to int32
1184                                    if array.dtype == np.int64:
1185                                        array = array.astype(np.int32)
1186                                    mass_features_group[str(k)].create_dataset(
1187                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1188                                    )
1189                                elif k2 == "associated_mass_features_deconvoluted":
1190                                    array = np.array(v2)
1191                                    # Convert int64 to int32
1192                                    if array.dtype == np.int64:
1193                                        array = array.astype(np.int32)
1194                                    mass_features_group[str(k)].create_dataset(
1195                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1196                                    )
1197                                elif k2 == "_noise_score":
1198                                    array = np.array(v2)
1199                                    # Convert float64 to float32
1200                                    if array.dtype == np.float64:
1201                                        array = array.astype(np.float32)
1202                                    mass_features_group[str(k)].create_dataset(
1203                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1204                                    )
1205                                elif (
1206                                    isinstance(v2, int)
1207                                    or isinstance(v2, float)
1208                                    or isinstance(v2, str)
1209                                    or isinstance(v2, np.integer)
1210                                    or isinstance(v2, np.float32)
1211                                    or isinstance(v2, np.float64)
1212                                    or isinstance(v2, np.bool_)
1213                                ):
1214                                    # Convert numpy types to smaller precision for storage
1215                                    if isinstance(v2, np.int64):
1216                                        v2 = np.int32(v2)
1217                                    elif isinstance(v2, np.float64):
1218                                        v2 = np.float32(v2)
1219                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1220                                else:
1221                                    raise TypeError(
1222                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1223                                    )
1224
1225            # Add EIC data to hdf5 file
1226            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1227            if len(self.mass_spectra.eics) > 0 and export_eics:
1228                if "eics" not in hdf_handle:
1229                    eic_group = hdf_handle.create_group("eics")
1230                else:
1231                    eic_group = hdf_handle.get("eics")
1232
1233                # Create group for each eic
1234                for k, v in self.mass_spectra.eics.items():
1235                    eic_group.create_group(str(k))
1236                    eic_group[str(k)].attrs["mz"] = k
1237                    # Loop through each of the attributes and add them as datasets (if array)
1238                    for k2, v2 in v.__dict__.items():
1239                        if v2 is not None:
1240                            array = np.array(v2)
1241                            # Apply data type optimization and compression
1242                            if array.dtype == np.int64:
1243                                array = array.astype(np.int32)
1244                            elif array.dtype == np.float64:
1245                                array = array.astype(np.float32)
1246                            elif array.dtype.str[0:2] == "<U":
1247                                # Convert Unicode strings to UTF-8 encoded strings
1248                                string_data = [str(item) for item in array]
1249                                string_dtype = h5py.string_dtype(encoding='utf-8')
1250                                eic_group[str(k)].create_dataset(str(k2), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1251                                continue
1252                            eic_group[str(k)].create_dataset(str(k2), data=array, compression="gzip", compression_opts=9, chunks=True)
1253
1254            # Add ms2_search results to hdf5 file (parameterized)
1255            if len(self.mass_spectra.spectral_search_results) > 0:
1256                if "spectral_search_results" not in hdf_handle:
1257                    spectral_search_results = hdf_handle.create_group(
1258                        "spectral_search_results"
1259                    )
1260                else:
1261                    spectral_search_results = hdf_handle.get("spectral_search_results")
1262                # Create group for each search result by ms2_scan / precursor_mz
1263                for k, v in self.mass_spectra.spectral_search_results.items():
1264                    # If parameter is set, only export spectral search results for relevant scans
1265                    if export_only_relevant and k not in relevant_scan_numbers:
1266                        continue
1267                    spectral_search_results.create_group(str(k))
1268                    for k2, v2 in v.items():
1269                        spectral_search_results[str(k)].create_group(str(k2))
1270                        spectral_search_results[str(k)][str(k2)].attrs[
1271                            "precursor_mz"
1272                        ] = v2.precursor_mz
1273                        spectral_search_results[str(k)][str(k2)].attrs[
1274                            "query_spectrum_id"
1275                        ] = v2.query_spectrum_id
1276                        # Loop through each of the attributes and add them as datasets (if array)
1277                        for k3, v3 in v2.__dict__.items():
1278                            if v3 is not None and k3 not in [
1279                                "query_spectrum",
1280                                "precursor_mz",
1281                                "query_spectrum_id",
1282                            ]:
1283                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1284                                    v3 = [", ".join(x) for x in v3]
1285                                if all(v3 is not None for v3 in v3):
1286                                    array = np.array(v3)
1287                                if array.dtype.str[0:2] == "<U":
1288                                    array = array.astype("S")
1289                                spectral_search_results[str(k)][str(k2)].create_dataset(
1290                                    str(k3), data=array
1291                                )
1292        if save_parameters:
1293            # Check if parameter_format is valid
1294            if parameter_format not in ["json", "toml"]:
1295                raise ValueError("parameter_format must be 'json' or 'toml'")
1296
1297            if parameter_format == "json":
1298                dump_lcms_settings_json(
1299                    filename=self.output_file.with_suffix(".json"),
1300                    lcms_obj=self.mass_spectra,
1301                )
1302            elif parameter_format == "toml":
1303                dump_lcms_settings_toml(
1304                    filename=self.output_file.with_suffix(".toml"),
1305                    lcms_obj=self.mass_spectra,
1306                )

A class to export high resolution LC-MS data.

This class provides methods to export high resolution LC-MS data to HDF5.

Parameters
  • out_file_path (str | Path): The output file path, do not include the file extension.
  • lcms_object (LCMSBase): The high resolution lc-ms object.
LCMSExport(out_file_path, mass_spectra)
1022    def __init__(self, out_file_path, mass_spectra):
1023        super().__init__(out_file_path, mass_spectra, output_type="hdf5")

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

def to_hdf(self, overwrite=False, save_parameters=True, parameter_format='toml'):
1025    def to_hdf(self, overwrite=False, save_parameters=True, parameter_format="toml"):
1026        """Export the data to an HDF5.
1027
1028        Parameters
1029        ----------
1030        overwrite : bool, optional
1031            Whether to overwrite the output file. Default is False.
1032        save_parameters : bool, optional
1033            Whether to save the parameters as a separate json or toml file. Default is True.
1034        parameter_format : str, optional
1035            The format to save the parameters in. Default is 'toml'.
1036
1037        Raises
1038        ------
1039        ValueError
1040            If parameter_format is not 'json' or 'toml'.
1041        """
1042        export_profile_spectra = (
1043            self.mass_spectra.parameters.lc_ms.export_profile_spectra
1044        )
1045
1046        # Parameterized export: all spectra (default) or only relevant spectra
1047        export_only_relevant = self.mass_spectra.parameters.lc_ms.export_only_relevant_mass_spectra
1048        if export_only_relevant:
1049            relevant_scan_numbers = set()
1050            # Add MS1 spectra associated with mass features (apex scans) and best MS2 spectra
1051            for mass_feature in self.mass_spectra.mass_features.values():
1052                relevant_scan_numbers.add(mass_feature.apex_scan)
1053                if mass_feature.best_ms2 is not None:
1054                    relevant_scan_numbers.add(mass_feature.best_ms2.scan_number)
1055        if overwrite:
1056            if self.output_file.with_suffix(".hdf5").exists():
1057                self.output_file.with_suffix(".hdf5").unlink()
1058
1059        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1060            if not hdf_handle.attrs.get("date_utc"):
1061                # Set metadata for all mass spectra
1062                timenow = str(
1063                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
1064                )
1065                hdf_handle.attrs["date_utc"] = timenow
1066                hdf_handle.attrs["filename"] = self.mass_spectra.file_location.name
1067                hdf_handle.attrs["data_structure"] = "mass_spectra"
1068                hdf_handle.attrs["analyzer"] = self.mass_spectra.analyzer
1069                hdf_handle.attrs["instrument_label"] = (
1070                    self.mass_spectra.instrument_label
1071                )
1072                hdf_handle.attrs["sample_name"] = self.mass_spectra.sample_name
1073                hdf_handle.attrs["polarity"] = self.mass_spectra.polarity
1074                hdf_handle.attrs["parser_type"] = (
1075                    self.mass_spectra.spectra_parser_class.__name__
1076                )
1077                hdf_handle.attrs["original_file_location"] = (
1078                    self.mass_spectra.file_location._str
1079                )
1080
1081            if "mass_spectra" not in hdf_handle:
1082                mass_spectra_group = hdf_handle.create_group("mass_spectra")
1083            else:
1084                mass_spectra_group = hdf_handle.get("mass_spectra")
1085
1086            # Export logic based on parameter
1087            for mass_spectrum in self.mass_spectra:
1088                if export_only_relevant:
1089                    if mass_spectrum.scan_number in relevant_scan_numbers:
1090                        group_key = str(int(mass_spectrum.scan_number))
1091                        self.add_mass_spectrum_to_hdf5(
1092                            hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra
1093                        )
1094                else:
1095                    group_key = str(int(mass_spectrum.scan_number))
1096                    self.add_mass_spectrum_to_hdf5(
1097                        hdf_handle, mass_spectrum, group_key, mass_spectra_group, export_profile_spectra
1098                    )
1099
1100        # Write scan info, ms_unprocessed, mass features, eics, and ms2_search results to the hdf5 file
1101        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
1102            # Add scan_info to hdf5 file
1103            if "scan_info" not in hdf_handle:
1104                scan_info_group = hdf_handle.create_group("scan_info")
1105                for k, v in self.mass_spectra._scan_info.items():
1106                    array = np.array(list(v.values()))
1107                    if array.dtype.str[0:2] == "<U":
1108                        # Convert Unicode strings to UTF-8 encoded bytes first
1109                        string_data = [str(item) for item in array]
1110                        string_dtype = h5py.string_dtype(encoding='utf-8')
1111                        scan_info_group.create_dataset(k, data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1112                    else:
1113                        # Apply data type optimization for numeric data
1114                        if array.dtype == np.float64:
1115                            array = array.astype(np.float32)
1116                        elif array.dtype == np.int64:
1117                            array = array.astype(np.int32)
1118                        scan_info_group.create_dataset(k, data=array, compression="gzip", compression_opts=9, chunks=True)
1119
1120            # Add ms_unprocessed to hdf5 file
1121            export_unprocessed_ms1 = (
1122                self.mass_spectra.parameters.lc_ms.export_unprocessed_ms1
1123            )
1124            if self.mass_spectra._ms_unprocessed and export_unprocessed_ms1:
1125                if "ms_unprocessed" not in hdf_handle:
1126                    ms_unprocessed_group = hdf_handle.create_group("ms_unprocessed")
1127                else:
1128                    ms_unprocessed_group = hdf_handle.get("ms_unprocessed")
1129                for k, v in self.mass_spectra._ms_unprocessed.items():
1130                    array = np.array(v)
1131                    # Apply data type optimization and compression
1132                    if array.dtype == np.int64:
1133                        array = array.astype(np.int32)
1134                    elif array.dtype == np.float64:
1135                        array = array.astype(np.float32)
1136                    elif array.dtype.str[0:2] == "<U":
1137                        # Convert Unicode strings to UTF-8 encoded strings
1138                        string_data = [str(item) for item in array]
1139                        string_dtype = h5py.string_dtype(encoding='utf-8')
1140                        ms_unprocessed_group.create_dataset(str(k), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1141                        continue
1142                    ms_unprocessed_group.create_dataset(str(k), data=array, compression="gzip", compression_opts=9, chunks=True)
1143
1144            # Add LCMS mass features to hdf5 file
1145            if len(self.mass_spectra.mass_features) > 0:
1146                if "mass_features" not in hdf_handle:
1147                    mass_features_group = hdf_handle.create_group("mass_features")
1148                else:
1149                    mass_features_group = hdf_handle.get("mass_features")
1150
1151                # Create group for each mass feature, with key as the mass feature id
1152                for k, v in self.mass_spectra.mass_features.items():
1153                    mass_features_group.create_group(str(k))
1154                    # Loop through each of the mass feature attributes and add them as attributes (if single value) or datasets (if array)
1155                    for k2, v2 in v.__dict__.items():
1156                        if v2 is not None:
1157                            # Check if the attribute is an integer or float and set as an attribute in the mass feature group
1158                            if k2 not in [
1159                                "chromatogram_parent",
1160                                "ms2_mass_spectra",
1161                                "mass_spectrum",
1162                                "_eic_data",
1163                                "ms2_similarity_results",
1164                            ]:
1165                                if k2 == "ms2_scan_numbers":
1166                                    array = np.array(v2)
1167                                    # Convert int64 to int32
1168                                    if array.dtype == np.int64:
1169                                        array = array.astype(np.int32)
1170                                    mass_features_group[str(k)].create_dataset(
1171                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1172                                    )
1173                                elif k2 == "_half_height_width":
1174                                    array = np.array(v2)
1175                                    # Convert float64 to float32
1176                                    if array.dtype == np.float64:
1177                                        array = array.astype(np.float32)
1178                                    mass_features_group[str(k)].create_dataset(
1179                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1180                                    )
1181                                elif k2 == "_ms_deconvoluted_idx":
1182                                    array = np.array(v2)
1183                                    # Convert int64 to int32
1184                                    if array.dtype == np.int64:
1185                                        array = array.astype(np.int32)
1186                                    mass_features_group[str(k)].create_dataset(
1187                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1188                                    )
1189                                elif k2 == "associated_mass_features_deconvoluted":
1190                                    array = np.array(v2)
1191                                    # Convert int64 to int32
1192                                    if array.dtype == np.int64:
1193                                        array = array.astype(np.int32)
1194                                    mass_features_group[str(k)].create_dataset(
1195                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1196                                    )
1197                                elif k2 == "_noise_score":
1198                                    array = np.array(v2)
1199                                    # Convert float64 to float32
1200                                    if array.dtype == np.float64:
1201                                        array = array.astype(np.float32)
1202                                    mass_features_group[str(k)].create_dataset(
1203                                        str(k2), data=array, compression="gzip", compression_opts=9, chunks=True
1204                                    )
1205                                elif (
1206                                    isinstance(v2, int)
1207                                    or isinstance(v2, float)
1208                                    or isinstance(v2, str)
1209                                    or isinstance(v2, np.integer)
1210                                    or isinstance(v2, np.float32)
1211                                    or isinstance(v2, np.float64)
1212                                    or isinstance(v2, np.bool_)
1213                                ):
1214                                    # Convert numpy types to smaller precision for storage
1215                                    if isinstance(v2, np.int64):
1216                                        v2 = np.int32(v2)
1217                                    elif isinstance(v2, np.float64):
1218                                        v2 = np.float32(v2)
1219                                    mass_features_group[str(k)].attrs[str(k2)] = v2
1220                                else:
1221                                    raise TypeError(
1222                                        f"Attribute {k2} is not an integer, float, or string and cannot be added to the hdf5 file"
1223                                    )
1224
1225            # Add EIC data to hdf5 file
1226            export_eics = self.mass_spectra.parameters.lc_ms.export_eics
1227            if len(self.mass_spectra.eics) > 0 and export_eics:
1228                if "eics" not in hdf_handle:
1229                    eic_group = hdf_handle.create_group("eics")
1230                else:
1231                    eic_group = hdf_handle.get("eics")
1232
1233                # Create group for each eic
1234                for k, v in self.mass_spectra.eics.items():
1235                    eic_group.create_group(str(k))
1236                    eic_group[str(k)].attrs["mz"] = k
1237                    # Loop through each of the attributes and add them as datasets (if array)
1238                    for k2, v2 in v.__dict__.items():
1239                        if v2 is not None:
1240                            array = np.array(v2)
1241                            # Apply data type optimization and compression
1242                            if array.dtype == np.int64:
1243                                array = array.astype(np.int32)
1244                            elif array.dtype == np.float64:
1245                                array = array.astype(np.float32)
1246                            elif array.dtype.str[0:2] == "<U":
1247                                # Convert Unicode strings to UTF-8 encoded strings
1248                                string_data = [str(item) for item in array]
1249                                string_dtype = h5py.string_dtype(encoding='utf-8')
1250                                eic_group[str(k)].create_dataset(str(k2), data=string_data, dtype=string_dtype, compression="gzip", compression_opts=9, chunks=True)
1251                                continue
1252                            eic_group[str(k)].create_dataset(str(k2), data=array, compression="gzip", compression_opts=9, chunks=True)
1253
1254            # Add ms2_search results to hdf5 file (parameterized)
1255            if len(self.mass_spectra.spectral_search_results) > 0:
1256                if "spectral_search_results" not in hdf_handle:
1257                    spectral_search_results = hdf_handle.create_group(
1258                        "spectral_search_results"
1259                    )
1260                else:
1261                    spectral_search_results = hdf_handle.get("spectral_search_results")
1262                # Create group for each search result by ms2_scan / precursor_mz
1263                for k, v in self.mass_spectra.spectral_search_results.items():
1264                    # If parameter is set, only export spectral search results for relevant scans
1265                    if export_only_relevant and k not in relevant_scan_numbers:
1266                        continue
1267                    spectral_search_results.create_group(str(k))
1268                    for k2, v2 in v.items():
1269                        spectral_search_results[str(k)].create_group(str(k2))
1270                        spectral_search_results[str(k)][str(k2)].attrs[
1271                            "precursor_mz"
1272                        ] = v2.precursor_mz
1273                        spectral_search_results[str(k)][str(k2)].attrs[
1274                            "query_spectrum_id"
1275                        ] = v2.query_spectrum_id
1276                        # Loop through each of the attributes and add them as datasets (if array)
1277                        for k3, v3 in v2.__dict__.items():
1278                            if v3 is not None and k3 not in [
1279                                "query_spectrum",
1280                                "precursor_mz",
1281                                "query_spectrum_id",
1282                            ]:
1283                                if k3 == "query_frag_types" or k3 == "ref_frag_types":
1284                                    v3 = [", ".join(x) for x in v3]
1285                                if all(v3 is not None for v3 in v3):
1286                                    array = np.array(v3)
1287                                if array.dtype.str[0:2] == "<U":
1288                                    array = array.astype("S")
1289                                spectral_search_results[str(k)][str(k2)].create_dataset(
1290                                    str(k3), data=array
1291                                )
1292        if save_parameters:
1293            # Check if parameter_format is valid
1294            if parameter_format not in ["json", "toml"]:
1295                raise ValueError("parameter_format must be 'json' or 'toml'")
1296
1297            if parameter_format == "json":
1298                dump_lcms_settings_json(
1299                    filename=self.output_file.with_suffix(".json"),
1300                    lcms_obj=self.mass_spectra,
1301                )
1302            elif parameter_format == "toml":
1303                dump_lcms_settings_toml(
1304                    filename=self.output_file.with_suffix(".toml"),
1305                    lcms_obj=self.mass_spectra,
1306                )

Export the data to an HDF5.

Parameters
  • overwrite (bool, optional): Whether to overwrite the output file. Default is False.
  • save_parameters (bool, optional): Whether to save the parameters as a separate json or toml file. Default is True.
  • parameter_format (str, optional): The format to save the parameters in. Default is 'toml'.
Raises
  • ValueError: If parameter_format is not 'json' or 'toml'.
class LCMSMetabolomicsExport(LCMSExport):
1308class LCMSMetabolomicsExport(LCMSExport):
1309    """A class to export LCMS metabolite data.
1310
1311    This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report.
1312
1313    Parameters
1314    ----------
1315    out_file_path : str | Path
1316        The output file path, do not include the file extension.
1317    mass_spectra : object
1318        The high resolution mass spectra object.
1319    """
1320
1321    def __init__(self, out_file_path, mass_spectra):
1322        super().__init__(out_file_path, mass_spectra)
1323        self.ion_type_dict = ion_type_dict
1324    
1325    @staticmethod
1326    def get_ion_formula(neutral_formula, ion_type):
1327        """From a neutral formula and an ion type, return the formula of the ion.
1328
1329        Notes
1330        -----
1331        This is a static method.
1332        If the neutral_formula is not a string, this method will return None.
1333
1334        Parameters
1335        ----------
1336        neutral_formula : str
1337            The neutral formula, this should be a string form from the MolecularFormula class
1338            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1339            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1340            e.g. MgCl2 is parsed as 'Mg Cl2.
1341        ion_type : str
1342            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1343            See the self.ion_type_dict for the available ion types.
1344
1345        Returns
1346        -------
1347        str
1348            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1349        """
1350        # If neutral_formula is not a string, return None
1351        if not isinstance(neutral_formula, str):
1352            return None
1353
1354        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1355        if re.search(r"\s", neutral_formula):
1356            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1357        else:
1358            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1359            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1360            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1361            neutral_formula = MolecularFormula(
1362                dict(
1363                    zip(
1364                        [x[0] for x in elements],
1365                        [int(x[0]) if x else 1 for x in counts],
1366                    )
1367                ),
1368                ion_charge=0,
1369            )
1370        neutral_formula_dict = neutral_formula.to_dict().copy()
1371
1372        adduct_add_dict = ion_type_dict[ion_type][0]
1373        for key in adduct_add_dict:
1374            if key in neutral_formula_dict.keys():
1375                neutral_formula_dict[key] += adduct_add_dict[key]
1376            else:
1377                neutral_formula_dict[key] = adduct_add_dict[key]
1378
1379        adduct_subtract = ion_type_dict[ion_type][1]
1380        for key in adduct_subtract:
1381            neutral_formula_dict[key] -= adduct_subtract[key]
1382
1383        return MolecularFormula(neutral_formula_dict, ion_charge=0).string
1384
1385    @staticmethod
1386    def get_isotope_type(ion_formula):
1387        """From an ion formula, return the 13C isotope type of the ion.
1388
1389        Notes
1390        -----
1391        This is a static method.
1392        If the ion_formula is not a string, this method will return None.
1393        This is currently only functional for 13C isotopes.
1394
1395        Parameters
1396        ----------
1397        ion_formula : str
1398            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1399
1400        Returns
1401        -------
1402        str
1403            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1404
1405        Raises
1406        ------
1407        ValueError
1408            If the ion_formula is not a string.
1409        """
1410        if not isinstance(ion_formula, str):
1411            return None
1412
1413        if re.search(r"\s", ion_formula):
1414            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1415        else:
1416            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1417        ion_formula_dict = ion_formula.to_dict().copy()
1418
1419        try:
1420            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1421        except KeyError:
1422            iso_class = None
1423
1424        return iso_class
1425    
1426    def report_to_csv(self, molecular_metadata=None):
1427        """Create a report of the mass features and their annotations and save it as a CSV file.
1428
1429        Parameters
1430        ----------
1431        molecular_metadata : dict, optional
1432            The molecular metadata. Default is None.
1433        """
1434        report = self.to_report(molecular_metadata=molecular_metadata)
1435        out_file = self.output_file.with_suffix(".csv")
1436        report.to_csv(out_file, index=False)
1437    
1438    def clean_ms1_report(self, ms1_summary_full):
1439        """Clean the MS1 report.
1440
1441        Parameters
1442        ----------
1443        ms1_summary_full : DataFrame
1444            The full MS1 summary DataFrame.
1445
1446        Returns
1447        -------
1448        DataFrame
1449            The cleaned MS1 summary DataFrame.
1450        """
1451        ms1_summary_full = ms1_summary_full.reset_index()
1452        cols_to_keep = [
1453            "mf_id",
1454            "Molecular Formula",
1455            "Ion Type",
1456            "Calculated m/z",
1457            "m/z Error (ppm)",
1458            "m/z Error Score",
1459            "Is Isotopologue",
1460            "Isotopologue Similarity",
1461            "Confidence Score",
1462        ]
1463        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1464        ms1_summary["ion_formula"] = [
1465            self.get_ion_formula(f, a)
1466            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1467        ]
1468        ms1_summary["isotopologue_type"] = [
1469            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1470        ]
1471
1472        # Reorder columns
1473        ms1_summary = ms1_summary[
1474            [
1475                "mf_id",
1476                "ion_formula",
1477                "isotopologue_type",
1478                "Calculated m/z",
1479                "m/z Error (ppm)",
1480                "m/z Error Score",
1481                "Isotopologue Similarity",
1482                "Confidence Score",
1483            ]
1484        ]
1485
1486        # Set the index to mf_id
1487        ms1_summary = ms1_summary.set_index("mf_id")
1488
1489        return ms1_summary
1490    
1491    def summarize_ms2_report(self, ms2_annot_report):
1492        """
1493        Summarize the MS2 report.
1494
1495        Parameters
1496        ----------
1497        ms2_annot_report : DataFrame
1498            The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
1499        
1500        Returns
1501        -------
1502        """
1503    
1504    def summarize_metabolomics_report(self, ms2_annot_report):
1505        """Summarize the MS2 hits for a metabolomics report
1506        
1507        Parameters
1508        ----------
1509        ms2_annot : DataFrame
1510            The MS2 annotation DataFrame with all annotations.
1511
1512        Returns
1513        -------
1514        DataFrame
1515            The summarized metabolomics report.
1516        """
1517        columns_to_drop = [
1518            "precursor_mz",
1519            "precursor_mz_error_ppm",
1520            "cas",
1521            "data_id",
1522            "iupac_name",
1523            "traditional_name",
1524            "common_name",
1525            "casno",
1526        ]
1527        ms2_annot = ms2_annot_report.drop(
1528            columns=[col for col in columns_to_drop if col in ms2_annot_report.columns]
1529        )
1530        
1531        # Prepare information about the search results, pulling out the best hit for the single report
1532        # Group by mf_id,ref_mol_id grab row with highest entropy similarity
1533        ms2_annot = ms2_annot.reset_index()
1534        # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id
1535        ms2_annot["n_spectra_contributing"] = (
1536            ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"]
1537            .transform("nunique")
1538        )
1539        # Sort by entropy similarity
1540        ms2_annot = ms2_annot.sort_values(
1541            by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False]
1542        )
1543        best_entropy = ms2_annot.drop_duplicates(
1544            subset=["mf_id", "ref_mol_id"], keep="first"
1545        )
1546
1547        return best_entropy
1548
1549    def clean_ms2_report(self, metabolite_summary):
1550        """Clean the MS2 report.
1551
1552        Parameters
1553        ----------
1554        metabolite_summary : DataFrame
1555            The full metabolomics summary DataFrame.
1556
1557        Returns
1558        -------
1559        DataFrame
1560            The cleaned metabolomics summary DataFrame.
1561        """
1562        metabolite_summary = metabolite_summary.reset_index()
1563        metabolite_summary["ion_formula"] = [
1564            self.get_ion_formula(f, a)
1565            for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"])
1566        ]
1567
1568        col_order = [
1569            "mf_id",
1570            "ion_formula",
1571            "ref_ion_type",
1572            "formula",
1573            "inchikey",
1574            "name",
1575            "inchi",
1576            "chebi",
1577            "smiles",
1578            "kegg",
1579            "cas",
1580            "database_name",
1581            "ref_ms_id",
1582            "entropy_similarity",
1583            "ref_mz_in_query_fract",
1584            "n_spectra_contributing",
1585        ]
1586
1587        # Reorder columns
1588        metabolite_summary = metabolite_summary[
1589            [col for col in col_order if col in metabolite_summary.columns]
1590        ]
1591
1592        # Convert chebi (if present) to int:
1593        if "chebi" in metabolite_summary.columns:
1594            metabolite_summary["chebi"] = metabolite_summary["chebi"].astype(
1595                "Int64", errors="ignore"
1596            )
1597
1598        # Set the index to mf_id
1599        metabolite_summary = metabolite_summary.set_index("mf_id")
1600
1601        return metabolite_summary
1602    
1603    def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report):
1604        """Combine the mass feature report with the MS1 and MS2 reports.
1605
1606        Parameters
1607        ----------
1608        mf_report : DataFrame
1609            The mass feature report DataFrame.
1610        ms1_annot_report : DataFrame
1611            The MS1 annotation report DataFrame.
1612        ms2_annot_report : DataFrame
1613            The MS2 annotation report DataFrame.
1614        """
1615        # If there is an ms1_annot_report, merge it with the mf_report
1616        if not ms1_annot_report.empty:
1617            # MS1 has been run and has molecular formula information
1618            mf_report = pd.merge(
1619                mf_report,
1620                ms1_annot_report,
1621                how="left",
1622                on=["mf_id", "isotopologue_type"],
1623            )
1624        if ms2_annot_report is not None:
1625            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1626            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1627            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1628            mf_no_ion_formula = pd.merge(
1629                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1630            )
1631
1632            # pull out the records with ion_formula
1633            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1634            mf_with_ion_formula = pd.merge(
1635                mf_with_ion_formula,
1636                ms2_annot_report,
1637                how="left",
1638                on=["mf_id", "ion_formula"],
1639            )
1640
1641            # put back together
1642            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1643
1644        # Rename colums
1645        rename_dict = {
1646            "mf_id": "Mass Feature ID",
1647            "scan_time": "Retention Time (min)",
1648            "mz": "m/z",
1649            "apex_scan": "Apex Scan Number",
1650            "intensity": "Intensity",
1651            "persistence": "Persistence",
1652            "area": "Area",
1653            "half_height_width": "Half Height Width (min)",
1654            "tailing_factor": "Tailing Factor",
1655            "dispersity_index": "Dispersity Index",
1656            "ms2_spectrum": "MS2 Spectrum",
1657            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1658            "isotopologue_type": "Isotopologue Type",
1659            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1660            "associated_mass_features": "Associated Mass Features after Deconvolution",
1661            "ion_formula": "Ion Formula",
1662            "formula": "Molecular Formula",
1663            "ref_ion_type": "Ion Type",
1664            "annot_level": "Lipid Annotation Level",
1665            "lipid_molecular_species_id": "Lipid Molecular Species",
1666            "lipid_summed_name": "Lipid Species",
1667            "lipid_subclass": "Lipid Subclass",
1668            "lipid_class": "Lipid Class",
1669            "lipid_category": "Lipid Category",
1670            "entropy_similarity": "Entropy Similarity",
1671            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1672            "n_spectra_contributing": "Spectra with Annotation (n)",
1673        }
1674        mf_report = mf_report.rename(columns=rename_dict)
1675        mf_report["Sample Name"] = self.mass_spectra.sample_name
1676        mf_report["Polarity"] = self.mass_spectra.polarity
1677        mf_report = mf_report[
1678            ["Mass Feature ID", "Sample Name", "Polarity"]
1679            + [
1680                col
1681                for col in mf_report.columns
1682                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1683            ]
1684        ]
1685
1686        # Reorder rows by "Mass Feature ID", then "Entropy Similarity" (descending), then "Confidence Score" (descending)
1687        if "Entropy Similarity" in mf_report.columns:
1688            mf_report = mf_report.sort_values(
1689                by=["Mass Feature ID", "Entropy Similarity", "Confidence Score"],
1690                ascending=[True, False, False],
1691            )
1692        elif "Confidence Score" in mf_report.columns:
1693             mf_report = mf_report.sort_values(
1694                by=["Mass Feature ID", "Confidence Score"],
1695                ascending=[True, False],
1696            )
1697        # If neither "Entropy Similarity" nor "Confidence Score" are in the columns, just sort by "Mass Feature ID"
1698        else:
1699            mf_report = mf_report.sort_values("Mass Feature ID")
1700
1701        # Reset index
1702        mf_report = mf_report.reset_index(drop=True)
1703
1704        return mf_report
1705    
1706    def to_report(self, molecular_metadata=None):
1707        """Create a report of the mass features and their annotations.
1708
1709        Parameters
1710        ----------
1711        molecular_metadata : dict, optional
1712            The molecular metadata. Default is None.
1713
1714        Returns
1715        -------
1716        DataFrame
1717            The report as a Pandas DataFrame.
1718        """
1719        # Get mass feature dataframe
1720        mf_report = self.mass_spectra.mass_features_to_df()
1721        mf_report = mf_report.reset_index(drop=False)
1722
1723        # Get and clean ms1 annotation dataframe
1724        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1725        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1726        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1727
1728        # Get, summarize, and clean ms2 annotation dataframe
1729        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1730            molecular_metadata=molecular_metadata
1731        )
1732        if ms2_annot_report is not None and molecular_metadata is not None:
1733            ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report)
1734            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1735            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1736            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1737        else:
1738            ms2_annot_report = None
1739
1740        report = self.combine_reports(
1741            mf_report=mf_report,
1742            ms1_annot_report=ms1_annot_report,
1743            ms2_annot_report=ms2_annot_report
1744        )
1745
1746        return report

A class to export LCMS metabolite data.

This class provides methods to export LCMS metabolite data to various formats and summarize the metabolite report.

Parameters
  • out_file_path (str | Path): The output file path, do not include the file extension.
  • mass_spectra (object): The high resolution mass spectra object.
LCMSMetabolomicsExport(out_file_path, mass_spectra)
1321    def __init__(self, out_file_path, mass_spectra):
1322        super().__init__(out_file_path, mass_spectra)
1323        self.ion_type_dict = ion_type_dict

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

ion_type_dict
@staticmethod
def get_ion_formula(neutral_formula, ion_type):
1325    @staticmethod
1326    def get_ion_formula(neutral_formula, ion_type):
1327        """From a neutral formula and an ion type, return the formula of the ion.
1328
1329        Notes
1330        -----
1331        This is a static method.
1332        If the neutral_formula is not a string, this method will return None.
1333
1334        Parameters
1335        ----------
1336        neutral_formula : str
1337            The neutral formula, this should be a string form from the MolecularFormula class
1338            (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case).
1339            In the case of a simple string, the atoms are parsed based on the presence of capital letters,
1340            e.g. MgCl2 is parsed as 'Mg Cl2.
1341        ion_type : str
1342            The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc.
1343            See the self.ion_type_dict for the available ion types.
1344
1345        Returns
1346        -------
1347        str
1348            The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
1349        """
1350        # If neutral_formula is not a string, return None
1351        if not isinstance(neutral_formula, str):
1352            return None
1353
1354        # Check if there are spaces in the formula (these are outputs of the MolecularFormula class and do not need to be processed before being passed to the class)
1355        if re.search(r"\s", neutral_formula):
1356            neutral_formula = MolecularFormula(neutral_formula, ion_charge=0)
1357        else:
1358            form_pre = re.sub(r"([A-Z])", r" \1", neutral_formula)[1:]
1359            elements = [re.findall(r"[A-Z][a-z]*", x) for x in form_pre.split()]
1360            counts = [re.findall(r"\d+", x) for x in form_pre.split()]
1361            neutral_formula = MolecularFormula(
1362                dict(
1363                    zip(
1364                        [x[0] for x in elements],
1365                        [int(x[0]) if x else 1 for x in counts],
1366                    )
1367                ),
1368                ion_charge=0,
1369            )
1370        neutral_formula_dict = neutral_formula.to_dict().copy()
1371
1372        adduct_add_dict = ion_type_dict[ion_type][0]
1373        for key in adduct_add_dict:
1374            if key in neutral_formula_dict.keys():
1375                neutral_formula_dict[key] += adduct_add_dict[key]
1376            else:
1377                neutral_formula_dict[key] = adduct_add_dict[key]
1378
1379        adduct_subtract = ion_type_dict[ion_type][1]
1380        for key in adduct_subtract:
1381            neutral_formula_dict[key] -= adduct_subtract[key]
1382
1383        return MolecularFormula(neutral_formula_dict, ion_charge=0).string

From a neutral formula and an ion type, return the formula of the ion.

Notes

This is a static method. If the neutral_formula is not a string, this method will return None.

Parameters
  • neutral_formula (str): The neutral formula, this should be a string form from the MolecularFormula class (e.g. 'C2 H4 O2', isotopes OK), or simple string (e.g. 'C2H4O2', no isotope handling in this case). In the case of a simple string, the atoms are parsed based on the presence of capital letters, e.g. MgCl2 is parsed as 'Mg Cl2.
  • ion_type (str): The ion type, e.g. 'protonated', '[M+H]+', '[M+Na]+', etc. See the self.ion_type_dict for the available ion types.
Returns
  • str: The formula of the ion as a string (like 'C2 H4 O2'); or None if the neutral_formula is not a string.
@staticmethod
def get_isotope_type(ion_formula):
1385    @staticmethod
1386    def get_isotope_type(ion_formula):
1387        """From an ion formula, return the 13C isotope type of the ion.
1388
1389        Notes
1390        -----
1391        This is a static method.
1392        If the ion_formula is not a string, this method will return None.
1393        This is currently only functional for 13C isotopes.
1394
1395        Parameters
1396        ----------
1397        ion_formula : str
1398            The formula of the ion, expected to be a string like 'C2 H4 O2'.
1399
1400        Returns
1401        -------
1402        str
1403            The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
1404
1405        Raises
1406        ------
1407        ValueError
1408            If the ion_formula is not a string.
1409        """
1410        if not isinstance(ion_formula, str):
1411            return None
1412
1413        if re.search(r"\s", ion_formula):
1414            ion_formula = MolecularFormula(ion_formula, ion_charge=0)
1415        else:
1416            raise ValueError('ion_formula should be a string like "C2 H4 O2"')
1417        ion_formula_dict = ion_formula.to_dict().copy()
1418
1419        try:
1420            iso_class = "13C" + str(ion_formula_dict.pop("13C"))
1421        except KeyError:
1422            iso_class = None
1423
1424        return iso_class

From an ion formula, return the 13C isotope type of the ion.

Notes

This is a static method. If the ion_formula is not a string, this method will return None. This is currently only functional for 13C isotopes.

Parameters
  • ion_formula (str): The formula of the ion, expected to be a string like 'C2 H4 O2'.
Returns
  • str: The isotope type of the ion, e.g. '13C1', '13C2', etc; or None if the ion_formula does not contain a 13C isotope.
Raises
  • ValueError: If the ion_formula is not a string.
def report_to_csv(self, molecular_metadata=None):
1426    def report_to_csv(self, molecular_metadata=None):
1427        """Create a report of the mass features and their annotations and save it as a CSV file.
1428
1429        Parameters
1430        ----------
1431        molecular_metadata : dict, optional
1432            The molecular metadata. Default is None.
1433        """
1434        report = self.to_report(molecular_metadata=molecular_metadata)
1435        out_file = self.output_file.with_suffix(".csv")
1436        report.to_csv(out_file, index=False)

Create a report of the mass features and their annotations and save it as a CSV file.

Parameters
  • molecular_metadata (dict, optional): The molecular metadata. Default is None.
def clean_ms1_report(self, ms1_summary_full):
1438    def clean_ms1_report(self, ms1_summary_full):
1439        """Clean the MS1 report.
1440
1441        Parameters
1442        ----------
1443        ms1_summary_full : DataFrame
1444            The full MS1 summary DataFrame.
1445
1446        Returns
1447        -------
1448        DataFrame
1449            The cleaned MS1 summary DataFrame.
1450        """
1451        ms1_summary_full = ms1_summary_full.reset_index()
1452        cols_to_keep = [
1453            "mf_id",
1454            "Molecular Formula",
1455            "Ion Type",
1456            "Calculated m/z",
1457            "m/z Error (ppm)",
1458            "m/z Error Score",
1459            "Is Isotopologue",
1460            "Isotopologue Similarity",
1461            "Confidence Score",
1462        ]
1463        ms1_summary = ms1_summary_full[cols_to_keep].copy()
1464        ms1_summary["ion_formula"] = [
1465            self.get_ion_formula(f, a)
1466            for f, a in zip(ms1_summary["Molecular Formula"], ms1_summary["Ion Type"])
1467        ]
1468        ms1_summary["isotopologue_type"] = [
1469            self.get_isotope_type(f) for f in ms1_summary["ion_formula"].tolist()
1470        ]
1471
1472        # Reorder columns
1473        ms1_summary = ms1_summary[
1474            [
1475                "mf_id",
1476                "ion_formula",
1477                "isotopologue_type",
1478                "Calculated m/z",
1479                "m/z Error (ppm)",
1480                "m/z Error Score",
1481                "Isotopologue Similarity",
1482                "Confidence Score",
1483            ]
1484        ]
1485
1486        # Set the index to mf_id
1487        ms1_summary = ms1_summary.set_index("mf_id")
1488
1489        return ms1_summary

Clean the MS1 report.

Parameters
  • ms1_summary_full (DataFrame): The full MS1 summary DataFrame.
Returns
  • DataFrame: The cleaned MS1 summary DataFrame.
def summarize_ms2_report(self, ms2_annot_report):
1491    def summarize_ms2_report(self, ms2_annot_report):
1492        """
1493        Summarize the MS2 report.
1494
1495        Parameters
1496        ----------
1497        ms2_annot_report : DataFrame
1498            The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
1499        
1500        Returns
1501        -------
1502        """

Summarize the MS2 report.

Parameters
  • ms2_annot_report (DataFrame): The MS2 annotation DataFrame with all annotations, output of mass_features_ms2_annot_to_df.
  • Returns
  • -------
def summarize_metabolomics_report(self, ms2_annot_report):
1504    def summarize_metabolomics_report(self, ms2_annot_report):
1505        """Summarize the MS2 hits for a metabolomics report
1506        
1507        Parameters
1508        ----------
1509        ms2_annot : DataFrame
1510            The MS2 annotation DataFrame with all annotations.
1511
1512        Returns
1513        -------
1514        DataFrame
1515            The summarized metabolomics report.
1516        """
1517        columns_to_drop = [
1518            "precursor_mz",
1519            "precursor_mz_error_ppm",
1520            "cas",
1521            "data_id",
1522            "iupac_name",
1523            "traditional_name",
1524            "common_name",
1525            "casno",
1526        ]
1527        ms2_annot = ms2_annot_report.drop(
1528            columns=[col for col in columns_to_drop if col in ms2_annot_report.columns]
1529        )
1530        
1531        # Prepare information about the search results, pulling out the best hit for the single report
1532        # Group by mf_id,ref_mol_id grab row with highest entropy similarity
1533        ms2_annot = ms2_annot.reset_index()
1534        # Add column called "n_spectra_contributing" that is the number of unique values in query_spectrum_id per mf_id,ref_mol_id
1535        ms2_annot["n_spectra_contributing"] = (
1536            ms2_annot.groupby(["mf_id", "ref_mol_id"])["query_spectrum_id"]
1537            .transform("nunique")
1538        )
1539        # Sort by entropy similarity
1540        ms2_annot = ms2_annot.sort_values(
1541            by=["mf_id", "ref_mol_id", "entropy_similarity"], ascending=[True, True, False]
1542        )
1543        best_entropy = ms2_annot.drop_duplicates(
1544            subset=["mf_id", "ref_mol_id"], keep="first"
1545        )
1546
1547        return best_entropy

Summarize the MS2 hits for a metabolomics report

Parameters
  • ms2_annot (DataFrame): The MS2 annotation DataFrame with all annotations.
Returns
  • DataFrame: The summarized metabolomics report.
def clean_ms2_report(self, metabolite_summary):
1549    def clean_ms2_report(self, metabolite_summary):
1550        """Clean the MS2 report.
1551
1552        Parameters
1553        ----------
1554        metabolite_summary : DataFrame
1555            The full metabolomics summary DataFrame.
1556
1557        Returns
1558        -------
1559        DataFrame
1560            The cleaned metabolomics summary DataFrame.
1561        """
1562        metabolite_summary = metabolite_summary.reset_index()
1563        metabolite_summary["ion_formula"] = [
1564            self.get_ion_formula(f, a)
1565            for f, a in zip(metabolite_summary["formula"], metabolite_summary["ref_ion_type"])
1566        ]
1567
1568        col_order = [
1569            "mf_id",
1570            "ion_formula",
1571            "ref_ion_type",
1572            "formula",
1573            "inchikey",
1574            "name",
1575            "inchi",
1576            "chebi",
1577            "smiles",
1578            "kegg",
1579            "cas",
1580            "database_name",
1581            "ref_ms_id",
1582            "entropy_similarity",
1583            "ref_mz_in_query_fract",
1584            "n_spectra_contributing",
1585        ]
1586
1587        # Reorder columns
1588        metabolite_summary = metabolite_summary[
1589            [col for col in col_order if col in metabolite_summary.columns]
1590        ]
1591
1592        # Convert chebi (if present) to int:
1593        if "chebi" in metabolite_summary.columns:
1594            metabolite_summary["chebi"] = metabolite_summary["chebi"].astype(
1595                "Int64", errors="ignore"
1596            )
1597
1598        # Set the index to mf_id
1599        metabolite_summary = metabolite_summary.set_index("mf_id")
1600
1601        return metabolite_summary

Clean the MS2 report.

Parameters
  • metabolite_summary (DataFrame): The full metabolomics summary DataFrame.
Returns
  • DataFrame: The cleaned metabolomics summary DataFrame.
def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report):
1603    def combine_reports(self, mf_report, ms1_annot_report, ms2_annot_report):
1604        """Combine the mass feature report with the MS1 and MS2 reports.
1605
1606        Parameters
1607        ----------
1608        mf_report : DataFrame
1609            The mass feature report DataFrame.
1610        ms1_annot_report : DataFrame
1611            The MS1 annotation report DataFrame.
1612        ms2_annot_report : DataFrame
1613            The MS2 annotation report DataFrame.
1614        """
1615        # If there is an ms1_annot_report, merge it with the mf_report
1616        if not ms1_annot_report.empty:
1617            # MS1 has been run and has molecular formula information
1618            mf_report = pd.merge(
1619                mf_report,
1620                ms1_annot_report,
1621                how="left",
1622                on=["mf_id", "isotopologue_type"],
1623            )
1624        if ms2_annot_report is not None:
1625            # pull out the records with ion_formula and drop the ion_formula column (these should be empty if MS1 molecular formula assignment is working correctly)
1626            mf_no_ion_formula = mf_report[mf_report["ion_formula"].isna()]
1627            mf_no_ion_formula = mf_no_ion_formula.drop(columns=["ion_formula"])
1628            mf_no_ion_formula = pd.merge(
1629                mf_no_ion_formula, ms2_annot_report, how="left", on=["mf_id"]
1630            )
1631
1632            # pull out the records with ion_formula
1633            mf_with_ion_formula = mf_report[~mf_report["ion_formula"].isna()]
1634            mf_with_ion_formula = pd.merge(
1635                mf_with_ion_formula,
1636                ms2_annot_report,
1637                how="left",
1638                on=["mf_id", "ion_formula"],
1639            )
1640
1641            # put back together
1642            mf_report = pd.concat([mf_no_ion_formula, mf_with_ion_formula])
1643
1644        # Rename colums
1645        rename_dict = {
1646            "mf_id": "Mass Feature ID",
1647            "scan_time": "Retention Time (min)",
1648            "mz": "m/z",
1649            "apex_scan": "Apex Scan Number",
1650            "intensity": "Intensity",
1651            "persistence": "Persistence",
1652            "area": "Area",
1653            "half_height_width": "Half Height Width (min)",
1654            "tailing_factor": "Tailing Factor",
1655            "dispersity_index": "Dispersity Index",
1656            "ms2_spectrum": "MS2 Spectrum",
1657            "monoisotopic_mf_id": "Monoisotopic Mass Feature ID",
1658            "isotopologue_type": "Isotopologue Type",
1659            "mass_spectrum_deconvoluted_parent": "Is Largest Ion after Deconvolution",
1660            "associated_mass_features": "Associated Mass Features after Deconvolution",
1661            "ion_formula": "Ion Formula",
1662            "formula": "Molecular Formula",
1663            "ref_ion_type": "Ion Type",
1664            "annot_level": "Lipid Annotation Level",
1665            "lipid_molecular_species_id": "Lipid Molecular Species",
1666            "lipid_summed_name": "Lipid Species",
1667            "lipid_subclass": "Lipid Subclass",
1668            "lipid_class": "Lipid Class",
1669            "lipid_category": "Lipid Category",
1670            "entropy_similarity": "Entropy Similarity",
1671            "ref_mz_in_query_fract": "Library mzs in Query (fraction)",
1672            "n_spectra_contributing": "Spectra with Annotation (n)",
1673        }
1674        mf_report = mf_report.rename(columns=rename_dict)
1675        mf_report["Sample Name"] = self.mass_spectra.sample_name
1676        mf_report["Polarity"] = self.mass_spectra.polarity
1677        mf_report = mf_report[
1678            ["Mass Feature ID", "Sample Name", "Polarity"]
1679            + [
1680                col
1681                for col in mf_report.columns
1682                if col not in ["Mass Feature ID", "Sample Name", "Polarity"]
1683            ]
1684        ]
1685
1686        # Reorder rows by "Mass Feature ID", then "Entropy Similarity" (descending), then "Confidence Score" (descending)
1687        if "Entropy Similarity" in mf_report.columns:
1688            mf_report = mf_report.sort_values(
1689                by=["Mass Feature ID", "Entropy Similarity", "Confidence Score"],
1690                ascending=[True, False, False],
1691            )
1692        elif "Confidence Score" in mf_report.columns:
1693             mf_report = mf_report.sort_values(
1694                by=["Mass Feature ID", "Confidence Score"],
1695                ascending=[True, False],
1696            )
1697        # If neither "Entropy Similarity" nor "Confidence Score" are in the columns, just sort by "Mass Feature ID"
1698        else:
1699            mf_report = mf_report.sort_values("Mass Feature ID")
1700
1701        # Reset index
1702        mf_report = mf_report.reset_index(drop=True)
1703
1704        return mf_report

Combine the mass feature report with the MS1 and MS2 reports.

Parameters
  • mf_report (DataFrame): The mass feature report DataFrame.
  • ms1_annot_report (DataFrame): The MS1 annotation report DataFrame.
  • ms2_annot_report (DataFrame): The MS2 annotation report DataFrame.
def to_report(self, molecular_metadata=None):
1706    def to_report(self, molecular_metadata=None):
1707        """Create a report of the mass features and their annotations.
1708
1709        Parameters
1710        ----------
1711        molecular_metadata : dict, optional
1712            The molecular metadata. Default is None.
1713
1714        Returns
1715        -------
1716        DataFrame
1717            The report as a Pandas DataFrame.
1718        """
1719        # Get mass feature dataframe
1720        mf_report = self.mass_spectra.mass_features_to_df()
1721        mf_report = mf_report.reset_index(drop=False)
1722
1723        # Get and clean ms1 annotation dataframe
1724        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
1725        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
1726        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
1727
1728        # Get, summarize, and clean ms2 annotation dataframe
1729        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
1730            molecular_metadata=molecular_metadata
1731        )
1732        if ms2_annot_report is not None and molecular_metadata is not None:
1733            ms2_annot_report = self.summarize_metabolomics_report(ms2_annot_report)
1734            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
1735            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
1736            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
1737        else:
1738            ms2_annot_report = None
1739
1740        report = self.combine_reports(
1741            mf_report=mf_report,
1742            ms1_annot_report=ms1_annot_report,
1743            ms2_annot_report=ms2_annot_report
1744        )
1745
1746        return report

Create a report of the mass features and their annotations.

Parameters
  • molecular_metadata (dict, optional): The molecular metadata. Default is None.
Returns
  • DataFrame: The report as a Pandas DataFrame.
class LipidomicsExport(LCMSMetabolomicsExport):
1747class LipidomicsExport(LCMSMetabolomicsExport):
1748    """A class to export lipidomics data.
1749
1750    This class provides methods to export lipidomics data to various formats and summarize the lipid report.
1751
1752    Parameters
1753    ----------
1754    out_file_path : str | Path
1755        The output file path, do not include the file extension.
1756    mass_spectra : object
1757        The high resolution mass spectra object.
1758    """
1759
1760    def __init__(self, out_file_path, mass_spectra):
1761        super().__init__(out_file_path, mass_spectra)
1762
1763    def summarize_lipid_report(self, ms2_annot):
1764        """Summarize the lipid report.
1765
1766        Parameters
1767        ----------
1768        ms2_annot : DataFrame
1769            The MS2 annotation DataFrame with all annotations.
1770
1771        Returns
1772        -------
1773        DataFrame
1774            The summarized lipid report.
1775        """
1776        # Drop unnecessary columns for easier viewing
1777        columns_to_drop = [
1778            "precursor_mz",
1779            "precursor_mz_error_ppm",
1780            "metabref_mol_id",
1781            "metabref_precursor_mz",
1782            "cas",
1783            "inchikey",
1784            "inchi",
1785            "chebi",
1786            "smiles",
1787            "kegg",
1788            "data_id",
1789            "iupac_name",
1790            "traditional_name",
1791            "common_name",
1792            "casno",
1793        ]
1794        ms2_annot = ms2_annot.drop(
1795            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1796        )
1797
1798        # If ion_types_excluded is not empty, remove those ion types
1799        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1800            "ms2"
1801        ].molecular_search.ion_types_excluded
1802        if len(ion_types_excluded) > 0:
1803            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1804
1805        # If mf_id is not present, check that the index name is mf_id and reset the index
1806        if "mf_id" not in ms2_annot.columns:
1807            if ms2_annot.index.name == "mf_id":
1808                ms2_annot = ms2_annot.reset_index()
1809            else:
1810                raise ValueError("mf_id is not present in the dataframe")
1811
1812        # Attempt to get consensus annotations to the MLF level
1813        mlf_results_all = []
1814        for mf_id in ms2_annot["mf_id"].unique():
1815            mlf_results_perid = []
1816            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1817            ms2_annot_mf["n_spectra_contributing"] = ms2_annot_mf.query_spectrum_id.nunique()
1818
1819            for query_scan in ms2_annot["query_spectrum_id"].unique():
1820                ms2_annot_sub = ms2_annot_mf[
1821                    ms2_annot_mf["query_spectrum_id"] == query_scan
1822                ].copy()
1823
1824                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1825                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1826                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1827                        ms2_annot_sub["entropy_max"] = (
1828                            ms2_annot_sub["entropy_similarity"]
1829                            == ms2_annot_sub["entropy_similarity"].max()
1830                        )
1831                        ms2_annot_sub["ref_match_fract_max"] = (
1832                            ms2_annot_sub["ref_mz_in_query_fract"]
1833                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1834                        )
1835                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1836                            "query_frag_types"
1837                        ].apply(lambda x: True if "MLF" in x else False)
1838
1839                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1840                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1841                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1842                        ].all(axis=1)
1843
1844                        # If there is a consensus, take the row with the highest entropy_similarity
1845                        if ms2_annot_sub["consensus"].any():
1846                            ms2_annot_sub = ms2_annot_sub[
1847                                ms2_annot_sub["entropy_similarity"]
1848                                == ms2_annot_sub["entropy_similarity"].max()
1849                            ].head(1)
1850                            mlf_results_perid.append(ms2_annot_sub)
1851            if len(mlf_results_perid) == 0:
1852                mlf_results_perid = pd.DataFrame()
1853            else:
1854                mlf_results_perid = pd.concat(mlf_results_perid)
1855                if mlf_results_perid["name"].nunique() == 1:
1856                    mlf_results_perid = mlf_results_perid[
1857                        mlf_results_perid["entropy_similarity"]
1858                        == mlf_results_perid["entropy_similarity"].max()
1859                    ].head(1)
1860                else:
1861                    mlf_results_perid = pd.DataFrame()
1862                mlf_results_all.append(mlf_results_perid)
1863
1864        # These are the consensus annotations to the MLF level
1865        if len(mlf_results_all) > 0:
1866            mlf_results_all = pd.concat(mlf_results_all)
1867            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1868        else:
1869            # Make an empty dataframe
1870            mlf_results_all = ms2_annot.head(0)
1871
1872        # For remaining mf_ids, try to get a consensus annotation to the species level
1873        species_results_all = []
1874        # Remove mf_ids that have consensus annotations to the MLF level
1875        ms2_annot_spec = ms2_annot[
1876            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1877        ]
1878        for mf_id in ms2_annot_spec["mf_id"].unique():
1879            # Do all the hits have the same lipid_summed_name?
1880            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1881            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1882
1883            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1884                # Grab the highest entropy_similarity result
1885                ms2_annot_sub = ms2_annot_sub[
1886                    ms2_annot_sub["entropy_similarity"]
1887                    == ms2_annot_sub["entropy_similarity"].max()
1888                ].head(1)
1889                species_results_all.append(ms2_annot_sub)
1890
1891        # These are the consensus annotations to the species level
1892        if len(species_results_all) > 0:
1893            species_results_all = pd.concat(species_results_all)
1894            species_results_all["annot_level"] = "species"
1895        else:
1896            # Make an empty dataframe
1897            species_results_all = ms2_annot.head(0)
1898
1899        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1900        # Remove mf_ids that have consensus annotations to the species level
1901        ms2_annot_remaining = ms2_annot_spec[
1902            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1903        ]
1904        no_consensus = []
1905        for mf_id in ms2_annot_remaining["mf_id"].unique():
1906            id_sub = []
1907            id_no_con = []
1908            ms2_annot_sub_mf = ms2_annot_remaining[
1909                ms2_annot_remaining["mf_id"] == mf_id
1910            ].copy()
1911            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1912                ms2_annot_sub = ms2_annot_sub_mf[
1913                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1914                ].copy()
1915
1916                # New columns for ranking [HIGHER RANK = BETTER]
1917                ms2_annot_sub["entropy_max"] = (
1918                    ms2_annot_sub["entropy_similarity"]
1919                    == ms2_annot_sub["entropy_similarity"].max()
1920                )
1921                ms2_annot_sub["ref_match_fract_max"] = (
1922                    ms2_annot_sub["ref_mz_in_query_fract"]
1923                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1924                )
1925                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1926                    lambda x: True if "MLF" in x else False
1927                )
1928
1929                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1930                ms2_annot_sub["consensus"] = ms2_annot_sub[
1931                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1932                ].all(axis=1)
1933                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1934                id_sub.append(ms2_annot_sub_con)
1935                id_no_con.append(ms2_annot_sub)
1936            id_sub = pd.concat(id_sub)
1937            id_no_con = pd.concat(id_no_con)
1938
1939            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1940            if (
1941                id_sub["query_frag_types"]
1942                .apply(lambda x: True if "MLF" in x else False)
1943                .all()
1944                and len(id_sub) > 0
1945            ):
1946                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1947                id_sub = id_sub.loc[idx]
1948                # Reorder so highest entropy_similarity is first
1949                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1950                id_sub["annot_level"] = id_sub["structure_level"]
1951                no_consensus.append(id_sub)
1952
1953            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1954            elif len(id_sub) == 0:
1955                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1956                    summed_sub = id_no_con[
1957                        id_no_con["lipid_summed_name"] == lipid_summed_name
1958                    ]
1959                    # Any consensus to MLF?
1960                    if summed_sub["consensus"].any():
1961                        summed_sub = summed_sub[summed_sub["consensus"]]
1962                        summed_sub["annot_level"] = summed_sub["structure_level"]
1963                        no_consensus.append(summed_sub)
1964                    else:
1965                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1966                        summed_sub = summed_sub[
1967                            summed_sub["entropy_similarity"]
1968                            == summed_sub["entropy_similarity"].max()
1969                        ].head(1)
1970                        # get first row
1971                        summed_sub["annot_level"] = "species"
1972                        summed_sub["name"] = ""
1973                        no_consensus.append(summed_sub)
1974            else:
1975                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1976
1977        if len(no_consensus) > 0:
1978            no_consensus = pd.concat(no_consensus)
1979        else:
1980            no_consensus = ms2_annot.head(0)
1981
1982        # Combine all the consensus annotations and reformat the dataframe for output
1983        species_results_all = species_results_all.drop(columns=["name"])
1984        species_results_all["lipid_molecular_species_id"] = ""
1985        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1986        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1987        consensus_annotations = pd.concat(
1988            [mlf_results_all, species_results_all, no_consensus]
1989        )
1990        consensus_annotations = consensus_annotations.sort_values(
1991            "mf_id", ascending=True
1992        )
1993        cols_to_keep = [
1994            "mf_id",
1995            "ref_ion_type",
1996            "entropy_similarity",
1997            "ref_mz_in_query_fract",
1998            "lipid_molecular_species_id",
1999            "lipid_summed_name",
2000            "lipid_subclass",
2001            "lipid_class",
2002            "lipid_category",
2003            "formula",
2004            "annot_level",
2005            "n_spectra_contributing",
2006        ]
2007        consensus_annotations = consensus_annotations[cols_to_keep]
2008        consensus_annotations = consensus_annotations.set_index("mf_id")
2009
2010        return consensus_annotations
2011
2012    def clean_ms2_report(self, lipid_summary):
2013        """Clean the MS2 report.
2014
2015        Parameters
2016        ----------
2017        lipid_summary : DataFrame
2018            The full lipid summary DataFrame.
2019
2020        Returns
2021        -------
2022        DataFrame
2023            The cleaned lipid summary DataFrame.
2024        """
2025        lipid_summary = lipid_summary.reset_index()
2026        lipid_summary["ion_formula"] = [
2027            self.get_ion_formula(f, a)
2028            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
2029        ]
2030
2031        # Reorder columns
2032        lipid_summary = lipid_summary[
2033            [
2034                "mf_id",
2035                "ion_formula",
2036                "ref_ion_type",
2037                "formula",
2038                "annot_level",
2039                "lipid_molecular_species_id",
2040                "lipid_summed_name",
2041                "lipid_subclass",
2042                "lipid_class",
2043                "lipid_category",
2044                "entropy_similarity",
2045                "ref_mz_in_query_fract",
2046                "n_spectra_contributing",
2047            ]
2048        ]
2049
2050        # Set the index to mf_id
2051        lipid_summary = lipid_summary.set_index("mf_id")
2052
2053        return lipid_summary
2054
2055    def to_report(self, molecular_metadata=None):
2056        """Create a report of the mass features and their annotations.
2057
2058        Parameters
2059        ----------
2060        molecular_metadata : dict, optional
2061            The molecular metadata. Default is None.
2062
2063        Returns
2064        -------
2065        DataFrame
2066            The report of the mass features and their annotations.
2067
2068        Notes
2069        -----
2070        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
2071        """
2072        # Get mass feature dataframe
2073        mf_report = self.mass_spectra.mass_features_to_df()
2074        mf_report = mf_report.reset_index(drop=False)
2075
2076        # Get and clean ms1 annotation dataframe
2077        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
2078        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
2079        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
2080
2081        # Get, summarize, and clean ms2 annotation dataframe
2082        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
2083            molecular_metadata=molecular_metadata
2084        )
2085        if ms2_annot_report is not None and molecular_metadata is not None:
2086            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
2087            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
2088            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
2089            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
2090        report = self.combine_reports(
2091            mf_report=mf_report,
2092            ms1_annot_report=ms1_annot_report,
2093            ms2_annot_report=ms2_annot_report
2094        )
2095        return report

A class to export lipidomics data.

This class provides methods to export lipidomics data to various formats and summarize the lipid report.

Parameters
  • out_file_path (str | Path): The output file path, do not include the file extension.
  • mass_spectra (object): The high resolution mass spectra object.
LipidomicsExport(out_file_path, mass_spectra)
1760    def __init__(self, out_file_path, mass_spectra):
1761        super().__init__(out_file_path, mass_spectra)

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

def summarize_lipid_report(self, ms2_annot):
1763    def summarize_lipid_report(self, ms2_annot):
1764        """Summarize the lipid report.
1765
1766        Parameters
1767        ----------
1768        ms2_annot : DataFrame
1769            The MS2 annotation DataFrame with all annotations.
1770
1771        Returns
1772        -------
1773        DataFrame
1774            The summarized lipid report.
1775        """
1776        # Drop unnecessary columns for easier viewing
1777        columns_to_drop = [
1778            "precursor_mz",
1779            "precursor_mz_error_ppm",
1780            "metabref_mol_id",
1781            "metabref_precursor_mz",
1782            "cas",
1783            "inchikey",
1784            "inchi",
1785            "chebi",
1786            "smiles",
1787            "kegg",
1788            "data_id",
1789            "iupac_name",
1790            "traditional_name",
1791            "common_name",
1792            "casno",
1793        ]
1794        ms2_annot = ms2_annot.drop(
1795            columns=[col for col in columns_to_drop if col in ms2_annot.columns]
1796        )
1797
1798        # If ion_types_excluded is not empty, remove those ion types
1799        ion_types_excluded = self.mass_spectra.parameters.mass_spectrum[
1800            "ms2"
1801        ].molecular_search.ion_types_excluded
1802        if len(ion_types_excluded) > 0:
1803            ms2_annot = ms2_annot[~ms2_annot["ref_ion_type"].isin(ion_types_excluded)]
1804
1805        # If mf_id is not present, check that the index name is mf_id and reset the index
1806        if "mf_id" not in ms2_annot.columns:
1807            if ms2_annot.index.name == "mf_id":
1808                ms2_annot = ms2_annot.reset_index()
1809            else:
1810                raise ValueError("mf_id is not present in the dataframe")
1811
1812        # Attempt to get consensus annotations to the MLF level
1813        mlf_results_all = []
1814        for mf_id in ms2_annot["mf_id"].unique():
1815            mlf_results_perid = []
1816            ms2_annot_mf = ms2_annot[ms2_annot["mf_id"] == mf_id].copy()
1817            ms2_annot_mf["n_spectra_contributing"] = ms2_annot_mf.query_spectrum_id.nunique()
1818
1819            for query_scan in ms2_annot["query_spectrum_id"].unique():
1820                ms2_annot_sub = ms2_annot_mf[
1821                    ms2_annot_mf["query_spectrum_id"] == query_scan
1822                ].copy()
1823
1824                if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1825                    # If there is only one lipid_summed_name, let's try to get consensus molecular species annotation
1826                    if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1827                        ms2_annot_sub["entropy_max"] = (
1828                            ms2_annot_sub["entropy_similarity"]
1829                            == ms2_annot_sub["entropy_similarity"].max()
1830                        )
1831                        ms2_annot_sub["ref_match_fract_max"] = (
1832                            ms2_annot_sub["ref_mz_in_query_fract"]
1833                            == ms2_annot_sub["ref_mz_in_query_fract"].max()
1834                        )
1835                        ms2_annot_sub["frag_max"] = ms2_annot_sub[
1836                            "query_frag_types"
1837                        ].apply(lambda x: True if "MLF" in x else False)
1838
1839                        # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1840                        ms2_annot_sub["consensus"] = ms2_annot_sub[
1841                            ["entropy_max", "ref_match_fract_max", "frag_max"]
1842                        ].all(axis=1)
1843
1844                        # If there is a consensus, take the row with the highest entropy_similarity
1845                        if ms2_annot_sub["consensus"].any():
1846                            ms2_annot_sub = ms2_annot_sub[
1847                                ms2_annot_sub["entropy_similarity"]
1848                                == ms2_annot_sub["entropy_similarity"].max()
1849                            ].head(1)
1850                            mlf_results_perid.append(ms2_annot_sub)
1851            if len(mlf_results_perid) == 0:
1852                mlf_results_perid = pd.DataFrame()
1853            else:
1854                mlf_results_perid = pd.concat(mlf_results_perid)
1855                if mlf_results_perid["name"].nunique() == 1:
1856                    mlf_results_perid = mlf_results_perid[
1857                        mlf_results_perid["entropy_similarity"]
1858                        == mlf_results_perid["entropy_similarity"].max()
1859                    ].head(1)
1860                else:
1861                    mlf_results_perid = pd.DataFrame()
1862                mlf_results_all.append(mlf_results_perid)
1863
1864        # These are the consensus annotations to the MLF level
1865        if len(mlf_results_all) > 0:
1866            mlf_results_all = pd.concat(mlf_results_all)
1867            mlf_results_all["annot_level"] = mlf_results_all["structure_level"]
1868        else:
1869            # Make an empty dataframe
1870            mlf_results_all = ms2_annot.head(0)
1871
1872        # For remaining mf_ids, try to get a consensus annotation to the species level
1873        species_results_all = []
1874        # Remove mf_ids that have consensus annotations to the MLF level
1875        ms2_annot_spec = ms2_annot[
1876            ~ms2_annot["mf_id"].isin(mlf_results_all["mf_id"].unique())
1877        ]
1878        for mf_id in ms2_annot_spec["mf_id"].unique():
1879            # Do all the hits have the same lipid_summed_name?
1880            ms2_annot_sub = ms2_annot_spec[ms2_annot_spec["mf_id"] == mf_id].copy()
1881            ms2_annot_sub["n_spectra_contributing"] = len(ms2_annot_sub)
1882
1883            if ms2_annot_sub["lipid_summed_name"].nunique() == 1:
1884                # Grab the highest entropy_similarity result
1885                ms2_annot_sub = ms2_annot_sub[
1886                    ms2_annot_sub["entropy_similarity"]
1887                    == ms2_annot_sub["entropy_similarity"].max()
1888                ].head(1)
1889                species_results_all.append(ms2_annot_sub)
1890
1891        # These are the consensus annotations to the species level
1892        if len(species_results_all) > 0:
1893            species_results_all = pd.concat(species_results_all)
1894            species_results_all["annot_level"] = "species"
1895        else:
1896            # Make an empty dataframe
1897            species_results_all = ms2_annot.head(0)
1898
1899        # Deal with the remaining mf_ids that do not have consensus annotations to the species level or MLF level
1900        # Remove mf_ids that have consensus annotations to the species level
1901        ms2_annot_remaining = ms2_annot_spec[
1902            ~ms2_annot_spec["mf_id"].isin(species_results_all["mf_id"].unique())
1903        ]
1904        no_consensus = []
1905        for mf_id in ms2_annot_remaining["mf_id"].unique():
1906            id_sub = []
1907            id_no_con = []
1908            ms2_annot_sub_mf = ms2_annot_remaining[
1909                ms2_annot_remaining["mf_id"] == mf_id
1910            ].copy()
1911            for query_scan in ms2_annot_sub_mf["query_spectrum_id"].unique():
1912                ms2_annot_sub = ms2_annot_sub_mf[
1913                    ms2_annot_sub_mf["query_spectrum_id"] == query_scan
1914                ].copy()
1915
1916                # New columns for ranking [HIGHER RANK = BETTER]
1917                ms2_annot_sub["entropy_max"] = (
1918                    ms2_annot_sub["entropy_similarity"]
1919                    == ms2_annot_sub["entropy_similarity"].max()
1920                )
1921                ms2_annot_sub["ref_match_fract_max"] = (
1922                    ms2_annot_sub["ref_mz_in_query_fract"]
1923                    == ms2_annot_sub["ref_mz_in_query_fract"].max()
1924                )
1925                ms2_annot_sub["frag_max"] = ms2_annot_sub["query_frag_types"].apply(
1926                    lambda x: True if "MLF" in x else False
1927                )
1928
1929                # New column that looks if there is a consensus between the ranks (one row that is highest in all ranks)
1930                ms2_annot_sub["consensus"] = ms2_annot_sub[
1931                    ["entropy_max", "ref_match_fract_max", "frag_max"]
1932                ].all(axis=1)
1933                ms2_annot_sub_con = ms2_annot_sub[ms2_annot_sub["consensus"]]
1934                id_sub.append(ms2_annot_sub_con)
1935                id_no_con.append(ms2_annot_sub)
1936            id_sub = pd.concat(id_sub)
1937            id_no_con = pd.concat(id_no_con)
1938
1939            # Scenario 1: Multiple scans are being resolved to different MLFs [could be coelutions and should both be kept and annotated to MS level]
1940            if (
1941                id_sub["query_frag_types"]
1942                .apply(lambda x: True if "MLF" in x else False)
1943                .all()
1944                and len(id_sub) > 0
1945            ):
1946                idx = id_sub.groupby("name")["entropy_similarity"].idxmax()
1947                id_sub = id_sub.loc[idx]
1948                # Reorder so highest entropy_similarity is first
1949                id_sub = id_sub.sort_values("entropy_similarity", ascending=False)
1950                id_sub["annot_level"] = id_sub["structure_level"]
1951                no_consensus.append(id_sub)
1952
1953            # Scenario 2: Multiple scans are being resolved to different species, keep both and annotate to appropriate level
1954            elif len(id_sub) == 0:
1955                for lipid_summed_name in id_no_con["lipid_summed_name"].unique():
1956                    summed_sub = id_no_con[
1957                        id_no_con["lipid_summed_name"] == lipid_summed_name
1958                    ]
1959                    # Any consensus to MLF?
1960                    if summed_sub["consensus"].any():
1961                        summed_sub = summed_sub[summed_sub["consensus"]]
1962                        summed_sub["annot_level"] = summed_sub["structure_level"]
1963                        no_consensus.append(summed_sub)
1964                    else:
1965                        # Grab the highest entropy_similarity, if there are multiple, grab the first one
1966                        summed_sub = summed_sub[
1967                            summed_sub["entropy_similarity"]
1968                            == summed_sub["entropy_similarity"].max()
1969                        ].head(1)
1970                        # get first row
1971                        summed_sub["annot_level"] = "species"
1972                        summed_sub["name"] = ""
1973                        no_consensus.append(summed_sub)
1974            else:
1975                raise ValueError("Unexpected scenario for summarizing mf_id: ", mf_id)
1976
1977        if len(no_consensus) > 0:
1978            no_consensus = pd.concat(no_consensus)
1979        else:
1980            no_consensus = ms2_annot.head(0)
1981
1982        # Combine all the consensus annotations and reformat the dataframe for output
1983        species_results_all = species_results_all.drop(columns=["name"])
1984        species_results_all["lipid_molecular_species_id"] = ""
1985        mlf_results_all["lipid_molecular_species_id"] = mlf_results_all["name"]
1986        no_consensus["lipid_molecular_species_id"] = no_consensus["name"]
1987        consensus_annotations = pd.concat(
1988            [mlf_results_all, species_results_all, no_consensus]
1989        )
1990        consensus_annotations = consensus_annotations.sort_values(
1991            "mf_id", ascending=True
1992        )
1993        cols_to_keep = [
1994            "mf_id",
1995            "ref_ion_type",
1996            "entropy_similarity",
1997            "ref_mz_in_query_fract",
1998            "lipid_molecular_species_id",
1999            "lipid_summed_name",
2000            "lipid_subclass",
2001            "lipid_class",
2002            "lipid_category",
2003            "formula",
2004            "annot_level",
2005            "n_spectra_contributing",
2006        ]
2007        consensus_annotations = consensus_annotations[cols_to_keep]
2008        consensus_annotations = consensus_annotations.set_index("mf_id")
2009
2010        return consensus_annotations

Summarize the lipid report.

Parameters
  • ms2_annot (DataFrame): The MS2 annotation DataFrame with all annotations.
Returns
  • DataFrame: The summarized lipid report.
def clean_ms2_report(self, lipid_summary):
2012    def clean_ms2_report(self, lipid_summary):
2013        """Clean the MS2 report.
2014
2015        Parameters
2016        ----------
2017        lipid_summary : DataFrame
2018            The full lipid summary DataFrame.
2019
2020        Returns
2021        -------
2022        DataFrame
2023            The cleaned lipid summary DataFrame.
2024        """
2025        lipid_summary = lipid_summary.reset_index()
2026        lipid_summary["ion_formula"] = [
2027            self.get_ion_formula(f, a)
2028            for f, a in zip(lipid_summary["formula"], lipid_summary["ref_ion_type"])
2029        ]
2030
2031        # Reorder columns
2032        lipid_summary = lipid_summary[
2033            [
2034                "mf_id",
2035                "ion_formula",
2036                "ref_ion_type",
2037                "formula",
2038                "annot_level",
2039                "lipid_molecular_species_id",
2040                "lipid_summed_name",
2041                "lipid_subclass",
2042                "lipid_class",
2043                "lipid_category",
2044                "entropy_similarity",
2045                "ref_mz_in_query_fract",
2046                "n_spectra_contributing",
2047            ]
2048        ]
2049
2050        # Set the index to mf_id
2051        lipid_summary = lipid_summary.set_index("mf_id")
2052
2053        return lipid_summary

Clean the MS2 report.

Parameters
  • lipid_summary (DataFrame): The full lipid summary DataFrame.
Returns
  • DataFrame: The cleaned lipid summary DataFrame.
def to_report(self, molecular_metadata=None):
2055    def to_report(self, molecular_metadata=None):
2056        """Create a report of the mass features and their annotations.
2057
2058        Parameters
2059        ----------
2060        molecular_metadata : dict, optional
2061            The molecular metadata. Default is None.
2062
2063        Returns
2064        -------
2065        DataFrame
2066            The report of the mass features and their annotations.
2067
2068        Notes
2069        -----
2070        The report will contain the mass features and their annotations from MS1 and MS2 (if available).
2071        """
2072        # Get mass feature dataframe
2073        mf_report = self.mass_spectra.mass_features_to_df()
2074        mf_report = mf_report.reset_index(drop=False)
2075
2076        # Get and clean ms1 annotation dataframe
2077        ms1_annot_report = self.mass_spectra.mass_features_ms1_annot_to_df().copy()
2078        ms1_annot_report = self.clean_ms1_report(ms1_annot_report)
2079        ms1_annot_report = ms1_annot_report.reset_index(drop=False)
2080
2081        # Get, summarize, and clean ms2 annotation dataframe
2082        ms2_annot_report = self.mass_spectra.mass_features_ms2_annot_to_df(
2083            molecular_metadata=molecular_metadata
2084        )
2085        if ms2_annot_report is not None and molecular_metadata is not None:
2086            ms2_annot_report = self.summarize_lipid_report(ms2_annot_report)
2087            ms2_annot_report = self.clean_ms2_report(ms2_annot_report)
2088            ms2_annot_report = ms2_annot_report.dropna(axis=1, how="all")
2089            ms2_annot_report = ms2_annot_report.reset_index(drop=False)
2090        report = self.combine_reports(
2091            mf_report=mf_report,
2092            ms1_annot_report=ms1_annot_report,
2093            ms2_annot_report=ms2_annot_report
2094        )
2095        return report

Create a report of the mass features and their annotations.

Parameters
  • molecular_metadata (dict, optional): The molecular metadata. Default is None.
Returns
  • DataFrame: The report of the mass features and their annotations.
Notes

The report will contain the mass features and their annotations from MS1 and MS2 (if available).