corems.mass_spectrum.output.export

  1__author__ = "Yuri E. Corilo"
  2__date__ = "Nov 11, 2019"
  3
  4import json
  5from datetime import datetime, timezone
  6from pathlib import Path
  7from threading import Thread
  8
  9import h5py
 10import toml
 11import numpy as np
 12from numpy import NaN, empty
 13from pandas import DataFrame
 14
 15from corems.encapsulation.constant import Atoms, Labels #Labels is accessed in the eval() function
 16from corems.encapsulation.output import parameter_to_dict
 17from corems.mass_spectrum.factory.MassSpectrumClasses import MassSpecfromFreq
 18
 19
 20class HighResMassSpecExport(Thread):
 21    """A class for exporting high-resolution mass spectra.
 22
 23    Parameters
 24    ----------
 25    out_file_path : str
 26        The output file path.
 27    mass_spectrum : MassSpectrum
 28        The mass spectrum to export.
 29    output_type : str, optional
 30        The type of output file. Defaults to 'excel'. Can be 'excel', 'csv', 'pandas' or 'hdf5'.
 31
 32    Attributes
 33    ----------
 34    output_file : Path
 35        The output file path.
 36    output_type : str
 37        The type of output file.
 38    mass_spectrum : MassSpectrum
 39        The mass spectrum to export.
 40    atoms_order_list : list
 41        The list of assigned atoms in the order specified by Atoms.atoms_order list.
 42    columns_label : list
 43        The column labels in order.
 44
 45    Methods
 46    -------
 47    * save().
 48        Save the mass spectrum data to the output file.
 49    * run().
 50        Run the export process.
 51    * get_pandas_df().
 52        Returns the mass spectrum data as a pandas DataFrame.
 53    * write_settings(output_path, mass_spectrum).
 54        Writes the settings of the mass spectrum to a JSON file.
 55    * to_pandas(write_metadata=True).
 56        Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.
 57    * to_excel(write_metadata=True).
 58        Exports the mass spectrum data to an Excel file.
 59    * to_csv(write_metadata=True).
 60        Exports the mass spectrum data to a CSV file.
 61    * to_json().
 62        Exports the mass spectrum data to a JSON string.
 63    * to_hdf().
 64        Exports the mass spectrum data to an HDF5 file.
 65    * parameters_to_toml().
 66        Converts the mass spectrum parameters to a TOML string.
 67    * parameters_to_json().
 68        Converts the mass spectrum parameters to a JSON string.
 69    * get_mass_spec_attrs(mass_spectrum).
 70        Returns the mass spectrum attributes as a dictionary.
 71    * get_all_used_atoms_in_order(mass_spectrum).
 72        Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.
 73    * list_dict_to_list(mass_spectrum, is_hdf5=False).
 74        Returns the mass spectrum data as a list of dictionaries.
 75    * get_list_dict_data(mass_spectrum, include_no_match=True, include_isotopologues=True, isotopologue_inline=True, no_match_inline=False, is_hdf5=False).
 76        Returns the mass spectrum data as a list of dictionaries.
 77
 78    """
 79
 80    def __init__(self, out_file_path, mass_spectrum, output_type="excel"):
 81        Thread.__init__(self)
 82
 83        self.output_file = Path(out_file_path)
 84
 85        # 'excel', 'csv' or 'pandas'
 86        self.output_type = output_type
 87
 88        self.mass_spectrum = mass_spectrum
 89
 90        # collect all assigned atoms and order them accordingly to the Atoms.atoms_order list
 91        self.atoms_order_list = self.get_all_used_atoms_in_order(self.mass_spectrum)
 92
 93        self._init_columns()
 94
 95    def _init_columns(self):
 96        """Initialize the columns for the mass spectrum output."""
 97        # column labels in order
 98        self.columns_label = [
 99            "Index",
100            "m/z",
101            "Calibrated m/z",
102            "Calculated m/z",
103            "Peak Height",
104            "Peak Area",
105            "Resolving Power",
106            "S/N",
107            "Ion Charge",
108            "m/z Error (ppm)",
109            "m/z Error Score",
110            "Isotopologue Similarity",
111            "Confidence Score",
112            "DBE",
113            "O/C",
114            "H/C",
115            "Heteroatom Class",
116            "Ion Type",
117            "Adduct",
118            "Is Isotopologue",
119            "Mono Isotopic Index",
120            "Molecular Formula",
121        ]
122
123    @property
124    def output_type(self):
125        """Returns the output type of the mass spectrum."""
126        return self._output_type
127
128    @output_type.setter
129    def output_type(self, output_type):
130        output_types = ["excel", "csv", "pandas", "hdf5"]
131        if output_type in output_types:
132            self._output_type = output_type
133        else:
134            raise TypeError(
135                'Supported types are "excel", "csv" or "pandas", %s entered'
136                % output_type
137            )
138
139    def save(self):
140        """Save the mass spectrum data to the output file.
141
142        Raises
143        ------
144        ValueError
145            If the output type is not supported.
146        """
147
148        if self.output_type == "excel":
149            self.to_excel()
150        elif self.output_type == "csv":
151            self.to_csv()
152        elif self.output_type == "pandas":
153            self.to_pandas()
154        elif self.output_type == "hdf5":
155            self.to_hdf()
156        else:
157            raise ValueError(
158                "Unkown output type: %s; it can be 'excel', 'csv' or 'pandas'"
159                % self.output_type
160            )
161
162    def run(self):
163        """Run the export process.
164
165        This method is called when the thread starts.
166        It calls the save method to perform the export."""
167        self.save()
168
169    def get_pandas_df(self, additional_columns=None):
170        """Returns the mass spectrum data as a pandas DataFrame.
171
172        Parameters
173        ----------
174        additional_columns : list, optional
175            Additional columns to include in the DataFrame. Defaults to None.
176            Suitable additional columns are: 'Aromaticity Index', 'NOSC', 'Aromaticity Index (modified)'.
177
178        Returns
179        -------
180        DataFrame
181            The mass spectrum data as a pandas DataFrame.
182        """
183        if additional_columns is not None:
184            possible_additional_columns = [
185                "Aromaticity Index",
186                "NOSC",
187                "Aromaticity Index (modified)",
188            ]
189            if additional_columns:
190                for column in additional_columns:
191                    if column not in possible_additional_columns:
192                        raise ValueError("Invalid additional column: %s" % column)
193            columns = (
194                self.columns_label
195                + additional_columns
196                + self.get_all_used_atoms_in_order(self.mass_spectrum)
197            )
198        else:
199            columns = self.columns_label + self.get_all_used_atoms_in_order(
200                self.mass_spectrum
201            )
202        dict_data_list = self.get_list_dict_data(
203            self.mass_spectrum, additional_columns=additional_columns
204        )
205        df = DataFrame(dict_data_list, columns=columns)
206        df.name = self.output_file
207        return df
208
209    def write_settings(self, output_path, mass_spectrum):
210        """Writes the settings of the mass spectrum to a JSON file.
211
212        Parameters
213        ----------
214        output_path : str
215            The output file path.
216        mass_spectrum : MassSpectrum
217            The mass spectrum to export.
218        """
219
220        import json
221
222        dict_setting = parameter_to_dict.get_dict_data_ms(mass_spectrum)
223
224        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(mass_spectrum)
225        dict_setting["analyzer"] = mass_spectrum.analyzer
226        dict_setting["instrument_label"] = mass_spectrum.instrument_label
227        dict_setting["sample_name"] = mass_spectrum.sample_name
228
229        with open(
230            output_path.with_suffix(".json"),
231            "w",
232            encoding="utf8",
233        ) as outfile:
234            output = json.dumps(
235                dict_setting, sort_keys=True, indent=4, separators=(",", ": ")
236            )
237            outfile.write(output)
238
239    def to_pandas(self, write_metadata=True):
240        """Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.
241
242        Parameters
243        ----------
244        write_metadata : bool, optional
245            Whether to write the metadata to a JSON file. Defaults to True.
246        """
247
248        columns = self.columns_label + self.get_all_used_atoms_in_order(
249            self.mass_spectrum
250        )
251
252        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
253
254        df = DataFrame(dict_data_list, columns=columns)
255
256        df.to_pickle(self.output_file.with_suffix(".pkl"))
257
258        if write_metadata:
259            self.write_settings(self.output_file, self.mass_spectrum)
260
261    def to_excel(self, write_metadata=True):
262        """Exports the mass spectrum data to an Excel file.
263
264        Parameters
265        ----------
266        write_metadata : bool, optional
267            Whether to write the metadata to a JSON file. Defaults to True.
268        """
269
270        columns = self.columns_label + self.get_all_used_atoms_in_order(
271            self.mass_spectrum
272        )
273
274        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
275
276        df = DataFrame(dict_data_list, columns=columns)
277
278        df.to_excel(self.output_file.with_suffix(".xlsx"))
279
280        if write_metadata:
281            self.write_settings(self.output_file, self.mass_spectrum)
282
283    def to_csv(self, write_metadata=True):
284        """Exports the mass spectrum data to a CSV file.
285
286        Parameters
287        ----------
288        write_metadata : bool, optional
289            Whether to write the metadata to a JSON file. Defaults to True.
290        """
291
292        columns = self.columns_label + self.get_all_used_atoms_in_order(
293            self.mass_spectrum
294        )
295
296        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
297
298        import csv
299
300        try:
301            with open(self.output_file.with_suffix(".csv"), "w", newline="") as csvfile:
302                writer = csv.DictWriter(csvfile, fieldnames=columns)
303                writer.writeheader()
304                for data in dict_data_list:
305                    writer.writerow(data)
306            if write_metadata:
307                self.write_settings(self.output_file, self.mass_spectrum)
308
309        except IOError as ioerror:
310            print(ioerror)
311
312    def to_json(self):
313        """Exports the mass spectrum data to a JSON string."""
314
315        columns = self.columns_label + self.get_all_used_atoms_in_order(
316            self.mass_spectrum
317        )
318
319        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
320
321        df = DataFrame(dict_data_list, columns=columns)
322
323        # for key, values in dict_data.items():
324        #    if not values: dict_data[key] = NaN
325
326        # output = json.dumps(dict_data, sort_keys=True, indent=4, separators=(',', ': '))
327        return df.to_json(orient="records")
328
329    def add_mass_spectrum_to_hdf5(
330        self,
331        hdf_handle,
332        mass_spectrum,
333        group_key,
334        mass_spectra_group=None,
335        export_raw=True,
336    ):
337        """Adds the mass spectrum data to an HDF5 file.
338
339        Parameters
340        ----------
341        hdf_handle : h5py.File
342            The HDF5 file handle.
343        mass_spectrum : MassSpectrum
344            The mass spectrum to add to the HDF5 file.
345        group_key : str
346            The group key (where to add the mass spectrum data within the HDF5 file).
347        mass_spectra_group : h5py.Group, optional
348            The mass spectra group. Defaults to None (no group, mass spectrum is added to the root).
349        export_raw : bool, optional
350            Whether to export the raw data. Defaults to True.
351            If False, only the processed data (peaks) is exported (essentially centroided data).
352        """
353        if mass_spectra_group is None:
354            # Check if the file has the necessary attributes and add them if not
355            # This assumes that if there is a mass_spectra_group, these attributes were already added to the file
356            if not hdf_handle.attrs.get("date_utc"):
357                timenow = str(
358                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
359                )
360                hdf_handle.attrs["date_utc"] = timenow
361                hdf_handle.attrs["file_name"] = mass_spectrum.filename.name
362                hdf_handle.attrs["data_structure"] = "mass_spectrum"
363                hdf_handle.attrs["analyzer"] = mass_spectrum.analyzer
364                hdf_handle.attrs["instrument_label"] = mass_spectrum.instrument_label
365                hdf_handle.attrs["sample_name"] = mass_spectrum.sample_name
366
367        list_results = self.list_dict_to_list(mass_spectrum, is_hdf5=True)
368
369        dict_ms_attrs = self.get_mass_spec_attrs(mass_spectrum)
370
371        setting_dicts = parameter_to_dict.get_dict_data_ms(mass_spectrum)
372
373        columns_labels = json.dumps(
374            self.columns_label + self.get_all_used_atoms_in_order(mass_spectrum),
375            sort_keys=False,
376            indent=4,
377            separators=(",", ": "),
378        )
379
380        group_key = group_key
381
382        if mass_spectra_group is not None:
383            hdf_handle = mass_spectra_group
384
385        if group_key not in hdf_handle.keys():
386            scan_group = hdf_handle.create_group(group_key)
387
388            # If there is raw data (from profile data) save it
389            if not mass_spectrum.is_centroid and export_raw:
390                mz_abun_array = empty(shape=(2, len(mass_spectrum.abundance_profile)), dtype=np.float32)
391
392                mz_abun_array[0] = mass_spectrum.abundance_profile
393                mz_abun_array[1] = mass_spectrum.mz_exp_profile
394
395                raw_ms_dataset = scan_group.create_dataset(
396                    "raw_ms", data=mz_abun_array, dtype="f4", compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True
397                )
398                
399                # Add metadata to the raw_ms dataset when it exists
400                raw_ms_dataset.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs)
401
402                if isinstance(mass_spectrum, MassSpecfromFreq):
403                    raw_ms_dataset.attrs["TransientSetting"] = json.dumps(
404                        setting_dicts.get("TransientSetting"),
405                        sort_keys=False,
406                        indent=4,
407                        separators=(",", ": "),
408                    )
409            else:
410                # For centroided data, store metadata directly on the scan group
411                scan_group.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs)
412                
413                if isinstance(mass_spectrum, MassSpecfromFreq):
414                    scan_group.attrs["TransientSetting"] = json.dumps(
415                        setting_dicts.get("TransientSetting"),
416                        sort_keys=False,
417                        indent=4,
418                        separators=(",", ": "),
419                    )
420
421        else:
422            scan_group = hdf_handle.get(group_key)
423
424        # if there is not processed data len = 0, otherwise len() will return next index
425        index_processed_data = str(len(scan_group.keys()))
426
427        timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
428
429        # Convert list_results to numpy array and optimize data types
430        array_results = np.array(list_results)
431        
432        # Apply data type optimization for numeric columns
433        if array_results.dtype == np.float64:
434            array_results = array_results.astype(np.float32)
435        elif array_results.dtype == np.int64:
436            array_results = array_results.astype(np.int32)
437        
438        processed_dset = scan_group.create_dataset(
439            index_processed_data, data=array_results, compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True
440        )
441
442        processed_dset.attrs["date_utc"] = timenow
443
444        processed_dset.attrs["ColumnsLabels"] = columns_labels
445
446        processed_dset.attrs["MoleculaSearchSetting"] = json.dumps(
447            setting_dicts.get("MoleculaSearch"),
448            sort_keys=False,
449            indent=4,
450            separators=(",", ": "),
451        )
452
453        processed_dset.attrs["MassSpecPeakSetting"] = json.dumps(
454            setting_dicts.get("MassSpecPeak"),
455            sort_keys=False,
456            indent=4,
457            separators=(",", ": "),
458        )
459
460        processed_dset.attrs["MassSpectrumSetting"] = json.dumps(
461            setting_dicts.get("MassSpectrum"),
462            sort_keys=False,
463            indent=4,
464            separators=(",", ": "),
465        )
466
467    def to_hdf(self):
468        """Exports the mass spectrum data to an HDF5 file."""
469
470        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
471            self.add_mass_spectrum_to_hdf5(
472                hdf_handle, self.mass_spectrum, str(self.mass_spectrum.scan_number)
473            )
474
475    def parameters_to_toml(self):
476        """Converts the mass spectrum parameters to a TOML string.
477
478        Returns
479        -------
480        str
481            The TOML string of the mass spectrum parameters.
482        """
483
484        dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum)
485
486        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum)
487        dict_setting["analyzer"] = self.mass_spectrum.analyzer
488        dict_setting["instrument_label"] = self.mass_spectrum.instrument_label
489        dict_setting["sample_name"] = self.mass_spectrum.sample_name
490
491        output = toml.dumps(dict_setting)
492
493        return output
494
495    def parameters_to_json(self):
496        """Converts the mass spectrum parameters to a JSON string.
497
498        Returns
499        -------
500        str
501            The JSON string of the mass spectrum parameters.
502        """
503
504        dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum)
505
506        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum)
507        dict_setting["analyzer"] = self.mass_spectrum.analyzer
508        dict_setting["instrument_label"] = self.mass_spectrum.instrument_label
509        dict_setting["sample_name"] = self.mass_spectrum.sample_name
510
511        output = json.dumps(dict_setting)
512
513        return output
514
515    def get_mass_spec_attrs(self, mass_spectrum):
516        """Returns the mass spectrum attributes as a dictionary.
517
518        Parameters
519        ----------
520        mass_spectrum : MassSpectrum
521            The mass spectrum to export.
522
523        Returns
524        -------
525        dict
526            The mass spectrum attributes.
527        """
528
529        dict_ms_attrs = {}
530        dict_ms_attrs["polarity"] = mass_spectrum.polarity
531        dict_ms_attrs["rt"] = mass_spectrum.retention_time
532        dict_ms_attrs["tic"] = mass_spectrum.tic
533        dict_ms_attrs["mobility_scan"] = mass_spectrum.mobility_scan
534        dict_ms_attrs["mobility_rt"] = mass_spectrum.mobility_rt
535        dict_ms_attrs["Aterm"] = mass_spectrum.Aterm
536        dict_ms_attrs["Bterm"] = mass_spectrum.Bterm
537        dict_ms_attrs["Cterm"] = mass_spectrum.Cterm
538        dict_ms_attrs["baseline_noise"] = mass_spectrum.baseline_noise
539        dict_ms_attrs["baseline_noise_std"] = mass_spectrum.baseline_noise_std
540
541        return dict_ms_attrs
542
543    def get_all_used_atoms_in_order(self, mass_spectrum):
544        """Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.
545
546        Parameters
547        ----------
548        mass_spectrum : MassSpectrum
549            The mass spectrum to export.
550
551        Returns
552        -------
553        list
554            The list of assigned atoms in the order specified by Atoms.atoms_order list.
555        """
556
557        atoms_in_order = Atoms.atoms_order
558        all_used_atoms = set()
559        if mass_spectrum:
560            for ms_peak in mass_spectrum:
561                if ms_peak:
562                    for m_formula in ms_peak:
563                        for atom in m_formula.atoms:
564                            all_used_atoms.add(atom)
565
566        def sort_method(atom):
567            return [atoms_in_order.index(atom)]
568
569        return sorted(all_used_atoms, key=sort_method)
570
571    def list_dict_to_list(self, mass_spectrum, is_hdf5=False):
572        """Returns the mass spectrum data as a list of dictionaries.
573
574        Parameters
575        ----------
576        mass_spectrum : MassSpectrum
577            The mass spectrum to export.
578        is_hdf5 : bool, optional
579            Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
580
581        Returns
582        -------
583        list
584            The mass spectrum data as a list of dictionaries.
585        """
586
587        column_labels = self.columns_label + self.get_all_used_atoms_in_order(
588            mass_spectrum
589        )
590
591        dict_list = self.get_list_dict_data(mass_spectrum, is_hdf5=is_hdf5)
592
593        all_lines = []
594        for dict_res in dict_list:
595            result_line = [NaN] * len(column_labels)
596
597            for label, value in dict_res.items():
598                label_index = column_labels.index(label)
599                result_line[label_index] = value
600
601            all_lines.append(result_line)
602
603        return all_lines
604
605    def get_list_dict_data(
606        self,
607        mass_spectrum,
608        include_no_match=True,
609        include_isotopologues=True,
610        isotopologue_inline=True,
611        no_match_inline=False,
612        is_hdf5=False,
613        additional_columns=None,
614    ):
615        """Returns the mass spectrum data as a list of dictionaries.
616
617        Parameters
618        ----------
619        mass_spectrum : MassSpectrum
620            The mass spectrum to export.
621        include_no_match : bool, optional
622            Whether to include unassigned (no match) data. Defaults to True.
623        include_isotopologues : bool, optional
624            Whether to include isotopologues. Defaults to True.
625        isotopologue_inline : bool, optional
626            Whether to include isotopologues inline. Defaults to True.
627        no_match_inline : bool, optional
628            Whether to include unassigned (no match) data inline. Defaults to False.
629        is_hdf5 : bool, optional
630            Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
631
632        Returns
633        -------
634        list
635            The mass spectrum data as a list of dictionaries.
636        """
637
638        dict_data_list = []
639
640        if is_hdf5:
641            encode = ".encode('utf-8')"
642        else:
643            encode = ""
644
645        def add_no_match_dict_data(index, ms_peak):
646            """
647            Export dictionary of mspeak info for unassigned (no match) data
648            """
649            dict_result = {
650                "Index": index,
651                "m/z": ms_peak._mz_exp,
652                "Calibrated m/z": ms_peak.mz_exp,
653                "Peak Height": ms_peak.abundance,
654                "Peak Area": ms_peak.area,
655                "Resolving Power": ms_peak.resolving_power,
656                "S/N": ms_peak.signal_to_noise,
657                "Ion Charge": ms_peak.ion_charge,
658                "Heteroatom Class": eval("Labels.unassigned{}".format(encode)),
659            }
660
661            dict_data_list.append(dict_result)
662
663        def add_match_dict_data(index, ms_peak, mformula, additional_columns=None):
664            """
665            Export dictionary of mspeak info for assigned (match) data
666            """
667            formula_dict = mformula.to_dict()
668
669            dict_result = {
670                "Index": index,
671                "m/z": ms_peak._mz_exp,
672                "Calibrated m/z": ms_peak.mz_exp,
673                "Calculated m/z": mformula.mz_calc,
674                "Peak Height": ms_peak.abundance,
675                "Peak Area": ms_peak.area,
676                "Resolving Power": ms_peak.resolving_power,
677                "S/N": ms_peak.signal_to_noise,
678                "Ion Charge": ms_peak.ion_charge,
679                "m/z Error (ppm)": mformula.mz_error,
680                "Confidence Score": mformula.confidence_score,
681                "Isotopologue Similarity": mformula.isotopologue_similarity,
682                "m/z Error Score": mformula.average_mz_error_score,
683                "DBE": mformula.dbe,
684                "Heteroatom Class": eval("mformula.class_label{}".format(encode)),
685                "H/C": mformula.H_C,
686                "O/C": mformula.O_C,
687                "Ion Type": eval("mformula.ion_type.lower(){}".format(encode)),
688                "Is Isotopologue": int(mformula.is_isotopologue),
689                "Molecular Formula": eval("mformula.string{}".format(encode)),
690            }
691            if additional_columns is not None:
692                possible_dict = {
693                    "Aromaticity Index": mformula.A_I,
694                    "NOSC": mformula.nosc,
695                    "Aromaticity Index (modified)": mformula.A_I_mod,
696                }
697                for column in additional_columns:
698                    dict_result[column] = possible_dict.get(column)
699
700            if mformula.adduct_atom:
701                dict_result["Adduct"] = eval("mformula.adduct_atom{}".format(encode))
702
703            if mformula.is_isotopologue:
704                dict_result["Mono Isotopic Index"] = mformula.mspeak_index_mono_isotopic
705
706            if self.atoms_order_list is None:
707                atoms_order_list = self.get_all_used_atoms_in_order(mass_spectrum)
708            else:
709                atoms_order_list = self.atoms_order_list
710
711            for atom in atoms_order_list:
712                if atom in formula_dict.keys():
713                    dict_result[atom] = formula_dict.get(atom)
714
715            dict_data_list.append(dict_result)
716
717        score_methods = mass_spectrum.molecular_search_settings.score_methods
718        selected_score_method = (
719            mass_spectrum.molecular_search_settings.output_score_method
720        )
721
722        if selected_score_method in score_methods:
723            # temp set score method as the one chosen in the output
724            current_method = mass_spectrum.molecular_search_settings.score_method
725            mass_spectrum.molecular_search_settings.score_method = selected_score_method
726
727            for index, ms_peak in enumerate(mass_spectrum):
728                # print(ms_peak.mz_exp)
729
730                if ms_peak:
731                    m_formula = ms_peak.best_molecular_formula_candidate
732
733                    if m_formula:
734                        if not m_formula.is_isotopologue:
735                            add_match_dict_data(
736                                index,
737                                ms_peak,
738                                m_formula,
739                                additional_columns=additional_columns,
740                            )
741
742                            for (
743                                iso_mspeak_index,
744                                iso_mf_formula,
745                            ) in m_formula.mspeak_mf_isotopologues_indexes:
746                                iso_ms_peak = mass_spectrum[iso_mspeak_index]
747                                add_match_dict_data(
748                                    iso_mspeak_index,
749                                    iso_ms_peak,
750                                    iso_mf_formula,
751                                    additional_columns=additional_columns,
752                                )
753                else:
754                    if include_no_match and no_match_inline:
755                        add_no_match_dict_data(index, ms_peak)
756
757            if include_no_match and not no_match_inline:
758                for index, ms_peak in enumerate(mass_spectrum):
759                    if not ms_peak:
760                        add_no_match_dict_data(index, ms_peak)
761            # reset score method as the one chosen in the output
762            mass_spectrum.molecular_search_settings.score_method = current_method
763
764        else:
765            for index, ms_peak in enumerate(mass_spectrum):
766                # check if there is a molecular formula candidate for the msPeak
767
768                if ms_peak:
769                    # m_formula = ms_peak.molecular_formula_lowest_error
770                    for m_formula in ms_peak:
771                        if mass_spectrum.molecular_search_settings.output_min_score > 0:
772                            if (
773                                m_formula.confidence_score
774                                >= mass_spectrum.molecular_search_settings.output_min_score
775                            ):
776                                if m_formula.is_isotopologue:  # isotopologues inline
777                                    if include_isotopologues and isotopologue_inline:
778                                        add_match_dict_data(
779                                            index,
780                                            ms_peak,
781                                            m_formula,
782                                            additional_columns=additional_columns,
783                                        )
784                                else:
785                                    add_match_dict_data(
786                                        index,
787                                        ms_peak,
788                                        m_formula,
789                                        additional_columns=additional_columns,
790                                    )  # add monoisotopic peak
791
792                            # cutoff because of low score
793                            else:
794                                add_no_match_dict_data(index, ms_peak)
795
796                        else:
797                            if m_formula.is_isotopologue:  # isotopologues inline
798                                if include_isotopologues and isotopologue_inline:
799                                    add_match_dict_data(
800                                        index,
801                                        ms_peak,
802                                        m_formula,
803                                        additional_columns=additional_columns,
804                                    )
805                            else:
806                                add_match_dict_data(
807                                    index,
808                                    ms_peak,
809                                    m_formula,
810                                    additional_columns=additional_columns,
811                                )  # add monoisotopic peak
812                else:
813                    # include not_match
814                    if include_no_match and no_match_inline:
815                        add_no_match_dict_data(index, ms_peak)
816
817            if include_isotopologues and not isotopologue_inline:
818                for index, ms_peak in enumerate(mass_spectrum):
819                    for m_formula in ms_peak:
820                        if m_formula.is_isotopologue:
821                            if (
822                                m_formula.confidence_score
823                                >= mass_spectrum.molecular_search_settings.output_min_score
824                            ):
825                                add_match_dict_data(
826                                    index,
827                                    ms_peak,
828                                    m_formula,
829                                    additional_columns=additional_columns,
830                                )
831
832            if include_no_match and not no_match_inline:
833                for index, ms_peak in enumerate(mass_spectrum):
834                    if not ms_peak:
835                        add_no_match_dict_data(index, ms_peak)
836
837        # remove duplicated add_match data possibly introduced on the output_score_filter step
838        res = []
839        [res.append(x) for x in dict_data_list if x not in res]
840
841        return res
class HighResMassSpecExport(threading.Thread):
 21class HighResMassSpecExport(Thread):
 22    """A class for exporting high-resolution mass spectra.
 23
 24    Parameters
 25    ----------
 26    out_file_path : str
 27        The output file path.
 28    mass_spectrum : MassSpectrum
 29        The mass spectrum to export.
 30    output_type : str, optional
 31        The type of output file. Defaults to 'excel'. Can be 'excel', 'csv', 'pandas' or 'hdf5'.
 32
 33    Attributes
 34    ----------
 35    output_file : Path
 36        The output file path.
 37    output_type : str
 38        The type of output file.
 39    mass_spectrum : MassSpectrum
 40        The mass spectrum to export.
 41    atoms_order_list : list
 42        The list of assigned atoms in the order specified by Atoms.atoms_order list.
 43    columns_label : list
 44        The column labels in order.
 45
 46    Methods
 47    -------
 48    * save().
 49        Save the mass spectrum data to the output file.
 50    * run().
 51        Run the export process.
 52    * get_pandas_df().
 53        Returns the mass spectrum data as a pandas DataFrame.
 54    * write_settings(output_path, mass_spectrum).
 55        Writes the settings of the mass spectrum to a JSON file.
 56    * to_pandas(write_metadata=True).
 57        Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.
 58    * to_excel(write_metadata=True).
 59        Exports the mass spectrum data to an Excel file.
 60    * to_csv(write_metadata=True).
 61        Exports the mass spectrum data to a CSV file.
 62    * to_json().
 63        Exports the mass spectrum data to a JSON string.
 64    * to_hdf().
 65        Exports the mass spectrum data to an HDF5 file.
 66    * parameters_to_toml().
 67        Converts the mass spectrum parameters to a TOML string.
 68    * parameters_to_json().
 69        Converts the mass spectrum parameters to a JSON string.
 70    * get_mass_spec_attrs(mass_spectrum).
 71        Returns the mass spectrum attributes as a dictionary.
 72    * get_all_used_atoms_in_order(mass_spectrum).
 73        Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.
 74    * list_dict_to_list(mass_spectrum, is_hdf5=False).
 75        Returns the mass spectrum data as a list of dictionaries.
 76    * get_list_dict_data(mass_spectrum, include_no_match=True, include_isotopologues=True, isotopologue_inline=True, no_match_inline=False, is_hdf5=False).
 77        Returns the mass spectrum data as a list of dictionaries.
 78
 79    """
 80
 81    def __init__(self, out_file_path, mass_spectrum, output_type="excel"):
 82        Thread.__init__(self)
 83
 84        self.output_file = Path(out_file_path)
 85
 86        # 'excel', 'csv' or 'pandas'
 87        self.output_type = output_type
 88
 89        self.mass_spectrum = mass_spectrum
 90
 91        # collect all assigned atoms and order them accordingly to the Atoms.atoms_order list
 92        self.atoms_order_list = self.get_all_used_atoms_in_order(self.mass_spectrum)
 93
 94        self._init_columns()
 95
 96    def _init_columns(self):
 97        """Initialize the columns for the mass spectrum output."""
 98        # column labels in order
 99        self.columns_label = [
100            "Index",
101            "m/z",
102            "Calibrated m/z",
103            "Calculated m/z",
104            "Peak Height",
105            "Peak Area",
106            "Resolving Power",
107            "S/N",
108            "Ion Charge",
109            "m/z Error (ppm)",
110            "m/z Error Score",
111            "Isotopologue Similarity",
112            "Confidence Score",
113            "DBE",
114            "O/C",
115            "H/C",
116            "Heteroatom Class",
117            "Ion Type",
118            "Adduct",
119            "Is Isotopologue",
120            "Mono Isotopic Index",
121            "Molecular Formula",
122        ]
123
124    @property
125    def output_type(self):
126        """Returns the output type of the mass spectrum."""
127        return self._output_type
128
129    @output_type.setter
130    def output_type(self, output_type):
131        output_types = ["excel", "csv", "pandas", "hdf5"]
132        if output_type in output_types:
133            self._output_type = output_type
134        else:
135            raise TypeError(
136                'Supported types are "excel", "csv" or "pandas", %s entered'
137                % output_type
138            )
139
140    def save(self):
141        """Save the mass spectrum data to the output file.
142
143        Raises
144        ------
145        ValueError
146            If the output type is not supported.
147        """
148
149        if self.output_type == "excel":
150            self.to_excel()
151        elif self.output_type == "csv":
152            self.to_csv()
153        elif self.output_type == "pandas":
154            self.to_pandas()
155        elif self.output_type == "hdf5":
156            self.to_hdf()
157        else:
158            raise ValueError(
159                "Unkown output type: %s; it can be 'excel', 'csv' or 'pandas'"
160                % self.output_type
161            )
162
163    def run(self):
164        """Run the export process.
165
166        This method is called when the thread starts.
167        It calls the save method to perform the export."""
168        self.save()
169
170    def get_pandas_df(self, additional_columns=None):
171        """Returns the mass spectrum data as a pandas DataFrame.
172
173        Parameters
174        ----------
175        additional_columns : list, optional
176            Additional columns to include in the DataFrame. Defaults to None.
177            Suitable additional columns are: 'Aromaticity Index', 'NOSC', 'Aromaticity Index (modified)'.
178
179        Returns
180        -------
181        DataFrame
182            The mass spectrum data as a pandas DataFrame.
183        """
184        if additional_columns is not None:
185            possible_additional_columns = [
186                "Aromaticity Index",
187                "NOSC",
188                "Aromaticity Index (modified)",
189            ]
190            if additional_columns:
191                for column in additional_columns:
192                    if column not in possible_additional_columns:
193                        raise ValueError("Invalid additional column: %s" % column)
194            columns = (
195                self.columns_label
196                + additional_columns
197                + self.get_all_used_atoms_in_order(self.mass_spectrum)
198            )
199        else:
200            columns = self.columns_label + self.get_all_used_atoms_in_order(
201                self.mass_spectrum
202            )
203        dict_data_list = self.get_list_dict_data(
204            self.mass_spectrum, additional_columns=additional_columns
205        )
206        df = DataFrame(dict_data_list, columns=columns)
207        df.name = self.output_file
208        return df
209
210    def write_settings(self, output_path, mass_spectrum):
211        """Writes the settings of the mass spectrum to a JSON file.
212
213        Parameters
214        ----------
215        output_path : str
216            The output file path.
217        mass_spectrum : MassSpectrum
218            The mass spectrum to export.
219        """
220
221        import json
222
223        dict_setting = parameter_to_dict.get_dict_data_ms(mass_spectrum)
224
225        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(mass_spectrum)
226        dict_setting["analyzer"] = mass_spectrum.analyzer
227        dict_setting["instrument_label"] = mass_spectrum.instrument_label
228        dict_setting["sample_name"] = mass_spectrum.sample_name
229
230        with open(
231            output_path.with_suffix(".json"),
232            "w",
233            encoding="utf8",
234        ) as outfile:
235            output = json.dumps(
236                dict_setting, sort_keys=True, indent=4, separators=(",", ": ")
237            )
238            outfile.write(output)
239
240    def to_pandas(self, write_metadata=True):
241        """Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.
242
243        Parameters
244        ----------
245        write_metadata : bool, optional
246            Whether to write the metadata to a JSON file. Defaults to True.
247        """
248
249        columns = self.columns_label + self.get_all_used_atoms_in_order(
250            self.mass_spectrum
251        )
252
253        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
254
255        df = DataFrame(dict_data_list, columns=columns)
256
257        df.to_pickle(self.output_file.with_suffix(".pkl"))
258
259        if write_metadata:
260            self.write_settings(self.output_file, self.mass_spectrum)
261
262    def to_excel(self, write_metadata=True):
263        """Exports the mass spectrum data to an Excel file.
264
265        Parameters
266        ----------
267        write_metadata : bool, optional
268            Whether to write the metadata to a JSON file. Defaults to True.
269        """
270
271        columns = self.columns_label + self.get_all_used_atoms_in_order(
272            self.mass_spectrum
273        )
274
275        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
276
277        df = DataFrame(dict_data_list, columns=columns)
278
279        df.to_excel(self.output_file.with_suffix(".xlsx"))
280
281        if write_metadata:
282            self.write_settings(self.output_file, self.mass_spectrum)
283
284    def to_csv(self, write_metadata=True):
285        """Exports the mass spectrum data to a CSV file.
286
287        Parameters
288        ----------
289        write_metadata : bool, optional
290            Whether to write the metadata to a JSON file. Defaults to True.
291        """
292
293        columns = self.columns_label + self.get_all_used_atoms_in_order(
294            self.mass_spectrum
295        )
296
297        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
298
299        import csv
300
301        try:
302            with open(self.output_file.with_suffix(".csv"), "w", newline="") as csvfile:
303                writer = csv.DictWriter(csvfile, fieldnames=columns)
304                writer.writeheader()
305                for data in dict_data_list:
306                    writer.writerow(data)
307            if write_metadata:
308                self.write_settings(self.output_file, self.mass_spectrum)
309
310        except IOError as ioerror:
311            print(ioerror)
312
313    def to_json(self):
314        """Exports the mass spectrum data to a JSON string."""
315
316        columns = self.columns_label + self.get_all_used_atoms_in_order(
317            self.mass_spectrum
318        )
319
320        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
321
322        df = DataFrame(dict_data_list, columns=columns)
323
324        # for key, values in dict_data.items():
325        #    if not values: dict_data[key] = NaN
326
327        # output = json.dumps(dict_data, sort_keys=True, indent=4, separators=(',', ': '))
328        return df.to_json(orient="records")
329
330    def add_mass_spectrum_to_hdf5(
331        self,
332        hdf_handle,
333        mass_spectrum,
334        group_key,
335        mass_spectra_group=None,
336        export_raw=True,
337    ):
338        """Adds the mass spectrum data to an HDF5 file.
339
340        Parameters
341        ----------
342        hdf_handle : h5py.File
343            The HDF5 file handle.
344        mass_spectrum : MassSpectrum
345            The mass spectrum to add to the HDF5 file.
346        group_key : str
347            The group key (where to add the mass spectrum data within the HDF5 file).
348        mass_spectra_group : h5py.Group, optional
349            The mass spectra group. Defaults to None (no group, mass spectrum is added to the root).
350        export_raw : bool, optional
351            Whether to export the raw data. Defaults to True.
352            If False, only the processed data (peaks) is exported (essentially centroided data).
353        """
354        if mass_spectra_group is None:
355            # Check if the file has the necessary attributes and add them if not
356            # This assumes that if there is a mass_spectra_group, these attributes were already added to the file
357            if not hdf_handle.attrs.get("date_utc"):
358                timenow = str(
359                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
360                )
361                hdf_handle.attrs["date_utc"] = timenow
362                hdf_handle.attrs["file_name"] = mass_spectrum.filename.name
363                hdf_handle.attrs["data_structure"] = "mass_spectrum"
364                hdf_handle.attrs["analyzer"] = mass_spectrum.analyzer
365                hdf_handle.attrs["instrument_label"] = mass_spectrum.instrument_label
366                hdf_handle.attrs["sample_name"] = mass_spectrum.sample_name
367
368        list_results = self.list_dict_to_list(mass_spectrum, is_hdf5=True)
369
370        dict_ms_attrs = self.get_mass_spec_attrs(mass_spectrum)
371
372        setting_dicts = parameter_to_dict.get_dict_data_ms(mass_spectrum)
373
374        columns_labels = json.dumps(
375            self.columns_label + self.get_all_used_atoms_in_order(mass_spectrum),
376            sort_keys=False,
377            indent=4,
378            separators=(",", ": "),
379        )
380
381        group_key = group_key
382
383        if mass_spectra_group is not None:
384            hdf_handle = mass_spectra_group
385
386        if group_key not in hdf_handle.keys():
387            scan_group = hdf_handle.create_group(group_key)
388
389            # If there is raw data (from profile data) save it
390            if not mass_spectrum.is_centroid and export_raw:
391                mz_abun_array = empty(shape=(2, len(mass_spectrum.abundance_profile)), dtype=np.float32)
392
393                mz_abun_array[0] = mass_spectrum.abundance_profile
394                mz_abun_array[1] = mass_spectrum.mz_exp_profile
395
396                raw_ms_dataset = scan_group.create_dataset(
397                    "raw_ms", data=mz_abun_array, dtype="f4", compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True
398                )
399                
400                # Add metadata to the raw_ms dataset when it exists
401                raw_ms_dataset.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs)
402
403                if isinstance(mass_spectrum, MassSpecfromFreq):
404                    raw_ms_dataset.attrs["TransientSetting"] = json.dumps(
405                        setting_dicts.get("TransientSetting"),
406                        sort_keys=False,
407                        indent=4,
408                        separators=(",", ": "),
409                    )
410            else:
411                # For centroided data, store metadata directly on the scan group
412                scan_group.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs)
413                
414                if isinstance(mass_spectrum, MassSpecfromFreq):
415                    scan_group.attrs["TransientSetting"] = json.dumps(
416                        setting_dicts.get("TransientSetting"),
417                        sort_keys=False,
418                        indent=4,
419                        separators=(",", ": "),
420                    )
421
422        else:
423            scan_group = hdf_handle.get(group_key)
424
425        # if there is not processed data len = 0, otherwise len() will return next index
426        index_processed_data = str(len(scan_group.keys()))
427
428        timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
429
430        # Convert list_results to numpy array and optimize data types
431        array_results = np.array(list_results)
432        
433        # Apply data type optimization for numeric columns
434        if array_results.dtype == np.float64:
435            array_results = array_results.astype(np.float32)
436        elif array_results.dtype == np.int64:
437            array_results = array_results.astype(np.int32)
438        
439        processed_dset = scan_group.create_dataset(
440            index_processed_data, data=array_results, compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True
441        )
442
443        processed_dset.attrs["date_utc"] = timenow
444
445        processed_dset.attrs["ColumnsLabels"] = columns_labels
446
447        processed_dset.attrs["MoleculaSearchSetting"] = json.dumps(
448            setting_dicts.get("MoleculaSearch"),
449            sort_keys=False,
450            indent=4,
451            separators=(",", ": "),
452        )
453
454        processed_dset.attrs["MassSpecPeakSetting"] = json.dumps(
455            setting_dicts.get("MassSpecPeak"),
456            sort_keys=False,
457            indent=4,
458            separators=(",", ": "),
459        )
460
461        processed_dset.attrs["MassSpectrumSetting"] = json.dumps(
462            setting_dicts.get("MassSpectrum"),
463            sort_keys=False,
464            indent=4,
465            separators=(",", ": "),
466        )
467
468    def to_hdf(self):
469        """Exports the mass spectrum data to an HDF5 file."""
470
471        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
472            self.add_mass_spectrum_to_hdf5(
473                hdf_handle, self.mass_spectrum, str(self.mass_spectrum.scan_number)
474            )
475
476    def parameters_to_toml(self):
477        """Converts the mass spectrum parameters to a TOML string.
478
479        Returns
480        -------
481        str
482            The TOML string of the mass spectrum parameters.
483        """
484
485        dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum)
486
487        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum)
488        dict_setting["analyzer"] = self.mass_spectrum.analyzer
489        dict_setting["instrument_label"] = self.mass_spectrum.instrument_label
490        dict_setting["sample_name"] = self.mass_spectrum.sample_name
491
492        output = toml.dumps(dict_setting)
493
494        return output
495
496    def parameters_to_json(self):
497        """Converts the mass spectrum parameters to a JSON string.
498
499        Returns
500        -------
501        str
502            The JSON string of the mass spectrum parameters.
503        """
504
505        dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum)
506
507        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum)
508        dict_setting["analyzer"] = self.mass_spectrum.analyzer
509        dict_setting["instrument_label"] = self.mass_spectrum.instrument_label
510        dict_setting["sample_name"] = self.mass_spectrum.sample_name
511
512        output = json.dumps(dict_setting)
513
514        return output
515
516    def get_mass_spec_attrs(self, mass_spectrum):
517        """Returns the mass spectrum attributes as a dictionary.
518
519        Parameters
520        ----------
521        mass_spectrum : MassSpectrum
522            The mass spectrum to export.
523
524        Returns
525        -------
526        dict
527            The mass spectrum attributes.
528        """
529
530        dict_ms_attrs = {}
531        dict_ms_attrs["polarity"] = mass_spectrum.polarity
532        dict_ms_attrs["rt"] = mass_spectrum.retention_time
533        dict_ms_attrs["tic"] = mass_spectrum.tic
534        dict_ms_attrs["mobility_scan"] = mass_spectrum.mobility_scan
535        dict_ms_attrs["mobility_rt"] = mass_spectrum.mobility_rt
536        dict_ms_attrs["Aterm"] = mass_spectrum.Aterm
537        dict_ms_attrs["Bterm"] = mass_spectrum.Bterm
538        dict_ms_attrs["Cterm"] = mass_spectrum.Cterm
539        dict_ms_attrs["baseline_noise"] = mass_spectrum.baseline_noise
540        dict_ms_attrs["baseline_noise_std"] = mass_spectrum.baseline_noise_std
541
542        return dict_ms_attrs
543
544    def get_all_used_atoms_in_order(self, mass_spectrum):
545        """Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.
546
547        Parameters
548        ----------
549        mass_spectrum : MassSpectrum
550            The mass spectrum to export.
551
552        Returns
553        -------
554        list
555            The list of assigned atoms in the order specified by Atoms.atoms_order list.
556        """
557
558        atoms_in_order = Atoms.atoms_order
559        all_used_atoms = set()
560        if mass_spectrum:
561            for ms_peak in mass_spectrum:
562                if ms_peak:
563                    for m_formula in ms_peak:
564                        for atom in m_formula.atoms:
565                            all_used_atoms.add(atom)
566
567        def sort_method(atom):
568            return [atoms_in_order.index(atom)]
569
570        return sorted(all_used_atoms, key=sort_method)
571
572    def list_dict_to_list(self, mass_spectrum, is_hdf5=False):
573        """Returns the mass spectrum data as a list of dictionaries.
574
575        Parameters
576        ----------
577        mass_spectrum : MassSpectrum
578            The mass spectrum to export.
579        is_hdf5 : bool, optional
580            Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
581
582        Returns
583        -------
584        list
585            The mass spectrum data as a list of dictionaries.
586        """
587
588        column_labels = self.columns_label + self.get_all_used_atoms_in_order(
589            mass_spectrum
590        )
591
592        dict_list = self.get_list_dict_data(mass_spectrum, is_hdf5=is_hdf5)
593
594        all_lines = []
595        for dict_res in dict_list:
596            result_line = [NaN] * len(column_labels)
597
598            for label, value in dict_res.items():
599                label_index = column_labels.index(label)
600                result_line[label_index] = value
601
602            all_lines.append(result_line)
603
604        return all_lines
605
606    def get_list_dict_data(
607        self,
608        mass_spectrum,
609        include_no_match=True,
610        include_isotopologues=True,
611        isotopologue_inline=True,
612        no_match_inline=False,
613        is_hdf5=False,
614        additional_columns=None,
615    ):
616        """Returns the mass spectrum data as a list of dictionaries.
617
618        Parameters
619        ----------
620        mass_spectrum : MassSpectrum
621            The mass spectrum to export.
622        include_no_match : bool, optional
623            Whether to include unassigned (no match) data. Defaults to True.
624        include_isotopologues : bool, optional
625            Whether to include isotopologues. Defaults to True.
626        isotopologue_inline : bool, optional
627            Whether to include isotopologues inline. Defaults to True.
628        no_match_inline : bool, optional
629            Whether to include unassigned (no match) data inline. Defaults to False.
630        is_hdf5 : bool, optional
631            Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
632
633        Returns
634        -------
635        list
636            The mass spectrum data as a list of dictionaries.
637        """
638
639        dict_data_list = []
640
641        if is_hdf5:
642            encode = ".encode('utf-8')"
643        else:
644            encode = ""
645
646        def add_no_match_dict_data(index, ms_peak):
647            """
648            Export dictionary of mspeak info for unassigned (no match) data
649            """
650            dict_result = {
651                "Index": index,
652                "m/z": ms_peak._mz_exp,
653                "Calibrated m/z": ms_peak.mz_exp,
654                "Peak Height": ms_peak.abundance,
655                "Peak Area": ms_peak.area,
656                "Resolving Power": ms_peak.resolving_power,
657                "S/N": ms_peak.signal_to_noise,
658                "Ion Charge": ms_peak.ion_charge,
659                "Heteroatom Class": eval("Labels.unassigned{}".format(encode)),
660            }
661
662            dict_data_list.append(dict_result)
663
664        def add_match_dict_data(index, ms_peak, mformula, additional_columns=None):
665            """
666            Export dictionary of mspeak info for assigned (match) data
667            """
668            formula_dict = mformula.to_dict()
669
670            dict_result = {
671                "Index": index,
672                "m/z": ms_peak._mz_exp,
673                "Calibrated m/z": ms_peak.mz_exp,
674                "Calculated m/z": mformula.mz_calc,
675                "Peak Height": ms_peak.abundance,
676                "Peak Area": ms_peak.area,
677                "Resolving Power": ms_peak.resolving_power,
678                "S/N": ms_peak.signal_to_noise,
679                "Ion Charge": ms_peak.ion_charge,
680                "m/z Error (ppm)": mformula.mz_error,
681                "Confidence Score": mformula.confidence_score,
682                "Isotopologue Similarity": mformula.isotopologue_similarity,
683                "m/z Error Score": mformula.average_mz_error_score,
684                "DBE": mformula.dbe,
685                "Heteroatom Class": eval("mformula.class_label{}".format(encode)),
686                "H/C": mformula.H_C,
687                "O/C": mformula.O_C,
688                "Ion Type": eval("mformula.ion_type.lower(){}".format(encode)),
689                "Is Isotopologue": int(mformula.is_isotopologue),
690                "Molecular Formula": eval("mformula.string{}".format(encode)),
691            }
692            if additional_columns is not None:
693                possible_dict = {
694                    "Aromaticity Index": mformula.A_I,
695                    "NOSC": mformula.nosc,
696                    "Aromaticity Index (modified)": mformula.A_I_mod,
697                }
698                for column in additional_columns:
699                    dict_result[column] = possible_dict.get(column)
700
701            if mformula.adduct_atom:
702                dict_result["Adduct"] = eval("mformula.adduct_atom{}".format(encode))
703
704            if mformula.is_isotopologue:
705                dict_result["Mono Isotopic Index"] = mformula.mspeak_index_mono_isotopic
706
707            if self.atoms_order_list is None:
708                atoms_order_list = self.get_all_used_atoms_in_order(mass_spectrum)
709            else:
710                atoms_order_list = self.atoms_order_list
711
712            for atom in atoms_order_list:
713                if atom in formula_dict.keys():
714                    dict_result[atom] = formula_dict.get(atom)
715
716            dict_data_list.append(dict_result)
717
718        score_methods = mass_spectrum.molecular_search_settings.score_methods
719        selected_score_method = (
720            mass_spectrum.molecular_search_settings.output_score_method
721        )
722
723        if selected_score_method in score_methods:
724            # temp set score method as the one chosen in the output
725            current_method = mass_spectrum.molecular_search_settings.score_method
726            mass_spectrum.molecular_search_settings.score_method = selected_score_method
727
728            for index, ms_peak in enumerate(mass_spectrum):
729                # print(ms_peak.mz_exp)
730
731                if ms_peak:
732                    m_formula = ms_peak.best_molecular_formula_candidate
733
734                    if m_formula:
735                        if not m_formula.is_isotopologue:
736                            add_match_dict_data(
737                                index,
738                                ms_peak,
739                                m_formula,
740                                additional_columns=additional_columns,
741                            )
742
743                            for (
744                                iso_mspeak_index,
745                                iso_mf_formula,
746                            ) in m_formula.mspeak_mf_isotopologues_indexes:
747                                iso_ms_peak = mass_spectrum[iso_mspeak_index]
748                                add_match_dict_data(
749                                    iso_mspeak_index,
750                                    iso_ms_peak,
751                                    iso_mf_formula,
752                                    additional_columns=additional_columns,
753                                )
754                else:
755                    if include_no_match and no_match_inline:
756                        add_no_match_dict_data(index, ms_peak)
757
758            if include_no_match and not no_match_inline:
759                for index, ms_peak in enumerate(mass_spectrum):
760                    if not ms_peak:
761                        add_no_match_dict_data(index, ms_peak)
762            # reset score method as the one chosen in the output
763            mass_spectrum.molecular_search_settings.score_method = current_method
764
765        else:
766            for index, ms_peak in enumerate(mass_spectrum):
767                # check if there is a molecular formula candidate for the msPeak
768
769                if ms_peak:
770                    # m_formula = ms_peak.molecular_formula_lowest_error
771                    for m_formula in ms_peak:
772                        if mass_spectrum.molecular_search_settings.output_min_score > 0:
773                            if (
774                                m_formula.confidence_score
775                                >= mass_spectrum.molecular_search_settings.output_min_score
776                            ):
777                                if m_formula.is_isotopologue:  # isotopologues inline
778                                    if include_isotopologues and isotopologue_inline:
779                                        add_match_dict_data(
780                                            index,
781                                            ms_peak,
782                                            m_formula,
783                                            additional_columns=additional_columns,
784                                        )
785                                else:
786                                    add_match_dict_data(
787                                        index,
788                                        ms_peak,
789                                        m_formula,
790                                        additional_columns=additional_columns,
791                                    )  # add monoisotopic peak
792
793                            # cutoff because of low score
794                            else:
795                                add_no_match_dict_data(index, ms_peak)
796
797                        else:
798                            if m_formula.is_isotopologue:  # isotopologues inline
799                                if include_isotopologues and isotopologue_inline:
800                                    add_match_dict_data(
801                                        index,
802                                        ms_peak,
803                                        m_formula,
804                                        additional_columns=additional_columns,
805                                    )
806                            else:
807                                add_match_dict_data(
808                                    index,
809                                    ms_peak,
810                                    m_formula,
811                                    additional_columns=additional_columns,
812                                )  # add monoisotopic peak
813                else:
814                    # include not_match
815                    if include_no_match and no_match_inline:
816                        add_no_match_dict_data(index, ms_peak)
817
818            if include_isotopologues and not isotopologue_inline:
819                for index, ms_peak in enumerate(mass_spectrum):
820                    for m_formula in ms_peak:
821                        if m_formula.is_isotopologue:
822                            if (
823                                m_formula.confidence_score
824                                >= mass_spectrum.molecular_search_settings.output_min_score
825                            ):
826                                add_match_dict_data(
827                                    index,
828                                    ms_peak,
829                                    m_formula,
830                                    additional_columns=additional_columns,
831                                )
832
833            if include_no_match and not no_match_inline:
834                for index, ms_peak in enumerate(mass_spectrum):
835                    if not ms_peak:
836                        add_no_match_dict_data(index, ms_peak)
837
838        # remove duplicated add_match data possibly introduced on the output_score_filter step
839        res = []
840        [res.append(x) for x in dict_data_list if x not in res]
841
842        return res

A class for exporting high-resolution mass spectra.

Parameters
  • out_file_path (str): The output file path.
  • mass_spectrum (MassSpectrum): The mass spectrum to export.
  • output_type (str, optional): The type of output file. Defaults to 'excel'. Can be 'excel', 'csv', 'pandas' or 'hdf5'.
Attributes
  • output_file (Path): The output file path.
  • output_type (str): The type of output file.
  • mass_spectrum (MassSpectrum): The mass spectrum to export.
  • atoms_order_list (list): The list of assigned atoms in the order specified by Atoms.atoms_order list.
  • columns_label (list): The column labels in order.
Methods
  • save(). Save the mass spectrum data to the output file.
  • run(). Run the export process.
  • get_pandas_df(). Returns the mass spectrum data as a pandas DataFrame.
  • write_settings(output_path, mass_spectrum). Writes the settings of the mass spectrum to a JSON file.
  • to_pandas(write_metadata=True). Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.
  • to_excel(write_metadata=True). Exports the mass spectrum data to an Excel file.
  • to_csv(write_metadata=True). Exports the mass spectrum data to a CSV file.
  • to_json(). Exports the mass spectrum data to a JSON string.
  • to_hdf(). Exports the mass spectrum data to an HDF5 file.
  • parameters_to_toml(). Converts the mass spectrum parameters to a TOML string.
  • parameters_to_json(). Converts the mass spectrum parameters to a JSON string.
  • get_mass_spec_attrs(mass_spectrum). Returns the mass spectrum attributes as a dictionary.
  • get_all_used_atoms_in_order(mass_spectrum). Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.
  • list_dict_to_list(mass_spectrum, is_hdf5=False). Returns the mass spectrum data as a list of dictionaries.
  • get_list_dict_data(mass_spectrum, include_no_match=True, include_isotopologues=True, isotopologue_inline=True, no_match_inline=False, is_hdf5=False). Returns the mass spectrum data as a list of dictionaries.
HighResMassSpecExport(out_file_path, mass_spectrum, output_type='excel')
81    def __init__(self, out_file_path, mass_spectrum, output_type="excel"):
82        Thread.__init__(self)
83
84        self.output_file = Path(out_file_path)
85
86        # 'excel', 'csv' or 'pandas'
87        self.output_type = output_type
88
89        self.mass_spectrum = mass_spectrum
90
91        # collect all assigned atoms and order them accordingly to the Atoms.atoms_order list
92        self.atoms_order_list = self.get_all_used_atoms_in_order(self.mass_spectrum)
93
94        self._init_columns()

This constructor should always be called with keyword arguments. Arguments are:

group should be None; reserved for future extension when a ThreadGroup class is implemented.

target is the callable object to be invoked by the run() method. Defaults to None, meaning nothing is called.

name is the thread name. By default, a unique name is constructed of the form "Thread-N" where N is a small decimal number.

args is the argument tuple for the target invocation. Defaults to ().

kwargs is a dictionary of keyword arguments for the target invocation. Defaults to {}.

If a subclass overrides the constructor, it must make sure to invoke the base class constructor (Thread.__init__()) before doing anything else to the thread.

output_file
output_type

Returns the output type of the mass spectrum.

mass_spectrum
atoms_order_list
def save(self):
140    def save(self):
141        """Save the mass spectrum data to the output file.
142
143        Raises
144        ------
145        ValueError
146            If the output type is not supported.
147        """
148
149        if self.output_type == "excel":
150            self.to_excel()
151        elif self.output_type == "csv":
152            self.to_csv()
153        elif self.output_type == "pandas":
154            self.to_pandas()
155        elif self.output_type == "hdf5":
156            self.to_hdf()
157        else:
158            raise ValueError(
159                "Unkown output type: %s; it can be 'excel', 'csv' or 'pandas'"
160                % self.output_type
161            )

Save the mass spectrum data to the output file.

Raises
  • ValueError: If the output type is not supported.
def run(self):
163    def run(self):
164        """Run the export process.
165
166        This method is called when the thread starts.
167        It calls the save method to perform the export."""
168        self.save()

Run the export process.

This method is called when the thread starts. It calls the save method to perform the export.

def get_pandas_df(self, additional_columns=None):
170    def get_pandas_df(self, additional_columns=None):
171        """Returns the mass spectrum data as a pandas DataFrame.
172
173        Parameters
174        ----------
175        additional_columns : list, optional
176            Additional columns to include in the DataFrame. Defaults to None.
177            Suitable additional columns are: 'Aromaticity Index', 'NOSC', 'Aromaticity Index (modified)'.
178
179        Returns
180        -------
181        DataFrame
182            The mass spectrum data as a pandas DataFrame.
183        """
184        if additional_columns is not None:
185            possible_additional_columns = [
186                "Aromaticity Index",
187                "NOSC",
188                "Aromaticity Index (modified)",
189            ]
190            if additional_columns:
191                for column in additional_columns:
192                    if column not in possible_additional_columns:
193                        raise ValueError("Invalid additional column: %s" % column)
194            columns = (
195                self.columns_label
196                + additional_columns
197                + self.get_all_used_atoms_in_order(self.mass_spectrum)
198            )
199        else:
200            columns = self.columns_label + self.get_all_used_atoms_in_order(
201                self.mass_spectrum
202            )
203        dict_data_list = self.get_list_dict_data(
204            self.mass_spectrum, additional_columns=additional_columns
205        )
206        df = DataFrame(dict_data_list, columns=columns)
207        df.name = self.output_file
208        return df

Returns the mass spectrum data as a pandas DataFrame.

Parameters
  • additional_columns (list, optional): Additional columns to include in the DataFrame. Defaults to None. Suitable additional columns are: 'Aromaticity Index', 'NOSC', 'Aromaticity Index (modified)'.
Returns
  • DataFrame: The mass spectrum data as a pandas DataFrame.
def write_settings(self, output_path, mass_spectrum):
210    def write_settings(self, output_path, mass_spectrum):
211        """Writes the settings of the mass spectrum to a JSON file.
212
213        Parameters
214        ----------
215        output_path : str
216            The output file path.
217        mass_spectrum : MassSpectrum
218            The mass spectrum to export.
219        """
220
221        import json
222
223        dict_setting = parameter_to_dict.get_dict_data_ms(mass_spectrum)
224
225        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(mass_spectrum)
226        dict_setting["analyzer"] = mass_spectrum.analyzer
227        dict_setting["instrument_label"] = mass_spectrum.instrument_label
228        dict_setting["sample_name"] = mass_spectrum.sample_name
229
230        with open(
231            output_path.with_suffix(".json"),
232            "w",
233            encoding="utf8",
234        ) as outfile:
235            output = json.dumps(
236                dict_setting, sort_keys=True, indent=4, separators=(",", ": ")
237            )
238            outfile.write(output)

Writes the settings of the mass spectrum to a JSON file.

Parameters
  • output_path (str): The output file path.
  • mass_spectrum (MassSpectrum): The mass spectrum to export.
def to_pandas(self, write_metadata=True):
240    def to_pandas(self, write_metadata=True):
241        """Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.
242
243        Parameters
244        ----------
245        write_metadata : bool, optional
246            Whether to write the metadata to a JSON file. Defaults to True.
247        """
248
249        columns = self.columns_label + self.get_all_used_atoms_in_order(
250            self.mass_spectrum
251        )
252
253        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
254
255        df = DataFrame(dict_data_list, columns=columns)
256
257        df.to_pickle(self.output_file.with_suffix(".pkl"))
258
259        if write_metadata:
260            self.write_settings(self.output_file, self.mass_spectrum)

Exports the mass spectrum data to a pandas DataFrame and saves it as a pickle file.

Parameters
  • write_metadata (bool, optional): Whether to write the metadata to a JSON file. Defaults to True.
def to_excel(self, write_metadata=True):
262    def to_excel(self, write_metadata=True):
263        """Exports the mass spectrum data to an Excel file.
264
265        Parameters
266        ----------
267        write_metadata : bool, optional
268            Whether to write the metadata to a JSON file. Defaults to True.
269        """
270
271        columns = self.columns_label + self.get_all_used_atoms_in_order(
272            self.mass_spectrum
273        )
274
275        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
276
277        df = DataFrame(dict_data_list, columns=columns)
278
279        df.to_excel(self.output_file.with_suffix(".xlsx"))
280
281        if write_metadata:
282            self.write_settings(self.output_file, self.mass_spectrum)

Exports the mass spectrum data to an Excel file.

Parameters
  • write_metadata (bool, optional): Whether to write the metadata to a JSON file. Defaults to True.
def to_csv(self, write_metadata=True):
284    def to_csv(self, write_metadata=True):
285        """Exports the mass spectrum data to a CSV file.
286
287        Parameters
288        ----------
289        write_metadata : bool, optional
290            Whether to write the metadata to a JSON file. Defaults to True.
291        """
292
293        columns = self.columns_label + self.get_all_used_atoms_in_order(
294            self.mass_spectrum
295        )
296
297        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
298
299        import csv
300
301        try:
302            with open(self.output_file.with_suffix(".csv"), "w", newline="") as csvfile:
303                writer = csv.DictWriter(csvfile, fieldnames=columns)
304                writer.writeheader()
305                for data in dict_data_list:
306                    writer.writerow(data)
307            if write_metadata:
308                self.write_settings(self.output_file, self.mass_spectrum)
309
310        except IOError as ioerror:
311            print(ioerror)

Exports the mass spectrum data to a CSV file.

Parameters
  • write_metadata (bool, optional): Whether to write the metadata to a JSON file. Defaults to True.
def to_json(self):
313    def to_json(self):
314        """Exports the mass spectrum data to a JSON string."""
315
316        columns = self.columns_label + self.get_all_used_atoms_in_order(
317            self.mass_spectrum
318        )
319
320        dict_data_list = self.get_list_dict_data(self.mass_spectrum)
321
322        df = DataFrame(dict_data_list, columns=columns)
323
324        # for key, values in dict_data.items():
325        #    if not values: dict_data[key] = NaN
326
327        # output = json.dumps(dict_data, sort_keys=True, indent=4, separators=(',', ': '))
328        return df.to_json(orient="records")

Exports the mass spectrum data to a JSON string.

def add_mass_spectrum_to_hdf5( self, hdf_handle, mass_spectrum, group_key, mass_spectra_group=None, export_raw=True):
330    def add_mass_spectrum_to_hdf5(
331        self,
332        hdf_handle,
333        mass_spectrum,
334        group_key,
335        mass_spectra_group=None,
336        export_raw=True,
337    ):
338        """Adds the mass spectrum data to an HDF5 file.
339
340        Parameters
341        ----------
342        hdf_handle : h5py.File
343            The HDF5 file handle.
344        mass_spectrum : MassSpectrum
345            The mass spectrum to add to the HDF5 file.
346        group_key : str
347            The group key (where to add the mass spectrum data within the HDF5 file).
348        mass_spectra_group : h5py.Group, optional
349            The mass spectra group. Defaults to None (no group, mass spectrum is added to the root).
350        export_raw : bool, optional
351            Whether to export the raw data. Defaults to True.
352            If False, only the processed data (peaks) is exported (essentially centroided data).
353        """
354        if mass_spectra_group is None:
355            # Check if the file has the necessary attributes and add them if not
356            # This assumes that if there is a mass_spectra_group, these attributes were already added to the file
357            if not hdf_handle.attrs.get("date_utc"):
358                timenow = str(
359                    datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z")
360                )
361                hdf_handle.attrs["date_utc"] = timenow
362                hdf_handle.attrs["file_name"] = mass_spectrum.filename.name
363                hdf_handle.attrs["data_structure"] = "mass_spectrum"
364                hdf_handle.attrs["analyzer"] = mass_spectrum.analyzer
365                hdf_handle.attrs["instrument_label"] = mass_spectrum.instrument_label
366                hdf_handle.attrs["sample_name"] = mass_spectrum.sample_name
367
368        list_results = self.list_dict_to_list(mass_spectrum, is_hdf5=True)
369
370        dict_ms_attrs = self.get_mass_spec_attrs(mass_spectrum)
371
372        setting_dicts = parameter_to_dict.get_dict_data_ms(mass_spectrum)
373
374        columns_labels = json.dumps(
375            self.columns_label + self.get_all_used_atoms_in_order(mass_spectrum),
376            sort_keys=False,
377            indent=4,
378            separators=(",", ": "),
379        )
380
381        group_key = group_key
382
383        if mass_spectra_group is not None:
384            hdf_handle = mass_spectra_group
385
386        if group_key not in hdf_handle.keys():
387            scan_group = hdf_handle.create_group(group_key)
388
389            # If there is raw data (from profile data) save it
390            if not mass_spectrum.is_centroid and export_raw:
391                mz_abun_array = empty(shape=(2, len(mass_spectrum.abundance_profile)), dtype=np.float32)
392
393                mz_abun_array[0] = mass_spectrum.abundance_profile
394                mz_abun_array[1] = mass_spectrum.mz_exp_profile
395
396                raw_ms_dataset = scan_group.create_dataset(
397                    "raw_ms", data=mz_abun_array, dtype="f4", compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True
398                )
399                
400                # Add metadata to the raw_ms dataset when it exists
401                raw_ms_dataset.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs)
402
403                if isinstance(mass_spectrum, MassSpecfromFreq):
404                    raw_ms_dataset.attrs["TransientSetting"] = json.dumps(
405                        setting_dicts.get("TransientSetting"),
406                        sort_keys=False,
407                        indent=4,
408                        separators=(",", ": "),
409                    )
410            else:
411                # For centroided data, store metadata directly on the scan group
412                scan_group.attrs["MassSpecAttrs"] = json.dumps(dict_ms_attrs)
413                
414                if isinstance(mass_spectrum, MassSpecfromFreq):
415                    scan_group.attrs["TransientSetting"] = json.dumps(
416                        setting_dicts.get("TransientSetting"),
417                        sort_keys=False,
418                        indent=4,
419                        separators=(",", ": "),
420                    )
421
422        else:
423            scan_group = hdf_handle.get(group_key)
424
425        # if there is not processed data len = 0, otherwise len() will return next index
426        index_processed_data = str(len(scan_group.keys()))
427
428        timenow = str(datetime.now(timezone.utc).strftime("%d/%m/%Y %H:%M:%S %Z"))
429
430        # Convert list_results to numpy array and optimize data types
431        array_results = np.array(list_results)
432        
433        # Apply data type optimization for numeric columns
434        if array_results.dtype == np.float64:
435            array_results = array_results.astype(np.float32)
436        elif array_results.dtype == np.int64:
437            array_results = array_results.astype(np.int32)
438        
439        processed_dset = scan_group.create_dataset(
440            index_processed_data, data=array_results, compression="gzip", compression_opts=9, chunks=True, shuffle=True, fletcher32=True
441        )
442
443        processed_dset.attrs["date_utc"] = timenow
444
445        processed_dset.attrs["ColumnsLabels"] = columns_labels
446
447        processed_dset.attrs["MoleculaSearchSetting"] = json.dumps(
448            setting_dicts.get("MoleculaSearch"),
449            sort_keys=False,
450            indent=4,
451            separators=(",", ": "),
452        )
453
454        processed_dset.attrs["MassSpecPeakSetting"] = json.dumps(
455            setting_dicts.get("MassSpecPeak"),
456            sort_keys=False,
457            indent=4,
458            separators=(",", ": "),
459        )
460
461        processed_dset.attrs["MassSpectrumSetting"] = json.dumps(
462            setting_dicts.get("MassSpectrum"),
463            sort_keys=False,
464            indent=4,
465            separators=(",", ": "),
466        )

Adds the mass spectrum data to an HDF5 file.

Parameters
  • hdf_handle (h5py.File): The HDF5 file handle.
  • mass_spectrum (MassSpectrum): The mass spectrum to add to the HDF5 file.
  • group_key (str): The group key (where to add the mass spectrum data within the HDF5 file).
  • mass_spectra_group (h5py.Group, optional): The mass spectra group. Defaults to None (no group, mass spectrum is added to the root).
  • export_raw (bool, optional): Whether to export the raw data. Defaults to True. If False, only the processed data (peaks) is exported (essentially centroided data).
def to_hdf(self):
468    def to_hdf(self):
469        """Exports the mass spectrum data to an HDF5 file."""
470
471        with h5py.File(self.output_file.with_suffix(".hdf5"), "a") as hdf_handle:
472            self.add_mass_spectrum_to_hdf5(
473                hdf_handle, self.mass_spectrum, str(self.mass_spectrum.scan_number)
474            )

Exports the mass spectrum data to an HDF5 file.

def parameters_to_toml(self):
476    def parameters_to_toml(self):
477        """Converts the mass spectrum parameters to a TOML string.
478
479        Returns
480        -------
481        str
482            The TOML string of the mass spectrum parameters.
483        """
484
485        dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum)
486
487        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum)
488        dict_setting["analyzer"] = self.mass_spectrum.analyzer
489        dict_setting["instrument_label"] = self.mass_spectrum.instrument_label
490        dict_setting["sample_name"] = self.mass_spectrum.sample_name
491
492        output = toml.dumps(dict_setting)
493
494        return output

Converts the mass spectrum parameters to a TOML string.

Returns
  • str: The TOML string of the mass spectrum parameters.
def parameters_to_json(self):
496    def parameters_to_json(self):
497        """Converts the mass spectrum parameters to a JSON string.
498
499        Returns
500        -------
501        str
502            The JSON string of the mass spectrum parameters.
503        """
504
505        dict_setting = parameter_to_dict.get_dict_data_ms(self.mass_spectrum)
506
507        dict_setting["MassSpecAttrs"] = self.get_mass_spec_attrs(self.mass_spectrum)
508        dict_setting["analyzer"] = self.mass_spectrum.analyzer
509        dict_setting["instrument_label"] = self.mass_spectrum.instrument_label
510        dict_setting["sample_name"] = self.mass_spectrum.sample_name
511
512        output = json.dumps(dict_setting)
513
514        return output

Converts the mass spectrum parameters to a JSON string.

Returns
  • str: The JSON string of the mass spectrum parameters.
def get_mass_spec_attrs(self, mass_spectrum):
516    def get_mass_spec_attrs(self, mass_spectrum):
517        """Returns the mass spectrum attributes as a dictionary.
518
519        Parameters
520        ----------
521        mass_spectrum : MassSpectrum
522            The mass spectrum to export.
523
524        Returns
525        -------
526        dict
527            The mass spectrum attributes.
528        """
529
530        dict_ms_attrs = {}
531        dict_ms_attrs["polarity"] = mass_spectrum.polarity
532        dict_ms_attrs["rt"] = mass_spectrum.retention_time
533        dict_ms_attrs["tic"] = mass_spectrum.tic
534        dict_ms_attrs["mobility_scan"] = mass_spectrum.mobility_scan
535        dict_ms_attrs["mobility_rt"] = mass_spectrum.mobility_rt
536        dict_ms_attrs["Aterm"] = mass_spectrum.Aterm
537        dict_ms_attrs["Bterm"] = mass_spectrum.Bterm
538        dict_ms_attrs["Cterm"] = mass_spectrum.Cterm
539        dict_ms_attrs["baseline_noise"] = mass_spectrum.baseline_noise
540        dict_ms_attrs["baseline_noise_std"] = mass_spectrum.baseline_noise_std
541
542        return dict_ms_attrs

Returns the mass spectrum attributes as a dictionary.

Parameters
  • mass_spectrum (MassSpectrum): The mass spectrum to export.
Returns
  • dict: The mass spectrum attributes.
def get_all_used_atoms_in_order(self, mass_spectrum):
544    def get_all_used_atoms_in_order(self, mass_spectrum):
545        """Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.
546
547        Parameters
548        ----------
549        mass_spectrum : MassSpectrum
550            The mass spectrum to export.
551
552        Returns
553        -------
554        list
555            The list of assigned atoms in the order specified by Atoms.atoms_order list.
556        """
557
558        atoms_in_order = Atoms.atoms_order
559        all_used_atoms = set()
560        if mass_spectrum:
561            for ms_peak in mass_spectrum:
562                if ms_peak:
563                    for m_formula in ms_peak:
564                        for atom in m_formula.atoms:
565                            all_used_atoms.add(atom)
566
567        def sort_method(atom):
568            return [atoms_in_order.index(atom)]
569
570        return sorted(all_used_atoms, key=sort_method)

Returns the list of assigned atoms in the order specified by Atoms.atoms_order list.

Parameters
  • mass_spectrum (MassSpectrum): The mass spectrum to export.
Returns
  • list: The list of assigned atoms in the order specified by Atoms.atoms_order list.
def list_dict_to_list(self, mass_spectrum, is_hdf5=False):
572    def list_dict_to_list(self, mass_spectrum, is_hdf5=False):
573        """Returns the mass spectrum data as a list of dictionaries.
574
575        Parameters
576        ----------
577        mass_spectrum : MassSpectrum
578            The mass spectrum to export.
579        is_hdf5 : bool, optional
580            Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
581
582        Returns
583        -------
584        list
585            The mass spectrum data as a list of dictionaries.
586        """
587
588        column_labels = self.columns_label + self.get_all_used_atoms_in_order(
589            mass_spectrum
590        )
591
592        dict_list = self.get_list_dict_data(mass_spectrum, is_hdf5=is_hdf5)
593
594        all_lines = []
595        for dict_res in dict_list:
596            result_line = [NaN] * len(column_labels)
597
598            for label, value in dict_res.items():
599                label_index = column_labels.index(label)
600                result_line[label_index] = value
601
602            all_lines.append(result_line)
603
604        return all_lines

Returns the mass spectrum data as a list of dictionaries.

Parameters
  • mass_spectrum (MassSpectrum): The mass spectrum to export.
  • is_hdf5 (bool, optional): Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
Returns
  • list: The mass spectrum data as a list of dictionaries.
def get_list_dict_data( self, mass_spectrum, include_no_match=True, include_isotopologues=True, isotopologue_inline=True, no_match_inline=False, is_hdf5=False, additional_columns=None):
606    def get_list_dict_data(
607        self,
608        mass_spectrum,
609        include_no_match=True,
610        include_isotopologues=True,
611        isotopologue_inline=True,
612        no_match_inline=False,
613        is_hdf5=False,
614        additional_columns=None,
615    ):
616        """Returns the mass spectrum data as a list of dictionaries.
617
618        Parameters
619        ----------
620        mass_spectrum : MassSpectrum
621            The mass spectrum to export.
622        include_no_match : bool, optional
623            Whether to include unassigned (no match) data. Defaults to True.
624        include_isotopologues : bool, optional
625            Whether to include isotopologues. Defaults to True.
626        isotopologue_inline : bool, optional
627            Whether to include isotopologues inline. Defaults to True.
628        no_match_inline : bool, optional
629            Whether to include unassigned (no match) data inline. Defaults to False.
630        is_hdf5 : bool, optional
631            Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
632
633        Returns
634        -------
635        list
636            The mass spectrum data as a list of dictionaries.
637        """
638
639        dict_data_list = []
640
641        if is_hdf5:
642            encode = ".encode('utf-8')"
643        else:
644            encode = ""
645
646        def add_no_match_dict_data(index, ms_peak):
647            """
648            Export dictionary of mspeak info for unassigned (no match) data
649            """
650            dict_result = {
651                "Index": index,
652                "m/z": ms_peak._mz_exp,
653                "Calibrated m/z": ms_peak.mz_exp,
654                "Peak Height": ms_peak.abundance,
655                "Peak Area": ms_peak.area,
656                "Resolving Power": ms_peak.resolving_power,
657                "S/N": ms_peak.signal_to_noise,
658                "Ion Charge": ms_peak.ion_charge,
659                "Heteroatom Class": eval("Labels.unassigned{}".format(encode)),
660            }
661
662            dict_data_list.append(dict_result)
663
664        def add_match_dict_data(index, ms_peak, mformula, additional_columns=None):
665            """
666            Export dictionary of mspeak info for assigned (match) data
667            """
668            formula_dict = mformula.to_dict()
669
670            dict_result = {
671                "Index": index,
672                "m/z": ms_peak._mz_exp,
673                "Calibrated m/z": ms_peak.mz_exp,
674                "Calculated m/z": mformula.mz_calc,
675                "Peak Height": ms_peak.abundance,
676                "Peak Area": ms_peak.area,
677                "Resolving Power": ms_peak.resolving_power,
678                "S/N": ms_peak.signal_to_noise,
679                "Ion Charge": ms_peak.ion_charge,
680                "m/z Error (ppm)": mformula.mz_error,
681                "Confidence Score": mformula.confidence_score,
682                "Isotopologue Similarity": mformula.isotopologue_similarity,
683                "m/z Error Score": mformula.average_mz_error_score,
684                "DBE": mformula.dbe,
685                "Heteroatom Class": eval("mformula.class_label{}".format(encode)),
686                "H/C": mformula.H_C,
687                "O/C": mformula.O_C,
688                "Ion Type": eval("mformula.ion_type.lower(){}".format(encode)),
689                "Is Isotopologue": int(mformula.is_isotopologue),
690                "Molecular Formula": eval("mformula.string{}".format(encode)),
691            }
692            if additional_columns is not None:
693                possible_dict = {
694                    "Aromaticity Index": mformula.A_I,
695                    "NOSC": mformula.nosc,
696                    "Aromaticity Index (modified)": mformula.A_I_mod,
697                }
698                for column in additional_columns:
699                    dict_result[column] = possible_dict.get(column)
700
701            if mformula.adduct_atom:
702                dict_result["Adduct"] = eval("mformula.adduct_atom{}".format(encode))
703
704            if mformula.is_isotopologue:
705                dict_result["Mono Isotopic Index"] = mformula.mspeak_index_mono_isotopic
706
707            if self.atoms_order_list is None:
708                atoms_order_list = self.get_all_used_atoms_in_order(mass_spectrum)
709            else:
710                atoms_order_list = self.atoms_order_list
711
712            for atom in atoms_order_list:
713                if atom in formula_dict.keys():
714                    dict_result[atom] = formula_dict.get(atom)
715
716            dict_data_list.append(dict_result)
717
718        score_methods = mass_spectrum.molecular_search_settings.score_methods
719        selected_score_method = (
720            mass_spectrum.molecular_search_settings.output_score_method
721        )
722
723        if selected_score_method in score_methods:
724            # temp set score method as the one chosen in the output
725            current_method = mass_spectrum.molecular_search_settings.score_method
726            mass_spectrum.molecular_search_settings.score_method = selected_score_method
727
728            for index, ms_peak in enumerate(mass_spectrum):
729                # print(ms_peak.mz_exp)
730
731                if ms_peak:
732                    m_formula = ms_peak.best_molecular_formula_candidate
733
734                    if m_formula:
735                        if not m_formula.is_isotopologue:
736                            add_match_dict_data(
737                                index,
738                                ms_peak,
739                                m_formula,
740                                additional_columns=additional_columns,
741                            )
742
743                            for (
744                                iso_mspeak_index,
745                                iso_mf_formula,
746                            ) in m_formula.mspeak_mf_isotopologues_indexes:
747                                iso_ms_peak = mass_spectrum[iso_mspeak_index]
748                                add_match_dict_data(
749                                    iso_mspeak_index,
750                                    iso_ms_peak,
751                                    iso_mf_formula,
752                                    additional_columns=additional_columns,
753                                )
754                else:
755                    if include_no_match and no_match_inline:
756                        add_no_match_dict_data(index, ms_peak)
757
758            if include_no_match and not no_match_inline:
759                for index, ms_peak in enumerate(mass_spectrum):
760                    if not ms_peak:
761                        add_no_match_dict_data(index, ms_peak)
762            # reset score method as the one chosen in the output
763            mass_spectrum.molecular_search_settings.score_method = current_method
764
765        else:
766            for index, ms_peak in enumerate(mass_spectrum):
767                # check if there is a molecular formula candidate for the msPeak
768
769                if ms_peak:
770                    # m_formula = ms_peak.molecular_formula_lowest_error
771                    for m_formula in ms_peak:
772                        if mass_spectrum.molecular_search_settings.output_min_score > 0:
773                            if (
774                                m_formula.confidence_score
775                                >= mass_spectrum.molecular_search_settings.output_min_score
776                            ):
777                                if m_formula.is_isotopologue:  # isotopologues inline
778                                    if include_isotopologues and isotopologue_inline:
779                                        add_match_dict_data(
780                                            index,
781                                            ms_peak,
782                                            m_formula,
783                                            additional_columns=additional_columns,
784                                        )
785                                else:
786                                    add_match_dict_data(
787                                        index,
788                                        ms_peak,
789                                        m_formula,
790                                        additional_columns=additional_columns,
791                                    )  # add monoisotopic peak
792
793                            # cutoff because of low score
794                            else:
795                                add_no_match_dict_data(index, ms_peak)
796
797                        else:
798                            if m_formula.is_isotopologue:  # isotopologues inline
799                                if include_isotopologues and isotopologue_inline:
800                                    add_match_dict_data(
801                                        index,
802                                        ms_peak,
803                                        m_formula,
804                                        additional_columns=additional_columns,
805                                    )
806                            else:
807                                add_match_dict_data(
808                                    index,
809                                    ms_peak,
810                                    m_formula,
811                                    additional_columns=additional_columns,
812                                )  # add monoisotopic peak
813                else:
814                    # include not_match
815                    if include_no_match and no_match_inline:
816                        add_no_match_dict_data(index, ms_peak)
817
818            if include_isotopologues and not isotopologue_inline:
819                for index, ms_peak in enumerate(mass_spectrum):
820                    for m_formula in ms_peak:
821                        if m_formula.is_isotopologue:
822                            if (
823                                m_formula.confidence_score
824                                >= mass_spectrum.molecular_search_settings.output_min_score
825                            ):
826                                add_match_dict_data(
827                                    index,
828                                    ms_peak,
829                                    m_formula,
830                                    additional_columns=additional_columns,
831                                )
832
833            if include_no_match and not no_match_inline:
834                for index, ms_peak in enumerate(mass_spectrum):
835                    if not ms_peak:
836                        add_no_match_dict_data(index, ms_peak)
837
838        # remove duplicated add_match data possibly introduced on the output_score_filter step
839        res = []
840        [res.append(x) for x in dict_data_list if x not in res]
841
842        return res

Returns the mass spectrum data as a list of dictionaries.

Parameters
  • mass_spectrum (MassSpectrum): The mass spectrum to export.
  • include_no_match (bool, optional): Whether to include unassigned (no match) data. Defaults to True.
  • include_isotopologues (bool, optional): Whether to include isotopologues. Defaults to True.
  • isotopologue_inline (bool, optional): Whether to include isotopologues inline. Defaults to True.
  • no_match_inline (bool, optional): Whether to include unassigned (no match) data inline. Defaults to False.
  • is_hdf5 (bool, optional): Whether the mass spectrum is being exported to an HDF5 file. Defaults to False.
Returns
  • list: The mass spectrum data as a list of dictionaries.
Inherited Members
threading.Thread
start
join
name
ident
is_alive
daemon
isDaemon
setDaemon
getName
setName
native_id