corems.mass_spectra.factory.GC_Class

  1__author__ = "Yuri E. Corilo"
  2__date__ = "Feb 13, 2020"
  3
  4
  5from collections.abc import Mapping
  6from pathlib import Path
  7import json
  8
  9from numpy import array
 10
 11
 12from corems.mass_spectra.calc.GC_Calc import GC_Calculations
 13from corems.mass_spectra.calc.GC_Deconvolution import MassDeconvolution
 14from corems.mass_spectra.calc import SignalProcessing as sp
 15
 16from corems.chroma_peak.factory.chroma_peak_classes import GCPeak
 17from corems.mass_spectra.output.export import LowResGCMSExport
 18from corems.encapsulation.factory.parameters import GCMSParameters
 19
 20
 21class GCMSBase(GC_Calculations, MassDeconvolution):
 22    """Base class for GC-MS data processing.
 23
 24    Parameters
 25    ----
 26    file_location : str, pathlib.Path, or s3path.S3Path
 27        Path object containing the file location.
 28    analyzer : str, optional
 29        Name of the analyzer. Defaults to 'Unknown'.
 30    instrument_label : str, optional
 31        Label of the instrument. Defaults to 'Unknown'.
 32    sample_name : str, optional
 33        Name of the sample. If not provided, it is derived from the file location.
 34
 35    Attributes
 36    ------------
 37    file_location : pathlib.Path
 38        Path object containing the file location.
 39    sample_name : str
 40        Name of the sample.
 41    analyzer : str
 42        Name of the analyzer.
 43    instrument_label : str
 44        Label of the instrument.
 45    gcpeaks : list
 46        List of GCPeak objects.
 47    ri_pairs_ref : None
 48        Reference retention index pairs.
 49    cal_file_path : None
 50        Calibration file path.
 51    _parameters : GCMSParameters
 52        GC-MS parameters.
 53    _retention_time_list : list
 54        List of retention times.
 55    _scans_number_list : list
 56        List of scan numbers.
 57    _tic_list : list
 58        List of total ion chromatogram values.
 59    _ms : dict
 60        Dictionary containing all mass spectra.
 61    _processed_tic : list
 62        List of processed total ion chromatogram values.
 63
 64    Methods
 65    -------
 66    * process_chromatogram(plot_res=False). Process the chromatogram.
 67    * plot_gc_peaks(ax=None, color='red'). Plot the GC peaks.
 68    """
 69
 70    def __init__(
 71        self,
 72        file_location,
 73        analyzer="Unknown",
 74        instrument_label="Unknown",
 75        sample_name=None,
 76    ):
 77        if isinstance(file_location, str):
 78            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 79            file_location = Path(file_location)
 80
 81        if not file_location.exists():
 82            raise FileExistsError("File does not exist: " + str(file_location))
 83
 84        self.file_location = file_location
 85
 86        if sample_name:
 87            self.sample_name = sample_name
 88        else:
 89            self.sample_name = file_location.stem
 90
 91        self.analyzer = analyzer
 92        self.instrument_label = instrument_label
 93        self._init_settings()
 94
 95        self._retention_time_list = []
 96        self._scans_number_list = []
 97        self._tic_list = []
 98
 99        # all scans
100        self._ms = {}
101
102        # after peak detection
103        self._processed_tic = []
104        self.gcpeaks = []
105
106        self.ri_pairs_ref = None
107        self.cal_file_path = None
108
109    def _init_settings(self):
110        """Initialize the settings for GC_Class.
111
112        This method initializes the settings for the GC_Class object using the GCMSParameters class.
113        """
114        self._parameters = GCMSParameters()
115
116    def __len__(self):
117        """Return the number of GC peaks in the GC_Class object."""
118        return len(self.gcpeaks)
119
120    def __getitem__(self, scan_number) -> GCPeak:
121        """Return the GCPeak with the given scan number."""
122        return self.gcpeaks[scan_number]
123
124    # def __iter__(self):
125
126    #     return iter(self.gcpeaks.values())
127
128    def process_chromatogram(self, plot_res=False):
129        """Process the chromatogram.
130
131        This method processes the chromatogram.
132
133        Parameters
134        ----------
135        plot_res : bool, optional
136            If True, plot the results. Defaults to False.
137        """
138
139        # tic = self.tic - self.baseline_detector(self.tic)
140
141        self._processed_tic = self.smooth_tic(self.tic)
142
143        for index, tic in enumerate(self._processed_tic):
144            self._ms[index]._processed_tic = tic
145
146        # self.second_derivative_threshold(self._processed_tic)
147
148        if self.chromatogram_settings.use_deconvolution:
149            self.run_deconvolution(plot_res=False)
150
151        else:
152            peaks_index = self.centroid_detector(
153                self._processed_tic, self.retention_time
154            )
155
156            for i in peaks_index:
157                apex_index = i[1]
158
159                gc_peak = GCPeak(self, self._ms[apex_index], i)
160
161                gc_peak.calc_area(self._processed_tic, 1)
162
163                self.gcpeaks.append(gc_peak)
164
165                # self.gcpeaks[self.scans_number[apex_index]] = gc_peak
166
167    def add_mass_spectrum(self, mass_spec):
168        """Add a mass spectrum to the GC-MS object.
169
170        This method adds a mass spectrum to the GC-MS object.
171
172        Parameters
173        ----------
174        mass_spec : MassSpectrum
175            Mass spectrum to be added.
176        """
177
178        self._ms[mass_spec.scan_number] = mass_spec
179
180    def set_tic_list_from_data(self):
181        """Set the total ion chromatogram list from the mass spectra data within the GC-MS data object."""
182
183        self.tic = [self._ms.get(i).tic for i in self.scans_number]
184
185        # self.set_tic_list([self._ms.get(i).get_sumed_signal_to_noise() for i in self.get_scans_number()])
186
187    def set_retention_time_from_data(self):
188        """Set the retention time list from the mass spectra data within the GC-MS data object."""
189
190        retention_time_list = []
191
192        for key_ms in sorted(self._ms.keys()):
193            retention_time_list.append(self._ms.get(key_ms).retention_time)
194
195        self.retention_time = retention_time_list
196
197        # self.set_retention_time_list(sorted(self._ms.keys()))
198
199    def set_scans_number_from_data(self):
200        """Set the scan number list from the mass spectra data within the GC-MS data object."""
201
202        self.scans_number = sorted(self._ms.keys())
203
204    @property
205    def parameters(self):
206        """GCMS Parameters"""
207        return self._parameters
208
209    @parameters.setter
210    def parameters(self, gcms_parameters_instance):
211        self._parameters = gcms_parameters_instance
212
213    # Note: maintaining `parameter` for backwards compatibility,
214    # but proper usage would reference `parameters` to conform
215    # to other classes.
216    @property
217    def parameter(self):
218        """GCMS Parameters"""
219        return self._parameters
220
221    @parameter.setter
222    def parameter(self, gcms_parameters_instance):
223        self._parameters = gcms_parameters_instance
224
225    @property
226    def molecular_search_settings(self):
227        """Molecular Search Settings"""
228        return self.parameters.molecular_search
229
230    @molecular_search_settings.setter
231    def molecular_search_settings(self, settings_class_instance):
232        self.parameters.molecular_search = settings_class_instance
233
234    @property
235    def chromatogram_settings(self):
236        """Chromatogram Settings"""
237        return self.parameters.gc_ms
238
239    @chromatogram_settings.setter
240    def chromatogram_settings(self, settings_class_instance):
241        self.parameters.gc_ms = settings_class_instance
242
243    @property
244    def scans_number(self):
245        """Scans Number"""
246        return self._scans_number_list
247
248    @property
249    def retention_time(self):
250        """Retention Time"""
251        return self._retention_time_list
252
253    @property
254    def processed_tic(self):
255        """Processed Total Ion Current"""
256        return self._processed_tic
257
258    @property
259    def tic(self):
260        """Total Ion Current"""
261        return self._tic_list
262
263    @property
264    def max_tic(self):
265        """Maximum Total Ion Current"""
266        return max([gc_peak.tic for gc_peak in self])
267
268    @property
269    def min_tic(self):
270        """Minimum Total Ion Current"""
271        return min([gc_peak.tic for gc_peak in self])
272
273    @property
274    def dynamic_range(self):
275        """Dynamic Range of the Total Ion Current"""
276        return self.max_tic / self.min_tic
277
278    @property
279    def matched_peaks(self):
280        """Matched Peaks"""
281        return [gc_peak for gc_peak in self if gc_peak]
282
283    @property
284    def sorted_gcpeaks(self):
285        """Sorted GC Peaks, by retention time"""
286        return sorted(self, key=lambda g: g.retention_time)
287
288    @property
289    def unique_metabolites(self):
290        """Unique Metabolites"""
291        metabolites = set()
292        for gc_peak in self:
293            if gc_peak:
294                for compound_obj in gc_peak:
295                    metabolites.add(compound_obj.name)
296
297        return metabolites
298
299    @property
300    def metabolites_data(self):
301        """Metabolites Data"""
302        metabolites = {}
303        for gc_peak in self:
304            if gc_peak:
305                for compound_obj in gc_peak:
306                    if compound_obj.name in metabolites.keys():
307                        current_score = metabolites[compound_obj.name][
308                            "highest_similarity_score"
309                        ]
310                        compound_score = compound_obj.spectral_similarity_score
311                        metabolites[compound_obj.name]["highest_similarity_score"] = (
312                            compound_score
313                            if compound_score > current_score
314                            else current_score
315                        )
316
317                    else:
318                        if compound_obj.metadata:
319                            metabolites[compound_obj.name] = {
320                                "name": compound_obj.name,
321                                "highest_similarity_score": compound_obj.spectral_similarity_score,
322                                "casno": compound_obj.metadata.cas,
323                                "kegg": compound_obj.metadata.kegg,
324                                "inchi": compound_obj.metadata.inchi,
325                                "inchi_key": compound_obj.metadata.inchikey,
326                                "chebi": compound_obj.metadata.chebi,
327                                "smiles": compound_obj.metadata.smiles,
328                            }
329                        else:
330                            metabolites[compound_obj.name] = {
331                                "name": compound_obj.name,
332                                "highest_similarity_score": compound_obj.spectral_similarity_score,
333                                "casno": "",
334                                "kegg": "",
335                                "inchi": "",
336                                "inchikey": "",
337                                "chebi": "",
338                                "smiles": "",
339                            }
340
341        return list(metabolites.values())
342
343    @property
344    def no_matched_peaks(self):
345        """Peaks with no Matched Metabolites"""
346        return [peak for peak in self if not peak]
347
348    @retention_time.setter
349    def retention_time(self, alist):
350        # self._retention_time_list = linspace(0, 80, num=len(self._scans_number_list))
351        self._retention_time_list = alist
352
353    @scans_number.setter
354    def scans_number(self, alist):
355        self._scans_number_list = alist
356
357    @tic.setter
358    def tic(self, alist):
359        self._tic_list = array(alist)
360
361    def plot_gc_peaks(self, ax=None, color="red"):  # pragma: no cover
362        """Plot the GC peaks.
363
364        This method plots the GC peaks.
365
366        Parameters
367        ----------
368        ax : matplotlib.axes.Axes, optional
369            Axes object to plot the GC peaks. Defaults to None.
370        color : str, optional
371            Color of the GC peaks. Defaults to 'red'.
372        """
373
374        import matplotlib.pyplot as plt
375
376        fig = plt.gcf()
377        if ax is None:
378            ax = plt.gca()
379
380        max_rts = [gc_peak.mass_spectrum.retention_time for gc_peak in self]
381        max_tics = [gc_peak.mass_spectrum.tic for gc_peak in self]
382
383        # min_rts = [self._ms[gc_peak.start_index].retention_time for gc_peak in self] + [self._ms[gc_peak.final_index].retention_time for gc_peak in self]
384        # min_tics = [self._ms[gc_peak.start_index].tic for gc_peak in self] + [self._ms[gc_peak.final_index].tic for gc_peak in self]
385        # sc = ax.scatter(min_rts, min_tics, color='yellow', linewidth=0, marker='v')
386
387        sc = ax.scatter(max_rts, max_tics, color=color, marker="v")
388
389        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
390
391        annot = ax.annotate(
392            "",
393            xy=(0, 0),
394            xytext=(20, 20),
395            textcoords="offset points",
396            bbox=dict(boxstyle="round", fc="w"),
397            arrowprops=dict(arrowstyle="->"),
398        )
399        annot.set_visible(False)
400        annot.get_bbox_patch().set_facecolor(("lightblue"))
401        annot.get_bbox_patch().set_alpha(0.8)
402
403        def update_annot(ind):
404            pos = sc.get_offsets()[ind["ind"][0]]
405            annot.xy = pos
406
407            text = "RT: {}\nRT Ref: {}\nRI: {}\nRI Ref: {}\nSimilarity Score: {}\nName: {}".format(
408                " ".join([str(round(self[n].retention_time, 2)) for n in ind["ind"]]),
409                " ".join(
410                    [
411                        str(
412                            round(self[n].highest_score_compound.retention_time, 2)
413                            if self[n].highest_score_compound
414                            else None
415                        )
416                        for n in ind["ind"]
417                    ]
418                ),
419                " ".join(
420                    [
421                        str(round(self[n].ri, 2) if self[n].ri else None)
422                        for n in ind["ind"]
423                    ]
424                ),
425                " ".join(
426                    [
427                        str(
428                            round(self[n].highest_score_compound.ri, 2)
429                            if self[n].highest_score_compound
430                            else None
431                        )
432                        for n in ind["ind"]
433                    ]
434                ),
435                " ".join(
436                    [
437                        str(
438                            round(self[n].highest_score_compound.similarity_score, 4)
439                            if self[n].highest_score_compound
440                            else None
441                        )
442                        for n in ind["ind"]
443                    ]
444                ),
445                " ".join(
446                    [
447                        str(
448                            self[n].highest_score_compound.name
449                            if self[n].highest_score_compound
450                            else None
451                        )
452                        for n in ind["ind"]
453                    ]
454                ),
455            )
456            annot.set_text(text)
457
458        def hover(event):
459            vis = annot.get_visible()
460            if event.inaxes == ax:
461                cont, ind = sc.contains(event)
462                if cont:
463                    update_annot(ind)
464                    annot.set_visible(True)
465                    fig.canvas.draw_idle()
466                else:
467                    if vis:
468                        annot.set_visible(False)
469                        fig.canvas.draw_idle()
470
471        fig.canvas.mpl_connect("motion_notify_event", hover)
472
473        return ax
474
475    def to_excel(
476        self, out_file_path, write_mode="ab", write_metadata=True, id_label="corems:"
477    ):
478        """Export the GC-MS data to an Excel file.
479
480        This method exports the GC-MS data to an Excel file.
481
482        Parameters
483        ----------
484        out_file_path : str, pathlib.Path, or s3path.S3Path
485            Path object containing the file location.
486        write_mode : str, optional
487            Write mode. Defaults to 'ab'.
488        write_metadata : bool, optional
489            If True, write the metadata. Defaults to True.
490        id_label : str, optional
491            Label of the ID. Defaults to 'corems:'.
492
493        """
494
495        if isinstance(out_file_path, str):
496            out_file_path = Path(out_file_path)
497
498        exportMS = LowResGCMSExport(out_file_path, self)
499        exportMS.to_excel(
500            id_label=id_label, write_mode=write_mode, write_metadata=write_metadata
501        )
502
503        return out_file_path.with_suffix(".xlsx")
504
505    def to_csv(
506        self,
507        out_file_path,
508        separate_output=False,
509        write_metadata=True,
510        id_label="corems:",
511    ):
512        """Export the GC-MS data to a CSV file.
513
514        Parameters
515        ----------
516        out_file_path : str, pathlib.Path, or s3path.S3Path
517            Path object containing the file location.
518        separate_output : bool, optional
519            If True, separate the output. Defaults to False.
520        write_metadata : bool, optional
521            If True, write the metadata. Defaults to True.
522
523        """
524
525        if isinstance(out_file_path, str):
526            out_file_path = Path(out_file_path)
527
528        exportMS = LowResGCMSExport(out_file_path, self)
529        exportMS.to_csv(
530            id_label=id_label,
531            separate_output=separate_output,
532            write_metadata=write_metadata,
533        )
534
535        return out_file_path.with_suffix(".csv")
536
537    def to_pandas(self, out_file_path, write_metadata=True, id_label="corems:"):
538        """Export the GC-MS data to a Pandas dataframe.
539
540        Parameters
541        ----------
542        out_file_path : str, pathlib.Path, or s3path.S3Path
543            Path object containing the file location.
544        write_metadata : bool, optional
545            If True, write the metadata. Defaults to True.
546        id_label : str, optional
547            Label of the ID. Defaults to 'corems:'.
548
549        """
550
551        if isinstance(out_file_path, str):
552            out_file_path = Path(out_file_path)
553        # pickle dataframe (pkl extension)
554        exportMS = LowResGCMSExport(out_file_path, self)
555        exportMS.to_pandas(id_label=id_label, write_metadata=write_metadata)
556
557        return out_file_path.with_suffix(".pkl")
558
559    def to_dataframe(self, id_label="corems:"):
560        """Export the GC-MS data to a Pandas dataframe.
561
562        Parameters
563        ----------
564        id_label : str, optional
565            Label of the ID. Defaults to 'corems:'.
566
567        """
568
569        # returns pandas dataframe
570        exportMS = LowResGCMSExport(self.sample_name, self)
571        return exportMS.get_pandas_df(id_label=id_label)
572
573    def processing_stats(self):
574        """Return the processing statistics."""
575
576        # returns json string
577        exportMS = LowResGCMSExport(self.sample_name, self)
578        return exportMS.get_data_stats(self)
579
580    def parameters_json(self, id_label="corems:", output_path=" "):
581        """Return the parameters in JSON format.
582
583        Parameters
584        ----------
585        id_label : str, optional
586            Label of the ID. Defaults to 'corems:'.
587        output_path : str, optional
588            Path object containing the file location. Defaults to " ".
589        """
590
591        # returns json string
592        exportMS = LowResGCMSExport(self.sample_name, self)
593        return exportMS.get_parameters_json(self, id_label, output_path)
594
595    def to_json(self, id_label="corems:"):
596        """Export the GC-MS data to a JSON file.
597
598        Parameters
599        ----------
600        id_label : str, optional
601            Label of the ID. Defaults to 'corems:'.
602
603        """
604
605        # returns pandas dataframe
606        exportMS = LowResGCMSExport(self.sample_name, self)
607        return exportMS.get_json(id_label=id_label)
608
609    def to_hdf(self, id_label="corems:"):
610        """Export the GC-MS data to a HDF file.
611
612        Parameters
613        ----------
614        id_label : str, optional
615            Label of the ID. Defaults to 'corems:'.
616
617        """
618
619        # returns pandas dataframe
620        exportMS = LowResGCMSExport(self.sample_name, self)
621        return exportMS.to_hdf(id_label=id_label)
622
623    def plot_chromatogram(self, ax=None, color="blue"):  # pragma: no cover
624        """Plot the chromatogram.
625
626        Parameters
627        ----------
628        ax : matplotlib.axes.Axes, optional
629            Axes object to plot the chromatogram. Defaults to None.
630        color : str, optional
631            Color of the chromatogram. Defaults to 'blue'.
632
633        """
634
635        import matplotlib.pyplot as plt
636
637        if ax is None:
638            ax = plt.gca()
639
640        ax.plot(self.retention_time, self.tic, color=color)
641        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
642
643        return ax
644
645    def plot_smoothed_chromatogram(self, ax=None, color="green"):  # pragma: no cover
646        """Plot the smoothed chromatogram.
647
648        Parameters
649        ----------
650        ax : matplotlib.axes.Axes, optional
651            Axes object to plot the smoothed chromatogram. Defaults to None.
652        color : str, optional
653            Color of the smoothed chromatogram. Defaults to 'green'.
654
655        """
656
657        import matplotlib.pyplot as plt
658
659        if ax is None:
660            ax = plt.gca()
661
662        ax.plot(self.retention_time, self.smooth_tic(self.tic), color=color)
663
664        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
665
666        return ax
667
668    def plot_detected_baseline(self, ax=None, color="blue"):  # pragma: no cover
669        """Plot the detected baseline.
670
671        Parameters
672        ----------
673        ax : matplotlib.axes.Axes, optional
674            Axes object to plot the detected baseline. Defaults to None.
675        color : str, optional
676            Color of the detected baseline. Defaults to 'blue'.
677
678        """
679
680        import matplotlib.pyplot as plt
681
682        if ax is None:
683            ax = plt.gca()
684
685        max_height = self.chromatogram_settings.peak_height_max_percent
686        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
687
688        baseline = sp.baseline_detector(
689            self.tic, self.retention_time, max_height, max_prominence
690        )
691        ax.plot(self.retention_time, color=color)
692        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
693
694        return ax
695
696    def plot_baseline_subtraction(self, ax=None, color="black"):  # pragma: no cover
697        """Plot the baseline subtraction.
698
699        Parameters
700        ----------
701        ax : matplotlib.axes.Axes, optional
702            Axes object to plot the baseline subtraction. Defaults to None.
703        color : str, optional
704            Color of the baseline subtraction. Defaults to 'black'.
705
706        """
707
708        import matplotlib.pyplot as plt
709
710        if ax is None:
711            ax = plt.gca()
712
713        max_height = self.chromatogram_settings.peak_height_max_percent
714
715        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
716
717        x = self.tic + sp.baseline_detector(
718            self.tic, self.retention_time, max_height, max_prominence
719        )
720
721        ax.plot(self.retention_time, x, color=color)
722
723        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
724
725        return ax
726
727    def peaks_rt_tic(self, json_string=False):
728        """Return the peaks, retention time, and total ion chromatogram.
729
730        Parameters
731        ----------
732        json_string : bool, optional
733            If True, return the peaks, retention time, and total ion chromatogram in JSON format. Defaults to False.
734
735        """
736
737        peaks_list = dict()
738
739        all_candidates_data = {}
740
741        all_peaks_data = {}
742
743        for gcms_peak in self.sorted_gcpeaks:
744            dict_data = {
745                "rt": gcms_peak.rt_list,
746                "tic": gcms_peak.tic_list,
747                "mz": gcms_peak.mass_spectrum.mz_exp.tolist(),
748                "abundance": gcms_peak.mass_spectrum.abundance.tolist(),
749                "candidate_names": gcms_peak.compound_names,
750            }
751
752            peaks_list[gcms_peak.retention_time] = dict_data
753
754            for compound in gcms_peak:
755                if compound.name not in all_candidates_data.keys():
756                    mz = array(compound.mz).tolist()
757                    abundance = array(compound.abundance).tolist()
758                    data = {"mz": mz, "abundance": abundance}
759                    all_candidates_data[compound.name] = data
760
761        all_peaks_data["peak_data"] = peaks_list
762        all_peaks_data["ref_data"] = all_candidates_data
763
764        if json_string:
765            return json.dumps(all_peaks_data)
766
767        else:
768            return all_peaks_data
769
770    def plot_processed_chromatogram(self, ax=None, color="black"):
771        """Plot the processed chromatogram.
772
773        Parameters
774        ----------
775        ax : matplotlib.axes.Axes, optional
776            Axes object to plot the processed chromatogram. Defaults to None.
777        color : str, optional
778            Color of the processed chromatogram. Defaults to 'black'.
779
780        """
781
782        import matplotlib.pyplot as plt
783
784        if ax is None:
785            ax = plt.gca()
786
787        ax.plot(self.retention_time, self.processed_tic, color=color)
788
789        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
790
791        return ax
 22class GCMSBase(GC_Calculations, MassDeconvolution):
 23    """Base class for GC-MS data processing.
 24
 25    Parameters
 26    ----
 27    file_location : str, pathlib.Path, or s3path.S3Path
 28        Path object containing the file location.
 29    analyzer : str, optional
 30        Name of the analyzer. Defaults to 'Unknown'.
 31    instrument_label : str, optional
 32        Label of the instrument. Defaults to 'Unknown'.
 33    sample_name : str, optional
 34        Name of the sample. If not provided, it is derived from the file location.
 35
 36    Attributes
 37    ------------
 38    file_location : pathlib.Path
 39        Path object containing the file location.
 40    sample_name : str
 41        Name of the sample.
 42    analyzer : str
 43        Name of the analyzer.
 44    instrument_label : str
 45        Label of the instrument.
 46    gcpeaks : list
 47        List of GCPeak objects.
 48    ri_pairs_ref : None
 49        Reference retention index pairs.
 50    cal_file_path : None
 51        Calibration file path.
 52    _parameters : GCMSParameters
 53        GC-MS parameters.
 54    _retention_time_list : list
 55        List of retention times.
 56    _scans_number_list : list
 57        List of scan numbers.
 58    _tic_list : list
 59        List of total ion chromatogram values.
 60    _ms : dict
 61        Dictionary containing all mass spectra.
 62    _processed_tic : list
 63        List of processed total ion chromatogram values.
 64
 65    Methods
 66    -------
 67    * process_chromatogram(plot_res=False). Process the chromatogram.
 68    * plot_gc_peaks(ax=None, color='red'). Plot the GC peaks.
 69    """
 70
 71    def __init__(
 72        self,
 73        file_location,
 74        analyzer="Unknown",
 75        instrument_label="Unknown",
 76        sample_name=None,
 77    ):
 78        if isinstance(file_location, str):
 79            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 80            file_location = Path(file_location)
 81
 82        if not file_location.exists():
 83            raise FileExistsError("File does not exist: " + str(file_location))
 84
 85        self.file_location = file_location
 86
 87        if sample_name:
 88            self.sample_name = sample_name
 89        else:
 90            self.sample_name = file_location.stem
 91
 92        self.analyzer = analyzer
 93        self.instrument_label = instrument_label
 94        self._init_settings()
 95
 96        self._retention_time_list = []
 97        self._scans_number_list = []
 98        self._tic_list = []
 99
100        # all scans
101        self._ms = {}
102
103        # after peak detection
104        self._processed_tic = []
105        self.gcpeaks = []
106
107        self.ri_pairs_ref = None
108        self.cal_file_path = None
109
110    def _init_settings(self):
111        """Initialize the settings for GC_Class.
112
113        This method initializes the settings for the GC_Class object using the GCMSParameters class.
114        """
115        self._parameters = GCMSParameters()
116
117    def __len__(self):
118        """Return the number of GC peaks in the GC_Class object."""
119        return len(self.gcpeaks)
120
121    def __getitem__(self, scan_number) -> GCPeak:
122        """Return the GCPeak with the given scan number."""
123        return self.gcpeaks[scan_number]
124
125    # def __iter__(self):
126
127    #     return iter(self.gcpeaks.values())
128
129    def process_chromatogram(self, plot_res=False):
130        """Process the chromatogram.
131
132        This method processes the chromatogram.
133
134        Parameters
135        ----------
136        plot_res : bool, optional
137            If True, plot the results. Defaults to False.
138        """
139
140        # tic = self.tic - self.baseline_detector(self.tic)
141
142        self._processed_tic = self.smooth_tic(self.tic)
143
144        for index, tic in enumerate(self._processed_tic):
145            self._ms[index]._processed_tic = tic
146
147        # self.second_derivative_threshold(self._processed_tic)
148
149        if self.chromatogram_settings.use_deconvolution:
150            self.run_deconvolution(plot_res=False)
151
152        else:
153            peaks_index = self.centroid_detector(
154                self._processed_tic, self.retention_time
155            )
156
157            for i in peaks_index:
158                apex_index = i[1]
159
160                gc_peak = GCPeak(self, self._ms[apex_index], i)
161
162                gc_peak.calc_area(self._processed_tic, 1)
163
164                self.gcpeaks.append(gc_peak)
165
166                # self.gcpeaks[self.scans_number[apex_index]] = gc_peak
167
168    def add_mass_spectrum(self, mass_spec):
169        """Add a mass spectrum to the GC-MS object.
170
171        This method adds a mass spectrum to the GC-MS object.
172
173        Parameters
174        ----------
175        mass_spec : MassSpectrum
176            Mass spectrum to be added.
177        """
178
179        self._ms[mass_spec.scan_number] = mass_spec
180
181    def set_tic_list_from_data(self):
182        """Set the total ion chromatogram list from the mass spectra data within the GC-MS data object."""
183
184        self.tic = [self._ms.get(i).tic for i in self.scans_number]
185
186        # self.set_tic_list([self._ms.get(i).get_sumed_signal_to_noise() for i in self.get_scans_number()])
187
188    def set_retention_time_from_data(self):
189        """Set the retention time list from the mass spectra data within the GC-MS data object."""
190
191        retention_time_list = []
192
193        for key_ms in sorted(self._ms.keys()):
194            retention_time_list.append(self._ms.get(key_ms).retention_time)
195
196        self.retention_time = retention_time_list
197
198        # self.set_retention_time_list(sorted(self._ms.keys()))
199
200    def set_scans_number_from_data(self):
201        """Set the scan number list from the mass spectra data within the GC-MS data object."""
202
203        self.scans_number = sorted(self._ms.keys())
204
205    @property
206    def parameters(self):
207        """GCMS Parameters"""
208        return self._parameters
209
210    @parameters.setter
211    def parameters(self, gcms_parameters_instance):
212        self._parameters = gcms_parameters_instance
213
214    # Note: maintaining `parameter` for backwards compatibility,
215    # but proper usage would reference `parameters` to conform
216    # to other classes.
217    @property
218    def parameter(self):
219        """GCMS Parameters"""
220        return self._parameters
221
222    @parameter.setter
223    def parameter(self, gcms_parameters_instance):
224        self._parameters = gcms_parameters_instance
225
226    @property
227    def molecular_search_settings(self):
228        """Molecular Search Settings"""
229        return self.parameters.molecular_search
230
231    @molecular_search_settings.setter
232    def molecular_search_settings(self, settings_class_instance):
233        self.parameters.molecular_search = settings_class_instance
234
235    @property
236    def chromatogram_settings(self):
237        """Chromatogram Settings"""
238        return self.parameters.gc_ms
239
240    @chromatogram_settings.setter
241    def chromatogram_settings(self, settings_class_instance):
242        self.parameters.gc_ms = settings_class_instance
243
244    @property
245    def scans_number(self):
246        """Scans Number"""
247        return self._scans_number_list
248
249    @property
250    def retention_time(self):
251        """Retention Time"""
252        return self._retention_time_list
253
254    @property
255    def processed_tic(self):
256        """Processed Total Ion Current"""
257        return self._processed_tic
258
259    @property
260    def tic(self):
261        """Total Ion Current"""
262        return self._tic_list
263
264    @property
265    def max_tic(self):
266        """Maximum Total Ion Current"""
267        return max([gc_peak.tic for gc_peak in self])
268
269    @property
270    def min_tic(self):
271        """Minimum Total Ion Current"""
272        return min([gc_peak.tic for gc_peak in self])
273
274    @property
275    def dynamic_range(self):
276        """Dynamic Range of the Total Ion Current"""
277        return self.max_tic / self.min_tic
278
279    @property
280    def matched_peaks(self):
281        """Matched Peaks"""
282        return [gc_peak for gc_peak in self if gc_peak]
283
284    @property
285    def sorted_gcpeaks(self):
286        """Sorted GC Peaks, by retention time"""
287        return sorted(self, key=lambda g: g.retention_time)
288
289    @property
290    def unique_metabolites(self):
291        """Unique Metabolites"""
292        metabolites = set()
293        for gc_peak in self:
294            if gc_peak:
295                for compound_obj in gc_peak:
296                    metabolites.add(compound_obj.name)
297
298        return metabolites
299
300    @property
301    def metabolites_data(self):
302        """Metabolites Data"""
303        metabolites = {}
304        for gc_peak in self:
305            if gc_peak:
306                for compound_obj in gc_peak:
307                    if compound_obj.name in metabolites.keys():
308                        current_score = metabolites[compound_obj.name][
309                            "highest_similarity_score"
310                        ]
311                        compound_score = compound_obj.spectral_similarity_score
312                        metabolites[compound_obj.name]["highest_similarity_score"] = (
313                            compound_score
314                            if compound_score > current_score
315                            else current_score
316                        )
317
318                    else:
319                        if compound_obj.metadata:
320                            metabolites[compound_obj.name] = {
321                                "name": compound_obj.name,
322                                "highest_similarity_score": compound_obj.spectral_similarity_score,
323                                "casno": compound_obj.metadata.cas,
324                                "kegg": compound_obj.metadata.kegg,
325                                "inchi": compound_obj.metadata.inchi,
326                                "inchi_key": compound_obj.metadata.inchikey,
327                                "chebi": compound_obj.metadata.chebi,
328                                "smiles": compound_obj.metadata.smiles,
329                            }
330                        else:
331                            metabolites[compound_obj.name] = {
332                                "name": compound_obj.name,
333                                "highest_similarity_score": compound_obj.spectral_similarity_score,
334                                "casno": "",
335                                "kegg": "",
336                                "inchi": "",
337                                "inchikey": "",
338                                "chebi": "",
339                                "smiles": "",
340                            }
341
342        return list(metabolites.values())
343
344    @property
345    def no_matched_peaks(self):
346        """Peaks with no Matched Metabolites"""
347        return [peak for peak in self if not peak]
348
349    @retention_time.setter
350    def retention_time(self, alist):
351        # self._retention_time_list = linspace(0, 80, num=len(self._scans_number_list))
352        self._retention_time_list = alist
353
354    @scans_number.setter
355    def scans_number(self, alist):
356        self._scans_number_list = alist
357
358    @tic.setter
359    def tic(self, alist):
360        self._tic_list = array(alist)
361
362    def plot_gc_peaks(self, ax=None, color="red"):  # pragma: no cover
363        """Plot the GC peaks.
364
365        This method plots the GC peaks.
366
367        Parameters
368        ----------
369        ax : matplotlib.axes.Axes, optional
370            Axes object to plot the GC peaks. Defaults to None.
371        color : str, optional
372            Color of the GC peaks. Defaults to 'red'.
373        """
374
375        import matplotlib.pyplot as plt
376
377        fig = plt.gcf()
378        if ax is None:
379            ax = plt.gca()
380
381        max_rts = [gc_peak.mass_spectrum.retention_time for gc_peak in self]
382        max_tics = [gc_peak.mass_spectrum.tic for gc_peak in self]
383
384        # min_rts = [self._ms[gc_peak.start_index].retention_time for gc_peak in self] + [self._ms[gc_peak.final_index].retention_time for gc_peak in self]
385        # min_tics = [self._ms[gc_peak.start_index].tic for gc_peak in self] + [self._ms[gc_peak.final_index].tic for gc_peak in self]
386        # sc = ax.scatter(min_rts, min_tics, color='yellow', linewidth=0, marker='v')
387
388        sc = ax.scatter(max_rts, max_tics, color=color, marker="v")
389
390        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
391
392        annot = ax.annotate(
393            "",
394            xy=(0, 0),
395            xytext=(20, 20),
396            textcoords="offset points",
397            bbox=dict(boxstyle="round", fc="w"),
398            arrowprops=dict(arrowstyle="->"),
399        )
400        annot.set_visible(False)
401        annot.get_bbox_patch().set_facecolor(("lightblue"))
402        annot.get_bbox_patch().set_alpha(0.8)
403
404        def update_annot(ind):
405            pos = sc.get_offsets()[ind["ind"][0]]
406            annot.xy = pos
407
408            text = "RT: {}\nRT Ref: {}\nRI: {}\nRI Ref: {}\nSimilarity Score: {}\nName: {}".format(
409                " ".join([str(round(self[n].retention_time, 2)) for n in ind["ind"]]),
410                " ".join(
411                    [
412                        str(
413                            round(self[n].highest_score_compound.retention_time, 2)
414                            if self[n].highest_score_compound
415                            else None
416                        )
417                        for n in ind["ind"]
418                    ]
419                ),
420                " ".join(
421                    [
422                        str(round(self[n].ri, 2) if self[n].ri else None)
423                        for n in ind["ind"]
424                    ]
425                ),
426                " ".join(
427                    [
428                        str(
429                            round(self[n].highest_score_compound.ri, 2)
430                            if self[n].highest_score_compound
431                            else None
432                        )
433                        for n in ind["ind"]
434                    ]
435                ),
436                " ".join(
437                    [
438                        str(
439                            round(self[n].highest_score_compound.similarity_score, 4)
440                            if self[n].highest_score_compound
441                            else None
442                        )
443                        for n in ind["ind"]
444                    ]
445                ),
446                " ".join(
447                    [
448                        str(
449                            self[n].highest_score_compound.name
450                            if self[n].highest_score_compound
451                            else None
452                        )
453                        for n in ind["ind"]
454                    ]
455                ),
456            )
457            annot.set_text(text)
458
459        def hover(event):
460            vis = annot.get_visible()
461            if event.inaxes == ax:
462                cont, ind = sc.contains(event)
463                if cont:
464                    update_annot(ind)
465                    annot.set_visible(True)
466                    fig.canvas.draw_idle()
467                else:
468                    if vis:
469                        annot.set_visible(False)
470                        fig.canvas.draw_idle()
471
472        fig.canvas.mpl_connect("motion_notify_event", hover)
473
474        return ax
475
476    def to_excel(
477        self, out_file_path, write_mode="ab", write_metadata=True, id_label="corems:"
478    ):
479        """Export the GC-MS data to an Excel file.
480
481        This method exports the GC-MS data to an Excel file.
482
483        Parameters
484        ----------
485        out_file_path : str, pathlib.Path, or s3path.S3Path
486            Path object containing the file location.
487        write_mode : str, optional
488            Write mode. Defaults to 'ab'.
489        write_metadata : bool, optional
490            If True, write the metadata. Defaults to True.
491        id_label : str, optional
492            Label of the ID. Defaults to 'corems:'.
493
494        """
495
496        if isinstance(out_file_path, str):
497            out_file_path = Path(out_file_path)
498
499        exportMS = LowResGCMSExport(out_file_path, self)
500        exportMS.to_excel(
501            id_label=id_label, write_mode=write_mode, write_metadata=write_metadata
502        )
503
504        return out_file_path.with_suffix(".xlsx")
505
506    def to_csv(
507        self,
508        out_file_path,
509        separate_output=False,
510        write_metadata=True,
511        id_label="corems:",
512    ):
513        """Export the GC-MS data to a CSV file.
514
515        Parameters
516        ----------
517        out_file_path : str, pathlib.Path, or s3path.S3Path
518            Path object containing the file location.
519        separate_output : bool, optional
520            If True, separate the output. Defaults to False.
521        write_metadata : bool, optional
522            If True, write the metadata. Defaults to True.
523
524        """
525
526        if isinstance(out_file_path, str):
527            out_file_path = Path(out_file_path)
528
529        exportMS = LowResGCMSExport(out_file_path, self)
530        exportMS.to_csv(
531            id_label=id_label,
532            separate_output=separate_output,
533            write_metadata=write_metadata,
534        )
535
536        return out_file_path.with_suffix(".csv")
537
538    def to_pandas(self, out_file_path, write_metadata=True, id_label="corems:"):
539        """Export the GC-MS data to a Pandas dataframe.
540
541        Parameters
542        ----------
543        out_file_path : str, pathlib.Path, or s3path.S3Path
544            Path object containing the file location.
545        write_metadata : bool, optional
546            If True, write the metadata. Defaults to True.
547        id_label : str, optional
548            Label of the ID. Defaults to 'corems:'.
549
550        """
551
552        if isinstance(out_file_path, str):
553            out_file_path = Path(out_file_path)
554        # pickle dataframe (pkl extension)
555        exportMS = LowResGCMSExport(out_file_path, self)
556        exportMS.to_pandas(id_label=id_label, write_metadata=write_metadata)
557
558        return out_file_path.with_suffix(".pkl")
559
560    def to_dataframe(self, id_label="corems:"):
561        """Export the GC-MS data to a Pandas dataframe.
562
563        Parameters
564        ----------
565        id_label : str, optional
566            Label of the ID. Defaults to 'corems:'.
567
568        """
569
570        # returns pandas dataframe
571        exportMS = LowResGCMSExport(self.sample_name, self)
572        return exportMS.get_pandas_df(id_label=id_label)
573
574    def processing_stats(self):
575        """Return the processing statistics."""
576
577        # returns json string
578        exportMS = LowResGCMSExport(self.sample_name, self)
579        return exportMS.get_data_stats(self)
580
581    def parameters_json(self, id_label="corems:", output_path=" "):
582        """Return the parameters in JSON format.
583
584        Parameters
585        ----------
586        id_label : str, optional
587            Label of the ID. Defaults to 'corems:'.
588        output_path : str, optional
589            Path object containing the file location. Defaults to " ".
590        """
591
592        # returns json string
593        exportMS = LowResGCMSExport(self.sample_name, self)
594        return exportMS.get_parameters_json(self, id_label, output_path)
595
596    def to_json(self, id_label="corems:"):
597        """Export the GC-MS data to a JSON file.
598
599        Parameters
600        ----------
601        id_label : str, optional
602            Label of the ID. Defaults to 'corems:'.
603
604        """
605
606        # returns pandas dataframe
607        exportMS = LowResGCMSExport(self.sample_name, self)
608        return exportMS.get_json(id_label=id_label)
609
610    def to_hdf(self, id_label="corems:"):
611        """Export the GC-MS data to a HDF file.
612
613        Parameters
614        ----------
615        id_label : str, optional
616            Label of the ID. Defaults to 'corems:'.
617
618        """
619
620        # returns pandas dataframe
621        exportMS = LowResGCMSExport(self.sample_name, self)
622        return exportMS.to_hdf(id_label=id_label)
623
624    def plot_chromatogram(self, ax=None, color="blue"):  # pragma: no cover
625        """Plot the chromatogram.
626
627        Parameters
628        ----------
629        ax : matplotlib.axes.Axes, optional
630            Axes object to plot the chromatogram. Defaults to None.
631        color : str, optional
632            Color of the chromatogram. Defaults to 'blue'.
633
634        """
635
636        import matplotlib.pyplot as plt
637
638        if ax is None:
639            ax = plt.gca()
640
641        ax.plot(self.retention_time, self.tic, color=color)
642        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
643
644        return ax
645
646    def plot_smoothed_chromatogram(self, ax=None, color="green"):  # pragma: no cover
647        """Plot the smoothed chromatogram.
648
649        Parameters
650        ----------
651        ax : matplotlib.axes.Axes, optional
652            Axes object to plot the smoothed chromatogram. Defaults to None.
653        color : str, optional
654            Color of the smoothed chromatogram. Defaults to 'green'.
655
656        """
657
658        import matplotlib.pyplot as plt
659
660        if ax is None:
661            ax = plt.gca()
662
663        ax.plot(self.retention_time, self.smooth_tic(self.tic), color=color)
664
665        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
666
667        return ax
668
669    def plot_detected_baseline(self, ax=None, color="blue"):  # pragma: no cover
670        """Plot the detected baseline.
671
672        Parameters
673        ----------
674        ax : matplotlib.axes.Axes, optional
675            Axes object to plot the detected baseline. Defaults to None.
676        color : str, optional
677            Color of the detected baseline. Defaults to 'blue'.
678
679        """
680
681        import matplotlib.pyplot as plt
682
683        if ax is None:
684            ax = plt.gca()
685
686        max_height = self.chromatogram_settings.peak_height_max_percent
687        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
688
689        baseline = sp.baseline_detector(
690            self.tic, self.retention_time, max_height, max_prominence
691        )
692        ax.plot(self.retention_time, color=color)
693        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
694
695        return ax
696
697    def plot_baseline_subtraction(self, ax=None, color="black"):  # pragma: no cover
698        """Plot the baseline subtraction.
699
700        Parameters
701        ----------
702        ax : matplotlib.axes.Axes, optional
703            Axes object to plot the baseline subtraction. Defaults to None.
704        color : str, optional
705            Color of the baseline subtraction. Defaults to 'black'.
706
707        """
708
709        import matplotlib.pyplot as plt
710
711        if ax is None:
712            ax = plt.gca()
713
714        max_height = self.chromatogram_settings.peak_height_max_percent
715
716        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
717
718        x = self.tic + sp.baseline_detector(
719            self.tic, self.retention_time, max_height, max_prominence
720        )
721
722        ax.plot(self.retention_time, x, color=color)
723
724        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
725
726        return ax
727
728    def peaks_rt_tic(self, json_string=False):
729        """Return the peaks, retention time, and total ion chromatogram.
730
731        Parameters
732        ----------
733        json_string : bool, optional
734            If True, return the peaks, retention time, and total ion chromatogram in JSON format. Defaults to False.
735
736        """
737
738        peaks_list = dict()
739
740        all_candidates_data = {}
741
742        all_peaks_data = {}
743
744        for gcms_peak in self.sorted_gcpeaks:
745            dict_data = {
746                "rt": gcms_peak.rt_list,
747                "tic": gcms_peak.tic_list,
748                "mz": gcms_peak.mass_spectrum.mz_exp.tolist(),
749                "abundance": gcms_peak.mass_spectrum.abundance.tolist(),
750                "candidate_names": gcms_peak.compound_names,
751            }
752
753            peaks_list[gcms_peak.retention_time] = dict_data
754
755            for compound in gcms_peak:
756                if compound.name not in all_candidates_data.keys():
757                    mz = array(compound.mz).tolist()
758                    abundance = array(compound.abundance).tolist()
759                    data = {"mz": mz, "abundance": abundance}
760                    all_candidates_data[compound.name] = data
761
762        all_peaks_data["peak_data"] = peaks_list
763        all_peaks_data["ref_data"] = all_candidates_data
764
765        if json_string:
766            return json.dumps(all_peaks_data)
767
768        else:
769            return all_peaks_data
770
771    def plot_processed_chromatogram(self, ax=None, color="black"):
772        """Plot the processed chromatogram.
773
774        Parameters
775        ----------
776        ax : matplotlib.axes.Axes, optional
777            Axes object to plot the processed chromatogram. Defaults to None.
778        color : str, optional
779            Color of the processed chromatogram. Defaults to 'black'.
780
781        """
782
783        import matplotlib.pyplot as plt
784
785        if ax is None:
786            ax = plt.gca()
787
788        ax.plot(self.retention_time, self.processed_tic, color=color)
789
790        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
791
792        return ax

Base class for GC-MS data processing.

Parameters
  • file_location (str, pathlib.Path, or s3path.S3Path): Path object containing the file location.
  • analyzer (str, optional): Name of the analyzer. Defaults to 'Unknown'.
  • instrument_label (str, optional): Label of the instrument. Defaults to 'Unknown'.
  • sample_name (str, optional): Name of the sample. If not provided, it is derived from the file location.
Attributes
  • file_location (pathlib.Path): Path object containing the file location.
  • sample_name (str): Name of the sample.
  • analyzer (str): Name of the analyzer.
  • instrument_label (str): Label of the instrument.
  • gcpeaks (list): List of GCPeak objects.
  • ri_pairs_ref (None): Reference retention index pairs.
  • cal_file_path (None): Calibration file path.
  • _parameters (GCMSParameters): GC-MS parameters.
  • _retention_time_list (list): List of retention times.
  • _scans_number_list (list): List of scan numbers.
  • _tic_list (list): List of total ion chromatogram values.
  • _ms (dict): Dictionary containing all mass spectra.
  • _processed_tic (list): List of processed total ion chromatogram values.
Methods
  • process_chromatogram(plot_res=False). Process the chromatogram.
  • plot_gc_peaks(ax=None, color='red'). Plot the GC peaks.
GCMSBase( file_location, analyzer='Unknown', instrument_label='Unknown', sample_name=None)
 71    def __init__(
 72        self,
 73        file_location,
 74        analyzer="Unknown",
 75        instrument_label="Unknown",
 76        sample_name=None,
 77    ):
 78        if isinstance(file_location, str):
 79            # if obj is a string it defaults to create a Path obj, pass the S3Path if needed
 80            file_location = Path(file_location)
 81
 82        if not file_location.exists():
 83            raise FileExistsError("File does not exist: " + str(file_location))
 84
 85        self.file_location = file_location
 86
 87        if sample_name:
 88            self.sample_name = sample_name
 89        else:
 90            self.sample_name = file_location.stem
 91
 92        self.analyzer = analyzer
 93        self.instrument_label = instrument_label
 94        self._init_settings()
 95
 96        self._retention_time_list = []
 97        self._scans_number_list = []
 98        self._tic_list = []
 99
100        # all scans
101        self._ms = {}
102
103        # after peak detection
104        self._processed_tic = []
105        self.gcpeaks = []
106
107        self.ri_pairs_ref = None
108        self.cal_file_path = None
file_location
analyzer
instrument_label
gcpeaks
ri_pairs_ref
cal_file_path
def process_chromatogram(self, plot_res=False):
129    def process_chromatogram(self, plot_res=False):
130        """Process the chromatogram.
131
132        This method processes the chromatogram.
133
134        Parameters
135        ----------
136        plot_res : bool, optional
137            If True, plot the results. Defaults to False.
138        """
139
140        # tic = self.tic - self.baseline_detector(self.tic)
141
142        self._processed_tic = self.smooth_tic(self.tic)
143
144        for index, tic in enumerate(self._processed_tic):
145            self._ms[index]._processed_tic = tic
146
147        # self.second_derivative_threshold(self._processed_tic)
148
149        if self.chromatogram_settings.use_deconvolution:
150            self.run_deconvolution(plot_res=False)
151
152        else:
153            peaks_index = self.centroid_detector(
154                self._processed_tic, self.retention_time
155            )
156
157            for i in peaks_index:
158                apex_index = i[1]
159
160                gc_peak = GCPeak(self, self._ms[apex_index], i)
161
162                gc_peak.calc_area(self._processed_tic, 1)
163
164                self.gcpeaks.append(gc_peak)
165
166                # self.gcpeaks[self.scans_number[apex_index]] = gc_peak

Process the chromatogram.

This method processes the chromatogram.

Parameters
  • plot_res (bool, optional): If True, plot the results. Defaults to False.
def add_mass_spectrum(self, mass_spec):
168    def add_mass_spectrum(self, mass_spec):
169        """Add a mass spectrum to the GC-MS object.
170
171        This method adds a mass spectrum to the GC-MS object.
172
173        Parameters
174        ----------
175        mass_spec : MassSpectrum
176            Mass spectrum to be added.
177        """
178
179        self._ms[mass_spec.scan_number] = mass_spec

Add a mass spectrum to the GC-MS object.

This method adds a mass spectrum to the GC-MS object.

Parameters
  • mass_spec (MassSpectrum): Mass spectrum to be added.
def set_tic_list_from_data(self):
181    def set_tic_list_from_data(self):
182        """Set the total ion chromatogram list from the mass spectra data within the GC-MS data object."""
183
184        self.tic = [self._ms.get(i).tic for i in self.scans_number]
185
186        # self.set_tic_list([self._ms.get(i).get_sumed_signal_to_noise() for i in self.get_scans_number()])

Set the total ion chromatogram list from the mass spectra data within the GC-MS data object.

def set_retention_time_from_data(self):
188    def set_retention_time_from_data(self):
189        """Set the retention time list from the mass spectra data within the GC-MS data object."""
190
191        retention_time_list = []
192
193        for key_ms in sorted(self._ms.keys()):
194            retention_time_list.append(self._ms.get(key_ms).retention_time)
195
196        self.retention_time = retention_time_list
197
198        # self.set_retention_time_list(sorted(self._ms.keys()))

Set the retention time list from the mass spectra data within the GC-MS data object.

def set_scans_number_from_data(self):
200    def set_scans_number_from_data(self):
201        """Set the scan number list from the mass spectra data within the GC-MS data object."""
202
203        self.scans_number = sorted(self._ms.keys())

Set the scan number list from the mass spectra data within the GC-MS data object.

parameters

GCMS Parameters

parameter

GCMS Parameters

molecular_search_settings

Molecular Search Settings

chromatogram_settings

Chromatogram Settings

scans_number

Scans Number

retention_time

Retention Time

processed_tic

Processed Total Ion Current

tic

Total Ion Current

max_tic

Maximum Total Ion Current

min_tic

Minimum Total Ion Current

dynamic_range

Dynamic Range of the Total Ion Current

matched_peaks

Matched Peaks

sorted_gcpeaks

Sorted GC Peaks, by retention time

unique_metabolites

Unique Metabolites

metabolites_data

Metabolites Data

no_matched_peaks

Peaks with no Matched Metabolites

def plot_gc_peaks(self, ax=None, color='red'):
362    def plot_gc_peaks(self, ax=None, color="red"):  # pragma: no cover
363        """Plot the GC peaks.
364
365        This method plots the GC peaks.
366
367        Parameters
368        ----------
369        ax : matplotlib.axes.Axes, optional
370            Axes object to plot the GC peaks. Defaults to None.
371        color : str, optional
372            Color of the GC peaks. Defaults to 'red'.
373        """
374
375        import matplotlib.pyplot as plt
376
377        fig = plt.gcf()
378        if ax is None:
379            ax = plt.gca()
380
381        max_rts = [gc_peak.mass_spectrum.retention_time for gc_peak in self]
382        max_tics = [gc_peak.mass_spectrum.tic for gc_peak in self]
383
384        # min_rts = [self._ms[gc_peak.start_index].retention_time for gc_peak in self] + [self._ms[gc_peak.final_index].retention_time for gc_peak in self]
385        # min_tics = [self._ms[gc_peak.start_index].tic for gc_peak in self] + [self._ms[gc_peak.final_index].tic for gc_peak in self]
386        # sc = ax.scatter(min_rts, min_tics, color='yellow', linewidth=0, marker='v')
387
388        sc = ax.scatter(max_rts, max_tics, color=color, marker="v")
389
390        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
391
392        annot = ax.annotate(
393            "",
394            xy=(0, 0),
395            xytext=(20, 20),
396            textcoords="offset points",
397            bbox=dict(boxstyle="round", fc="w"),
398            arrowprops=dict(arrowstyle="->"),
399        )
400        annot.set_visible(False)
401        annot.get_bbox_patch().set_facecolor(("lightblue"))
402        annot.get_bbox_patch().set_alpha(0.8)
403
404        def update_annot(ind):
405            pos = sc.get_offsets()[ind["ind"][0]]
406            annot.xy = pos
407
408            text = "RT: {}\nRT Ref: {}\nRI: {}\nRI Ref: {}\nSimilarity Score: {}\nName: {}".format(
409                " ".join([str(round(self[n].retention_time, 2)) for n in ind["ind"]]),
410                " ".join(
411                    [
412                        str(
413                            round(self[n].highest_score_compound.retention_time, 2)
414                            if self[n].highest_score_compound
415                            else None
416                        )
417                        for n in ind["ind"]
418                    ]
419                ),
420                " ".join(
421                    [
422                        str(round(self[n].ri, 2) if self[n].ri else None)
423                        for n in ind["ind"]
424                    ]
425                ),
426                " ".join(
427                    [
428                        str(
429                            round(self[n].highest_score_compound.ri, 2)
430                            if self[n].highest_score_compound
431                            else None
432                        )
433                        for n in ind["ind"]
434                    ]
435                ),
436                " ".join(
437                    [
438                        str(
439                            round(self[n].highest_score_compound.similarity_score, 4)
440                            if self[n].highest_score_compound
441                            else None
442                        )
443                        for n in ind["ind"]
444                    ]
445                ),
446                " ".join(
447                    [
448                        str(
449                            self[n].highest_score_compound.name
450                            if self[n].highest_score_compound
451                            else None
452                        )
453                        for n in ind["ind"]
454                    ]
455                ),
456            )
457            annot.set_text(text)
458
459        def hover(event):
460            vis = annot.get_visible()
461            if event.inaxes == ax:
462                cont, ind = sc.contains(event)
463                if cont:
464                    update_annot(ind)
465                    annot.set_visible(True)
466                    fig.canvas.draw_idle()
467                else:
468                    if vis:
469                        annot.set_visible(False)
470                        fig.canvas.draw_idle()
471
472        fig.canvas.mpl_connect("motion_notify_event", hover)
473
474        return ax

Plot the GC peaks.

This method plots the GC peaks.

Parameters
  • ax (matplotlib.axes.Axes, optional): Axes object to plot the GC peaks. Defaults to None.
  • color (str, optional): Color of the GC peaks. Defaults to 'red'.
def to_excel( self, out_file_path, write_mode='ab', write_metadata=True, id_label='corems:'):
476    def to_excel(
477        self, out_file_path, write_mode="ab", write_metadata=True, id_label="corems:"
478    ):
479        """Export the GC-MS data to an Excel file.
480
481        This method exports the GC-MS data to an Excel file.
482
483        Parameters
484        ----------
485        out_file_path : str, pathlib.Path, or s3path.S3Path
486            Path object containing the file location.
487        write_mode : str, optional
488            Write mode. Defaults to 'ab'.
489        write_metadata : bool, optional
490            If True, write the metadata. Defaults to True.
491        id_label : str, optional
492            Label of the ID. Defaults to 'corems:'.
493
494        """
495
496        if isinstance(out_file_path, str):
497            out_file_path = Path(out_file_path)
498
499        exportMS = LowResGCMSExport(out_file_path, self)
500        exportMS.to_excel(
501            id_label=id_label, write_mode=write_mode, write_metadata=write_metadata
502        )
503
504        return out_file_path.with_suffix(".xlsx")

Export the GC-MS data to an Excel file.

This method exports the GC-MS data to an Excel file.

Parameters
  • out_file_path (str, pathlib.Path, or s3path.S3Path): Path object containing the file location.
  • write_mode (str, optional): Write mode. Defaults to 'ab'.
  • write_metadata (bool, optional): If True, write the metadata. Defaults to True.
  • id_label (str, optional): Label of the ID. Defaults to 'corems:'.
def to_csv( self, out_file_path, separate_output=False, write_metadata=True, id_label='corems:'):
506    def to_csv(
507        self,
508        out_file_path,
509        separate_output=False,
510        write_metadata=True,
511        id_label="corems:",
512    ):
513        """Export the GC-MS data to a CSV file.
514
515        Parameters
516        ----------
517        out_file_path : str, pathlib.Path, or s3path.S3Path
518            Path object containing the file location.
519        separate_output : bool, optional
520            If True, separate the output. Defaults to False.
521        write_metadata : bool, optional
522            If True, write the metadata. Defaults to True.
523
524        """
525
526        if isinstance(out_file_path, str):
527            out_file_path = Path(out_file_path)
528
529        exportMS = LowResGCMSExport(out_file_path, self)
530        exportMS.to_csv(
531            id_label=id_label,
532            separate_output=separate_output,
533            write_metadata=write_metadata,
534        )
535
536        return out_file_path.with_suffix(".csv")

Export the GC-MS data to a CSV file.

Parameters
  • out_file_path (str, pathlib.Path, or s3path.S3Path): Path object containing the file location.
  • separate_output (bool, optional): If True, separate the output. Defaults to False.
  • write_metadata (bool, optional): If True, write the metadata. Defaults to True.
def to_pandas(self, out_file_path, write_metadata=True, id_label='corems:'):
538    def to_pandas(self, out_file_path, write_metadata=True, id_label="corems:"):
539        """Export the GC-MS data to a Pandas dataframe.
540
541        Parameters
542        ----------
543        out_file_path : str, pathlib.Path, or s3path.S3Path
544            Path object containing the file location.
545        write_metadata : bool, optional
546            If True, write the metadata. Defaults to True.
547        id_label : str, optional
548            Label of the ID. Defaults to 'corems:'.
549
550        """
551
552        if isinstance(out_file_path, str):
553            out_file_path = Path(out_file_path)
554        # pickle dataframe (pkl extension)
555        exportMS = LowResGCMSExport(out_file_path, self)
556        exportMS.to_pandas(id_label=id_label, write_metadata=write_metadata)
557
558        return out_file_path.with_suffix(".pkl")

Export the GC-MS data to a Pandas dataframe.

Parameters
  • out_file_path (str, pathlib.Path, or s3path.S3Path): Path object containing the file location.
  • write_metadata (bool, optional): If True, write the metadata. Defaults to True.
  • id_label (str, optional): Label of the ID. Defaults to 'corems:'.
def to_dataframe(self, id_label='corems:'):
560    def to_dataframe(self, id_label="corems:"):
561        """Export the GC-MS data to a Pandas dataframe.
562
563        Parameters
564        ----------
565        id_label : str, optional
566            Label of the ID. Defaults to 'corems:'.
567
568        """
569
570        # returns pandas dataframe
571        exportMS = LowResGCMSExport(self.sample_name, self)
572        return exportMS.get_pandas_df(id_label=id_label)

Export the GC-MS data to a Pandas dataframe.

Parameters
  • id_label (str, optional): Label of the ID. Defaults to 'corems:'.
def processing_stats(self):
574    def processing_stats(self):
575        """Return the processing statistics."""
576
577        # returns json string
578        exportMS = LowResGCMSExport(self.sample_name, self)
579        return exportMS.get_data_stats(self)

Return the processing statistics.

def parameters_json(self, id_label='corems:', output_path=' '):
581    def parameters_json(self, id_label="corems:", output_path=" "):
582        """Return the parameters in JSON format.
583
584        Parameters
585        ----------
586        id_label : str, optional
587            Label of the ID. Defaults to 'corems:'.
588        output_path : str, optional
589            Path object containing the file location. Defaults to " ".
590        """
591
592        # returns json string
593        exportMS = LowResGCMSExport(self.sample_name, self)
594        return exportMS.get_parameters_json(self, id_label, output_path)

Return the parameters in JSON format.

Parameters
  • id_label (str, optional): Label of the ID. Defaults to 'corems:'.
  • output_path (str, optional): Path object containing the file location. Defaults to " ".
def to_json(self, id_label='corems:'):
596    def to_json(self, id_label="corems:"):
597        """Export the GC-MS data to a JSON file.
598
599        Parameters
600        ----------
601        id_label : str, optional
602            Label of the ID. Defaults to 'corems:'.
603
604        """
605
606        # returns pandas dataframe
607        exportMS = LowResGCMSExport(self.sample_name, self)
608        return exportMS.get_json(id_label=id_label)

Export the GC-MS data to a JSON file.

Parameters
  • id_label (str, optional): Label of the ID. Defaults to 'corems:'.
def to_hdf(self, id_label='corems:'):
610    def to_hdf(self, id_label="corems:"):
611        """Export the GC-MS data to a HDF file.
612
613        Parameters
614        ----------
615        id_label : str, optional
616            Label of the ID. Defaults to 'corems:'.
617
618        """
619
620        # returns pandas dataframe
621        exportMS = LowResGCMSExport(self.sample_name, self)
622        return exportMS.to_hdf(id_label=id_label)

Export the GC-MS data to a HDF file.

Parameters
  • id_label (str, optional): Label of the ID. Defaults to 'corems:'.
def plot_chromatogram(self, ax=None, color='blue'):
624    def plot_chromatogram(self, ax=None, color="blue"):  # pragma: no cover
625        """Plot the chromatogram.
626
627        Parameters
628        ----------
629        ax : matplotlib.axes.Axes, optional
630            Axes object to plot the chromatogram. Defaults to None.
631        color : str, optional
632            Color of the chromatogram. Defaults to 'blue'.
633
634        """
635
636        import matplotlib.pyplot as plt
637
638        if ax is None:
639            ax = plt.gca()
640
641        ax.plot(self.retention_time, self.tic, color=color)
642        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
643
644        return ax

Plot the chromatogram.

Parameters
  • ax (matplotlib.axes.Axes, optional): Axes object to plot the chromatogram. Defaults to None.
  • color (str, optional): Color of the chromatogram. Defaults to 'blue'.
def plot_smoothed_chromatogram(self, ax=None, color='green'):
646    def plot_smoothed_chromatogram(self, ax=None, color="green"):  # pragma: no cover
647        """Plot the smoothed chromatogram.
648
649        Parameters
650        ----------
651        ax : matplotlib.axes.Axes, optional
652            Axes object to plot the smoothed chromatogram. Defaults to None.
653        color : str, optional
654            Color of the smoothed chromatogram. Defaults to 'green'.
655
656        """
657
658        import matplotlib.pyplot as plt
659
660        if ax is None:
661            ax = plt.gca()
662
663        ax.plot(self.retention_time, self.smooth_tic(self.tic), color=color)
664
665        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
666
667        return ax

Plot the smoothed chromatogram.

Parameters
  • ax (matplotlib.axes.Axes, optional): Axes object to plot the smoothed chromatogram. Defaults to None.
  • color (str, optional): Color of the smoothed chromatogram. Defaults to 'green'.
def plot_detected_baseline(self, ax=None, color='blue'):
669    def plot_detected_baseline(self, ax=None, color="blue"):  # pragma: no cover
670        """Plot the detected baseline.
671
672        Parameters
673        ----------
674        ax : matplotlib.axes.Axes, optional
675            Axes object to plot the detected baseline. Defaults to None.
676        color : str, optional
677            Color of the detected baseline. Defaults to 'blue'.
678
679        """
680
681        import matplotlib.pyplot as plt
682
683        if ax is None:
684            ax = plt.gca()
685
686        max_height = self.chromatogram_settings.peak_height_max_percent
687        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
688
689        baseline = sp.baseline_detector(
690            self.tic, self.retention_time, max_height, max_prominence
691        )
692        ax.plot(self.retention_time, color=color)
693        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
694
695        return ax

Plot the detected baseline.

Parameters
  • ax (matplotlib.axes.Axes, optional): Axes object to plot the detected baseline. Defaults to None.
  • color (str, optional): Color of the detected baseline. Defaults to 'blue'.
def plot_baseline_subtraction(self, ax=None, color='black'):
697    def plot_baseline_subtraction(self, ax=None, color="black"):  # pragma: no cover
698        """Plot the baseline subtraction.
699
700        Parameters
701        ----------
702        ax : matplotlib.axes.Axes, optional
703            Axes object to plot the baseline subtraction. Defaults to None.
704        color : str, optional
705            Color of the baseline subtraction. Defaults to 'black'.
706
707        """
708
709        import matplotlib.pyplot as plt
710
711        if ax is None:
712            ax = plt.gca()
713
714        max_height = self.chromatogram_settings.peak_height_max_percent
715
716        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
717
718        x = self.tic + sp.baseline_detector(
719            self.tic, self.retention_time, max_height, max_prominence
720        )
721
722        ax.plot(self.retention_time, x, color=color)
723
724        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
725
726        return ax

Plot the baseline subtraction.

Parameters
  • ax (matplotlib.axes.Axes, optional): Axes object to plot the baseline subtraction. Defaults to None.
  • color (str, optional): Color of the baseline subtraction. Defaults to 'black'.
def peaks_rt_tic(self, json_string=False):
728    def peaks_rt_tic(self, json_string=False):
729        """Return the peaks, retention time, and total ion chromatogram.
730
731        Parameters
732        ----------
733        json_string : bool, optional
734            If True, return the peaks, retention time, and total ion chromatogram in JSON format. Defaults to False.
735
736        """
737
738        peaks_list = dict()
739
740        all_candidates_data = {}
741
742        all_peaks_data = {}
743
744        for gcms_peak in self.sorted_gcpeaks:
745            dict_data = {
746                "rt": gcms_peak.rt_list,
747                "tic": gcms_peak.tic_list,
748                "mz": gcms_peak.mass_spectrum.mz_exp.tolist(),
749                "abundance": gcms_peak.mass_spectrum.abundance.tolist(),
750                "candidate_names": gcms_peak.compound_names,
751            }
752
753            peaks_list[gcms_peak.retention_time] = dict_data
754
755            for compound in gcms_peak:
756                if compound.name not in all_candidates_data.keys():
757                    mz = array(compound.mz).tolist()
758                    abundance = array(compound.abundance).tolist()
759                    data = {"mz": mz, "abundance": abundance}
760                    all_candidates_data[compound.name] = data
761
762        all_peaks_data["peak_data"] = peaks_list
763        all_peaks_data["ref_data"] = all_candidates_data
764
765        if json_string:
766            return json.dumps(all_peaks_data)
767
768        else:
769            return all_peaks_data

Return the peaks, retention time, and total ion chromatogram.

Parameters
  • json_string (bool, optional): If True, return the peaks, retention time, and total ion chromatogram in JSON format. Defaults to False.
def plot_processed_chromatogram(self, ax=None, color='black'):
771    def plot_processed_chromatogram(self, ax=None, color="black"):
772        """Plot the processed chromatogram.
773
774        Parameters
775        ----------
776        ax : matplotlib.axes.Axes, optional
777            Axes object to plot the processed chromatogram. Defaults to None.
778        color : str, optional
779            Color of the processed chromatogram. Defaults to 'black'.
780
781        """
782
783        import matplotlib.pyplot as plt
784
785        if ax is None:
786            ax = plt.gca()
787
788        ax.plot(self.retention_time, self.processed_tic, color=color)
789
790        ax.set(xlabel="Retention Time (s)", ylabel="Total Ion Chromatogram")
791
792        return ax

Plot the processed chromatogram.

Parameters
  • ax (matplotlib.axes.Axes, optional): Axes object to plot the processed chromatogram. Defaults to None.
  • color (str, optional): Color of the processed chromatogram. Defaults to 'black'.