corems.mass_spectrum.calc.AutoRecalibration

Created on March 23 2023

@author: Will Kew

Modules for automatic mass internal recalibration

  1# -*- coding: utf-8 -*-
  2"""
  3Created on March 23 2023
  4
  5@author: Will Kew
  6
  7Modules for automatic mass internal recalibration
  8"""
  9
 10from lmfit.models import GaussianModel
 11import seaborn as sns
 12import pandas as pd
 13import numpy as np
 14import matplotlib.pyplot as plt
 15from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulas
 16import copy
 17
 18
 19class HighResRecalibration:
 20    """
 21    This class is designed for high resolution (FTICR, Orbitrap) data of complex mixture, e.g. Organic matter
 22
 23    The tool first does a broad mass range search for the most commonly expected ion type (i.e. CHO, deprotonated - for negative ESI)
 24    And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range.
 25    This tool works when the data are of sufficient quality, and not outwith the typical expected range of the mass analyzer
 26    It presumes the mean error is out by 0-several ppm, but that the spread of error values is modest (<2ppm)
 27
 28    Parameters
 29    ----------
 30    mass_spectrum : MassSpectrum
 31        CoreMS mass spectrum object
 32    plot : bool, optional
 33        Whether to plot the error distribution. The default is False.
 34    docker : bool, optional
 35        Whether to use the docker database. The default is True. If not, it uses a dynamically generated sqlite database.
 36    ppmFWHMprior : float, optional
 37        The FWHM of the prior distribution (ppm). The default is 3.
 38    ppmRangeprior : float, optional
 39        The range of the prior distribution (ppm). The default is 15.
 40
 41    Methods
 42    --------
 43    * determine_error_boundaries(). Determine the error boundaries for recalibration space.
 44
 45    Notes
 46    -----
 47    This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments.
 48    Possible future task is to make the base class copyable.
 49
 50    """
 51
 52    def __init__(
 53        self,
 54        mass_spectrum,
 55        plot: bool = False,
 56        docker: bool = True,
 57        ppmFWHMprior: float = 3,
 58        ppmRangeprior: float = 15,
 59    ):
 60        self.mass_spectrum = copy.deepcopy(mass_spectrum)
 61        self.plot = plot
 62        self.docker = docker
 63        self.ppmFWHMprior = ppmFWHMprior
 64        self.ppmRangeprior = ppmRangeprior
 65
 66    def set_uncal_settings(self):
 67        """Set uncalibrated formula search settings
 68
 69        This function serves the uncalibrated data (hence broad error tolerance)
 70        It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode
 71
 72        This will not work for positive mode data, or for other ion types, or other expected elemental searches.
 73
 74        """
 75        # TODO rework this.
 76
 77        if self.docker:
 78            self.mass_spectrum.molecular_search_settings.url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
 79        else:
 80            self.mass_spectrum.molecular_search_settings.url_database = None
 81        self.mass_spectrum.molecular_search_settings.error_method = None
 82        self.mass_spectrum.molecular_search_settings.score_method = "prob_score"
 83
 84        self.mass_spectrum.molecular_search_settings.min_ppm_error = (
 85            -1 * self.ppmRangeprior / 2
 86        )  # -7.5
 87        self.mass_spectrum.molecular_search_settings.max_ppm_error = (
 88            self.ppmRangeprior / 2
 89        )  # 7.5
 90
 91        self.mass_spectrum.molecular_search_settings.min_dbe = 0
 92        self.mass_spectrum.molecular_search_settings.max_dbe = 50
 93
 94        self.mass_spectrum.molecular_search_settings.use_isotopologue_filter = False
 95        self.mass_spectrum.molecular_search_settings.min_abun_error = -30
 96        self.mass_spectrum.molecular_search_settings.max_abun_error = 70
 97
 98        self.mass_spectrum.molecular_search_settings.use_min_peaks_filter = True
 99        self.mass_spectrum.molecular_search_settings.min_peaks_per_class = (
100            10  # default is 15
101        )
102
103        self.mass_spectrum.molecular_search_settings.usedAtoms["C"] = (1, 90)
104        self.mass_spectrum.molecular_search_settings.usedAtoms["H"] = (4, 200)
105        self.mass_spectrum.molecular_search_settings.usedAtoms["O"] = (1, 23)
106        self.mass_spectrum.molecular_search_settings.usedAtoms["N"] = (0, 0)
107        self.mass_spectrum.molecular_search_settings.usedAtoms["S"] = (0, 0)
108        self.mass_spectrum.molecular_search_settings.usedAtoms["P"] = (0, 0)
109
110        self.mass_spectrum.molecular_search_settings.isProtonated = True
111        self.mass_spectrum.molecular_search_settings.isRadical = False
112        self.mass_spectrum.molecular_search_settings.isAdduct = False
113
114    def positive_search_settings(self):
115        """Set the positive mode elemental search settings"""
116        self.mass_spectrum.molecular_search_settings.isProtonated = False
117        self.mass_spectrum.molecular_search_settings.isAdduct = True
118        self.mass_spectrum.molecular_search_settings.adduct_atoms_pos = ["Na"]
119
120    @staticmethod
121    def get_error_range(
122        errors: list, ppmFWHMprior: float = 3, plot_logic: bool = False
123    ):
124        """Get the error range from the error distribution
125
126        Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.
127
128        Parameters
129        ----------
130        errors : list
131            list of the errors of the assigned species (ppm)
132        ppmFWHMprior : float, optional
133            The FWHM of the prior distribution (ppm). The default is 3.
134        plot_logic : bool, optional
135            Whether to plot the error distribution. The default is False.
136
137        Returns
138        -------
139        mean_error : float
140            mean mass error of the Gaussian distribution (ppm)
141        fwhm_error : float
142            full width half max of the gaussian error distribution (ppm)
143        ppm_thresh : list
144            recommended thresholds for the recalibration parameters (ppm)
145            Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
146
147        """
148        kde = sns.kdeplot(errors)
149
150        kde_data = kde.get_lines()[0].get_data()
151
152        tmpdf = pd.Series(index=kde_data[0], data=kde_data[1])
153        kde_apex_ppm = tmpdf.idxmax()
154        kde_apex_val = tmpdf.max()
155
156        plt.close(kde.figure)
157        plt.close("all")
158
159        lmmodel = GaussianModel()
160        lmpars = lmmodel.guess(kde_data[1], x=kde_data[0])
161        lmpars["sigma"].value = 2.3548 / ppmFWHMprior
162        lmpars["center"].value = kde_apex_ppm
163        lmpars["amplitude"].value = kde_apex_val
164        lmout = lmmodel.fit(kde_data[1], lmpars, x=kde_data[0])
165
166        if plot_logic:
167            fig, ax = plt.subplots(figsize=(8, 4))
168            lmout.plot_fit(
169                ax=ax, data_kws={"color": "tab:blue"}, fit_kws={"color": "tab:red"}
170            )
171            ax.set_xlabel("$m/z$ Error (ppm)")
172            ax.set_ylabel("Density")
173            plt.legend(facecolor="white", framealpha=0)
174
175        mean_error = lmout.best_values["center"]
176        std_error = lmout.best_values["sigma"]
177        # FWHM from Sigma = approx. 2.355*sigma
178        # fwhm_error = 2*np.sqrt(2*np.log(2))*std_error
179        fwhm_error = std_error * np.sqrt(8 * np.log(2))
180
181        ppm_thresh = [mean_error - fwhm_error, mean_error + fwhm_error]
182        return mean_error, fwhm_error, ppm_thresh
183
184    def determine_error_boundaries(self):
185        """Determine the error boundaries for recalibration space
186
187        This is the main function in this class
188        Sets the Molecular Formulas search settings, performs the initial formula search
189        Converts the data to a dataframe, and gets the error range
190        Returns the error thresholds.
191
192        Returns
193        -------
194        mean_error : float
195            mean mass error of the Gaussian distribution (ppm)
196        fwhm_error : float
197            full width half max of the gaussian error distribution (ppm)
198        ppm_thresh : list
199            recommended thresholds for the recalibration parameters (ppm)
200            Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
201        """
202
203        # Set the search settings
204        self.set_uncal_settings()
205
206        # Set the positive mode settings
207        # To do - have user defineable settings?
208        if self.mass_spectrum.polarity == 1:
209            self.positive_search_settings()
210
211        # Search MFs
212        SearchMolecularFormulas(
213            self.mass_spectrum, first_hit=True
214        ).run_worker_mass_spectrum()
215
216        # Exporting to a DF is ~30x slower than just getting the errors, so this is fast.
217        errors = []
218        for mspeak in self.mass_spectrum.mspeaks:
219            if len(mspeak.molecular_formulas) > 0:
220                errors.append(mspeak.best_molecular_formula_candidate.mz_error)
221
222        # If there are NO assignments, it'll fail on the next step. Need to check for that
223        nassign = len(errors)
224        # Here we say at least 5 features assigned are needed - it probably should be greater, but we are just trying to stop it breaking the code
225        # We want to make sure the spectrum is capture in the database though - so we return the stats entries (0 assignments) and the number of assignments
226        if nassign < 5:
227            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
228                print("fewer than 5 peaks assigned, cannot determine error range")
229            return np.nan, np.nan, [np.nan, np.nan]
230        else:
231            mean_error, fwhm_error, ppm_thresh = self.get_error_range(
232                errors, self.ppmFWHMprior, self.plot
233            )
234            return mean_error, fwhm_error, ppm_thresh
class HighResRecalibration:
 20class HighResRecalibration:
 21    """
 22    This class is designed for high resolution (FTICR, Orbitrap) data of complex mixture, e.g. Organic matter
 23
 24    The tool first does a broad mass range search for the most commonly expected ion type (i.e. CHO, deprotonated - for negative ESI)
 25    And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range.
 26    This tool works when the data are of sufficient quality, and not outwith the typical expected range of the mass analyzer
 27    It presumes the mean error is out by 0-several ppm, but that the spread of error values is modest (<2ppm)
 28
 29    Parameters
 30    ----------
 31    mass_spectrum : MassSpectrum
 32        CoreMS mass spectrum object
 33    plot : bool, optional
 34        Whether to plot the error distribution. The default is False.
 35    docker : bool, optional
 36        Whether to use the docker database. The default is True. If not, it uses a dynamically generated sqlite database.
 37    ppmFWHMprior : float, optional
 38        The FWHM of the prior distribution (ppm). The default is 3.
 39    ppmRangeprior : float, optional
 40        The range of the prior distribution (ppm). The default is 15.
 41
 42    Methods
 43    --------
 44    * determine_error_boundaries(). Determine the error boundaries for recalibration space.
 45
 46    Notes
 47    -----
 48    This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments.
 49    Possible future task is to make the base class copyable.
 50
 51    """
 52
 53    def __init__(
 54        self,
 55        mass_spectrum,
 56        plot: bool = False,
 57        docker: bool = True,
 58        ppmFWHMprior: float = 3,
 59        ppmRangeprior: float = 15,
 60    ):
 61        self.mass_spectrum = copy.deepcopy(mass_spectrum)
 62        self.plot = plot
 63        self.docker = docker
 64        self.ppmFWHMprior = ppmFWHMprior
 65        self.ppmRangeprior = ppmRangeprior
 66
 67    def set_uncal_settings(self):
 68        """Set uncalibrated formula search settings
 69
 70        This function serves the uncalibrated data (hence broad error tolerance)
 71        It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode
 72
 73        This will not work for positive mode data, or for other ion types, or other expected elemental searches.
 74
 75        """
 76        # TODO rework this.
 77
 78        if self.docker:
 79            self.mass_spectrum.molecular_search_settings.url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
 80        else:
 81            self.mass_spectrum.molecular_search_settings.url_database = None
 82        self.mass_spectrum.molecular_search_settings.error_method = None
 83        self.mass_spectrum.molecular_search_settings.score_method = "prob_score"
 84
 85        self.mass_spectrum.molecular_search_settings.min_ppm_error = (
 86            -1 * self.ppmRangeprior / 2
 87        )  # -7.5
 88        self.mass_spectrum.molecular_search_settings.max_ppm_error = (
 89            self.ppmRangeprior / 2
 90        )  # 7.5
 91
 92        self.mass_spectrum.molecular_search_settings.min_dbe = 0
 93        self.mass_spectrum.molecular_search_settings.max_dbe = 50
 94
 95        self.mass_spectrum.molecular_search_settings.use_isotopologue_filter = False
 96        self.mass_spectrum.molecular_search_settings.min_abun_error = -30
 97        self.mass_spectrum.molecular_search_settings.max_abun_error = 70
 98
 99        self.mass_spectrum.molecular_search_settings.use_min_peaks_filter = True
100        self.mass_spectrum.molecular_search_settings.min_peaks_per_class = (
101            10  # default is 15
102        )
103
104        self.mass_spectrum.molecular_search_settings.usedAtoms["C"] = (1, 90)
105        self.mass_spectrum.molecular_search_settings.usedAtoms["H"] = (4, 200)
106        self.mass_spectrum.molecular_search_settings.usedAtoms["O"] = (1, 23)
107        self.mass_spectrum.molecular_search_settings.usedAtoms["N"] = (0, 0)
108        self.mass_spectrum.molecular_search_settings.usedAtoms["S"] = (0, 0)
109        self.mass_spectrum.molecular_search_settings.usedAtoms["P"] = (0, 0)
110
111        self.mass_spectrum.molecular_search_settings.isProtonated = True
112        self.mass_spectrum.molecular_search_settings.isRadical = False
113        self.mass_spectrum.molecular_search_settings.isAdduct = False
114
115    def positive_search_settings(self):
116        """Set the positive mode elemental search settings"""
117        self.mass_spectrum.molecular_search_settings.isProtonated = False
118        self.mass_spectrum.molecular_search_settings.isAdduct = True
119        self.mass_spectrum.molecular_search_settings.adduct_atoms_pos = ["Na"]
120
121    @staticmethod
122    def get_error_range(
123        errors: list, ppmFWHMprior: float = 3, plot_logic: bool = False
124    ):
125        """Get the error range from the error distribution
126
127        Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.
128
129        Parameters
130        ----------
131        errors : list
132            list of the errors of the assigned species (ppm)
133        ppmFWHMprior : float, optional
134            The FWHM of the prior distribution (ppm). The default is 3.
135        plot_logic : bool, optional
136            Whether to plot the error distribution. The default is False.
137
138        Returns
139        -------
140        mean_error : float
141            mean mass error of the Gaussian distribution (ppm)
142        fwhm_error : float
143            full width half max of the gaussian error distribution (ppm)
144        ppm_thresh : list
145            recommended thresholds for the recalibration parameters (ppm)
146            Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
147
148        """
149        kde = sns.kdeplot(errors)
150
151        kde_data = kde.get_lines()[0].get_data()
152
153        tmpdf = pd.Series(index=kde_data[0], data=kde_data[1])
154        kde_apex_ppm = tmpdf.idxmax()
155        kde_apex_val = tmpdf.max()
156
157        plt.close(kde.figure)
158        plt.close("all")
159
160        lmmodel = GaussianModel()
161        lmpars = lmmodel.guess(kde_data[1], x=kde_data[0])
162        lmpars["sigma"].value = 2.3548 / ppmFWHMprior
163        lmpars["center"].value = kde_apex_ppm
164        lmpars["amplitude"].value = kde_apex_val
165        lmout = lmmodel.fit(kde_data[1], lmpars, x=kde_data[0])
166
167        if plot_logic:
168            fig, ax = plt.subplots(figsize=(8, 4))
169            lmout.plot_fit(
170                ax=ax, data_kws={"color": "tab:blue"}, fit_kws={"color": "tab:red"}
171            )
172            ax.set_xlabel("$m/z$ Error (ppm)")
173            ax.set_ylabel("Density")
174            plt.legend(facecolor="white", framealpha=0)
175
176        mean_error = lmout.best_values["center"]
177        std_error = lmout.best_values["sigma"]
178        # FWHM from Sigma = approx. 2.355*sigma
179        # fwhm_error = 2*np.sqrt(2*np.log(2))*std_error
180        fwhm_error = std_error * np.sqrt(8 * np.log(2))
181
182        ppm_thresh = [mean_error - fwhm_error, mean_error + fwhm_error]
183        return mean_error, fwhm_error, ppm_thresh
184
185    def determine_error_boundaries(self):
186        """Determine the error boundaries for recalibration space
187
188        This is the main function in this class
189        Sets the Molecular Formulas search settings, performs the initial formula search
190        Converts the data to a dataframe, and gets the error range
191        Returns the error thresholds.
192
193        Returns
194        -------
195        mean_error : float
196            mean mass error of the Gaussian distribution (ppm)
197        fwhm_error : float
198            full width half max of the gaussian error distribution (ppm)
199        ppm_thresh : list
200            recommended thresholds for the recalibration parameters (ppm)
201            Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
202        """
203
204        # Set the search settings
205        self.set_uncal_settings()
206
207        # Set the positive mode settings
208        # To do - have user defineable settings?
209        if self.mass_spectrum.polarity == 1:
210            self.positive_search_settings()
211
212        # Search MFs
213        SearchMolecularFormulas(
214            self.mass_spectrum, first_hit=True
215        ).run_worker_mass_spectrum()
216
217        # Exporting to a DF is ~30x slower than just getting the errors, so this is fast.
218        errors = []
219        for mspeak in self.mass_spectrum.mspeaks:
220            if len(mspeak.molecular_formulas) > 0:
221                errors.append(mspeak.best_molecular_formula_candidate.mz_error)
222
223        # If there are NO assignments, it'll fail on the next step. Need to check for that
224        nassign = len(errors)
225        # Here we say at least 5 features assigned are needed - it probably should be greater, but we are just trying to stop it breaking the code
226        # We want to make sure the spectrum is capture in the database though - so we return the stats entries (0 assignments) and the number of assignments
227        if nassign < 5:
228            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
229                print("fewer than 5 peaks assigned, cannot determine error range")
230            return np.nan, np.nan, [np.nan, np.nan]
231        else:
232            mean_error, fwhm_error, ppm_thresh = self.get_error_range(
233                errors, self.ppmFWHMprior, self.plot
234            )
235            return mean_error, fwhm_error, ppm_thresh

This class is designed for high resolution (FTICR, Orbitrap) data of complex mixture, e.g. Organic matter

The tool first does a broad mass range search for the most commonly expected ion type (i.e. CHO, deprotonated - for negative ESI) And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range. This tool works when the data are of sufficient quality, and not outwith the typical expected range of the mass analyzer It presumes the mean error is out by 0-several ppm, but that the spread of error values is modest (<2ppm)

Parameters
  • mass_spectrum (MassSpectrum): CoreMS mass spectrum object
  • plot (bool, optional): Whether to plot the error distribution. The default is False.
  • docker (bool, optional): Whether to use the docker database. The default is True. If not, it uses a dynamically generated sqlite database.
  • ppmFWHMprior (float, optional): The FWHM of the prior distribution (ppm). The default is 3.
  • ppmRangeprior (float, optional): The range of the prior distribution (ppm). The default is 15.
Methods
  • determine_error_boundaries(). Determine the error boundaries for recalibration space.
Notes

This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments. Possible future task is to make the base class copyable.

HighResRecalibration( mass_spectrum, plot: bool = False, docker: bool = True, ppmFWHMprior: float = 3, ppmRangeprior: float = 15)
53    def __init__(
54        self,
55        mass_spectrum,
56        plot: bool = False,
57        docker: bool = True,
58        ppmFWHMprior: float = 3,
59        ppmRangeprior: float = 15,
60    ):
61        self.mass_spectrum = copy.deepcopy(mass_spectrum)
62        self.plot = plot
63        self.docker = docker
64        self.ppmFWHMprior = ppmFWHMprior
65        self.ppmRangeprior = ppmRangeprior
mass_spectrum
plot
docker
ppmFWHMprior
ppmRangeprior
def set_uncal_settings(self):
 67    def set_uncal_settings(self):
 68        """Set uncalibrated formula search settings
 69
 70        This function serves the uncalibrated data (hence broad error tolerance)
 71        It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode
 72
 73        This will not work for positive mode data, or for other ion types, or other expected elemental searches.
 74
 75        """
 76        # TODO rework this.
 77
 78        if self.docker:
 79            self.mass_spectrum.molecular_search_settings.url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
 80        else:
 81            self.mass_spectrum.molecular_search_settings.url_database = None
 82        self.mass_spectrum.molecular_search_settings.error_method = None
 83        self.mass_spectrum.molecular_search_settings.score_method = "prob_score"
 84
 85        self.mass_spectrum.molecular_search_settings.min_ppm_error = (
 86            -1 * self.ppmRangeprior / 2
 87        )  # -7.5
 88        self.mass_spectrum.molecular_search_settings.max_ppm_error = (
 89            self.ppmRangeprior / 2
 90        )  # 7.5
 91
 92        self.mass_spectrum.molecular_search_settings.min_dbe = 0
 93        self.mass_spectrum.molecular_search_settings.max_dbe = 50
 94
 95        self.mass_spectrum.molecular_search_settings.use_isotopologue_filter = False
 96        self.mass_spectrum.molecular_search_settings.min_abun_error = -30
 97        self.mass_spectrum.molecular_search_settings.max_abun_error = 70
 98
 99        self.mass_spectrum.molecular_search_settings.use_min_peaks_filter = True
100        self.mass_spectrum.molecular_search_settings.min_peaks_per_class = (
101            10  # default is 15
102        )
103
104        self.mass_spectrum.molecular_search_settings.usedAtoms["C"] = (1, 90)
105        self.mass_spectrum.molecular_search_settings.usedAtoms["H"] = (4, 200)
106        self.mass_spectrum.molecular_search_settings.usedAtoms["O"] = (1, 23)
107        self.mass_spectrum.molecular_search_settings.usedAtoms["N"] = (0, 0)
108        self.mass_spectrum.molecular_search_settings.usedAtoms["S"] = (0, 0)
109        self.mass_spectrum.molecular_search_settings.usedAtoms["P"] = (0, 0)
110
111        self.mass_spectrum.molecular_search_settings.isProtonated = True
112        self.mass_spectrum.molecular_search_settings.isRadical = False
113        self.mass_spectrum.molecular_search_settings.isAdduct = False

Set uncalibrated formula search settings

This function serves the uncalibrated data (hence broad error tolerance) It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode

This will not work for positive mode data, or for other ion types, or other expected elemental searches.

def positive_search_settings(self):
115    def positive_search_settings(self):
116        """Set the positive mode elemental search settings"""
117        self.mass_spectrum.molecular_search_settings.isProtonated = False
118        self.mass_spectrum.molecular_search_settings.isAdduct = True
119        self.mass_spectrum.molecular_search_settings.adduct_atoms_pos = ["Na"]

Set the positive mode elemental search settings

@staticmethod
def get_error_range(errors: list, ppmFWHMprior: float = 3, plot_logic: bool = False):
121    @staticmethod
122    def get_error_range(
123        errors: list, ppmFWHMprior: float = 3, plot_logic: bool = False
124    ):
125        """Get the error range from the error distribution
126
127        Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.
128
129        Parameters
130        ----------
131        errors : list
132            list of the errors of the assigned species (ppm)
133        ppmFWHMprior : float, optional
134            The FWHM of the prior distribution (ppm). The default is 3.
135        plot_logic : bool, optional
136            Whether to plot the error distribution. The default is False.
137
138        Returns
139        -------
140        mean_error : float
141            mean mass error of the Gaussian distribution (ppm)
142        fwhm_error : float
143            full width half max of the gaussian error distribution (ppm)
144        ppm_thresh : list
145            recommended thresholds for the recalibration parameters (ppm)
146            Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
147
148        """
149        kde = sns.kdeplot(errors)
150
151        kde_data = kde.get_lines()[0].get_data()
152
153        tmpdf = pd.Series(index=kde_data[0], data=kde_data[1])
154        kde_apex_ppm = tmpdf.idxmax()
155        kde_apex_val = tmpdf.max()
156
157        plt.close(kde.figure)
158        plt.close("all")
159
160        lmmodel = GaussianModel()
161        lmpars = lmmodel.guess(kde_data[1], x=kde_data[0])
162        lmpars["sigma"].value = 2.3548 / ppmFWHMprior
163        lmpars["center"].value = kde_apex_ppm
164        lmpars["amplitude"].value = kde_apex_val
165        lmout = lmmodel.fit(kde_data[1], lmpars, x=kde_data[0])
166
167        if plot_logic:
168            fig, ax = plt.subplots(figsize=(8, 4))
169            lmout.plot_fit(
170                ax=ax, data_kws={"color": "tab:blue"}, fit_kws={"color": "tab:red"}
171            )
172            ax.set_xlabel("$m/z$ Error (ppm)")
173            ax.set_ylabel("Density")
174            plt.legend(facecolor="white", framealpha=0)
175
176        mean_error = lmout.best_values["center"]
177        std_error = lmout.best_values["sigma"]
178        # FWHM from Sigma = approx. 2.355*sigma
179        # fwhm_error = 2*np.sqrt(2*np.log(2))*std_error
180        fwhm_error = std_error * np.sqrt(8 * np.log(2))
181
182        ppm_thresh = [mean_error - fwhm_error, mean_error + fwhm_error]
183        return mean_error, fwhm_error, ppm_thresh

Get the error range from the error distribution

Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.

Parameters
  • errors (list): list of the errors of the assigned species (ppm)
  • ppmFWHMprior (float, optional): The FWHM of the prior distribution (ppm). The default is 3.
  • plot_logic (bool, optional): Whether to plot the error distribution. The default is False.
Returns
  • mean_error (float): mean mass error of the Gaussian distribution (ppm)
  • fwhm_error (float): full width half max of the gaussian error distribution (ppm)
  • ppm_thresh (list): recommended thresholds for the recalibration parameters (ppm) Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
def determine_error_boundaries(self):
185    def determine_error_boundaries(self):
186        """Determine the error boundaries for recalibration space
187
188        This is the main function in this class
189        Sets the Molecular Formulas search settings, performs the initial formula search
190        Converts the data to a dataframe, and gets the error range
191        Returns the error thresholds.
192
193        Returns
194        -------
195        mean_error : float
196            mean mass error of the Gaussian distribution (ppm)
197        fwhm_error : float
198            full width half max of the gaussian error distribution (ppm)
199        ppm_thresh : list
200            recommended thresholds for the recalibration parameters (ppm)
201            Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
202        """
203
204        # Set the search settings
205        self.set_uncal_settings()
206
207        # Set the positive mode settings
208        # To do - have user defineable settings?
209        if self.mass_spectrum.polarity == 1:
210            self.positive_search_settings()
211
212        # Search MFs
213        SearchMolecularFormulas(
214            self.mass_spectrum, first_hit=True
215        ).run_worker_mass_spectrum()
216
217        # Exporting to a DF is ~30x slower than just getting the errors, so this is fast.
218        errors = []
219        for mspeak in self.mass_spectrum.mspeaks:
220            if len(mspeak.molecular_formulas) > 0:
221                errors.append(mspeak.best_molecular_formula_candidate.mz_error)
222
223        # If there are NO assignments, it'll fail on the next step. Need to check for that
224        nassign = len(errors)
225        # Here we say at least 5 features assigned are needed - it probably should be greater, but we are just trying to stop it breaking the code
226        # We want to make sure the spectrum is capture in the database though - so we return the stats entries (0 assignments) and the number of assignments
227        if nassign < 5:
228            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
229                print("fewer than 5 peaks assigned, cannot determine error range")
230            return np.nan, np.nan, [np.nan, np.nan]
231        else:
232            mean_error, fwhm_error, ppm_thresh = self.get_error_range(
233                errors, self.ppmFWHMprior, self.plot
234            )
235            return mean_error, fwhm_error, ppm_thresh

Determine the error boundaries for recalibration space

This is the main function in this class Sets the Molecular Formulas search settings, performs the initial formula search Converts the data to a dataframe, and gets the error range Returns the error thresholds.

Returns
  • mean_error (float): mean mass error of the Gaussian distribution (ppm)
  • fwhm_error (float): full width half max of the gaussian error distribution (ppm)
  • ppm_thresh (list): recommended thresholds for the recalibration parameters (ppm) Consists of [mean_error-fwhm_error,mean_error+fwhm_error]