corems.mass_spectrum.calc.AutoRecalibration
Created on March 23 2023
@author: Will Kew
Modules for automatic mass internal recalibration
1# -*- coding: utf-8 -*- 2""" 3Created on March 23 2023 4 5@author: Will Kew 6 7Modules for automatic mass internal recalibration 8""" 9 10from lmfit.models import GaussianModel 11import seaborn as sns 12import pandas as pd 13import numpy as np 14import matplotlib.pyplot as plt 15from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulas 16import copy 17 18 19class HighResRecalibration: 20 """ 21 This class is designed for high resolution (FTICR, Orbitrap) data of complex mixture, e.g. Organic matter 22 23 The tool first does a broad mass range search for the most commonly expected ion type (i.e. CHO, deprotonated - for negative ESI) 24 And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range. 25 This tool works when the data are of sufficient quality, and not outwith the typical expected range of the mass analyzer 26 It presumes the mean error is out by 0-several ppm, but that the spread of error values is modest (<2ppm) 27 28 Parameters 29 ---------- 30 mass_spectrum : MassSpectrum 31 CoreMS mass spectrum object 32 plot : bool, optional 33 Whether to plot the error distribution. The default is False. 34 docker : bool, optional 35 Whether to use the docker database. The default is True. If not, it uses a dynamically generated sqlite database. 36 ppmFWHMprior : float, optional 37 The FWHM of the prior distribution (ppm). The default is 3. 38 ppmRangeprior : float, optional 39 The range of the prior distribution (ppm). The default is 15. 40 41 Methods 42 -------- 43 * determine_error_boundaries(). Determine the error boundaries for recalibration space. 44 45 Notes 46 ----- 47 This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments. 48 Possible future task is to make the base class copyable. 49 50 """ 51 52 def __init__( 53 self, 54 mass_spectrum, 55 plot: bool = False, 56 docker: bool = True, 57 ppmFWHMprior: float = 3, 58 ppmRangeprior: float = 15, 59 ): 60 self.mass_spectrum = copy.deepcopy(mass_spectrum) 61 self.plot = plot 62 self.docker = docker 63 self.ppmFWHMprior = ppmFWHMprior 64 self.ppmRangeprior = ppmRangeprior 65 66 def set_uncal_settings(self): 67 """Set uncalibrated formula search settings 68 69 This function serves the uncalibrated data (hence broad error tolerance) 70 It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode 71 72 This will not work for positive mode data, or for other ion types, or other expected elemental searches. 73 74 """ 75 # TODO rework this. 76 77 if self.docker: 78 self.mass_spectrum.molecular_search_settings.url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp" 79 else: 80 self.mass_spectrum.molecular_search_settings.url_database = None 81 self.mass_spectrum.molecular_search_settings.error_method = None 82 self.mass_spectrum.molecular_search_settings.score_method = "prob_score" 83 84 self.mass_spectrum.molecular_search_settings.min_ppm_error = ( 85 -1 * self.ppmRangeprior / 2 86 ) # -7.5 87 self.mass_spectrum.molecular_search_settings.max_ppm_error = ( 88 self.ppmRangeprior / 2 89 ) # 7.5 90 91 self.mass_spectrum.molecular_search_settings.min_dbe = 0 92 self.mass_spectrum.molecular_search_settings.max_dbe = 50 93 94 self.mass_spectrum.molecular_search_settings.use_isotopologue_filter = False 95 self.mass_spectrum.molecular_search_settings.min_abun_error = -30 96 self.mass_spectrum.molecular_search_settings.max_abun_error = 70 97 98 self.mass_spectrum.molecular_search_settings.use_min_peaks_filter = True 99 self.mass_spectrum.molecular_search_settings.min_peaks_per_class = ( 100 10 # default is 15 101 ) 102 103 self.mass_spectrum.molecular_search_settings.usedAtoms["C"] = (1, 90) 104 self.mass_spectrum.molecular_search_settings.usedAtoms["H"] = (4, 200) 105 self.mass_spectrum.molecular_search_settings.usedAtoms["O"] = (1, 23) 106 self.mass_spectrum.molecular_search_settings.usedAtoms["N"] = (0, 0) 107 self.mass_spectrum.molecular_search_settings.usedAtoms["S"] = (0, 0) 108 self.mass_spectrum.molecular_search_settings.usedAtoms["P"] = (0, 0) 109 110 self.mass_spectrum.molecular_search_settings.isProtonated = True 111 self.mass_spectrum.molecular_search_settings.isRadical = False 112 self.mass_spectrum.molecular_search_settings.isAdduct = False 113 114 def positive_search_settings(self): 115 """Set the positive mode elemental search settings""" 116 self.mass_spectrum.molecular_search_settings.isProtonated = False 117 self.mass_spectrum.molecular_search_settings.isAdduct = True 118 self.mass_spectrum.molecular_search_settings.adduct_atoms_pos = ["Na"] 119 120 @staticmethod 121 def get_error_range( 122 errors: list, ppmFWHMprior: float = 3, plot_logic: bool = False 123 ): 124 """Get the error range from the error distribution 125 126 Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species. 127 128 Parameters 129 ---------- 130 errors : list 131 list of the errors of the assigned species (ppm) 132 ppmFWHMprior : float, optional 133 The FWHM of the prior distribution (ppm). The default is 3. 134 plot_logic : bool, optional 135 Whether to plot the error distribution. The default is False. 136 137 Returns 138 ------- 139 mean_error : float 140 mean mass error of the Gaussian distribution (ppm) 141 fwhm_error : float 142 full width half max of the gaussian error distribution (ppm) 143 ppm_thresh : list 144 recommended thresholds for the recalibration parameters (ppm) 145 Consists of [mean_error-fwhm_error,mean_error+fwhm_error] 146 147 """ 148 kde = sns.kdeplot(errors) 149 150 kde_data = kde.get_lines()[0].get_data() 151 152 tmpdf = pd.Series(index=kde_data[0], data=kde_data[1]) 153 kde_apex_ppm = tmpdf.idxmax() 154 kde_apex_val = tmpdf.max() 155 156 plt.close(kde.figure) 157 plt.close("all") 158 159 lmmodel = GaussianModel() 160 lmpars = lmmodel.guess(kde_data[1], x=kde_data[0]) 161 lmpars["sigma"].value = 2.3548 / ppmFWHMprior 162 lmpars["center"].value = kde_apex_ppm 163 lmpars["amplitude"].value = kde_apex_val 164 lmout = lmmodel.fit(kde_data[1], lmpars, x=kde_data[0]) 165 166 if plot_logic: 167 fig, ax = plt.subplots(figsize=(8, 4)) 168 lmout.plot_fit( 169 ax=ax, data_kws={"color": "tab:blue"}, fit_kws={"color": "tab:red"} 170 ) 171 ax.set_xlabel("$m/z$ Error (ppm)") 172 ax.set_ylabel("Density") 173 plt.legend(facecolor="white", framealpha=0) 174 175 mean_error = lmout.best_values["center"] 176 std_error = lmout.best_values["sigma"] 177 # FWHM from Sigma = approx. 2.355*sigma 178 # fwhm_error = 2*np.sqrt(2*np.log(2))*std_error 179 fwhm_error = std_error * np.sqrt(8 * np.log(2)) 180 181 ppm_thresh = [mean_error - fwhm_error, mean_error + fwhm_error] 182 return mean_error, fwhm_error, ppm_thresh 183 184 def determine_error_boundaries(self): 185 """Determine the error boundaries for recalibration space 186 187 This is the main function in this class 188 Sets the Molecular Formulas search settings, performs the initial formula search 189 Converts the data to a dataframe, and gets the error range 190 Returns the error thresholds. 191 192 Returns 193 ------- 194 mean_error : float 195 mean mass error of the Gaussian distribution (ppm) 196 fwhm_error : float 197 full width half max of the gaussian error distribution (ppm) 198 ppm_thresh : list 199 recommended thresholds for the recalibration parameters (ppm) 200 Consists of [mean_error-fwhm_error,mean_error+fwhm_error] 201 """ 202 203 # Set the search settings 204 self.set_uncal_settings() 205 206 # Set the positive mode settings 207 # To do - have user defineable settings? 208 if self.mass_spectrum.polarity == 1: 209 self.positive_search_settings() 210 211 # Search MFs 212 SearchMolecularFormulas( 213 self.mass_spectrum, first_hit=True 214 ).run_worker_mass_spectrum() 215 216 # Exporting to a DF is ~30x slower than just getting the errors, so this is fast. 217 errors = [] 218 for mspeak in self.mass_spectrum.mspeaks: 219 if len(mspeak.molecular_formulas) > 0: 220 errors.append(mspeak.best_molecular_formula_candidate.mz_error) 221 222 # If there are NO assignments, it'll fail on the next step. Need to check for that 223 nassign = len(errors) 224 # Here we say at least 5 features assigned are needed - it probably should be greater, but we are just trying to stop it breaking the code 225 # We want to make sure the spectrum is capture in the database though - so we return the stats entries (0 assignments) and the number of assignments 226 if nassign < 5: 227 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 228 print("fewer than 5 peaks assigned, cannot determine error range") 229 return np.nan, np.nan, [np.nan, np.nan] 230 else: 231 mean_error, fwhm_error, ppm_thresh = self.get_error_range( 232 errors, self.ppmFWHMprior, self.plot 233 ) 234 return mean_error, fwhm_error, ppm_thresh
20class HighResRecalibration: 21 """ 22 This class is designed for high resolution (FTICR, Orbitrap) data of complex mixture, e.g. Organic matter 23 24 The tool first does a broad mass range search for the most commonly expected ion type (i.e. CHO, deprotonated - for negative ESI) 25 And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range. 26 This tool works when the data are of sufficient quality, and not outwith the typical expected range of the mass analyzer 27 It presumes the mean error is out by 0-several ppm, but that the spread of error values is modest (<2ppm) 28 29 Parameters 30 ---------- 31 mass_spectrum : MassSpectrum 32 CoreMS mass spectrum object 33 plot : bool, optional 34 Whether to plot the error distribution. The default is False. 35 docker : bool, optional 36 Whether to use the docker database. The default is True. If not, it uses a dynamically generated sqlite database. 37 ppmFWHMprior : float, optional 38 The FWHM of the prior distribution (ppm). The default is 3. 39 ppmRangeprior : float, optional 40 The range of the prior distribution (ppm). The default is 15. 41 42 Methods 43 -------- 44 * determine_error_boundaries(). Determine the error boundaries for recalibration space. 45 46 Notes 47 ----- 48 This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments. 49 Possible future task is to make the base class copyable. 50 51 """ 52 53 def __init__( 54 self, 55 mass_spectrum, 56 plot: bool = False, 57 docker: bool = True, 58 ppmFWHMprior: float = 3, 59 ppmRangeprior: float = 15, 60 ): 61 self.mass_spectrum = copy.deepcopy(mass_spectrum) 62 self.plot = plot 63 self.docker = docker 64 self.ppmFWHMprior = ppmFWHMprior 65 self.ppmRangeprior = ppmRangeprior 66 67 def set_uncal_settings(self): 68 """Set uncalibrated formula search settings 69 70 This function serves the uncalibrated data (hence broad error tolerance) 71 It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode 72 73 This will not work for positive mode data, or for other ion types, or other expected elemental searches. 74 75 """ 76 # TODO rework this. 77 78 if self.docker: 79 self.mass_spectrum.molecular_search_settings.url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp" 80 else: 81 self.mass_spectrum.molecular_search_settings.url_database = None 82 self.mass_spectrum.molecular_search_settings.error_method = None 83 self.mass_spectrum.molecular_search_settings.score_method = "prob_score" 84 85 self.mass_spectrum.molecular_search_settings.min_ppm_error = ( 86 -1 * self.ppmRangeprior / 2 87 ) # -7.5 88 self.mass_spectrum.molecular_search_settings.max_ppm_error = ( 89 self.ppmRangeprior / 2 90 ) # 7.5 91 92 self.mass_spectrum.molecular_search_settings.min_dbe = 0 93 self.mass_spectrum.molecular_search_settings.max_dbe = 50 94 95 self.mass_spectrum.molecular_search_settings.use_isotopologue_filter = False 96 self.mass_spectrum.molecular_search_settings.min_abun_error = -30 97 self.mass_spectrum.molecular_search_settings.max_abun_error = 70 98 99 self.mass_spectrum.molecular_search_settings.use_min_peaks_filter = True 100 self.mass_spectrum.molecular_search_settings.min_peaks_per_class = ( 101 10 # default is 15 102 ) 103 104 self.mass_spectrum.molecular_search_settings.usedAtoms["C"] = (1, 90) 105 self.mass_spectrum.molecular_search_settings.usedAtoms["H"] = (4, 200) 106 self.mass_spectrum.molecular_search_settings.usedAtoms["O"] = (1, 23) 107 self.mass_spectrum.molecular_search_settings.usedAtoms["N"] = (0, 0) 108 self.mass_spectrum.molecular_search_settings.usedAtoms["S"] = (0, 0) 109 self.mass_spectrum.molecular_search_settings.usedAtoms["P"] = (0, 0) 110 111 self.mass_spectrum.molecular_search_settings.isProtonated = True 112 self.mass_spectrum.molecular_search_settings.isRadical = False 113 self.mass_spectrum.molecular_search_settings.isAdduct = False 114 115 def positive_search_settings(self): 116 """Set the positive mode elemental search settings""" 117 self.mass_spectrum.molecular_search_settings.isProtonated = False 118 self.mass_spectrum.molecular_search_settings.isAdduct = True 119 self.mass_spectrum.molecular_search_settings.adduct_atoms_pos = ["Na"] 120 121 @staticmethod 122 def get_error_range( 123 errors: list, ppmFWHMprior: float = 3, plot_logic: bool = False 124 ): 125 """Get the error range from the error distribution 126 127 Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species. 128 129 Parameters 130 ---------- 131 errors : list 132 list of the errors of the assigned species (ppm) 133 ppmFWHMprior : float, optional 134 The FWHM of the prior distribution (ppm). The default is 3. 135 plot_logic : bool, optional 136 Whether to plot the error distribution. The default is False. 137 138 Returns 139 ------- 140 mean_error : float 141 mean mass error of the Gaussian distribution (ppm) 142 fwhm_error : float 143 full width half max of the gaussian error distribution (ppm) 144 ppm_thresh : list 145 recommended thresholds for the recalibration parameters (ppm) 146 Consists of [mean_error-fwhm_error,mean_error+fwhm_error] 147 148 """ 149 kde = sns.kdeplot(errors) 150 151 kde_data = kde.get_lines()[0].get_data() 152 153 tmpdf = pd.Series(index=kde_data[0], data=kde_data[1]) 154 kde_apex_ppm = tmpdf.idxmax() 155 kde_apex_val = tmpdf.max() 156 157 plt.close(kde.figure) 158 plt.close("all") 159 160 lmmodel = GaussianModel() 161 lmpars = lmmodel.guess(kde_data[1], x=kde_data[0]) 162 lmpars["sigma"].value = 2.3548 / ppmFWHMprior 163 lmpars["center"].value = kde_apex_ppm 164 lmpars["amplitude"].value = kde_apex_val 165 lmout = lmmodel.fit(kde_data[1], lmpars, x=kde_data[0]) 166 167 if plot_logic: 168 fig, ax = plt.subplots(figsize=(8, 4)) 169 lmout.plot_fit( 170 ax=ax, data_kws={"color": "tab:blue"}, fit_kws={"color": "tab:red"} 171 ) 172 ax.set_xlabel("$m/z$ Error (ppm)") 173 ax.set_ylabel("Density") 174 plt.legend(facecolor="white", framealpha=0) 175 176 mean_error = lmout.best_values["center"] 177 std_error = lmout.best_values["sigma"] 178 # FWHM from Sigma = approx. 2.355*sigma 179 # fwhm_error = 2*np.sqrt(2*np.log(2))*std_error 180 fwhm_error = std_error * np.sqrt(8 * np.log(2)) 181 182 ppm_thresh = [mean_error - fwhm_error, mean_error + fwhm_error] 183 return mean_error, fwhm_error, ppm_thresh 184 185 def determine_error_boundaries(self): 186 """Determine the error boundaries for recalibration space 187 188 This is the main function in this class 189 Sets the Molecular Formulas search settings, performs the initial formula search 190 Converts the data to a dataframe, and gets the error range 191 Returns the error thresholds. 192 193 Returns 194 ------- 195 mean_error : float 196 mean mass error of the Gaussian distribution (ppm) 197 fwhm_error : float 198 full width half max of the gaussian error distribution (ppm) 199 ppm_thresh : list 200 recommended thresholds for the recalibration parameters (ppm) 201 Consists of [mean_error-fwhm_error,mean_error+fwhm_error] 202 """ 203 204 # Set the search settings 205 self.set_uncal_settings() 206 207 # Set the positive mode settings 208 # To do - have user defineable settings? 209 if self.mass_spectrum.polarity == 1: 210 self.positive_search_settings() 211 212 # Search MFs 213 SearchMolecularFormulas( 214 self.mass_spectrum, first_hit=True 215 ).run_worker_mass_spectrum() 216 217 # Exporting to a DF is ~30x slower than just getting the errors, so this is fast. 218 errors = [] 219 for mspeak in self.mass_spectrum.mspeaks: 220 if len(mspeak.molecular_formulas) > 0: 221 errors.append(mspeak.best_molecular_formula_candidate.mz_error) 222 223 # If there are NO assignments, it'll fail on the next step. Need to check for that 224 nassign = len(errors) 225 # Here we say at least 5 features assigned are needed - it probably should be greater, but we are just trying to stop it breaking the code 226 # We want to make sure the spectrum is capture in the database though - so we return the stats entries (0 assignments) and the number of assignments 227 if nassign < 5: 228 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 229 print("fewer than 5 peaks assigned, cannot determine error range") 230 return np.nan, np.nan, [np.nan, np.nan] 231 else: 232 mean_error, fwhm_error, ppm_thresh = self.get_error_range( 233 errors, self.ppmFWHMprior, self.plot 234 ) 235 return mean_error, fwhm_error, ppm_thresh
This class is designed for high resolution (FTICR, Orbitrap) data of complex mixture, e.g. Organic matter
The tool first does a broad mass range search for the most commonly expected ion type (i.e. CHO, deprotonated - for negative ESI) And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range. This tool works when the data are of sufficient quality, and not outwith the typical expected range of the mass analyzer It presumes the mean error is out by 0-several ppm, but that the spread of error values is modest (<2ppm)
Parameters
- mass_spectrum (MassSpectrum): CoreMS mass spectrum object
- plot (bool, optional): Whether to plot the error distribution. The default is False.
- docker (bool, optional): Whether to use the docker database. The default is True. If not, it uses a dynamically generated sqlite database.
- ppmFWHMprior (float, optional): The FWHM of the prior distribution (ppm). The default is 3.
- ppmRangeprior (float, optional): The range of the prior distribution (ppm). The default is 15.
Methods
- determine_error_boundaries(). Determine the error boundaries for recalibration space.
Notes
This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments. Possible future task is to make the base class copyable.
53 def __init__( 54 self, 55 mass_spectrum, 56 plot: bool = False, 57 docker: bool = True, 58 ppmFWHMprior: float = 3, 59 ppmRangeprior: float = 15, 60 ): 61 self.mass_spectrum = copy.deepcopy(mass_spectrum) 62 self.plot = plot 63 self.docker = docker 64 self.ppmFWHMprior = ppmFWHMprior 65 self.ppmRangeprior = ppmRangeprior
67 def set_uncal_settings(self): 68 """Set uncalibrated formula search settings 69 70 This function serves the uncalibrated data (hence broad error tolerance) 71 It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode 72 73 This will not work for positive mode data, or for other ion types, or other expected elemental searches. 74 75 """ 76 # TODO rework this. 77 78 if self.docker: 79 self.mass_spectrum.molecular_search_settings.url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp" 80 else: 81 self.mass_spectrum.molecular_search_settings.url_database = None 82 self.mass_spectrum.molecular_search_settings.error_method = None 83 self.mass_spectrum.molecular_search_settings.score_method = "prob_score" 84 85 self.mass_spectrum.molecular_search_settings.min_ppm_error = ( 86 -1 * self.ppmRangeprior / 2 87 ) # -7.5 88 self.mass_spectrum.molecular_search_settings.max_ppm_error = ( 89 self.ppmRangeprior / 2 90 ) # 7.5 91 92 self.mass_spectrum.molecular_search_settings.min_dbe = 0 93 self.mass_spectrum.molecular_search_settings.max_dbe = 50 94 95 self.mass_spectrum.molecular_search_settings.use_isotopologue_filter = False 96 self.mass_spectrum.molecular_search_settings.min_abun_error = -30 97 self.mass_spectrum.molecular_search_settings.max_abun_error = 70 98 99 self.mass_spectrum.molecular_search_settings.use_min_peaks_filter = True 100 self.mass_spectrum.molecular_search_settings.min_peaks_per_class = ( 101 10 # default is 15 102 ) 103 104 self.mass_spectrum.molecular_search_settings.usedAtoms["C"] = (1, 90) 105 self.mass_spectrum.molecular_search_settings.usedAtoms["H"] = (4, 200) 106 self.mass_spectrum.molecular_search_settings.usedAtoms["O"] = (1, 23) 107 self.mass_spectrum.molecular_search_settings.usedAtoms["N"] = (0, 0) 108 self.mass_spectrum.molecular_search_settings.usedAtoms["S"] = (0, 0) 109 self.mass_spectrum.molecular_search_settings.usedAtoms["P"] = (0, 0) 110 111 self.mass_spectrum.molecular_search_settings.isProtonated = True 112 self.mass_spectrum.molecular_search_settings.isRadical = False 113 self.mass_spectrum.molecular_search_settings.isAdduct = False
Set uncalibrated formula search settings
This function serves the uncalibrated data (hence broad error tolerance) It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode
This will not work for positive mode data, or for other ion types, or other expected elemental searches.
115 def positive_search_settings(self): 116 """Set the positive mode elemental search settings""" 117 self.mass_spectrum.molecular_search_settings.isProtonated = False 118 self.mass_spectrum.molecular_search_settings.isAdduct = True 119 self.mass_spectrum.molecular_search_settings.adduct_atoms_pos = ["Na"]
Set the positive mode elemental search settings
121 @staticmethod 122 def get_error_range( 123 errors: list, ppmFWHMprior: float = 3, plot_logic: bool = False 124 ): 125 """Get the error range from the error distribution 126 127 Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species. 128 129 Parameters 130 ---------- 131 errors : list 132 list of the errors of the assigned species (ppm) 133 ppmFWHMprior : float, optional 134 The FWHM of the prior distribution (ppm). The default is 3. 135 plot_logic : bool, optional 136 Whether to plot the error distribution. The default is False. 137 138 Returns 139 ------- 140 mean_error : float 141 mean mass error of the Gaussian distribution (ppm) 142 fwhm_error : float 143 full width half max of the gaussian error distribution (ppm) 144 ppm_thresh : list 145 recommended thresholds for the recalibration parameters (ppm) 146 Consists of [mean_error-fwhm_error,mean_error+fwhm_error] 147 148 """ 149 kde = sns.kdeplot(errors) 150 151 kde_data = kde.get_lines()[0].get_data() 152 153 tmpdf = pd.Series(index=kde_data[0], data=kde_data[1]) 154 kde_apex_ppm = tmpdf.idxmax() 155 kde_apex_val = tmpdf.max() 156 157 plt.close(kde.figure) 158 plt.close("all") 159 160 lmmodel = GaussianModel() 161 lmpars = lmmodel.guess(kde_data[1], x=kde_data[0]) 162 lmpars["sigma"].value = 2.3548 / ppmFWHMprior 163 lmpars["center"].value = kde_apex_ppm 164 lmpars["amplitude"].value = kde_apex_val 165 lmout = lmmodel.fit(kde_data[1], lmpars, x=kde_data[0]) 166 167 if plot_logic: 168 fig, ax = plt.subplots(figsize=(8, 4)) 169 lmout.plot_fit( 170 ax=ax, data_kws={"color": "tab:blue"}, fit_kws={"color": "tab:red"} 171 ) 172 ax.set_xlabel("$m/z$ Error (ppm)") 173 ax.set_ylabel("Density") 174 plt.legend(facecolor="white", framealpha=0) 175 176 mean_error = lmout.best_values["center"] 177 std_error = lmout.best_values["sigma"] 178 # FWHM from Sigma = approx. 2.355*sigma 179 # fwhm_error = 2*np.sqrt(2*np.log(2))*std_error 180 fwhm_error = std_error * np.sqrt(8 * np.log(2)) 181 182 ppm_thresh = [mean_error - fwhm_error, mean_error + fwhm_error] 183 return mean_error, fwhm_error, ppm_thresh
Get the error range from the error distribution
Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.
Parameters
- errors (list): list of the errors of the assigned species (ppm)
- ppmFWHMprior (float, optional): The FWHM of the prior distribution (ppm). The default is 3.
- plot_logic (bool, optional): Whether to plot the error distribution. The default is False.
Returns
- mean_error (float): mean mass error of the Gaussian distribution (ppm)
- fwhm_error (float): full width half max of the gaussian error distribution (ppm)
- ppm_thresh (list): recommended thresholds for the recalibration parameters (ppm) Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
185 def determine_error_boundaries(self): 186 """Determine the error boundaries for recalibration space 187 188 This is the main function in this class 189 Sets the Molecular Formulas search settings, performs the initial formula search 190 Converts the data to a dataframe, and gets the error range 191 Returns the error thresholds. 192 193 Returns 194 ------- 195 mean_error : float 196 mean mass error of the Gaussian distribution (ppm) 197 fwhm_error : float 198 full width half max of the gaussian error distribution (ppm) 199 ppm_thresh : list 200 recommended thresholds for the recalibration parameters (ppm) 201 Consists of [mean_error-fwhm_error,mean_error+fwhm_error] 202 """ 203 204 # Set the search settings 205 self.set_uncal_settings() 206 207 # Set the positive mode settings 208 # To do - have user defineable settings? 209 if self.mass_spectrum.polarity == 1: 210 self.positive_search_settings() 211 212 # Search MFs 213 SearchMolecularFormulas( 214 self.mass_spectrum, first_hit=True 215 ).run_worker_mass_spectrum() 216 217 # Exporting to a DF is ~30x slower than just getting the errors, so this is fast. 218 errors = [] 219 for mspeak in self.mass_spectrum.mspeaks: 220 if len(mspeak.molecular_formulas) > 0: 221 errors.append(mspeak.best_molecular_formula_candidate.mz_error) 222 223 # If there are NO assignments, it'll fail on the next step. Need to check for that 224 nassign = len(errors) 225 # Here we say at least 5 features assigned are needed - it probably should be greater, but we are just trying to stop it breaking the code 226 # We want to make sure the spectrum is capture in the database though - so we return the stats entries (0 assignments) and the number of assignments 227 if nassign < 5: 228 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 229 print("fewer than 5 peaks assigned, cannot determine error range") 230 return np.nan, np.nan, [np.nan, np.nan] 231 else: 232 mean_error, fwhm_error, ppm_thresh = self.get_error_range( 233 errors, self.ppmFWHMprior, self.plot 234 ) 235 return mean_error, fwhm_error, ppm_thresh
Determine the error boundaries for recalibration space
This is the main function in this class Sets the Molecular Formulas search settings, performs the initial formula search Converts the data to a dataframe, and gets the error range Returns the error thresholds.
Returns
- mean_error (float): mean mass error of the Gaussian distribution (ppm)
- fwhm_error (float): full width half max of the gaussian error distribution (ppm)
- ppm_thresh (list): recommended thresholds for the recalibration parameters (ppm) Consists of [mean_error-fwhm_error,mean_error+fwhm_error]