corems.mass_spectrum.calc.NoiseCalc

  1import warnings
  2from typing import Tuple
  3
  4from numpy import average, histogram, hstack, inf, isnan, log10, median, nan, std, where
  5
  6from corems import chunks
  7
  8# from matplotlib import pyplot
  9__author__ = "Yuri E. Corilo"
 10__date__ = "Jun 27, 2019"
 11
 12
 13class NoiseThresholdCalc:
 14    """Class for noise threshold calculation.
 15
 16    Parameters
 17    ----------
 18    mass_spectrum : MassSpectrum
 19        The mass spectrum object.
 20    settings : MSParameters
 21        The mass spectrum parameters object.
 22    is_centroid : bool
 23        Flag indicating whether the mass spectrum is centroid or profile.
 24    baseline_noise : float
 25        The baseline noise.
 26    baseline_noise_std : float
 27        The baseline noise standard deviation.
 28    max_signal_to_noise : float
 29        The maximum signal to noise.
 30    max_abundance : float
 31        The maximum abundance.
 32    abundance : np.array
 33        The abundance array.
 34    abundance_profile : np.array
 35        The abundance profile array.
 36    mz_exp : np.array
 37        The experimental m/z array.
 38    mz_exp_profile : np.array
 39        The experimental m/z profile array.
 40
 41    Attributes
 42    ----------
 43    None
 44
 45    Methods
 46    -------
 47    * get_noise_threshold(). Get the noise threshold.
 48    * cut_mz_domain_noise(). Cut the m/z domain to the noise threshold regions.
 49    * get_noise_average(ymincentroid).
 50        Get the average noise and standard deviation.
 51    * get_abundance_minima_centroid(abun_cut)
 52        Get the abundance minima for centroid data.
 53    * run_log_noise_threshold_calc().
 54        Run the log noise threshold calculation.
 55    * run_noise_threshold_calc().
 56        Run the noise threshold calculation.
 57    """
 58
 59    def get_noise_threshold(self) -> Tuple[Tuple[float, float], Tuple[float, float]]:
 60        """Get the noise threshold.
 61
 62        Returns
 63        -------
 64        Tuple[Tuple[float, float], Tuple[float, float]]
 65            A tuple containing the m/z and abundance noise thresholds.
 66            (min_mz, max_mz), (noise_threshold, noise_threshold)
 67        """
 68
 69        if self.is_centroid:
 70            x = min(self.mz_exp), max((self.mz_exp))
 71
 72            if self.settings.noise_threshold_method == "minima":
 73                abundance_threshold = self.baseline_noise + (
 74                    self.settings.noise_threshold_min_std * self.baseline_noise_std
 75                )
 76                y = (abundance_threshold, abundance_threshold)
 77
 78            elif self.settings.noise_threshold_method == "signal_noise":
 79                normalized_threshold = (
 80                    self.max_abundance * self.settings.noise_threshold_min_s2n
 81                ) / self.max_signal_to_noise
 82                y = (normalized_threshold, normalized_threshold)
 83
 84            elif self.settings.noise_threshold_method == "relative_abundance":
 85                normalized_threshold = (
 86                    max(self.abundance) / 100
 87                ) * self.settings.noise_threshold_min_relative_abundance
 88                y = (normalized_threshold, normalized_threshold)
 89
 90            elif self.settings.noise_threshold_method == "absolute_abundance":
 91                normalized_threshold = (
 92                    self.abundance * self.settings.noise_threshold_absolute_abundance
 93                )
 94                y = (normalized_threshold, normalized_threshold)
 95            # log noise method not tested for centroid data
 96            else:
 97                raise Exception(
 98                    "%s method was not implemented, please refer to corems.mass_spectrum.calc.NoiseCalc Class"
 99                    % self.settings.noise_threshold_method
100                )
101
102            return x, y
103
104        else:
105            if self.baseline_noise and self.baseline_noise_std:
106                x = (self.mz_exp_profile.min(), self.mz_exp_profile.max())
107                y = (self.baseline_noise_std, self.baseline_noise_std)
108
109                if self.settings.noise_threshold_method == "minima":
110                    # print(self.settings.noise_threshold_min_std)
111                    abundance_threshold = self.baseline_noise + (
112                        self.settings.noise_threshold_min_std * self.baseline_noise_std
113                    )
114
115                    y = (abundance_threshold, abundance_threshold)
116
117                elif self.settings.noise_threshold_method == "signal_noise":
118                    max_sn = self.abundance_profile.max() / self.baseline_noise_std
119
120                    normalized_threshold = (
121                        self.abundance_profile.max()
122                        * self.settings.noise_threshold_min_s2n
123                    ) / max_sn
124                    y = (normalized_threshold, normalized_threshold)
125
126                elif self.settings.noise_threshold_method == "relative_abundance":
127                    normalized_threshold = (
128                        self.abundance_profile.max() / 100
129                    ) * self.settings.noise_threshold_min_relative_abundance
130                    y = (normalized_threshold, normalized_threshold)
131
132                elif self.settings.noise_threshold_method == "absolute_abundance":
133                    normalized_threshold = (
134                        self.settings.noise_threshold_absolute_abundance
135                    )
136                    y = (normalized_threshold, normalized_threshold)
137
138                elif self.settings.noise_threshold_method == "log":
139                    normalized_threshold = (
140                        self.settings.noise_threshold_log_nsigma
141                        * self.baseline_noise_std
142                    )
143                    y = (normalized_threshold, normalized_threshold)
144
145                else:
146                    raise Exception(
147                        "%s method was not implemented, \
148                        please refer to corems.mass_spectrum.calc.NoiseCalc Class"
149                        % self.settings.noise_threshold_method
150                    )
151
152                return x, y
153
154            else:
155                warnings.warn(
156                    "Noise Baseline and Noise std not specified,\
157                    defaulting to 0,0 run process_mass_spec() ?"
158                )
159                return (0, 0), (0, 0)
160
161    def cut_mz_domain_noise(self):
162        """Cut the m/z domain to the noise threshold regions.
163
164        Returns
165        -------
166        Tuple[np.array, np.array]
167            A tuple containing the m/z and abundance arrays of the truncated spectrum region.
168        """
169        min_mz_whole_ms = self.mz_exp_profile.min()
170        max_mz_whole_ms = self.mz_exp_profile.max()
171
172        if self.settings.noise_threshold_method == "minima":
173            # this calculation is taking too long (about 2 seconds)
174            number_average_molecular_weight = self.weight_average_molecular_weight(
175                profile=True
176            )
177
178            # +-200 is a guess for testing only, it needs adjustment for each type of analysis
179            # need to check min mz here or it will break
180            min_mz_noise = number_average_molecular_weight - 100
181            # need to check max mz here or it will break
182            max_mz_noise = number_average_molecular_weight + 100
183
184        else:
185            min_mz_noise = self.settings.noise_min_mz
186            max_mz_noise = self.settings.noise_max_mz
187
188        if min_mz_noise < min_mz_whole_ms:
189            min_mz_noise = min_mz_whole_ms
190
191        if max_mz_noise > max_mz_whole_ms:
192            max_mz_noise = max_mz_whole_ms
193
194        # print(min_mz_noise, max_mz_noise)
195        low_mz_index = where(self.mz_exp_profile >= min_mz_noise)[0][0]
196        # print(self.mz_exp_profile[low_mz_index])
197        # low_mz_index = (argmax(self.mz_exp_profile <= min_mz_noise))
198
199        high_mz_index = where(self.mz_exp_profile <= max_mz_noise)[-1][-1]
200
201        # high_mz_index = (argmax(self.mz_exp_profile <= max_mz_noise))
202
203        if high_mz_index > low_mz_index:
204            # pyplot.plot(self.mz_exp_profile[low_mz_index:high_mz_index], self.abundance_profile[low_mz_index:high_mz_index])
205            # pyplot.show()
206            return self.mz_exp_profile[
207                high_mz_index:low_mz_index
208            ], self.abundance_profile[low_mz_index:high_mz_index]
209        else:
210            # pyplot.plot(self.mz_exp_profile[high_mz_index:low_mz_index], self.abundance_profile[high_mz_index:low_mz_index])
211            # pyplot.show()
212            return self.mz_exp_profile[
213                high_mz_index:low_mz_index
214            ], self.abundance_profile[high_mz_index:low_mz_index]
215
216    def get_noise_average(self, ymincentroid):
217        """Get the average noise and standard deviation.
218
219        Parameters
220        ----------
221        ymincentroid : np.array
222            The ymincentroid array.
223
224        Returns
225        -------
226        Tuple[float, float]
227            A tuple containing the average noise and standard deviation.
228
229        """
230        # assumes noise to be gaussian and estimate noise level by
231        # calculating the valley.
232
233        auto = True if self.settings.noise_threshold_method == "minima" else False
234
235        average_noise = median((ymincentroid)) * 2 if auto else median(ymincentroid)
236
237        s_deviation = ymincentroid.std() * 3 if auto else ymincentroid.std()
238
239        return average_noise, s_deviation
240
241    def get_abundance_minima_centroid(self, abun_cut):
242        """Get the abundance minima for centroid data.
243
244        Parameters
245        ----------
246        abun_cut : np.array
247            The abundance cut array.
248
249        Returns
250        -------
251        np.array
252            The abundance minima array.
253        """
254        maximum = self.abundance_profile.max()
255        threshold_min = maximum * 1.00
256
257        y = -abun_cut
258
259        dy = y[1:] - y[:-1]
260        """replaces NaN for Infinity"""
261        indices_nan = where(isnan(y))[0]
262
263        if indices_nan.size:
264            y[indices_nan] = inf
265            dy[where(isnan(dy))[0]] = inf
266
267        indices = where((hstack((dy, 0)) < 0) & (hstack((0, dy)) > 0))[0]
268
269        if indices.size and threshold_min is not None:
270            indices = indices[abun_cut[indices] <= threshold_min]
271
272        return abun_cut[indices]
273
274    def run_log_noise_threshold_calc(self):
275        """Run the log noise threshold calculation.
276
277
278        Returns
279        -------
280        Tuple[float, float]
281            A tuple containing the average noise and standard deviation.
282
283        Notes
284        --------
285        Method for estimating the noise based on decimal log of all the data point
286
287        Idea is that you calculate a histogram of of the log10(abundance) values.
288        The maximum of the histogram == the standard deviation of the noise.
289
290
291        For aFT data it is a gaussian distribution of noise - not implemented here!
292        For mFT data it is a Rayleigh distribution, and the value is actually 10^(abu_max)*0.463.
293
294
295        See the publication cited above for the derivation of this.
296
297        References
298        --------
299        1. dx.doi.org/10.1021/ac403278t | Anal. Chem. 2014, 86, 3308−3316
300
301        """
302
303        if self.is_centroid:
304            raise Exception("log noise Not tested for centroid data")
305        else:
306            # cut the spectrum to ROI
307            mz_cut, abundance_cut = self.cut_mz_domain_noise()
308            # If there are 0 values, the log will fail
309            # But we may have negative values for aFT data, so we check if 0 exists
310            # Need to make a copy of the abundance cut values so we dont overwrite it....
311            tmp_abundance = abundance_cut.copy()
312            if 0 in tmp_abundance:
313                tmp_abundance[tmp_abundance == 0] = nan
314                tmp_abundance = tmp_abundance[~isnan(tmp_abundance)]
315                # It seems there are edge cases of sparse but high S/N data where the wrong values may be determined.
316                # Hard to generalise - needs more investigation.
317
318            # calculate a histogram of the log10 of the abundance data
319            hist_values = histogram(
320                log10(tmp_abundance), bins=self.settings.noise_threshold_log_nsigma_bins
321            )
322            # find the apex of this histogram
323            maxvalidx = where(hist_values[0] == max(hist_values[0]))
324            # get the value of this apex (note - still in log10 units)
325            log_sigma = hist_values[1][maxvalidx]
326            # If the histogram had more than one maximum frequency bin, we need to reduce that to one entry
327            if len(log_sigma) > 1:
328                log_sigma = average(log_sigma)
329            ## To do : check if aFT or mFT and adjust method
330            noise_mid = 10**log_sigma
331            noise_1std = (
332                noise_mid * self.settings.noise_threshold_log_nsigma_corr_factor
333            )  # for mFT 0.463
334            return float(noise_mid), float(noise_1std)
335
336    def run_noise_threshold_calc(self):
337        """Runs noise threshold calculation (not log based method)
338
339        Returns
340        -------
341        Tuple[float, float]
342            A tuple containing the average noise and standard deviation.
343
344        """
345        if self.is_centroid:
346            # calculates noise_baseline and noise_std
347            # needed to run auto noise threshold mode
348            # it is not used for signal to noise nor
349            # relative abudance methods
350            abundances_chunks = chunks(self.abundance, 50)
351            each_min_abund = [min(x) for x in abundances_chunks]
352
353            return average(each_min_abund), std(each_min_abund)
354
355        else:
356            mz_cut, abundance_cut = self.cut_mz_domain_noise()
357
358            if self.settings.noise_threshold_method == "minima":
359                yminima = self.get_abundance_minima_centroid(abundance_cut)
360
361                return self.get_noise_average(yminima)
362
363            else:
364                # pyplot.show()
365                return self.get_noise_average(abundance_cut)
class NoiseThresholdCalc:
 14class NoiseThresholdCalc:
 15    """Class for noise threshold calculation.
 16
 17    Parameters
 18    ----------
 19    mass_spectrum : MassSpectrum
 20        The mass spectrum object.
 21    settings : MSParameters
 22        The mass spectrum parameters object.
 23    is_centroid : bool
 24        Flag indicating whether the mass spectrum is centroid or profile.
 25    baseline_noise : float
 26        The baseline noise.
 27    baseline_noise_std : float
 28        The baseline noise standard deviation.
 29    max_signal_to_noise : float
 30        The maximum signal to noise.
 31    max_abundance : float
 32        The maximum abundance.
 33    abundance : np.array
 34        The abundance array.
 35    abundance_profile : np.array
 36        The abundance profile array.
 37    mz_exp : np.array
 38        The experimental m/z array.
 39    mz_exp_profile : np.array
 40        The experimental m/z profile array.
 41
 42    Attributes
 43    ----------
 44    None
 45
 46    Methods
 47    -------
 48    * get_noise_threshold(). Get the noise threshold.
 49    * cut_mz_domain_noise(). Cut the m/z domain to the noise threshold regions.
 50    * get_noise_average(ymincentroid).
 51        Get the average noise and standard deviation.
 52    * get_abundance_minima_centroid(abun_cut)
 53        Get the abundance minima for centroid data.
 54    * run_log_noise_threshold_calc().
 55        Run the log noise threshold calculation.
 56    * run_noise_threshold_calc().
 57        Run the noise threshold calculation.
 58    """
 59
 60    def get_noise_threshold(self) -> Tuple[Tuple[float, float], Tuple[float, float]]:
 61        """Get the noise threshold.
 62
 63        Returns
 64        -------
 65        Tuple[Tuple[float, float], Tuple[float, float]]
 66            A tuple containing the m/z and abundance noise thresholds.
 67            (min_mz, max_mz), (noise_threshold, noise_threshold)
 68        """
 69
 70        if self.is_centroid:
 71            x = min(self.mz_exp), max((self.mz_exp))
 72
 73            if self.settings.noise_threshold_method == "minima":
 74                abundance_threshold = self.baseline_noise + (
 75                    self.settings.noise_threshold_min_std * self.baseline_noise_std
 76                )
 77                y = (abundance_threshold, abundance_threshold)
 78
 79            elif self.settings.noise_threshold_method == "signal_noise":
 80                normalized_threshold = (
 81                    self.max_abundance * self.settings.noise_threshold_min_s2n
 82                ) / self.max_signal_to_noise
 83                y = (normalized_threshold, normalized_threshold)
 84
 85            elif self.settings.noise_threshold_method == "relative_abundance":
 86                normalized_threshold = (
 87                    max(self.abundance) / 100
 88                ) * self.settings.noise_threshold_min_relative_abundance
 89                y = (normalized_threshold, normalized_threshold)
 90
 91            elif self.settings.noise_threshold_method == "absolute_abundance":
 92                normalized_threshold = (
 93                    self.abundance * self.settings.noise_threshold_absolute_abundance
 94                )
 95                y = (normalized_threshold, normalized_threshold)
 96            # log noise method not tested for centroid data
 97            else:
 98                raise Exception(
 99                    "%s method was not implemented, please refer to corems.mass_spectrum.calc.NoiseCalc Class"
100                    % self.settings.noise_threshold_method
101                )
102
103            return x, y
104
105        else:
106            if self.baseline_noise and self.baseline_noise_std:
107                x = (self.mz_exp_profile.min(), self.mz_exp_profile.max())
108                y = (self.baseline_noise_std, self.baseline_noise_std)
109
110                if self.settings.noise_threshold_method == "minima":
111                    # print(self.settings.noise_threshold_min_std)
112                    abundance_threshold = self.baseline_noise + (
113                        self.settings.noise_threshold_min_std * self.baseline_noise_std
114                    )
115
116                    y = (abundance_threshold, abundance_threshold)
117
118                elif self.settings.noise_threshold_method == "signal_noise":
119                    max_sn = self.abundance_profile.max() / self.baseline_noise_std
120
121                    normalized_threshold = (
122                        self.abundance_profile.max()
123                        * self.settings.noise_threshold_min_s2n
124                    ) / max_sn
125                    y = (normalized_threshold, normalized_threshold)
126
127                elif self.settings.noise_threshold_method == "relative_abundance":
128                    normalized_threshold = (
129                        self.abundance_profile.max() / 100
130                    ) * self.settings.noise_threshold_min_relative_abundance
131                    y = (normalized_threshold, normalized_threshold)
132
133                elif self.settings.noise_threshold_method == "absolute_abundance":
134                    normalized_threshold = (
135                        self.settings.noise_threshold_absolute_abundance
136                    )
137                    y = (normalized_threshold, normalized_threshold)
138
139                elif self.settings.noise_threshold_method == "log":
140                    normalized_threshold = (
141                        self.settings.noise_threshold_log_nsigma
142                        * self.baseline_noise_std
143                    )
144                    y = (normalized_threshold, normalized_threshold)
145
146                else:
147                    raise Exception(
148                        "%s method was not implemented, \
149                        please refer to corems.mass_spectrum.calc.NoiseCalc Class"
150                        % self.settings.noise_threshold_method
151                    )
152
153                return x, y
154
155            else:
156                warnings.warn(
157                    "Noise Baseline and Noise std not specified,\
158                    defaulting to 0,0 run process_mass_spec() ?"
159                )
160                return (0, 0), (0, 0)
161
162    def cut_mz_domain_noise(self):
163        """Cut the m/z domain to the noise threshold regions.
164
165        Returns
166        -------
167        Tuple[np.array, np.array]
168            A tuple containing the m/z and abundance arrays of the truncated spectrum region.
169        """
170        min_mz_whole_ms = self.mz_exp_profile.min()
171        max_mz_whole_ms = self.mz_exp_profile.max()
172
173        if self.settings.noise_threshold_method == "minima":
174            # this calculation is taking too long (about 2 seconds)
175            number_average_molecular_weight = self.weight_average_molecular_weight(
176                profile=True
177            )
178
179            # +-200 is a guess for testing only, it needs adjustment for each type of analysis
180            # need to check min mz here or it will break
181            min_mz_noise = number_average_molecular_weight - 100
182            # need to check max mz here or it will break
183            max_mz_noise = number_average_molecular_weight + 100
184
185        else:
186            min_mz_noise = self.settings.noise_min_mz
187            max_mz_noise = self.settings.noise_max_mz
188
189        if min_mz_noise < min_mz_whole_ms:
190            min_mz_noise = min_mz_whole_ms
191
192        if max_mz_noise > max_mz_whole_ms:
193            max_mz_noise = max_mz_whole_ms
194
195        # print(min_mz_noise, max_mz_noise)
196        low_mz_index = where(self.mz_exp_profile >= min_mz_noise)[0][0]
197        # print(self.mz_exp_profile[low_mz_index])
198        # low_mz_index = (argmax(self.mz_exp_profile <= min_mz_noise))
199
200        high_mz_index = where(self.mz_exp_profile <= max_mz_noise)[-1][-1]
201
202        # high_mz_index = (argmax(self.mz_exp_profile <= max_mz_noise))
203
204        if high_mz_index > low_mz_index:
205            # pyplot.plot(self.mz_exp_profile[low_mz_index:high_mz_index], self.abundance_profile[low_mz_index:high_mz_index])
206            # pyplot.show()
207            return self.mz_exp_profile[
208                high_mz_index:low_mz_index
209            ], self.abundance_profile[low_mz_index:high_mz_index]
210        else:
211            # pyplot.plot(self.mz_exp_profile[high_mz_index:low_mz_index], self.abundance_profile[high_mz_index:low_mz_index])
212            # pyplot.show()
213            return self.mz_exp_profile[
214                high_mz_index:low_mz_index
215            ], self.abundance_profile[high_mz_index:low_mz_index]
216
217    def get_noise_average(self, ymincentroid):
218        """Get the average noise and standard deviation.
219
220        Parameters
221        ----------
222        ymincentroid : np.array
223            The ymincentroid array.
224
225        Returns
226        -------
227        Tuple[float, float]
228            A tuple containing the average noise and standard deviation.
229
230        """
231        # assumes noise to be gaussian and estimate noise level by
232        # calculating the valley.
233
234        auto = True if self.settings.noise_threshold_method == "minima" else False
235
236        average_noise = median((ymincentroid)) * 2 if auto else median(ymincentroid)
237
238        s_deviation = ymincentroid.std() * 3 if auto else ymincentroid.std()
239
240        return average_noise, s_deviation
241
242    def get_abundance_minima_centroid(self, abun_cut):
243        """Get the abundance minima for centroid data.
244
245        Parameters
246        ----------
247        abun_cut : np.array
248            The abundance cut array.
249
250        Returns
251        -------
252        np.array
253            The abundance minima array.
254        """
255        maximum = self.abundance_profile.max()
256        threshold_min = maximum * 1.00
257
258        y = -abun_cut
259
260        dy = y[1:] - y[:-1]
261        """replaces NaN for Infinity"""
262        indices_nan = where(isnan(y))[0]
263
264        if indices_nan.size:
265            y[indices_nan] = inf
266            dy[where(isnan(dy))[0]] = inf
267
268        indices = where((hstack((dy, 0)) < 0) & (hstack((0, dy)) > 0))[0]
269
270        if indices.size and threshold_min is not None:
271            indices = indices[abun_cut[indices] <= threshold_min]
272
273        return abun_cut[indices]
274
275    def run_log_noise_threshold_calc(self):
276        """Run the log noise threshold calculation.
277
278
279        Returns
280        -------
281        Tuple[float, float]
282            A tuple containing the average noise and standard deviation.
283
284        Notes
285        --------
286        Method for estimating the noise based on decimal log of all the data point
287
288        Idea is that you calculate a histogram of of the log10(abundance) values.
289        The maximum of the histogram == the standard deviation of the noise.
290
291
292        For aFT data it is a gaussian distribution of noise - not implemented here!
293        For mFT data it is a Rayleigh distribution, and the value is actually 10^(abu_max)*0.463.
294
295
296        See the publication cited above for the derivation of this.
297
298        References
299        --------
300        1. dx.doi.org/10.1021/ac403278t | Anal. Chem. 2014, 86, 3308−3316
301
302        """
303
304        if self.is_centroid:
305            raise Exception("log noise Not tested for centroid data")
306        else:
307            # cut the spectrum to ROI
308            mz_cut, abundance_cut = self.cut_mz_domain_noise()
309            # If there are 0 values, the log will fail
310            # But we may have negative values for aFT data, so we check if 0 exists
311            # Need to make a copy of the abundance cut values so we dont overwrite it....
312            tmp_abundance = abundance_cut.copy()
313            if 0 in tmp_abundance:
314                tmp_abundance[tmp_abundance == 0] = nan
315                tmp_abundance = tmp_abundance[~isnan(tmp_abundance)]
316                # It seems there are edge cases of sparse but high S/N data where the wrong values may be determined.
317                # Hard to generalise - needs more investigation.
318
319            # calculate a histogram of the log10 of the abundance data
320            hist_values = histogram(
321                log10(tmp_abundance), bins=self.settings.noise_threshold_log_nsigma_bins
322            )
323            # find the apex of this histogram
324            maxvalidx = where(hist_values[0] == max(hist_values[0]))
325            # get the value of this apex (note - still in log10 units)
326            log_sigma = hist_values[1][maxvalidx]
327            # If the histogram had more than one maximum frequency bin, we need to reduce that to one entry
328            if len(log_sigma) > 1:
329                log_sigma = average(log_sigma)
330            ## To do : check if aFT or mFT and adjust method
331            noise_mid = 10**log_sigma
332            noise_1std = (
333                noise_mid * self.settings.noise_threshold_log_nsigma_corr_factor
334            )  # for mFT 0.463
335            return float(noise_mid), float(noise_1std)
336
337    def run_noise_threshold_calc(self):
338        """Runs noise threshold calculation (not log based method)
339
340        Returns
341        -------
342        Tuple[float, float]
343            A tuple containing the average noise and standard deviation.
344
345        """
346        if self.is_centroid:
347            # calculates noise_baseline and noise_std
348            # needed to run auto noise threshold mode
349            # it is not used for signal to noise nor
350            # relative abudance methods
351            abundances_chunks = chunks(self.abundance, 50)
352            each_min_abund = [min(x) for x in abundances_chunks]
353
354            return average(each_min_abund), std(each_min_abund)
355
356        else:
357            mz_cut, abundance_cut = self.cut_mz_domain_noise()
358
359            if self.settings.noise_threshold_method == "minima":
360                yminima = self.get_abundance_minima_centroid(abundance_cut)
361
362                return self.get_noise_average(yminima)
363
364            else:
365                # pyplot.show()
366                return self.get_noise_average(abundance_cut)

Class for noise threshold calculation.

Parameters
  • mass_spectrum (MassSpectrum): The mass spectrum object.
  • settings (MSParameters): The mass spectrum parameters object.
  • is_centroid (bool): Flag indicating whether the mass spectrum is centroid or profile.
  • baseline_noise (float): The baseline noise.
  • baseline_noise_std (float): The baseline noise standard deviation.
  • max_signal_to_noise (float): The maximum signal to noise.
  • max_abundance (float): The maximum abundance.
  • abundance (np.array): The abundance array.
  • abundance_profile (np.array): The abundance profile array.
  • mz_exp (np.array): The experimental m/z array.
  • mz_exp_profile (np.array): The experimental m/z profile array.
Attributes
  • None
Methods
  • get_noise_threshold(). Get the noise threshold.
  • cut_mz_domain_noise(). Cut the m/z domain to the noise threshold regions.
  • get_noise_average(ymincentroid). Get the average noise and standard deviation.
  • get_abundance_minima_centroid(abun_cut) Get the abundance minima for centroid data.
  • run_log_noise_threshold_calc(). Run the log noise threshold calculation.
  • run_noise_threshold_calc(). Run the noise threshold calculation.
def get_noise_threshold(self) -> Tuple[Tuple[float, float], Tuple[float, float]]:
 60    def get_noise_threshold(self) -> Tuple[Tuple[float, float], Tuple[float, float]]:
 61        """Get the noise threshold.
 62
 63        Returns
 64        -------
 65        Tuple[Tuple[float, float], Tuple[float, float]]
 66            A tuple containing the m/z and abundance noise thresholds.
 67            (min_mz, max_mz), (noise_threshold, noise_threshold)
 68        """
 69
 70        if self.is_centroid:
 71            x = min(self.mz_exp), max((self.mz_exp))
 72
 73            if self.settings.noise_threshold_method == "minima":
 74                abundance_threshold = self.baseline_noise + (
 75                    self.settings.noise_threshold_min_std * self.baseline_noise_std
 76                )
 77                y = (abundance_threshold, abundance_threshold)
 78
 79            elif self.settings.noise_threshold_method == "signal_noise":
 80                normalized_threshold = (
 81                    self.max_abundance * self.settings.noise_threshold_min_s2n
 82                ) / self.max_signal_to_noise
 83                y = (normalized_threshold, normalized_threshold)
 84
 85            elif self.settings.noise_threshold_method == "relative_abundance":
 86                normalized_threshold = (
 87                    max(self.abundance) / 100
 88                ) * self.settings.noise_threshold_min_relative_abundance
 89                y = (normalized_threshold, normalized_threshold)
 90
 91            elif self.settings.noise_threshold_method == "absolute_abundance":
 92                normalized_threshold = (
 93                    self.abundance * self.settings.noise_threshold_absolute_abundance
 94                )
 95                y = (normalized_threshold, normalized_threshold)
 96            # log noise method not tested for centroid data
 97            else:
 98                raise Exception(
 99                    "%s method was not implemented, please refer to corems.mass_spectrum.calc.NoiseCalc Class"
100                    % self.settings.noise_threshold_method
101                )
102
103            return x, y
104
105        else:
106            if self.baseline_noise and self.baseline_noise_std:
107                x = (self.mz_exp_profile.min(), self.mz_exp_profile.max())
108                y = (self.baseline_noise_std, self.baseline_noise_std)
109
110                if self.settings.noise_threshold_method == "minima":
111                    # print(self.settings.noise_threshold_min_std)
112                    abundance_threshold = self.baseline_noise + (
113                        self.settings.noise_threshold_min_std * self.baseline_noise_std
114                    )
115
116                    y = (abundance_threshold, abundance_threshold)
117
118                elif self.settings.noise_threshold_method == "signal_noise":
119                    max_sn = self.abundance_profile.max() / self.baseline_noise_std
120
121                    normalized_threshold = (
122                        self.abundance_profile.max()
123                        * self.settings.noise_threshold_min_s2n
124                    ) / max_sn
125                    y = (normalized_threshold, normalized_threshold)
126
127                elif self.settings.noise_threshold_method == "relative_abundance":
128                    normalized_threshold = (
129                        self.abundance_profile.max() / 100
130                    ) * self.settings.noise_threshold_min_relative_abundance
131                    y = (normalized_threshold, normalized_threshold)
132
133                elif self.settings.noise_threshold_method == "absolute_abundance":
134                    normalized_threshold = (
135                        self.settings.noise_threshold_absolute_abundance
136                    )
137                    y = (normalized_threshold, normalized_threshold)
138
139                elif self.settings.noise_threshold_method == "log":
140                    normalized_threshold = (
141                        self.settings.noise_threshold_log_nsigma
142                        * self.baseline_noise_std
143                    )
144                    y = (normalized_threshold, normalized_threshold)
145
146                else:
147                    raise Exception(
148                        "%s method was not implemented, \
149                        please refer to corems.mass_spectrum.calc.NoiseCalc Class"
150                        % self.settings.noise_threshold_method
151                    )
152
153                return x, y
154
155            else:
156                warnings.warn(
157                    "Noise Baseline and Noise std not specified,\
158                    defaulting to 0,0 run process_mass_spec() ?"
159                )
160                return (0, 0), (0, 0)

Get the noise threshold.

Returns
  • Tuple[Tuple[float, float], Tuple[float, float]]: A tuple containing the m/z and abundance noise thresholds. (min_mz, max_mz), (noise_threshold, noise_threshold)
def cut_mz_domain_noise(self):
162    def cut_mz_domain_noise(self):
163        """Cut the m/z domain to the noise threshold regions.
164
165        Returns
166        -------
167        Tuple[np.array, np.array]
168            A tuple containing the m/z and abundance arrays of the truncated spectrum region.
169        """
170        min_mz_whole_ms = self.mz_exp_profile.min()
171        max_mz_whole_ms = self.mz_exp_profile.max()
172
173        if self.settings.noise_threshold_method == "minima":
174            # this calculation is taking too long (about 2 seconds)
175            number_average_molecular_weight = self.weight_average_molecular_weight(
176                profile=True
177            )
178
179            # +-200 is a guess for testing only, it needs adjustment for each type of analysis
180            # need to check min mz here or it will break
181            min_mz_noise = number_average_molecular_weight - 100
182            # need to check max mz here or it will break
183            max_mz_noise = number_average_molecular_weight + 100
184
185        else:
186            min_mz_noise = self.settings.noise_min_mz
187            max_mz_noise = self.settings.noise_max_mz
188
189        if min_mz_noise < min_mz_whole_ms:
190            min_mz_noise = min_mz_whole_ms
191
192        if max_mz_noise > max_mz_whole_ms:
193            max_mz_noise = max_mz_whole_ms
194
195        # print(min_mz_noise, max_mz_noise)
196        low_mz_index = where(self.mz_exp_profile >= min_mz_noise)[0][0]
197        # print(self.mz_exp_profile[low_mz_index])
198        # low_mz_index = (argmax(self.mz_exp_profile <= min_mz_noise))
199
200        high_mz_index = where(self.mz_exp_profile <= max_mz_noise)[-1][-1]
201
202        # high_mz_index = (argmax(self.mz_exp_profile <= max_mz_noise))
203
204        if high_mz_index > low_mz_index:
205            # pyplot.plot(self.mz_exp_profile[low_mz_index:high_mz_index], self.abundance_profile[low_mz_index:high_mz_index])
206            # pyplot.show()
207            return self.mz_exp_profile[
208                high_mz_index:low_mz_index
209            ], self.abundance_profile[low_mz_index:high_mz_index]
210        else:
211            # pyplot.plot(self.mz_exp_profile[high_mz_index:low_mz_index], self.abundance_profile[high_mz_index:low_mz_index])
212            # pyplot.show()
213            return self.mz_exp_profile[
214                high_mz_index:low_mz_index
215            ], self.abundance_profile[high_mz_index:low_mz_index]

Cut the m/z domain to the noise threshold regions.

Returns
  • Tuple[np.array, np.array]: A tuple containing the m/z and abundance arrays of the truncated spectrum region.
def get_noise_average(self, ymincentroid):
217    def get_noise_average(self, ymincentroid):
218        """Get the average noise and standard deviation.
219
220        Parameters
221        ----------
222        ymincentroid : np.array
223            The ymincentroid array.
224
225        Returns
226        -------
227        Tuple[float, float]
228            A tuple containing the average noise and standard deviation.
229
230        """
231        # assumes noise to be gaussian and estimate noise level by
232        # calculating the valley.
233
234        auto = True if self.settings.noise_threshold_method == "minima" else False
235
236        average_noise = median((ymincentroid)) * 2 if auto else median(ymincentroid)
237
238        s_deviation = ymincentroid.std() * 3 if auto else ymincentroid.std()
239
240        return average_noise, s_deviation

Get the average noise and standard deviation.

Parameters
  • ymincentroid (np.array): The ymincentroid array.
Returns
  • Tuple[float, float]: A tuple containing the average noise and standard deviation.
def get_abundance_minima_centroid(self, abun_cut):
242    def get_abundance_minima_centroid(self, abun_cut):
243        """Get the abundance minima for centroid data.
244
245        Parameters
246        ----------
247        abun_cut : np.array
248            The abundance cut array.
249
250        Returns
251        -------
252        np.array
253            The abundance minima array.
254        """
255        maximum = self.abundance_profile.max()
256        threshold_min = maximum * 1.00
257
258        y = -abun_cut
259
260        dy = y[1:] - y[:-1]
261        """replaces NaN for Infinity"""
262        indices_nan = where(isnan(y))[0]
263
264        if indices_nan.size:
265            y[indices_nan] = inf
266            dy[where(isnan(dy))[0]] = inf
267
268        indices = where((hstack((dy, 0)) < 0) & (hstack((0, dy)) > 0))[0]
269
270        if indices.size and threshold_min is not None:
271            indices = indices[abun_cut[indices] <= threshold_min]
272
273        return abun_cut[indices]

Get the abundance minima for centroid data.

Parameters
  • abun_cut (np.array): The abundance cut array.
Returns
  • np.array: The abundance minima array.
def run_log_noise_threshold_calc(self):
275    def run_log_noise_threshold_calc(self):
276        """Run the log noise threshold calculation.
277
278
279        Returns
280        -------
281        Tuple[float, float]
282            A tuple containing the average noise and standard deviation.
283
284        Notes
285        --------
286        Method for estimating the noise based on decimal log of all the data point
287
288        Idea is that you calculate a histogram of of the log10(abundance) values.
289        The maximum of the histogram == the standard deviation of the noise.
290
291
292        For aFT data it is a gaussian distribution of noise - not implemented here!
293        For mFT data it is a Rayleigh distribution, and the value is actually 10^(abu_max)*0.463.
294
295
296        See the publication cited above for the derivation of this.
297
298        References
299        --------
300        1. dx.doi.org/10.1021/ac403278t | Anal. Chem. 2014, 86, 3308−3316
301
302        """
303
304        if self.is_centroid:
305            raise Exception("log noise Not tested for centroid data")
306        else:
307            # cut the spectrum to ROI
308            mz_cut, abundance_cut = self.cut_mz_domain_noise()
309            # If there are 0 values, the log will fail
310            # But we may have negative values for aFT data, so we check if 0 exists
311            # Need to make a copy of the abundance cut values so we dont overwrite it....
312            tmp_abundance = abundance_cut.copy()
313            if 0 in tmp_abundance:
314                tmp_abundance[tmp_abundance == 0] = nan
315                tmp_abundance = tmp_abundance[~isnan(tmp_abundance)]
316                # It seems there are edge cases of sparse but high S/N data where the wrong values may be determined.
317                # Hard to generalise - needs more investigation.
318
319            # calculate a histogram of the log10 of the abundance data
320            hist_values = histogram(
321                log10(tmp_abundance), bins=self.settings.noise_threshold_log_nsigma_bins
322            )
323            # find the apex of this histogram
324            maxvalidx = where(hist_values[0] == max(hist_values[0]))
325            # get the value of this apex (note - still in log10 units)
326            log_sigma = hist_values[1][maxvalidx]
327            # If the histogram had more than one maximum frequency bin, we need to reduce that to one entry
328            if len(log_sigma) > 1:
329                log_sigma = average(log_sigma)
330            ## To do : check if aFT or mFT and adjust method
331            noise_mid = 10**log_sigma
332            noise_1std = (
333                noise_mid * self.settings.noise_threshold_log_nsigma_corr_factor
334            )  # for mFT 0.463
335            return float(noise_mid), float(noise_1std)

Run the log noise threshold calculation.

Returns
  • Tuple[float, float]: A tuple containing the average noise and standard deviation.
Notes

Method for estimating the noise based on decimal log of all the data point

Idea is that you calculate a histogram of of the log10(abundance) values. The maximum of the histogram == the standard deviation of the noise.

For aFT data it is a gaussian distribution of noise - not implemented here! For mFT data it is a Rayleigh distribution, and the value is actually 10^(abu_max)*0.463.

See the publication cited above for the derivation of this.

References
  1. dx.doi.org/10.1021/ac403278t | Anal. Chem. 2014, 86, 3308−3316
def run_noise_threshold_calc(self):
337    def run_noise_threshold_calc(self):
338        """Runs noise threshold calculation (not log based method)
339
340        Returns
341        -------
342        Tuple[float, float]
343            A tuple containing the average noise and standard deviation.
344
345        """
346        if self.is_centroid:
347            # calculates noise_baseline and noise_std
348            # needed to run auto noise threshold mode
349            # it is not used for signal to noise nor
350            # relative abudance methods
351            abundances_chunks = chunks(self.abundance, 50)
352            each_min_abund = [min(x) for x in abundances_chunks]
353
354            return average(each_min_abund), std(each_min_abund)
355
356        else:
357            mz_cut, abundance_cut = self.cut_mz_domain_noise()
358
359            if self.settings.noise_threshold_method == "minima":
360                yminima = self.get_abundance_minima_centroid(abundance_cut)
361
362                return self.get_noise_average(yminima)
363
364            else:
365                # pyplot.show()
366                return self.get_noise_average(abundance_cut)

Runs noise threshold calculation (not log based method)

Returns
  • Tuple[float, float]: A tuple containing the average noise and standard deviation.