corems.mass_spectrum.calc.NoiseCalc
1import warnings 2from typing import Tuple 3 4from numpy import average, histogram, hstack, inf, isnan, log10, median, nan, std, where 5 6from corems import chunks 7 8# from matplotlib import pyplot 9__author__ = "Yuri E. Corilo" 10__date__ = "Jun 27, 2019" 11 12 13class NoiseThresholdCalc: 14 """Class for noise threshold calculation. 15 16 Parameters 17 ---------- 18 mass_spectrum : MassSpectrum 19 The mass spectrum object. 20 settings : MSParameters 21 The mass spectrum parameters object. 22 is_centroid : bool 23 Flag indicating whether the mass spectrum is centroid or profile. 24 baseline_noise : float 25 The baseline noise. 26 baseline_noise_std : float 27 The baseline noise standard deviation. 28 max_signal_to_noise : float 29 The maximum signal to noise. 30 max_abundance : float 31 The maximum abundance. 32 abundance : np.array 33 The abundance array. 34 abundance_profile : np.array 35 The abundance profile array. 36 mz_exp : np.array 37 The experimental m/z array. 38 mz_exp_profile : np.array 39 The experimental m/z profile array. 40 41 Attributes 42 ---------- 43 None 44 45 Methods 46 ------- 47 * get_noise_threshold(). Get the noise threshold. 48 * cut_mz_domain_noise(). Cut the m/z domain to the noise threshold regions. 49 * get_noise_average(ymincentroid). 50 Get the average noise and standard deviation. 51 * get_abundance_minima_centroid(abun_cut) 52 Get the abundance minima for centroid data. 53 * run_log_noise_threshold_calc(). 54 Run the log noise threshold calculation. 55 * run_noise_threshold_calc(). 56 Run the noise threshold calculation. 57 """ 58 59 def get_noise_threshold(self) -> Tuple[Tuple[float, float], Tuple[float, float]]: 60 """Get the noise threshold. 61 62 Returns 63 ------- 64 Tuple[Tuple[float, float], Tuple[float, float]] 65 A tuple containing the m/z and abundance noise thresholds. 66 (min_mz, max_mz), (noise_threshold, noise_threshold) 67 """ 68 69 if self.is_centroid: 70 x = min(self.mz_exp), max((self.mz_exp)) 71 72 if self.settings.noise_threshold_method == "minima": 73 abundance_threshold = self.baseline_noise + ( 74 self.settings.noise_threshold_min_std * self.baseline_noise_std 75 ) 76 y = (abundance_threshold, abundance_threshold) 77 78 elif self.settings.noise_threshold_method == "signal_noise": 79 normalized_threshold = ( 80 self.max_abundance * self.settings.noise_threshold_min_s2n 81 ) / self.max_signal_to_noise 82 y = (normalized_threshold, normalized_threshold) 83 84 elif self.settings.noise_threshold_method == "relative_abundance": 85 normalized_threshold = ( 86 max(self.abundance) / 100 87 ) * self.settings.noise_threshold_min_relative_abundance 88 y = (normalized_threshold, normalized_threshold) 89 90 elif self.settings.noise_threshold_method == "absolute_abundance": 91 normalized_threshold = ( 92 self.abundance * self.settings.noise_threshold_absolute_abundance 93 ) 94 y = (normalized_threshold, normalized_threshold) 95 # log noise method not tested for centroid data 96 else: 97 raise Exception( 98 "%s method was not implemented, please refer to corems.mass_spectrum.calc.NoiseCalc Class" 99 % self.settings.noise_threshold_method 100 ) 101 102 return x, y 103 104 else: 105 if self.baseline_noise and self.baseline_noise_std: 106 x = (self.mz_exp_profile.min(), self.mz_exp_profile.max()) 107 y = (self.baseline_noise_std, self.baseline_noise_std) 108 109 if self.settings.noise_threshold_method == "minima": 110 # print(self.settings.noise_threshold_min_std) 111 abundance_threshold = self.baseline_noise + ( 112 self.settings.noise_threshold_min_std * self.baseline_noise_std 113 ) 114 115 y = (abundance_threshold, abundance_threshold) 116 117 elif self.settings.noise_threshold_method == "signal_noise": 118 max_sn = self.abundance_profile.max() / self.baseline_noise_std 119 120 normalized_threshold = ( 121 self.abundance_profile.max() 122 * self.settings.noise_threshold_min_s2n 123 ) / max_sn 124 y = (normalized_threshold, normalized_threshold) 125 126 elif self.settings.noise_threshold_method == "relative_abundance": 127 normalized_threshold = ( 128 self.abundance_profile.max() / 100 129 ) * self.settings.noise_threshold_min_relative_abundance 130 y = (normalized_threshold, normalized_threshold) 131 132 elif self.settings.noise_threshold_method == "absolute_abundance": 133 normalized_threshold = ( 134 self.settings.noise_threshold_absolute_abundance 135 ) 136 y = (normalized_threshold, normalized_threshold) 137 138 elif self.settings.noise_threshold_method == "log": 139 normalized_threshold = ( 140 self.settings.noise_threshold_log_nsigma 141 * self.baseline_noise_std 142 ) 143 y = (normalized_threshold, normalized_threshold) 144 145 else: 146 raise Exception( 147 "%s method was not implemented, \ 148 please refer to corems.mass_spectrum.calc.NoiseCalc Class" 149 % self.settings.noise_threshold_method 150 ) 151 152 return x, y 153 154 else: 155 warnings.warn( 156 "Noise Baseline and Noise std not specified,\ 157 defaulting to 0,0 run process_mass_spec() ?" 158 ) 159 return (0, 0), (0, 0) 160 161 def cut_mz_domain_noise(self): 162 """Cut the m/z domain to the noise threshold regions. 163 164 Returns 165 ------- 166 Tuple[np.array, np.array] 167 A tuple containing the m/z and abundance arrays of the truncated spectrum region. 168 """ 169 min_mz_whole_ms = self.mz_exp_profile.min() 170 max_mz_whole_ms = self.mz_exp_profile.max() 171 172 if self.settings.noise_threshold_method == "minima": 173 # this calculation is taking too long (about 2 seconds) 174 number_average_molecular_weight = self.weight_average_molecular_weight( 175 profile=True 176 ) 177 178 # +-200 is a guess for testing only, it needs adjustment for each type of analysis 179 # need to check min mz here or it will break 180 min_mz_noise = number_average_molecular_weight - 100 181 # need to check max mz here or it will break 182 max_mz_noise = number_average_molecular_weight + 100 183 184 else: 185 min_mz_noise = self.settings.noise_min_mz 186 max_mz_noise = self.settings.noise_max_mz 187 188 if min_mz_noise < min_mz_whole_ms: 189 min_mz_noise = min_mz_whole_ms 190 191 if max_mz_noise > max_mz_whole_ms: 192 max_mz_noise = max_mz_whole_ms 193 194 # print(min_mz_noise, max_mz_noise) 195 low_mz_index = where(self.mz_exp_profile >= min_mz_noise)[0][0] 196 # print(self.mz_exp_profile[low_mz_index]) 197 # low_mz_index = (argmax(self.mz_exp_profile <= min_mz_noise)) 198 199 high_mz_index = where(self.mz_exp_profile <= max_mz_noise)[-1][-1] 200 201 # high_mz_index = (argmax(self.mz_exp_profile <= max_mz_noise)) 202 203 if high_mz_index > low_mz_index: 204 # pyplot.plot(self.mz_exp_profile[low_mz_index:high_mz_index], self.abundance_profile[low_mz_index:high_mz_index]) 205 # pyplot.show() 206 return self.mz_exp_profile[ 207 high_mz_index:low_mz_index 208 ], self.abundance_profile[low_mz_index:high_mz_index] 209 else: 210 # pyplot.plot(self.mz_exp_profile[high_mz_index:low_mz_index], self.abundance_profile[high_mz_index:low_mz_index]) 211 # pyplot.show() 212 return self.mz_exp_profile[ 213 high_mz_index:low_mz_index 214 ], self.abundance_profile[high_mz_index:low_mz_index] 215 216 def get_noise_average(self, ymincentroid): 217 """Get the average noise and standard deviation. 218 219 Parameters 220 ---------- 221 ymincentroid : np.array 222 The ymincentroid array. 223 224 Returns 225 ------- 226 Tuple[float, float] 227 A tuple containing the average noise and standard deviation. 228 229 """ 230 # assumes noise to be gaussian and estimate noise level by 231 # calculating the valley. 232 233 auto = True if self.settings.noise_threshold_method == "minima" else False 234 235 average_noise = median((ymincentroid)) * 2 if auto else median(ymincentroid) 236 237 s_deviation = ymincentroid.std() * 3 if auto else ymincentroid.std() 238 239 return average_noise, s_deviation 240 241 def get_abundance_minima_centroid(self, abun_cut): 242 """Get the abundance minima for centroid data. 243 244 Parameters 245 ---------- 246 abun_cut : np.array 247 The abundance cut array. 248 249 Returns 250 ------- 251 np.array 252 The abundance minima array. 253 """ 254 maximum = self.abundance_profile.max() 255 threshold_min = maximum * 1.00 256 257 y = -abun_cut 258 259 dy = y[1:] - y[:-1] 260 """replaces NaN for Infinity""" 261 indices_nan = where(isnan(y))[0] 262 263 if indices_nan.size: 264 y[indices_nan] = inf 265 dy[where(isnan(dy))[0]] = inf 266 267 indices = where((hstack((dy, 0)) < 0) & (hstack((0, dy)) > 0))[0] 268 269 if indices.size and threshold_min is not None: 270 indices = indices[abun_cut[indices] <= threshold_min] 271 272 return abun_cut[indices] 273 274 def run_log_noise_threshold_calc(self): 275 """Run the log noise threshold calculation. 276 277 278 Returns 279 ------- 280 Tuple[float, float] 281 A tuple containing the average noise and standard deviation. 282 283 Notes 284 -------- 285 Method for estimating the noise based on decimal log of all the data point 286 287 Idea is that you calculate a histogram of of the log10(abundance) values. 288 The maximum of the histogram == the standard deviation of the noise. 289 290 291 For aFT data it is a gaussian distribution of noise - not implemented here! 292 For mFT data it is a Rayleigh distribution, and the value is actually 10^(abu_max)*0.463. 293 294 295 See the publication cited above for the derivation of this. 296 297 References 298 -------- 299 1. dx.doi.org/10.1021/ac403278t | Anal. Chem. 2014, 86, 3308−3316 300 301 """ 302 303 if self.is_centroid: 304 raise Exception("log noise Not tested for centroid data") 305 else: 306 # cut the spectrum to ROI 307 mz_cut, abundance_cut = self.cut_mz_domain_noise() 308 # If there are 0 values, the log will fail 309 # But we may have negative values for aFT data, so we check if 0 exists 310 # Need to make a copy of the abundance cut values so we dont overwrite it.... 311 tmp_abundance = abundance_cut.copy() 312 if 0 in tmp_abundance: 313 tmp_abundance[tmp_abundance == 0] = nan 314 tmp_abundance = tmp_abundance[~isnan(tmp_abundance)] 315 # It seems there are edge cases of sparse but high S/N data where the wrong values may be determined. 316 # Hard to generalise - needs more investigation. 317 318 # calculate a histogram of the log10 of the abundance data 319 hist_values = histogram( 320 log10(tmp_abundance), bins=self.settings.noise_threshold_log_nsigma_bins 321 ) 322 # find the apex of this histogram 323 maxvalidx = where(hist_values[0] == max(hist_values[0])) 324 # get the value of this apex (note - still in log10 units) 325 log_sigma = hist_values[1][maxvalidx] 326 # If the histogram had more than one maximum frequency bin, we need to reduce that to one entry 327 if len(log_sigma) > 1: 328 log_sigma = average(log_sigma) 329 ## To do : check if aFT or mFT and adjust method 330 noise_mid = 10**log_sigma 331 noise_1std = ( 332 noise_mid * self.settings.noise_threshold_log_nsigma_corr_factor 333 ) # for mFT 0.463 334 return float(noise_mid), float(noise_1std) 335 336 def run_noise_threshold_calc(self): 337 """Runs noise threshold calculation (not log based method) 338 339 Returns 340 ------- 341 Tuple[float, float] 342 A tuple containing the average noise and standard deviation. 343 344 """ 345 if self.is_centroid: 346 # calculates noise_baseline and noise_std 347 # needed to run auto noise threshold mode 348 # it is not used for signal to noise nor 349 # relative abudance methods 350 abundances_chunks = chunks(self.abundance, 50) 351 each_min_abund = [min(x) for x in abundances_chunks] 352 353 return average(each_min_abund), std(each_min_abund) 354 355 else: 356 mz_cut, abundance_cut = self.cut_mz_domain_noise() 357 358 if self.settings.noise_threshold_method == "minima": 359 yminima = self.get_abundance_minima_centroid(abundance_cut) 360 361 return self.get_noise_average(yminima) 362 363 else: 364 # pyplot.show() 365 return self.get_noise_average(abundance_cut)
class
NoiseThresholdCalc:
14class NoiseThresholdCalc: 15 """Class for noise threshold calculation. 16 17 Parameters 18 ---------- 19 mass_spectrum : MassSpectrum 20 The mass spectrum object. 21 settings : MSParameters 22 The mass spectrum parameters object. 23 is_centroid : bool 24 Flag indicating whether the mass spectrum is centroid or profile. 25 baseline_noise : float 26 The baseline noise. 27 baseline_noise_std : float 28 The baseline noise standard deviation. 29 max_signal_to_noise : float 30 The maximum signal to noise. 31 max_abundance : float 32 The maximum abundance. 33 abundance : np.array 34 The abundance array. 35 abundance_profile : np.array 36 The abundance profile array. 37 mz_exp : np.array 38 The experimental m/z array. 39 mz_exp_profile : np.array 40 The experimental m/z profile array. 41 42 Attributes 43 ---------- 44 None 45 46 Methods 47 ------- 48 * get_noise_threshold(). Get the noise threshold. 49 * cut_mz_domain_noise(). Cut the m/z domain to the noise threshold regions. 50 * get_noise_average(ymincentroid). 51 Get the average noise and standard deviation. 52 * get_abundance_minima_centroid(abun_cut) 53 Get the abundance minima for centroid data. 54 * run_log_noise_threshold_calc(). 55 Run the log noise threshold calculation. 56 * run_noise_threshold_calc(). 57 Run the noise threshold calculation. 58 """ 59 60 def get_noise_threshold(self) -> Tuple[Tuple[float, float], Tuple[float, float]]: 61 """Get the noise threshold. 62 63 Returns 64 ------- 65 Tuple[Tuple[float, float], Tuple[float, float]] 66 A tuple containing the m/z and abundance noise thresholds. 67 (min_mz, max_mz), (noise_threshold, noise_threshold) 68 """ 69 70 if self.is_centroid: 71 x = min(self.mz_exp), max((self.mz_exp)) 72 73 if self.settings.noise_threshold_method == "minima": 74 abundance_threshold = self.baseline_noise + ( 75 self.settings.noise_threshold_min_std * self.baseline_noise_std 76 ) 77 y = (abundance_threshold, abundance_threshold) 78 79 elif self.settings.noise_threshold_method == "signal_noise": 80 normalized_threshold = ( 81 self.max_abundance * self.settings.noise_threshold_min_s2n 82 ) / self.max_signal_to_noise 83 y = (normalized_threshold, normalized_threshold) 84 85 elif self.settings.noise_threshold_method == "relative_abundance": 86 normalized_threshold = ( 87 max(self.abundance) / 100 88 ) * self.settings.noise_threshold_min_relative_abundance 89 y = (normalized_threshold, normalized_threshold) 90 91 elif self.settings.noise_threshold_method == "absolute_abundance": 92 normalized_threshold = ( 93 self.abundance * self.settings.noise_threshold_absolute_abundance 94 ) 95 y = (normalized_threshold, normalized_threshold) 96 # log noise method not tested for centroid data 97 else: 98 raise Exception( 99 "%s method was not implemented, please refer to corems.mass_spectrum.calc.NoiseCalc Class" 100 % self.settings.noise_threshold_method 101 ) 102 103 return x, y 104 105 else: 106 if self.baseline_noise and self.baseline_noise_std: 107 x = (self.mz_exp_profile.min(), self.mz_exp_profile.max()) 108 y = (self.baseline_noise_std, self.baseline_noise_std) 109 110 if self.settings.noise_threshold_method == "minima": 111 # print(self.settings.noise_threshold_min_std) 112 abundance_threshold = self.baseline_noise + ( 113 self.settings.noise_threshold_min_std * self.baseline_noise_std 114 ) 115 116 y = (abundance_threshold, abundance_threshold) 117 118 elif self.settings.noise_threshold_method == "signal_noise": 119 max_sn = self.abundance_profile.max() / self.baseline_noise_std 120 121 normalized_threshold = ( 122 self.abundance_profile.max() 123 * self.settings.noise_threshold_min_s2n 124 ) / max_sn 125 y = (normalized_threshold, normalized_threshold) 126 127 elif self.settings.noise_threshold_method == "relative_abundance": 128 normalized_threshold = ( 129 self.abundance_profile.max() / 100 130 ) * self.settings.noise_threshold_min_relative_abundance 131 y = (normalized_threshold, normalized_threshold) 132 133 elif self.settings.noise_threshold_method == "absolute_abundance": 134 normalized_threshold = ( 135 self.settings.noise_threshold_absolute_abundance 136 ) 137 y = (normalized_threshold, normalized_threshold) 138 139 elif self.settings.noise_threshold_method == "log": 140 normalized_threshold = ( 141 self.settings.noise_threshold_log_nsigma 142 * self.baseline_noise_std 143 ) 144 y = (normalized_threshold, normalized_threshold) 145 146 else: 147 raise Exception( 148 "%s method was not implemented, \ 149 please refer to corems.mass_spectrum.calc.NoiseCalc Class" 150 % self.settings.noise_threshold_method 151 ) 152 153 return x, y 154 155 else: 156 warnings.warn( 157 "Noise Baseline and Noise std not specified,\ 158 defaulting to 0,0 run process_mass_spec() ?" 159 ) 160 return (0, 0), (0, 0) 161 162 def cut_mz_domain_noise(self): 163 """Cut the m/z domain to the noise threshold regions. 164 165 Returns 166 ------- 167 Tuple[np.array, np.array] 168 A tuple containing the m/z and abundance arrays of the truncated spectrum region. 169 """ 170 min_mz_whole_ms = self.mz_exp_profile.min() 171 max_mz_whole_ms = self.mz_exp_profile.max() 172 173 if self.settings.noise_threshold_method == "minima": 174 # this calculation is taking too long (about 2 seconds) 175 number_average_molecular_weight = self.weight_average_molecular_weight( 176 profile=True 177 ) 178 179 # +-200 is a guess for testing only, it needs adjustment for each type of analysis 180 # need to check min mz here or it will break 181 min_mz_noise = number_average_molecular_weight - 100 182 # need to check max mz here or it will break 183 max_mz_noise = number_average_molecular_weight + 100 184 185 else: 186 min_mz_noise = self.settings.noise_min_mz 187 max_mz_noise = self.settings.noise_max_mz 188 189 if min_mz_noise < min_mz_whole_ms: 190 min_mz_noise = min_mz_whole_ms 191 192 if max_mz_noise > max_mz_whole_ms: 193 max_mz_noise = max_mz_whole_ms 194 195 # print(min_mz_noise, max_mz_noise) 196 low_mz_index = where(self.mz_exp_profile >= min_mz_noise)[0][0] 197 # print(self.mz_exp_profile[low_mz_index]) 198 # low_mz_index = (argmax(self.mz_exp_profile <= min_mz_noise)) 199 200 high_mz_index = where(self.mz_exp_profile <= max_mz_noise)[-1][-1] 201 202 # high_mz_index = (argmax(self.mz_exp_profile <= max_mz_noise)) 203 204 if high_mz_index > low_mz_index: 205 # pyplot.plot(self.mz_exp_profile[low_mz_index:high_mz_index], self.abundance_profile[low_mz_index:high_mz_index]) 206 # pyplot.show() 207 return self.mz_exp_profile[ 208 high_mz_index:low_mz_index 209 ], self.abundance_profile[low_mz_index:high_mz_index] 210 else: 211 # pyplot.plot(self.mz_exp_profile[high_mz_index:low_mz_index], self.abundance_profile[high_mz_index:low_mz_index]) 212 # pyplot.show() 213 return self.mz_exp_profile[ 214 high_mz_index:low_mz_index 215 ], self.abundance_profile[high_mz_index:low_mz_index] 216 217 def get_noise_average(self, ymincentroid): 218 """Get the average noise and standard deviation. 219 220 Parameters 221 ---------- 222 ymincentroid : np.array 223 The ymincentroid array. 224 225 Returns 226 ------- 227 Tuple[float, float] 228 A tuple containing the average noise and standard deviation. 229 230 """ 231 # assumes noise to be gaussian and estimate noise level by 232 # calculating the valley. 233 234 auto = True if self.settings.noise_threshold_method == "minima" else False 235 236 average_noise = median((ymincentroid)) * 2 if auto else median(ymincentroid) 237 238 s_deviation = ymincentroid.std() * 3 if auto else ymincentroid.std() 239 240 return average_noise, s_deviation 241 242 def get_abundance_minima_centroid(self, abun_cut): 243 """Get the abundance minima for centroid data. 244 245 Parameters 246 ---------- 247 abun_cut : np.array 248 The abundance cut array. 249 250 Returns 251 ------- 252 np.array 253 The abundance minima array. 254 """ 255 maximum = self.abundance_profile.max() 256 threshold_min = maximum * 1.00 257 258 y = -abun_cut 259 260 dy = y[1:] - y[:-1] 261 """replaces NaN for Infinity""" 262 indices_nan = where(isnan(y))[0] 263 264 if indices_nan.size: 265 y[indices_nan] = inf 266 dy[where(isnan(dy))[0]] = inf 267 268 indices = where((hstack((dy, 0)) < 0) & (hstack((0, dy)) > 0))[0] 269 270 if indices.size and threshold_min is not None: 271 indices = indices[abun_cut[indices] <= threshold_min] 272 273 return abun_cut[indices] 274 275 def run_log_noise_threshold_calc(self): 276 """Run the log noise threshold calculation. 277 278 279 Returns 280 ------- 281 Tuple[float, float] 282 A tuple containing the average noise and standard deviation. 283 284 Notes 285 -------- 286 Method for estimating the noise based on decimal log of all the data point 287 288 Idea is that you calculate a histogram of of the log10(abundance) values. 289 The maximum of the histogram == the standard deviation of the noise. 290 291 292 For aFT data it is a gaussian distribution of noise - not implemented here! 293 For mFT data it is a Rayleigh distribution, and the value is actually 10^(abu_max)*0.463. 294 295 296 See the publication cited above for the derivation of this. 297 298 References 299 -------- 300 1. dx.doi.org/10.1021/ac403278t | Anal. Chem. 2014, 86, 3308−3316 301 302 """ 303 304 if self.is_centroid: 305 raise Exception("log noise Not tested for centroid data") 306 else: 307 # cut the spectrum to ROI 308 mz_cut, abundance_cut = self.cut_mz_domain_noise() 309 # If there are 0 values, the log will fail 310 # But we may have negative values for aFT data, so we check if 0 exists 311 # Need to make a copy of the abundance cut values so we dont overwrite it.... 312 tmp_abundance = abundance_cut.copy() 313 if 0 in tmp_abundance: 314 tmp_abundance[tmp_abundance == 0] = nan 315 tmp_abundance = tmp_abundance[~isnan(tmp_abundance)] 316 # It seems there are edge cases of sparse but high S/N data where the wrong values may be determined. 317 # Hard to generalise - needs more investigation. 318 319 # calculate a histogram of the log10 of the abundance data 320 hist_values = histogram( 321 log10(tmp_abundance), bins=self.settings.noise_threshold_log_nsigma_bins 322 ) 323 # find the apex of this histogram 324 maxvalidx = where(hist_values[0] == max(hist_values[0])) 325 # get the value of this apex (note - still in log10 units) 326 log_sigma = hist_values[1][maxvalidx] 327 # If the histogram had more than one maximum frequency bin, we need to reduce that to one entry 328 if len(log_sigma) > 1: 329 log_sigma = average(log_sigma) 330 ## To do : check if aFT or mFT and adjust method 331 noise_mid = 10**log_sigma 332 noise_1std = ( 333 noise_mid * self.settings.noise_threshold_log_nsigma_corr_factor 334 ) # for mFT 0.463 335 return float(noise_mid), float(noise_1std) 336 337 def run_noise_threshold_calc(self): 338 """Runs noise threshold calculation (not log based method) 339 340 Returns 341 ------- 342 Tuple[float, float] 343 A tuple containing the average noise and standard deviation. 344 345 """ 346 if self.is_centroid: 347 # calculates noise_baseline and noise_std 348 # needed to run auto noise threshold mode 349 # it is not used for signal to noise nor 350 # relative abudance methods 351 abundances_chunks = chunks(self.abundance, 50) 352 each_min_abund = [min(x) for x in abundances_chunks] 353 354 return average(each_min_abund), std(each_min_abund) 355 356 else: 357 mz_cut, abundance_cut = self.cut_mz_domain_noise() 358 359 if self.settings.noise_threshold_method == "minima": 360 yminima = self.get_abundance_minima_centroid(abundance_cut) 361 362 return self.get_noise_average(yminima) 363 364 else: 365 # pyplot.show() 366 return self.get_noise_average(abundance_cut)
Class for noise threshold calculation.
Parameters
- mass_spectrum (MassSpectrum): The mass spectrum object.
- settings (MSParameters): The mass spectrum parameters object.
- is_centroid (bool): Flag indicating whether the mass spectrum is centroid or profile.
- baseline_noise (float): The baseline noise.
- baseline_noise_std (float): The baseline noise standard deviation.
- max_signal_to_noise (float): The maximum signal to noise.
- max_abundance (float): The maximum abundance.
- abundance (np.array): The abundance array.
- abundance_profile (np.array): The abundance profile array.
- mz_exp (np.array): The experimental m/z array.
- mz_exp_profile (np.array): The experimental m/z profile array.
Attributes
- None
Methods
- get_noise_threshold(). Get the noise threshold.
- cut_mz_domain_noise(). Cut the m/z domain to the noise threshold regions.
- get_noise_average(ymincentroid). Get the average noise and standard deviation.
- get_abundance_minima_centroid(abun_cut) Get the abundance minima for centroid data.
- run_log_noise_threshold_calc(). Run the log noise threshold calculation.
- run_noise_threshold_calc(). Run the noise threshold calculation.
def
get_noise_threshold(self) -> Tuple[Tuple[float, float], Tuple[float, float]]:
60 def get_noise_threshold(self) -> Tuple[Tuple[float, float], Tuple[float, float]]: 61 """Get the noise threshold. 62 63 Returns 64 ------- 65 Tuple[Tuple[float, float], Tuple[float, float]] 66 A tuple containing the m/z and abundance noise thresholds. 67 (min_mz, max_mz), (noise_threshold, noise_threshold) 68 """ 69 70 if self.is_centroid: 71 x = min(self.mz_exp), max((self.mz_exp)) 72 73 if self.settings.noise_threshold_method == "minima": 74 abundance_threshold = self.baseline_noise + ( 75 self.settings.noise_threshold_min_std * self.baseline_noise_std 76 ) 77 y = (abundance_threshold, abundance_threshold) 78 79 elif self.settings.noise_threshold_method == "signal_noise": 80 normalized_threshold = ( 81 self.max_abundance * self.settings.noise_threshold_min_s2n 82 ) / self.max_signal_to_noise 83 y = (normalized_threshold, normalized_threshold) 84 85 elif self.settings.noise_threshold_method == "relative_abundance": 86 normalized_threshold = ( 87 max(self.abundance) / 100 88 ) * self.settings.noise_threshold_min_relative_abundance 89 y = (normalized_threshold, normalized_threshold) 90 91 elif self.settings.noise_threshold_method == "absolute_abundance": 92 normalized_threshold = ( 93 self.abundance * self.settings.noise_threshold_absolute_abundance 94 ) 95 y = (normalized_threshold, normalized_threshold) 96 # log noise method not tested for centroid data 97 else: 98 raise Exception( 99 "%s method was not implemented, please refer to corems.mass_spectrum.calc.NoiseCalc Class" 100 % self.settings.noise_threshold_method 101 ) 102 103 return x, y 104 105 else: 106 if self.baseline_noise and self.baseline_noise_std: 107 x = (self.mz_exp_profile.min(), self.mz_exp_profile.max()) 108 y = (self.baseline_noise_std, self.baseline_noise_std) 109 110 if self.settings.noise_threshold_method == "minima": 111 # print(self.settings.noise_threshold_min_std) 112 abundance_threshold = self.baseline_noise + ( 113 self.settings.noise_threshold_min_std * self.baseline_noise_std 114 ) 115 116 y = (abundance_threshold, abundance_threshold) 117 118 elif self.settings.noise_threshold_method == "signal_noise": 119 max_sn = self.abundance_profile.max() / self.baseline_noise_std 120 121 normalized_threshold = ( 122 self.abundance_profile.max() 123 * self.settings.noise_threshold_min_s2n 124 ) / max_sn 125 y = (normalized_threshold, normalized_threshold) 126 127 elif self.settings.noise_threshold_method == "relative_abundance": 128 normalized_threshold = ( 129 self.abundance_profile.max() / 100 130 ) * self.settings.noise_threshold_min_relative_abundance 131 y = (normalized_threshold, normalized_threshold) 132 133 elif self.settings.noise_threshold_method == "absolute_abundance": 134 normalized_threshold = ( 135 self.settings.noise_threshold_absolute_abundance 136 ) 137 y = (normalized_threshold, normalized_threshold) 138 139 elif self.settings.noise_threshold_method == "log": 140 normalized_threshold = ( 141 self.settings.noise_threshold_log_nsigma 142 * self.baseline_noise_std 143 ) 144 y = (normalized_threshold, normalized_threshold) 145 146 else: 147 raise Exception( 148 "%s method was not implemented, \ 149 please refer to corems.mass_spectrum.calc.NoiseCalc Class" 150 % self.settings.noise_threshold_method 151 ) 152 153 return x, y 154 155 else: 156 warnings.warn( 157 "Noise Baseline and Noise std not specified,\ 158 defaulting to 0,0 run process_mass_spec() ?" 159 ) 160 return (0, 0), (0, 0)
Get the noise threshold.
Returns
- Tuple[Tuple[float, float], Tuple[float, float]]: A tuple containing the m/z and abundance noise thresholds. (min_mz, max_mz), (noise_threshold, noise_threshold)
def
cut_mz_domain_noise(self):
162 def cut_mz_domain_noise(self): 163 """Cut the m/z domain to the noise threshold regions. 164 165 Returns 166 ------- 167 Tuple[np.array, np.array] 168 A tuple containing the m/z and abundance arrays of the truncated spectrum region. 169 """ 170 min_mz_whole_ms = self.mz_exp_profile.min() 171 max_mz_whole_ms = self.mz_exp_profile.max() 172 173 if self.settings.noise_threshold_method == "minima": 174 # this calculation is taking too long (about 2 seconds) 175 number_average_molecular_weight = self.weight_average_molecular_weight( 176 profile=True 177 ) 178 179 # +-200 is a guess for testing only, it needs adjustment for each type of analysis 180 # need to check min mz here or it will break 181 min_mz_noise = number_average_molecular_weight - 100 182 # need to check max mz here or it will break 183 max_mz_noise = number_average_molecular_weight + 100 184 185 else: 186 min_mz_noise = self.settings.noise_min_mz 187 max_mz_noise = self.settings.noise_max_mz 188 189 if min_mz_noise < min_mz_whole_ms: 190 min_mz_noise = min_mz_whole_ms 191 192 if max_mz_noise > max_mz_whole_ms: 193 max_mz_noise = max_mz_whole_ms 194 195 # print(min_mz_noise, max_mz_noise) 196 low_mz_index = where(self.mz_exp_profile >= min_mz_noise)[0][0] 197 # print(self.mz_exp_profile[low_mz_index]) 198 # low_mz_index = (argmax(self.mz_exp_profile <= min_mz_noise)) 199 200 high_mz_index = where(self.mz_exp_profile <= max_mz_noise)[-1][-1] 201 202 # high_mz_index = (argmax(self.mz_exp_profile <= max_mz_noise)) 203 204 if high_mz_index > low_mz_index: 205 # pyplot.plot(self.mz_exp_profile[low_mz_index:high_mz_index], self.abundance_profile[low_mz_index:high_mz_index]) 206 # pyplot.show() 207 return self.mz_exp_profile[ 208 high_mz_index:low_mz_index 209 ], self.abundance_profile[low_mz_index:high_mz_index] 210 else: 211 # pyplot.plot(self.mz_exp_profile[high_mz_index:low_mz_index], self.abundance_profile[high_mz_index:low_mz_index]) 212 # pyplot.show() 213 return self.mz_exp_profile[ 214 high_mz_index:low_mz_index 215 ], self.abundance_profile[high_mz_index:low_mz_index]
Cut the m/z domain to the noise threshold regions.
Returns
- Tuple[np.array, np.array]: A tuple containing the m/z and abundance arrays of the truncated spectrum region.
def
get_noise_average(self, ymincentroid):
217 def get_noise_average(self, ymincentroid): 218 """Get the average noise and standard deviation. 219 220 Parameters 221 ---------- 222 ymincentroid : np.array 223 The ymincentroid array. 224 225 Returns 226 ------- 227 Tuple[float, float] 228 A tuple containing the average noise and standard deviation. 229 230 """ 231 # assumes noise to be gaussian and estimate noise level by 232 # calculating the valley. 233 234 auto = True if self.settings.noise_threshold_method == "minima" else False 235 236 average_noise = median((ymincentroid)) * 2 if auto else median(ymincentroid) 237 238 s_deviation = ymincentroid.std() * 3 if auto else ymincentroid.std() 239 240 return average_noise, s_deviation
Get the average noise and standard deviation.
Parameters
- ymincentroid (np.array): The ymincentroid array.
Returns
- Tuple[float, float]: A tuple containing the average noise and standard deviation.
def
get_abundance_minima_centroid(self, abun_cut):
242 def get_abundance_minima_centroid(self, abun_cut): 243 """Get the abundance minima for centroid data. 244 245 Parameters 246 ---------- 247 abun_cut : np.array 248 The abundance cut array. 249 250 Returns 251 ------- 252 np.array 253 The abundance minima array. 254 """ 255 maximum = self.abundance_profile.max() 256 threshold_min = maximum * 1.00 257 258 y = -abun_cut 259 260 dy = y[1:] - y[:-1] 261 """replaces NaN for Infinity""" 262 indices_nan = where(isnan(y))[0] 263 264 if indices_nan.size: 265 y[indices_nan] = inf 266 dy[where(isnan(dy))[0]] = inf 267 268 indices = where((hstack((dy, 0)) < 0) & (hstack((0, dy)) > 0))[0] 269 270 if indices.size and threshold_min is not None: 271 indices = indices[abun_cut[indices] <= threshold_min] 272 273 return abun_cut[indices]
Get the abundance minima for centroid data.
Parameters
- abun_cut (np.array): The abundance cut array.
Returns
- np.array: The abundance minima array.
def
run_log_noise_threshold_calc(self):
275 def run_log_noise_threshold_calc(self): 276 """Run the log noise threshold calculation. 277 278 279 Returns 280 ------- 281 Tuple[float, float] 282 A tuple containing the average noise and standard deviation. 283 284 Notes 285 -------- 286 Method for estimating the noise based on decimal log of all the data point 287 288 Idea is that you calculate a histogram of of the log10(abundance) values. 289 The maximum of the histogram == the standard deviation of the noise. 290 291 292 For aFT data it is a gaussian distribution of noise - not implemented here! 293 For mFT data it is a Rayleigh distribution, and the value is actually 10^(abu_max)*0.463. 294 295 296 See the publication cited above for the derivation of this. 297 298 References 299 -------- 300 1. dx.doi.org/10.1021/ac403278t | Anal. Chem. 2014, 86, 3308−3316 301 302 """ 303 304 if self.is_centroid: 305 raise Exception("log noise Not tested for centroid data") 306 else: 307 # cut the spectrum to ROI 308 mz_cut, abundance_cut = self.cut_mz_domain_noise() 309 # If there are 0 values, the log will fail 310 # But we may have negative values for aFT data, so we check if 0 exists 311 # Need to make a copy of the abundance cut values so we dont overwrite it.... 312 tmp_abundance = abundance_cut.copy() 313 if 0 in tmp_abundance: 314 tmp_abundance[tmp_abundance == 0] = nan 315 tmp_abundance = tmp_abundance[~isnan(tmp_abundance)] 316 # It seems there are edge cases of sparse but high S/N data where the wrong values may be determined. 317 # Hard to generalise - needs more investigation. 318 319 # calculate a histogram of the log10 of the abundance data 320 hist_values = histogram( 321 log10(tmp_abundance), bins=self.settings.noise_threshold_log_nsigma_bins 322 ) 323 # find the apex of this histogram 324 maxvalidx = where(hist_values[0] == max(hist_values[0])) 325 # get the value of this apex (note - still in log10 units) 326 log_sigma = hist_values[1][maxvalidx] 327 # If the histogram had more than one maximum frequency bin, we need to reduce that to one entry 328 if len(log_sigma) > 1: 329 log_sigma = average(log_sigma) 330 ## To do : check if aFT or mFT and adjust method 331 noise_mid = 10**log_sigma 332 noise_1std = ( 333 noise_mid * self.settings.noise_threshold_log_nsigma_corr_factor 334 ) # for mFT 0.463 335 return float(noise_mid), float(noise_1std)
Run the log noise threshold calculation.
Returns
- Tuple[float, float]: A tuple containing the average noise and standard deviation.
Notes
Method for estimating the noise based on decimal log of all the data point
Idea is that you calculate a histogram of of the log10(abundance) values. The maximum of the histogram == the standard deviation of the noise.
For aFT data it is a gaussian distribution of noise - not implemented here! For mFT data it is a Rayleigh distribution, and the value is actually 10^(abu_max)*0.463.
See the publication cited above for the derivation of this.
References
- dx.doi.org/10.1021/ac403278t | Anal. Chem. 2014, 86, 3308−3316
def
run_noise_threshold_calc(self):
337 def run_noise_threshold_calc(self): 338 """Runs noise threshold calculation (not log based method) 339 340 Returns 341 ------- 342 Tuple[float, float] 343 A tuple containing the average noise and standard deviation. 344 345 """ 346 if self.is_centroid: 347 # calculates noise_baseline and noise_std 348 # needed to run auto noise threshold mode 349 # it is not used for signal to noise nor 350 # relative abudance methods 351 abundances_chunks = chunks(self.abundance, 50) 352 each_min_abund = [min(x) for x in abundances_chunks] 353 354 return average(each_min_abund), std(each_min_abund) 355 356 else: 357 mz_cut, abundance_cut = self.cut_mz_domain_noise() 358 359 if self.settings.noise_threshold_method == "minima": 360 yminima = self.get_abundance_minima_centroid(abundance_cut) 361 362 return self.get_noise_average(yminima) 363 364 else: 365 # pyplot.show() 366 return self.get_noise_average(abundance_cut)
Runs noise threshold calculation (not log based method)
Returns
- Tuple[float, float]: A tuple containing the average noise and standard deviation.