corems.ms_peak.calc.MSPeakCalc
1__author__ = "Yuri E. Corilo" 2__date__ = "Jun 04, 2019" 3 4import warnings 5 6import pyswarm 7from lmfit import models 8from numpy import ( 9 ceil, 10 exp, 11 flip, 12 floor, 13 linspace, 14 log, 15 nan, 16 pi, 17 poly1d, 18 polyfit, 19 rint, 20 sqrt, 21 square, 22 trapz, 23) 24 25from corems.encapsulation.constant import Atoms 26from corems.encapsulation.factory.parameters import MSParameters 27 28 29class MSPeakCalculation: 30 """Class to perform calculations on MSPeak objects. 31 32 This class provides methods to perform various calculations on MSPeak objects, such as calculating Kendrick Mass Defect (KMD) and Kendrick Mass (KM), calculating peak area, and fitting peak lineshape using different models. 33 34 Parameters 35 ---------- 36 None 37 38 Attributes 39 ---------- 40 _ms_parent : MSParent 41 The parent MSParent object associated with the MSPeakCalculation object. 42 mz_exp : float 43 The experimental m/z value of the peak. 44 peak_left_index : int 45 The start scan index of the peak. 46 peak_right_index : int 47 The final scan index of the peak. 48 resolving_power : float 49 The resolving power of the peak. 50 51 Methods 52 ------- 53 * _calc_kmd(dict_base). 54 Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula. 55 * calc_area(). 56 Calculate the peak area using numpy's trapezoidal fit. 57 * fit_peak(mz_extend=6, delta_rp=0, model='Gaussian'). 58 Perform lineshape analysis on a peak using lmfit module. 59 * voigt_pso(w, r, yoff, width, loc, a). 60 Calculate the Voigt function for particle swarm optimization (PSO) fitting. 61 * objective_pso(x, w, u). 62 Calculate the objective function for PSO fitting. 63 * minimize_pso(lower, upper, w, u). 64 Minimize the objective function using the particle swarm optimization algorithm. 65 * fit_peak_pso(mz_extend=6, upsample_multiplier=5). 66 Perform lineshape analysis on a peak using particle swarm optimization (PSO) fitting. 67 * voigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1). 68 [Legacy] Perform voigt lineshape analysis on a peak. 69 * pseudovoigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5). 70 [Legacy] Perform pseudovoigt lineshape analysis on a peak. 71 * lorentz(oversample_multiplier=1, delta_rp=0, mz_overlay=1). 72 [Legacy] Perform lorentz lineshape analysis on a peak. 73 * gaussian(oversample_multiplier=1, delta_rp=0, mz_overlay=1). 74 [Legacy] Perform gaussian lineshape analysis on a peak. 75 * get_mz_domain(oversample_multiplier, mz_overlay). 76 [Legacy] Resample/interpolate datapoints for lineshape analysis. 77 * number_possible_assignments(). 78 Return the number of possible molecular formula assignments for the peak. 79 * molecular_formula_lowest_error(). 80 Return the molecular formula with the smallest absolute mz error. 81 * molecular_formula_highest_prob_score(). 82 Return the molecular formula with the highest confidence score. 83 * molecular_formula_earth_filter(lowest_error=True). 84 Filter molecular formula using the 'Earth' filter. 85 * molecular_formula_water_filter(lowest_error=True). 86 Filter molecular formula using the 'Water' filter. 87 * molecular_formula_air_filter(lowest_error=True). 88 Filter molecular formula using the 'Air' filter. 89 * cia_score_S_P_error(). 90 Compound Identification Algorithm SP Error - Assignment Filter. 91 * cia_score_N_S_P_error(). 92 Compound Identification Algorithm NSP Error - Assignment Filter. 93 94 """ 95 96 def _calc_kmd(self, dict_base): 97 """Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula 98 99 Parameters 100 ---------- 101 dict_base : dict 102 dictionary with the base formula to be used in the calculation 103 Default is CH2, e.g. 104 dict_base = {"C": 1, "H": 2} 105 """ 106 107 if self._ms_parent: 108 # msPeak obj does have a ms object parent 109 kendrick_rounding_method = ( 110 self._ms_parent.mspeaks_settings.kendrick_rounding_method 111 ) # rounding method can be one of floor, ceil or round 112 # msPeak obj does not have a ms object parent 113 else: 114 kendrick_rounding_method = MSParameters.ms_peak.kendrick_rounding_method 115 116 mass = 0 117 for atom in dict_base.keys(): 118 mass += Atoms.atomic_masses.get(atom) * dict_base.get(atom) 119 120 kendrick_mass = (int(mass) / mass) * self.mz_exp 121 122 if kendrick_rounding_method == "ceil": 123 nominal_km = ceil(kendrick_mass) 124 125 elif kendrick_rounding_method == "round": 126 nominal_km = rint(kendrick_mass) 127 128 elif kendrick_rounding_method == "floor": 129 nominal_km = floor(kendrick_mass) 130 131 else: 132 raise Exception( 133 "%s method was not implemented, please refer to corems.ms_peak.calc.MSPeakCalc Class" 134 % kendrick_rounding_method 135 ) 136 137 kmd = nominal_km - kendrick_mass 138 139 # kmd = (nominal_km - km) * 1 140 # kmd = round(kmd,0) 141 142 return kmd, kendrick_mass, nominal_km 143 144 def calc_area(self): 145 """Calculate the peak area using numpy's trapezoidal fit 146 147 uses provided mz_domain to accurately integrate areas independent of digital resolution 148 149 Returns 150 ------- 151 float 152 peak area 153 """ 154 if self.peak_right_index > self.peak_left_index: 155 yy = self._ms_parent.abundance_profile[ 156 self.peak_left_index : self.peak_right_index 157 ] 158 xx = self._ms_parent.mz_exp_profile[ 159 self.peak_left_index : self.peak_right_index 160 ] 161 # check if the axis is high to low m/z or not. if its MSFromFreq its high mz first, if its from Profile, its low mz first 162 if xx[0] > xx[-1]: 163 xx = flip(xx) 164 yy = flip(yy) 165 return float(trapz(yy, xx)) 166 167 else: 168 warnings.warn( 169 "Peak Area Calculation for m/z {} has failed".format(self.mz_exp) 170 ) 171 return nan 172 173 def fit_peak(self, mz_extend=6, delta_rp=0, model="Gaussian"): 174 """Lineshape analysis on a peak using lmfit module. 175 176 Model and fit peak lineshape by defined function - using lmfit module 177 Does not oversample/resample/interpolate data points 178 Better to go back to time domain and perform more zero filling - if possible. 179 180 Parameters 181 ---------- 182 mz_extend : int 183 extra points left and right of peak definition to include in fitting 184 delta_rp : float 185 delta resolving power to add to resolving power 186 model : str 187 Type of lineshape model to use. 188 Models allowed: Gaussian, Lorentz, Voigt 189 190 Returns 191 ----- 192 mz_domain : ndarray 193 x-axis domain for fit 194 fit_peak : lmfit object 195 fit results object from lmfit module 196 197 Notes 198 ----- 199 Returns the calculated mz domain, initial defined abundance profile, and the fit peak results object from lmfit module 200 mz_extend here extends the x-axis domain so that we have sufficient points either side of the apex to fit. 201 Takes about 10ms per peak 202 """ 203 start_index = ( 204 self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0 205 ) 206 final_index = ( 207 self.peak_right_index + mz_extend 208 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 209 else self.peak_right_index 210 ) 211 212 # check if MSPeak contains the resolving power info 213 if self.resolving_power: 214 # full width half maximum distance 215 self.fwhm = self.mz_exp / (self.resolving_power + delta_rp) 216 217 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 218 abundance_domain = self._ms_parent.abundance_profile[ 219 start_index:final_index 220 ] 221 222 if model == "Gaussian": 223 # stardard deviation 224 sigma = self.fwhm / (2 * sqrt(2 * log(2))) 225 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 226 model = models.GaussianModel() 227 params = model.make_params( 228 center=self.mz_exp, amplitude=amplitude, sigma=sigma 229 ) 230 231 elif model == "Lorentz": 232 # stardard deviation 233 sigma = self.fwhm / 2 234 amplitude = sigma * pi * self.abundance 235 model = models.LorentzianModel() 236 params = model.make_params( 237 center=self.mz_exp, amplitude=amplitude, sigma=sigma 238 ) 239 240 elif model == "Voigt": 241 # stardard deviation 242 sigma = self.fwhm / 3.6013 243 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 244 model = models.VoigtModel() 245 params = model.make_params( 246 center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma 247 ) 248 else: 249 raise LookupError("model lineshape not known or defined") 250 251 # calc_abundance = model.eval(params=params, x=mz_domain) #Same as initial fit, returned in fit_peak object 252 fit_peak = model.fit(abundance_domain, params=params, x=mz_domain) 253 return mz_domain, fit_peak 254 255 else: 256 raise LookupError( 257 "resolving power is not defined, try to use set_max_resolving_power()" 258 ) 259 260 def voigt_pso(self, w, r, yoff, width, loc, a): 261 """Voigt function for particle swarm optimisation (PSO) fitting 262 263 From https://github.com/pnnl/nmrfit/blob/master/nmrfit/equations.py. 264 Calculates a Voigt function over w based on the relevant properties of the distribution. 265 266 Parameters 267 ---------- 268 w : ndarray 269 Array over which the Voigt function will be evaluated. 270 r : float 271 Ratio between the Guassian and Lorentzian functions. 272 yoff : float 273 Y-offset of the Voigt function. 274 width : float 275 The width of the Voigt function. 276 loc : float 277 Center of the Voigt function. 278 a : float 279 Area of the Voigt function. 280 Returns 281 ------- 282 V : ndarray 283 Array defining the Voigt function over w. 284 285 References 286 ---------- 287 1. https://github.com/pnnl/nmrfit 288 289 Notes 290 ----- 291 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 292 293 """ 294 # Lorentzian component 295 L = (2 / (pi * width)) * 1 / (1 + ((w - loc) / (0.5 * width)) ** 2) 296 297 # Gaussian component 298 G = ( 299 (2 / width) 300 * sqrt(log(2) / pi) 301 * exp(-(((w - loc) / (width / (2 * sqrt(log(2))))) ** 2)) 302 ) 303 304 # Voigt body 305 V = (yoff + a) * (r * L + (1 - r) * G) 306 307 return V 308 309 def objective_pso(self, x, w, u): 310 """Objective function for particle swarm optimisation (PSO) fitting 311 312 The objective function used to fit supplied data. Evaluates sum of squared differences between the fit and the data. 313 314 Parameters 315 ---------- 316 x : list of floats 317 Parameter vector. 318 w : ndarray 319 Array of frequency data. 320 u : ndarray 321 Array of data to be fit. 322 323 Returns 324 ------- 325 rmse : float 326 Root mean square error between the data and fit. 327 328 References 329 ---------- 330 1. https://github.com/pnnl/nmrfit 331 332 """ 333 # global parameters 334 r, width, loc, a = x 335 yoff = 0 336 337 # calculate fit for V 338 V_fit = self.voigt_pso(w, r, yoff, width, loc, a) 339 340 # real component RMSE 341 rmse = sqrt(square((u - V_fit)).mean(axis=None)) 342 343 # return the total RMSE 344 return rmse 345 346 def minimize_pso(self, lower, upper, w, u): 347 """Minimization function for particle swarm optimisation (PSO) fitting 348 349 Minimizes the objective function using the particle swarm optimization algorithm. 350 Minimization function based on defined parameters 351 352 353 Parameters 354 ---------- 355 lower : list of floats 356 Lower bounds for the parameters. 357 upper : list of floats 358 Upper bounds for the parameters. 359 w : ndarray 360 Array of frequency data. 361 u : ndarray 362 Array of data to be fit. 363 364 Notes 365 ----- 366 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 367 Current parameters take ~2 seconds per peak. 368 369 370 References 371 ---------- 372 1. https://github.com/pnnl/nmrfit 373 374 """ 375 # TODO - allow support to pass swarmsize, maxiter, omega, phip, phig parameters. 376 # TODO - Refactor PSO fitting into its own class? 377 378 xopt, fopt = pyswarm.pso( 379 self.objective_pso, 380 lower, 381 upper, 382 args=(w, u), 383 swarmsize=1000, 384 maxiter=5000, 385 omega=-0.2134, 386 phip=-0.3344, 387 phig=2.3259, 388 ) 389 return xopt, fopt 390 391 def fit_peak_pso(self, mz_extend: int = 6, upsample_multiplier: int = 5): 392 """Lineshape analysis on a peak using particle swarm optimisation (PSO) fitting 393 394 Function to fit a Voigt peakshape using particle swarm optimisation (PSO). 395 Should return better results than lmfit, but much more computationally expensive 396 397 Parameters 398 ---------- 399 mz_extend : int, optional 400 extra points left and right of peak definition to include in fitting. Defaults to 6. 401 upsample_multiplier : int, optional 402 factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 5. 403 404 Returns 405 ------- 406 xopt : array 407 variables describing the voigt function. 408 G/L ratio, width (fwhm), apex (x-axis), area. 409 y-axis offset is fixed at 0 410 fopt : float 411 objective score (rmse) 412 psfit : array 413 recalculated y values based on function and optimised fit 414 psfit_hdp : tuple of arrays 415 0 - linspace x-axis upsampled grid 416 1 - recalculated y values based on function and upsampled x-axis grid 417 Does not change results, but aids in visualisation of the 'true' voigt lineshape 418 419 Notes 420 ----- 421 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 422 """ 423 # TODO - Add ability to pass pso args (i.e. swarm size, maxiter, omega, phig, etc) 424 # TODO: fix xopt. Magnitude mode data through CoreMS/Bruker starts at 0 but is noise centered well above 0. 425 # Thermo data is noise reduced by also noise subtracted, so starts at 0 426 # Absorption mode/phased data will have positive and negative components and may not be baseline corrected 427 428 start_index = ( 429 self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0 430 ) 431 final_index = ( 432 self.peak_right_index + mz_extend 433 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 434 else self.peak_right_index 435 ) 436 437 # check if MSPeak contains the resolving power info 438 if self.resolving_power: 439 # full width half maximum distance 440 self.fwhm = self.mz_exp / (self.resolving_power) 441 442 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 443 abundance_domain = self._ms_parent.abundance_profile[ 444 start_index:final_index 445 ] 446 lower = [0, self.fwhm * 0.8, (self.mz_exp - 0.0005), 0] 447 upper = [ 448 1, 449 self.fwhm * 1.2, 450 (self.mz_exp + 0.0005), 451 self.abundance / self.signal_to_noise, 452 ] 453 xopt, fopt = self.minimize_pso(lower, upper, mz_domain, abundance_domain) 454 455 psfit = self.voigt_pso(mz_domain, xopt[0], 0, xopt[1], xopt[2], xopt[3]) 456 psfit_hdp_x = linspace( 457 min(mz_domain), max(mz_domain), num=len(mz_domain) * upsample_multiplier 458 ) 459 psfit_hdp = self.voigt_pso( 460 psfit_hdp_x, xopt[0], 0, xopt[1], xopt[2], xopt[3] 461 ) 462 return xopt, fopt, psfit, (psfit_hdp_x, psfit_hdp) 463 else: 464 raise LookupError( 465 "resolving power is not defined, try to use set_max_resolving_power()" 466 ) 467 468 def voigt(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 469 """[Legacy] Voigt lineshape analysis function 470 Legacy function for voigt lineshape analysis 471 472 Parameters 473 ---------- 474 oversample_multiplier : int 475 factor to increase x-axis points by for simulation of fitted lineshape function 476 delta_rp : float 477 delta resolving power to add to resolving power 478 mz_overlay : int 479 extra points left and right of peak definition to include in fitting 480 481 Returns 482 ------- 483 mz_domain : ndarray 484 x-axis domain for fit 485 calc_abundance : ndarray 486 calculated abundance profile based on voigt function 487 """ 488 489 if self.resolving_power: 490 # full width half maximum distance 491 self.fwhm = self.mz_exp / ( 492 self.resolving_power + delta_rp 493 ) # self.resolving_power) 494 495 # stardart deviation 496 sigma = self.fwhm / 3.6013 497 498 # half width baseline distance 499 500 # mz_domain = linspace(self.mz_exp - hw_base_distance, 501 # self.mz_exp + hw_base_distance, datapoint) 502 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 503 504 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 505 506 # TODO derive amplitude 507 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 508 509 model = models.VoigtModel() 510 511 params = model.make_params( 512 center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma 513 ) 514 515 calc_abundance = model.eval(params=params, x=mz_domain) 516 517 return mz_domain, calc_abundance 518 519 else: 520 raise LookupError( 521 "resolving power is not defined, try to use set_max_resolving_power()" 522 ) 523 524 def pseudovoigt( 525 self, oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5 526 ): 527 """[Legacy] pseudovoigt lineshape function 528 529 Legacy function for pseudovoigt lineshape analysis. 530 Note - Code may not be functional currently. 531 532 Parameters 533 ---------- 534 oversample_multiplier : int, optional 535 factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 1. 536 delta_rp : float, optional 537 delta resolving power to add to resolving power. Defaults to 0. 538 mz_overlay : int, optional 539 extra points left and right of peak definition to include in fitting. Defaults to 1. 540 fraction : float, optional 541 fraction of gaussian component in pseudovoigt function. Defaults to 0.5. 542 543 """ 544 if self.resolving_power: 545 # full width half maximum distance 546 self.fwhm = self.mz_exp / ( 547 self.resolving_power + delta_rp 548 ) # self.resolving_power) 549 550 # stardart deviation 551 sigma = self.fwhm / 2 552 553 # half width baseline distance 554 555 # mz_domain = linspace(self.mz_exp - hw_base_distance, 556 # self.mz_exp + hw_base_distance, datapoint) 557 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 558 559 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 560 model = models.PseudoVoigtModel() 561 562 # TODO derive amplitude 563 gamma = sigma 564 565 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 566 amplitude = (sqrt(pi / log(2)) * (pi * sigma * self.abundance)) / ( 567 (pi * (1 - gamma)) + (sqrt(pi * log(2)) * gamma) 568 ) 569 570 params = model.make_params(center=self.mz_exp, sigma=sigma) 571 572 calc_abundance = model.eval(params=params, x=mz_domain) 573 574 return mz_domain, calc_abundance 575 576 else: 577 raise LookupError( 578 "resolving power is not defined, try to use set_max_resolving_power()" 579 ) 580 581 def lorentz(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 582 """[Legacy] Lorentz lineshape analysis function 583 584 Legacy function for lorentz lineshape analysis 585 586 Parameters 587 ---------- 588 oversample_multiplier : int 589 factor to increase x-axis points by for simulation of fitted lineshape function 590 delta_rp : float 591 delta resolving power to add to resolving power 592 mz_overlay : int 593 extra points left and right of peak definition to include in fitting 594 595 Returns 596 ------- 597 mz_domain : ndarray 598 x-axis domain for fit 599 calc_abundance : ndarray 600 calculated abundance profile based on lorentz function 601 602 """ 603 if self.resolving_power: 604 # full width half maximum distance 605 self.fwhm = self.mz_exp / ( 606 self.resolving_power + delta_rp 607 ) # self.resolving_power) 608 609 # stardart deviation 610 sigma = self.fwhm / 2 611 612 # half width baseline distance 613 hw_base_distance = 8 * sigma 614 615 # mz_domain = linspace(self.mz_exp - hw_base_distance, 616 # self.mz_exp + hw_base_distance, datapoint) 617 618 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 619 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 620 model = models.LorentzianModel() 621 622 amplitude = sigma * pi * self.abundance 623 624 params = model.make_params( 625 center=self.mz_exp, amplitude=amplitude, sigma=sigma 626 ) 627 628 calc_abundance = model.eval(params=params, x=mz_domain) 629 630 return mz_domain, calc_abundance 631 632 else: 633 raise LookupError( 634 "resolving power is not defined, try to use set_max_resolving_power()" 635 ) 636 637 def gaussian(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 638 """[Legacy] Gaussian lineshape analysis function 639 Legacy gaussian lineshape analysis function 640 641 Parameters 642 ---------- 643 oversample_multiplier : int 644 factor to increase x-axis points by for simulation of fitted lineshape function 645 delta_rp : float 646 delta resolving power to add to resolving power 647 mz_overlay : int 648 extra points left and right of peak definition to include in fitting 649 650 Returns 651 ------- 652 mz_domain : ndarray 653 x-axis domain for fit 654 calc_abundance : ndarray 655 calculated abundance profile based on gaussian function 656 657 658 """ 659 660 # check if MSPeak contains the resolving power info 661 if self.resolving_power: 662 # full width half maximum distance 663 self.fwhm = self.mz_exp / ( 664 self.resolving_power + delta_rp 665 ) # self.resolving_power) 666 667 # stardart deviation 668 sigma = self.fwhm / (2 * sqrt(2 * log(2))) 669 670 # half width baseline distance 671 # hw_base_distance = (3.2 * s) 672 673 # match_loz_factor = 3 674 675 # n_d = hw_base_distance * match_loz_factor 676 677 # mz_domain = linspace( 678 # self.mz_exp - n_d, self.mz_exp + n_d, datapoint) 679 680 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 681 682 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 683 684 # calc_abundance = norm.pdf(mz_domain, self.mz_exp, s) 685 686 model = models.GaussianModel() 687 688 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 689 690 params = model.make_params( 691 center=self.mz_exp, amplitude=amplitude, sigma=sigma 692 ) 693 694 calc_abundance = model.eval(params=params, x=mz_domain) 695 696 return mz_domain, calc_abundance 697 698 else: 699 raise LookupError( 700 "resolving power is not defined, try to use set_max_resolving_power()" 701 ) 702 703 def get_mz_domain(self, oversample_multiplier, mz_overlay): 704 """[Legacy] function to resample/interpolate datapoints for lineshape analysis 705 706 This code is used for the legacy line fitting functions and not recommended. 707 Legacy function to support expanding mz domain for legacy lineshape functions 708 709 Parameters 710 ---------- 711 oversample_multiplier : int 712 factor to increase x-axis points by for simulation of fitted lineshape function 713 mz_overlay : int 714 extra points left and right of peak definition to include in fitting 715 716 Returns 717 ------- 718 mz_domain : ndarray 719 x-axis domain for fit 720 721 """ 722 start_index = ( 723 self.peak_left_index - mz_overlay if not self.peak_left_index == 0 else 0 724 ) 725 final_index = ( 726 self.peak_right_index + mz_overlay 727 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 728 else self.peak_right_index 729 ) 730 731 if oversample_multiplier == 1: 732 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 733 734 else: 735 # we assume a linear correlation for m/z and datapoits 736 # which is only true if the m/z range in narrow (within 1 m/z unit) 737 # this is not true for a wide m/z range 738 739 indexes = range(start_index, final_index + 1) 740 mz = self._ms_parent.mz_exp_profile[indexes] 741 pol = poly1d(polyfit(indexes, mz, 1)) 742 oversampled_indexes = linspace( 743 start_index, 744 final_index, 745 (final_index - start_index) * oversample_multiplier, 746 ) 747 mz_domain = pol(oversampled_indexes) 748 749 return mz_domain 750 751 @property 752 def number_possible_assignments( 753 self, 754 ): 755 return len(self.molecular_formulas) 756 757 def molecular_formula_lowest_error(self): 758 """Return the molecular formula with the smallest absolute mz error""" 759 760 return min(self.molecular_formulas, key=lambda m: abs(m.mz_error)) 761 762 def molecular_formula_highest_prob_score(self): 763 """Return the molecular formula with the highest confidence score score""" 764 765 return max(self.molecular_formulas, key=lambda m: abs(m.confidence_score)) 766 767 def molecular_formula_earth_filter(self, lowest_error=True): 768 """Filter molecular formula using the 'Earth' filter 769 770 This function applies the Formularity-esque 'Earth' filter to possible molecular formula assignments. 771 Earth Filter: 772 O > 0 AND N <= 3 AND P <= 2 AND 3P <= O 773 774 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Earth filter. 775 Otherwise, it will return all Earth-filter compliant formulas. 776 777 Parameters 778 ---------- 779 lowest_error : bool, optional. 780 Return only the lowest error formula which also fits the Earth filter. 781 If False, return all Earth-filter compliant formulas. Default is True. 782 783 Returns 784 ------- 785 list 786 List of molecular formula objects which fit the Earth filter 787 788 References 789 ---------- 790 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 791 Anal. Chem. 2017, 89, 23, 12659–12665 792 doi: 10.1021/acs.analchem.7b03318 793 """ 794 795 candidates = list( 796 filter( 797 lambda mf: mf.get("O") > 0 798 and mf.get("N") <= 3 799 and mf.get("P") <= 2 800 and (3 * mf.get("P")) <= mf.get("O"), 801 self.molecular_formulas, 802 ) 803 ) 804 if len(candidates) > 0: 805 if lowest_error: 806 return min(candidates, key=lambda m: abs(m.mz_error)) 807 else: 808 return candidates 809 else: 810 return candidates 811 812 def molecular_formula_water_filter(self, lowest_error=True): 813 """Filter molecular formula using the 'Water' filter 814 815 This function applies the Formularity-esque 'Water' filter to possible molecular formula assignments. 816 Water Filter: 817 O > 0 AND N <= 3 AND S <= 2 AND P <= 2 818 819 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Water filter. 820 Otherwise, it will return all Water-filter compliant formulas. 821 822 Parameters 823 ---------- 824 lowest_error : bool, optional 825 Return only the lowest error formula which also fits the Water filter. 826 If False, return all Water-filter compliant formulas. Defaults to 2 827 828 Returns 829 ------- 830 list 831 List of molecular formula objects which fit the Water filter 832 833 References 834 ---------- 835 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 836 Anal. Chem. 2017, 89, 23, 12659–12665 837 doi: 10.1021/acs.analchem.7b03318 838 """ 839 840 candidates = list( 841 filter( 842 lambda mf: mf.get("O") > 0 843 and mf.get("N") <= 3 844 and mf.get("S") <= 2 845 and mf.get("P") <= 2, 846 self.molecular_formulas, 847 ) 848 ) 849 if len(candidates) > 0: 850 if lowest_error: 851 return min(candidates, key=lambda m: abs(m.mz_error)) 852 else: 853 return candidates 854 else: 855 return candidates 856 857 def molecular_formula_air_filter(self, lowest_error=True): 858 """Filter molecular formula using the 'Air' filter 859 860 This function applies the Formularity-esque 'Air' filter to possible molecular formula assignments. 861 Air Filter: 862 O > 0 AND N <= 3 AND S <= 1 AND P = 0 AND 3(S+N) <= O 863 864 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Air filter. 865 Otherwise, it will return all Air-filter compliant formulas. 866 867 Parameters 868 ---------- 869 lowest_error : bool, optional 870 Return only the lowest error formula which also fits the Air filter. 871 If False, return all Air-filter compliant formulas. Defaults to True. 872 873 Returns 874 ------- 875 list 876 List of molecular formula objects which fit the Air filter 877 878 References 879 ---------- 880 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 881 Anal. Chem. 2017, 89, 23, 12659–12665 882 doi: 10.1021/acs.analchem.7b03318 883 """ 884 885 candidates = list( 886 filter( 887 lambda mf: mf.get("O") > 0 888 and mf.get("N") <= 2 889 and mf.get("S") <= 1 890 and mf.get("P") == 0 891 and 3 * (mf.get("S") + mf.get("N")) <= mf.get("O"), 892 self.molecular_formulas, 893 ) 894 ) 895 896 if len(candidates) > 0: 897 if lowest_error: 898 return min(candidates, key=lambda m: abs(m.mz_error)) 899 else: 900 return candidates 901 else: 902 return candidates 903 904 def cia_score_S_P_error(self): 905 """Compound Identification Algorithm SP Error - Assignment Filter 906 907 This function applies the Compound Identification Algorithm (CIA) SP Error filter to possible molecular formula assignments. 908 909 It takes the molecular formula with the lowest S+P count, and returns the formula with the lowest absolute error from this subset. 910 911 Returns 912 ------- 913 MolecularFormula 914 A single molecular formula which fits the rules of the CIA SP Error filter 915 916 917 References 918 ---------- 919 1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" 920 Anal. Chem. 2006, 78, 13, 4363–4373 921 doi: 10.1021/ac0600306 922 """ 923 # case EFormulaScore.HAcap: 924 925 lowest_S_P_mf = min( 926 self.molecular_formulas, key=lambda mf: mf.get("S") + mf.get("P") 927 ) 928 lowest_S_P_count = lowest_S_P_mf.get("S") + lowest_S_P_mf.get("P") 929 930 list_same_s_p = list( 931 filter( 932 lambda mf: mf.get("S") + mf.get("P") == lowest_S_P_count, 933 self.molecular_formulas, 934 ) 935 ) 936 937 # check if list is not empty 938 if list_same_s_p: 939 return min(list_same_s_p, key=lambda m: abs(m.mz_error)) 940 941 else: 942 return lowest_S_P_mf 943 944 def cia_score_N_S_P_error(self): 945 """Compound Identification Algorithm NSP Error - Assignment Filter 946 947 This function applies the Compound Identification Algorithm (CIA) NSP Error filter to possible molecular formula assignments. 948 949 It takes the molecular formula with the lowest N+S+P count, and returns the formula with the lowest absolute error from this subset. 950 951 Returns 952 ------- 953 MolecularFormula 954 A single molecular formula which fits the rules of the CIA NSP Error filter 955 956 References 957 ---------- 958 1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" 959 Anal. Chem. 2006, 78, 13, 4363–4373 960 doi: 10.1021/ac0600306 961 962 Raises 963 ------- 964 Exception 965 If no molecular formula are associated with mass spectrum peak. 966 """ 967 # case EFormulaScore.HAcap: 968 if self.molecular_formulas: 969 lowest_N_S_P_mf = min( 970 self.molecular_formulas, 971 key=lambda mf: mf.get("N") + mf.get("S") + mf.get("P"), 972 ) 973 lowest_N_S_P_count = ( 974 lowest_N_S_P_mf.get("N") 975 + lowest_N_S_P_mf.get("S") 976 + lowest_N_S_P_mf.get("P") 977 ) 978 979 list_same_N_S_P = list( 980 filter( 981 lambda mf: mf.get("N") + mf.get("S") + mf.get("P") 982 == lowest_N_S_P_count, 983 self.molecular_formulas, 984 ) 985 ) 986 987 if list_same_N_S_P: 988 SP_filtered_list = list( 989 filter( 990 lambda mf: (mf.get("S") <= 3) and (mf.get("P") <= 1), 991 list_same_N_S_P, 992 ) 993 ) 994 995 if SP_filtered_list: 996 return min(SP_filtered_list, key=lambda m: abs(m.mz_error)) 997 998 else: 999 return min(list_same_N_S_P, key=lambda m: abs(m.mz_error)) 1000 1001 else: 1002 return lowest_N_S_P_mf 1003 else: 1004 raise Exception( 1005 "No molecular formula associated with the mass spectrum peak at m/z: %.6f" 1006 % self.mz_exp 1007 )
30class MSPeakCalculation: 31 """Class to perform calculations on MSPeak objects. 32 33 This class provides methods to perform various calculations on MSPeak objects, such as calculating Kendrick Mass Defect (KMD) and Kendrick Mass (KM), calculating peak area, and fitting peak lineshape using different models. 34 35 Parameters 36 ---------- 37 None 38 39 Attributes 40 ---------- 41 _ms_parent : MSParent 42 The parent MSParent object associated with the MSPeakCalculation object. 43 mz_exp : float 44 The experimental m/z value of the peak. 45 peak_left_index : int 46 The start scan index of the peak. 47 peak_right_index : int 48 The final scan index of the peak. 49 resolving_power : float 50 The resolving power of the peak. 51 52 Methods 53 ------- 54 * _calc_kmd(dict_base). 55 Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula. 56 * calc_area(). 57 Calculate the peak area using numpy's trapezoidal fit. 58 * fit_peak(mz_extend=6, delta_rp=0, model='Gaussian'). 59 Perform lineshape analysis on a peak using lmfit module. 60 * voigt_pso(w, r, yoff, width, loc, a). 61 Calculate the Voigt function for particle swarm optimization (PSO) fitting. 62 * objective_pso(x, w, u). 63 Calculate the objective function for PSO fitting. 64 * minimize_pso(lower, upper, w, u). 65 Minimize the objective function using the particle swarm optimization algorithm. 66 * fit_peak_pso(mz_extend=6, upsample_multiplier=5). 67 Perform lineshape analysis on a peak using particle swarm optimization (PSO) fitting. 68 * voigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1). 69 [Legacy] Perform voigt lineshape analysis on a peak. 70 * pseudovoigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5). 71 [Legacy] Perform pseudovoigt lineshape analysis on a peak. 72 * lorentz(oversample_multiplier=1, delta_rp=0, mz_overlay=1). 73 [Legacy] Perform lorentz lineshape analysis on a peak. 74 * gaussian(oversample_multiplier=1, delta_rp=0, mz_overlay=1). 75 [Legacy] Perform gaussian lineshape analysis on a peak. 76 * get_mz_domain(oversample_multiplier, mz_overlay). 77 [Legacy] Resample/interpolate datapoints for lineshape analysis. 78 * number_possible_assignments(). 79 Return the number of possible molecular formula assignments for the peak. 80 * molecular_formula_lowest_error(). 81 Return the molecular formula with the smallest absolute mz error. 82 * molecular_formula_highest_prob_score(). 83 Return the molecular formula with the highest confidence score. 84 * molecular_formula_earth_filter(lowest_error=True). 85 Filter molecular formula using the 'Earth' filter. 86 * molecular_formula_water_filter(lowest_error=True). 87 Filter molecular formula using the 'Water' filter. 88 * molecular_formula_air_filter(lowest_error=True). 89 Filter molecular formula using the 'Air' filter. 90 * cia_score_S_P_error(). 91 Compound Identification Algorithm SP Error - Assignment Filter. 92 * cia_score_N_S_P_error(). 93 Compound Identification Algorithm NSP Error - Assignment Filter. 94 95 """ 96 97 def _calc_kmd(self, dict_base): 98 """Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula 99 100 Parameters 101 ---------- 102 dict_base : dict 103 dictionary with the base formula to be used in the calculation 104 Default is CH2, e.g. 105 dict_base = {"C": 1, "H": 2} 106 """ 107 108 if self._ms_parent: 109 # msPeak obj does have a ms object parent 110 kendrick_rounding_method = ( 111 self._ms_parent.mspeaks_settings.kendrick_rounding_method 112 ) # rounding method can be one of floor, ceil or round 113 # msPeak obj does not have a ms object parent 114 else: 115 kendrick_rounding_method = MSParameters.ms_peak.kendrick_rounding_method 116 117 mass = 0 118 for atom in dict_base.keys(): 119 mass += Atoms.atomic_masses.get(atom) * dict_base.get(atom) 120 121 kendrick_mass = (int(mass) / mass) * self.mz_exp 122 123 if kendrick_rounding_method == "ceil": 124 nominal_km = ceil(kendrick_mass) 125 126 elif kendrick_rounding_method == "round": 127 nominal_km = rint(kendrick_mass) 128 129 elif kendrick_rounding_method == "floor": 130 nominal_km = floor(kendrick_mass) 131 132 else: 133 raise Exception( 134 "%s method was not implemented, please refer to corems.ms_peak.calc.MSPeakCalc Class" 135 % kendrick_rounding_method 136 ) 137 138 kmd = nominal_km - kendrick_mass 139 140 # kmd = (nominal_km - km) * 1 141 # kmd = round(kmd,0) 142 143 return kmd, kendrick_mass, nominal_km 144 145 def calc_area(self): 146 """Calculate the peak area using numpy's trapezoidal fit 147 148 uses provided mz_domain to accurately integrate areas independent of digital resolution 149 150 Returns 151 ------- 152 float 153 peak area 154 """ 155 if self.peak_right_index > self.peak_left_index: 156 yy = self._ms_parent.abundance_profile[ 157 self.peak_left_index : self.peak_right_index 158 ] 159 xx = self._ms_parent.mz_exp_profile[ 160 self.peak_left_index : self.peak_right_index 161 ] 162 # check if the axis is high to low m/z or not. if its MSFromFreq its high mz first, if its from Profile, its low mz first 163 if xx[0] > xx[-1]: 164 xx = flip(xx) 165 yy = flip(yy) 166 return float(trapz(yy, xx)) 167 168 else: 169 warnings.warn( 170 "Peak Area Calculation for m/z {} has failed".format(self.mz_exp) 171 ) 172 return nan 173 174 def fit_peak(self, mz_extend=6, delta_rp=0, model="Gaussian"): 175 """Lineshape analysis on a peak using lmfit module. 176 177 Model and fit peak lineshape by defined function - using lmfit module 178 Does not oversample/resample/interpolate data points 179 Better to go back to time domain and perform more zero filling - if possible. 180 181 Parameters 182 ---------- 183 mz_extend : int 184 extra points left and right of peak definition to include in fitting 185 delta_rp : float 186 delta resolving power to add to resolving power 187 model : str 188 Type of lineshape model to use. 189 Models allowed: Gaussian, Lorentz, Voigt 190 191 Returns 192 ----- 193 mz_domain : ndarray 194 x-axis domain for fit 195 fit_peak : lmfit object 196 fit results object from lmfit module 197 198 Notes 199 ----- 200 Returns the calculated mz domain, initial defined abundance profile, and the fit peak results object from lmfit module 201 mz_extend here extends the x-axis domain so that we have sufficient points either side of the apex to fit. 202 Takes about 10ms per peak 203 """ 204 start_index = ( 205 self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0 206 ) 207 final_index = ( 208 self.peak_right_index + mz_extend 209 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 210 else self.peak_right_index 211 ) 212 213 # check if MSPeak contains the resolving power info 214 if self.resolving_power: 215 # full width half maximum distance 216 self.fwhm = self.mz_exp / (self.resolving_power + delta_rp) 217 218 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 219 abundance_domain = self._ms_parent.abundance_profile[ 220 start_index:final_index 221 ] 222 223 if model == "Gaussian": 224 # stardard deviation 225 sigma = self.fwhm / (2 * sqrt(2 * log(2))) 226 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 227 model = models.GaussianModel() 228 params = model.make_params( 229 center=self.mz_exp, amplitude=amplitude, sigma=sigma 230 ) 231 232 elif model == "Lorentz": 233 # stardard deviation 234 sigma = self.fwhm / 2 235 amplitude = sigma * pi * self.abundance 236 model = models.LorentzianModel() 237 params = model.make_params( 238 center=self.mz_exp, amplitude=amplitude, sigma=sigma 239 ) 240 241 elif model == "Voigt": 242 # stardard deviation 243 sigma = self.fwhm / 3.6013 244 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 245 model = models.VoigtModel() 246 params = model.make_params( 247 center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma 248 ) 249 else: 250 raise LookupError("model lineshape not known or defined") 251 252 # calc_abundance = model.eval(params=params, x=mz_domain) #Same as initial fit, returned in fit_peak object 253 fit_peak = model.fit(abundance_domain, params=params, x=mz_domain) 254 return mz_domain, fit_peak 255 256 else: 257 raise LookupError( 258 "resolving power is not defined, try to use set_max_resolving_power()" 259 ) 260 261 def voigt_pso(self, w, r, yoff, width, loc, a): 262 """Voigt function for particle swarm optimisation (PSO) fitting 263 264 From https://github.com/pnnl/nmrfit/blob/master/nmrfit/equations.py. 265 Calculates a Voigt function over w based on the relevant properties of the distribution. 266 267 Parameters 268 ---------- 269 w : ndarray 270 Array over which the Voigt function will be evaluated. 271 r : float 272 Ratio between the Guassian and Lorentzian functions. 273 yoff : float 274 Y-offset of the Voigt function. 275 width : float 276 The width of the Voigt function. 277 loc : float 278 Center of the Voigt function. 279 a : float 280 Area of the Voigt function. 281 Returns 282 ------- 283 V : ndarray 284 Array defining the Voigt function over w. 285 286 References 287 ---------- 288 1. https://github.com/pnnl/nmrfit 289 290 Notes 291 ----- 292 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 293 294 """ 295 # Lorentzian component 296 L = (2 / (pi * width)) * 1 / (1 + ((w - loc) / (0.5 * width)) ** 2) 297 298 # Gaussian component 299 G = ( 300 (2 / width) 301 * sqrt(log(2) / pi) 302 * exp(-(((w - loc) / (width / (2 * sqrt(log(2))))) ** 2)) 303 ) 304 305 # Voigt body 306 V = (yoff + a) * (r * L + (1 - r) * G) 307 308 return V 309 310 def objective_pso(self, x, w, u): 311 """Objective function for particle swarm optimisation (PSO) fitting 312 313 The objective function used to fit supplied data. Evaluates sum of squared differences between the fit and the data. 314 315 Parameters 316 ---------- 317 x : list of floats 318 Parameter vector. 319 w : ndarray 320 Array of frequency data. 321 u : ndarray 322 Array of data to be fit. 323 324 Returns 325 ------- 326 rmse : float 327 Root mean square error between the data and fit. 328 329 References 330 ---------- 331 1. https://github.com/pnnl/nmrfit 332 333 """ 334 # global parameters 335 r, width, loc, a = x 336 yoff = 0 337 338 # calculate fit for V 339 V_fit = self.voigt_pso(w, r, yoff, width, loc, a) 340 341 # real component RMSE 342 rmse = sqrt(square((u - V_fit)).mean(axis=None)) 343 344 # return the total RMSE 345 return rmse 346 347 def minimize_pso(self, lower, upper, w, u): 348 """Minimization function for particle swarm optimisation (PSO) fitting 349 350 Minimizes the objective function using the particle swarm optimization algorithm. 351 Minimization function based on defined parameters 352 353 354 Parameters 355 ---------- 356 lower : list of floats 357 Lower bounds for the parameters. 358 upper : list of floats 359 Upper bounds for the parameters. 360 w : ndarray 361 Array of frequency data. 362 u : ndarray 363 Array of data to be fit. 364 365 Notes 366 ----- 367 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 368 Current parameters take ~2 seconds per peak. 369 370 371 References 372 ---------- 373 1. https://github.com/pnnl/nmrfit 374 375 """ 376 # TODO - allow support to pass swarmsize, maxiter, omega, phip, phig parameters. 377 # TODO - Refactor PSO fitting into its own class? 378 379 xopt, fopt = pyswarm.pso( 380 self.objective_pso, 381 lower, 382 upper, 383 args=(w, u), 384 swarmsize=1000, 385 maxiter=5000, 386 omega=-0.2134, 387 phip=-0.3344, 388 phig=2.3259, 389 ) 390 return xopt, fopt 391 392 def fit_peak_pso(self, mz_extend: int = 6, upsample_multiplier: int = 5): 393 """Lineshape analysis on a peak using particle swarm optimisation (PSO) fitting 394 395 Function to fit a Voigt peakshape using particle swarm optimisation (PSO). 396 Should return better results than lmfit, but much more computationally expensive 397 398 Parameters 399 ---------- 400 mz_extend : int, optional 401 extra points left and right of peak definition to include in fitting. Defaults to 6. 402 upsample_multiplier : int, optional 403 factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 5. 404 405 Returns 406 ------- 407 xopt : array 408 variables describing the voigt function. 409 G/L ratio, width (fwhm), apex (x-axis), area. 410 y-axis offset is fixed at 0 411 fopt : float 412 objective score (rmse) 413 psfit : array 414 recalculated y values based on function and optimised fit 415 psfit_hdp : tuple of arrays 416 0 - linspace x-axis upsampled grid 417 1 - recalculated y values based on function and upsampled x-axis grid 418 Does not change results, but aids in visualisation of the 'true' voigt lineshape 419 420 Notes 421 ----- 422 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 423 """ 424 # TODO - Add ability to pass pso args (i.e. swarm size, maxiter, omega, phig, etc) 425 # TODO: fix xopt. Magnitude mode data through CoreMS/Bruker starts at 0 but is noise centered well above 0. 426 # Thermo data is noise reduced by also noise subtracted, so starts at 0 427 # Absorption mode/phased data will have positive and negative components and may not be baseline corrected 428 429 start_index = ( 430 self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0 431 ) 432 final_index = ( 433 self.peak_right_index + mz_extend 434 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 435 else self.peak_right_index 436 ) 437 438 # check if MSPeak contains the resolving power info 439 if self.resolving_power: 440 # full width half maximum distance 441 self.fwhm = self.mz_exp / (self.resolving_power) 442 443 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 444 abundance_domain = self._ms_parent.abundance_profile[ 445 start_index:final_index 446 ] 447 lower = [0, self.fwhm * 0.8, (self.mz_exp - 0.0005), 0] 448 upper = [ 449 1, 450 self.fwhm * 1.2, 451 (self.mz_exp + 0.0005), 452 self.abundance / self.signal_to_noise, 453 ] 454 xopt, fopt = self.minimize_pso(lower, upper, mz_domain, abundance_domain) 455 456 psfit = self.voigt_pso(mz_domain, xopt[0], 0, xopt[1], xopt[2], xopt[3]) 457 psfit_hdp_x = linspace( 458 min(mz_domain), max(mz_domain), num=len(mz_domain) * upsample_multiplier 459 ) 460 psfit_hdp = self.voigt_pso( 461 psfit_hdp_x, xopt[0], 0, xopt[1], xopt[2], xopt[3] 462 ) 463 return xopt, fopt, psfit, (psfit_hdp_x, psfit_hdp) 464 else: 465 raise LookupError( 466 "resolving power is not defined, try to use set_max_resolving_power()" 467 ) 468 469 def voigt(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 470 """[Legacy] Voigt lineshape analysis function 471 Legacy function for voigt lineshape analysis 472 473 Parameters 474 ---------- 475 oversample_multiplier : int 476 factor to increase x-axis points by for simulation of fitted lineshape function 477 delta_rp : float 478 delta resolving power to add to resolving power 479 mz_overlay : int 480 extra points left and right of peak definition to include in fitting 481 482 Returns 483 ------- 484 mz_domain : ndarray 485 x-axis domain for fit 486 calc_abundance : ndarray 487 calculated abundance profile based on voigt function 488 """ 489 490 if self.resolving_power: 491 # full width half maximum distance 492 self.fwhm = self.mz_exp / ( 493 self.resolving_power + delta_rp 494 ) # self.resolving_power) 495 496 # stardart deviation 497 sigma = self.fwhm / 3.6013 498 499 # half width baseline distance 500 501 # mz_domain = linspace(self.mz_exp - hw_base_distance, 502 # self.mz_exp + hw_base_distance, datapoint) 503 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 504 505 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 506 507 # TODO derive amplitude 508 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 509 510 model = models.VoigtModel() 511 512 params = model.make_params( 513 center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma 514 ) 515 516 calc_abundance = model.eval(params=params, x=mz_domain) 517 518 return mz_domain, calc_abundance 519 520 else: 521 raise LookupError( 522 "resolving power is not defined, try to use set_max_resolving_power()" 523 ) 524 525 def pseudovoigt( 526 self, oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5 527 ): 528 """[Legacy] pseudovoigt lineshape function 529 530 Legacy function for pseudovoigt lineshape analysis. 531 Note - Code may not be functional currently. 532 533 Parameters 534 ---------- 535 oversample_multiplier : int, optional 536 factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 1. 537 delta_rp : float, optional 538 delta resolving power to add to resolving power. Defaults to 0. 539 mz_overlay : int, optional 540 extra points left and right of peak definition to include in fitting. Defaults to 1. 541 fraction : float, optional 542 fraction of gaussian component in pseudovoigt function. Defaults to 0.5. 543 544 """ 545 if self.resolving_power: 546 # full width half maximum distance 547 self.fwhm = self.mz_exp / ( 548 self.resolving_power + delta_rp 549 ) # self.resolving_power) 550 551 # stardart deviation 552 sigma = self.fwhm / 2 553 554 # half width baseline distance 555 556 # mz_domain = linspace(self.mz_exp - hw_base_distance, 557 # self.mz_exp + hw_base_distance, datapoint) 558 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 559 560 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 561 model = models.PseudoVoigtModel() 562 563 # TODO derive amplitude 564 gamma = sigma 565 566 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 567 amplitude = (sqrt(pi / log(2)) * (pi * sigma * self.abundance)) / ( 568 (pi * (1 - gamma)) + (sqrt(pi * log(2)) * gamma) 569 ) 570 571 params = model.make_params(center=self.mz_exp, sigma=sigma) 572 573 calc_abundance = model.eval(params=params, x=mz_domain) 574 575 return mz_domain, calc_abundance 576 577 else: 578 raise LookupError( 579 "resolving power is not defined, try to use set_max_resolving_power()" 580 ) 581 582 def lorentz(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 583 """[Legacy] Lorentz lineshape analysis function 584 585 Legacy function for lorentz lineshape analysis 586 587 Parameters 588 ---------- 589 oversample_multiplier : int 590 factor to increase x-axis points by for simulation of fitted lineshape function 591 delta_rp : float 592 delta resolving power to add to resolving power 593 mz_overlay : int 594 extra points left and right of peak definition to include in fitting 595 596 Returns 597 ------- 598 mz_domain : ndarray 599 x-axis domain for fit 600 calc_abundance : ndarray 601 calculated abundance profile based on lorentz function 602 603 """ 604 if self.resolving_power: 605 # full width half maximum distance 606 self.fwhm = self.mz_exp / ( 607 self.resolving_power + delta_rp 608 ) # self.resolving_power) 609 610 # stardart deviation 611 sigma = self.fwhm / 2 612 613 # half width baseline distance 614 hw_base_distance = 8 * sigma 615 616 # mz_domain = linspace(self.mz_exp - hw_base_distance, 617 # self.mz_exp + hw_base_distance, datapoint) 618 619 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 620 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 621 model = models.LorentzianModel() 622 623 amplitude = sigma * pi * self.abundance 624 625 params = model.make_params( 626 center=self.mz_exp, amplitude=amplitude, sigma=sigma 627 ) 628 629 calc_abundance = model.eval(params=params, x=mz_domain) 630 631 return mz_domain, calc_abundance 632 633 else: 634 raise LookupError( 635 "resolving power is not defined, try to use set_max_resolving_power()" 636 ) 637 638 def gaussian(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 639 """[Legacy] Gaussian lineshape analysis function 640 Legacy gaussian lineshape analysis function 641 642 Parameters 643 ---------- 644 oversample_multiplier : int 645 factor to increase x-axis points by for simulation of fitted lineshape function 646 delta_rp : float 647 delta resolving power to add to resolving power 648 mz_overlay : int 649 extra points left and right of peak definition to include in fitting 650 651 Returns 652 ------- 653 mz_domain : ndarray 654 x-axis domain for fit 655 calc_abundance : ndarray 656 calculated abundance profile based on gaussian function 657 658 659 """ 660 661 # check if MSPeak contains the resolving power info 662 if self.resolving_power: 663 # full width half maximum distance 664 self.fwhm = self.mz_exp / ( 665 self.resolving_power + delta_rp 666 ) # self.resolving_power) 667 668 # stardart deviation 669 sigma = self.fwhm / (2 * sqrt(2 * log(2))) 670 671 # half width baseline distance 672 # hw_base_distance = (3.2 * s) 673 674 # match_loz_factor = 3 675 676 # n_d = hw_base_distance * match_loz_factor 677 678 # mz_domain = linspace( 679 # self.mz_exp - n_d, self.mz_exp + n_d, datapoint) 680 681 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 682 683 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 684 685 # calc_abundance = norm.pdf(mz_domain, self.mz_exp, s) 686 687 model = models.GaussianModel() 688 689 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 690 691 params = model.make_params( 692 center=self.mz_exp, amplitude=amplitude, sigma=sigma 693 ) 694 695 calc_abundance = model.eval(params=params, x=mz_domain) 696 697 return mz_domain, calc_abundance 698 699 else: 700 raise LookupError( 701 "resolving power is not defined, try to use set_max_resolving_power()" 702 ) 703 704 def get_mz_domain(self, oversample_multiplier, mz_overlay): 705 """[Legacy] function to resample/interpolate datapoints for lineshape analysis 706 707 This code is used for the legacy line fitting functions and not recommended. 708 Legacy function to support expanding mz domain for legacy lineshape functions 709 710 Parameters 711 ---------- 712 oversample_multiplier : int 713 factor to increase x-axis points by for simulation of fitted lineshape function 714 mz_overlay : int 715 extra points left and right of peak definition to include in fitting 716 717 Returns 718 ------- 719 mz_domain : ndarray 720 x-axis domain for fit 721 722 """ 723 start_index = ( 724 self.peak_left_index - mz_overlay if not self.peak_left_index == 0 else 0 725 ) 726 final_index = ( 727 self.peak_right_index + mz_overlay 728 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 729 else self.peak_right_index 730 ) 731 732 if oversample_multiplier == 1: 733 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 734 735 else: 736 # we assume a linear correlation for m/z and datapoits 737 # which is only true if the m/z range in narrow (within 1 m/z unit) 738 # this is not true for a wide m/z range 739 740 indexes = range(start_index, final_index + 1) 741 mz = self._ms_parent.mz_exp_profile[indexes] 742 pol = poly1d(polyfit(indexes, mz, 1)) 743 oversampled_indexes = linspace( 744 start_index, 745 final_index, 746 (final_index - start_index) * oversample_multiplier, 747 ) 748 mz_domain = pol(oversampled_indexes) 749 750 return mz_domain 751 752 @property 753 def number_possible_assignments( 754 self, 755 ): 756 return len(self.molecular_formulas) 757 758 def molecular_formula_lowest_error(self): 759 """Return the molecular formula with the smallest absolute mz error""" 760 761 return min(self.molecular_formulas, key=lambda m: abs(m.mz_error)) 762 763 def molecular_formula_highest_prob_score(self): 764 """Return the molecular formula with the highest confidence score score""" 765 766 return max(self.molecular_formulas, key=lambda m: abs(m.confidence_score)) 767 768 def molecular_formula_earth_filter(self, lowest_error=True): 769 """Filter molecular formula using the 'Earth' filter 770 771 This function applies the Formularity-esque 'Earth' filter to possible molecular formula assignments. 772 Earth Filter: 773 O > 0 AND N <= 3 AND P <= 2 AND 3P <= O 774 775 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Earth filter. 776 Otherwise, it will return all Earth-filter compliant formulas. 777 778 Parameters 779 ---------- 780 lowest_error : bool, optional. 781 Return only the lowest error formula which also fits the Earth filter. 782 If False, return all Earth-filter compliant formulas. Default is True. 783 784 Returns 785 ------- 786 list 787 List of molecular formula objects which fit the Earth filter 788 789 References 790 ---------- 791 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 792 Anal. Chem. 2017, 89, 23, 12659–12665 793 doi: 10.1021/acs.analchem.7b03318 794 """ 795 796 candidates = list( 797 filter( 798 lambda mf: mf.get("O") > 0 799 and mf.get("N") <= 3 800 and mf.get("P") <= 2 801 and (3 * mf.get("P")) <= mf.get("O"), 802 self.molecular_formulas, 803 ) 804 ) 805 if len(candidates) > 0: 806 if lowest_error: 807 return min(candidates, key=lambda m: abs(m.mz_error)) 808 else: 809 return candidates 810 else: 811 return candidates 812 813 def molecular_formula_water_filter(self, lowest_error=True): 814 """Filter molecular formula using the 'Water' filter 815 816 This function applies the Formularity-esque 'Water' filter to possible molecular formula assignments. 817 Water Filter: 818 O > 0 AND N <= 3 AND S <= 2 AND P <= 2 819 820 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Water filter. 821 Otherwise, it will return all Water-filter compliant formulas. 822 823 Parameters 824 ---------- 825 lowest_error : bool, optional 826 Return only the lowest error formula which also fits the Water filter. 827 If False, return all Water-filter compliant formulas. Defaults to 2 828 829 Returns 830 ------- 831 list 832 List of molecular formula objects which fit the Water filter 833 834 References 835 ---------- 836 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 837 Anal. Chem. 2017, 89, 23, 12659–12665 838 doi: 10.1021/acs.analchem.7b03318 839 """ 840 841 candidates = list( 842 filter( 843 lambda mf: mf.get("O") > 0 844 and mf.get("N") <= 3 845 and mf.get("S") <= 2 846 and mf.get("P") <= 2, 847 self.molecular_formulas, 848 ) 849 ) 850 if len(candidates) > 0: 851 if lowest_error: 852 return min(candidates, key=lambda m: abs(m.mz_error)) 853 else: 854 return candidates 855 else: 856 return candidates 857 858 def molecular_formula_air_filter(self, lowest_error=True): 859 """Filter molecular formula using the 'Air' filter 860 861 This function applies the Formularity-esque 'Air' filter to possible molecular formula assignments. 862 Air Filter: 863 O > 0 AND N <= 3 AND S <= 1 AND P = 0 AND 3(S+N) <= O 864 865 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Air filter. 866 Otherwise, it will return all Air-filter compliant formulas. 867 868 Parameters 869 ---------- 870 lowest_error : bool, optional 871 Return only the lowest error formula which also fits the Air filter. 872 If False, return all Air-filter compliant formulas. Defaults to True. 873 874 Returns 875 ------- 876 list 877 List of molecular formula objects which fit the Air filter 878 879 References 880 ---------- 881 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 882 Anal. Chem. 2017, 89, 23, 12659–12665 883 doi: 10.1021/acs.analchem.7b03318 884 """ 885 886 candidates = list( 887 filter( 888 lambda mf: mf.get("O") > 0 889 and mf.get("N") <= 2 890 and mf.get("S") <= 1 891 and mf.get("P") == 0 892 and 3 * (mf.get("S") + mf.get("N")) <= mf.get("O"), 893 self.molecular_formulas, 894 ) 895 ) 896 897 if len(candidates) > 0: 898 if lowest_error: 899 return min(candidates, key=lambda m: abs(m.mz_error)) 900 else: 901 return candidates 902 else: 903 return candidates 904 905 def cia_score_S_P_error(self): 906 """Compound Identification Algorithm SP Error - Assignment Filter 907 908 This function applies the Compound Identification Algorithm (CIA) SP Error filter to possible molecular formula assignments. 909 910 It takes the molecular formula with the lowest S+P count, and returns the formula with the lowest absolute error from this subset. 911 912 Returns 913 ------- 914 MolecularFormula 915 A single molecular formula which fits the rules of the CIA SP Error filter 916 917 918 References 919 ---------- 920 1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" 921 Anal. Chem. 2006, 78, 13, 4363–4373 922 doi: 10.1021/ac0600306 923 """ 924 # case EFormulaScore.HAcap: 925 926 lowest_S_P_mf = min( 927 self.molecular_formulas, key=lambda mf: mf.get("S") + mf.get("P") 928 ) 929 lowest_S_P_count = lowest_S_P_mf.get("S") + lowest_S_P_mf.get("P") 930 931 list_same_s_p = list( 932 filter( 933 lambda mf: mf.get("S") + mf.get("P") == lowest_S_P_count, 934 self.molecular_formulas, 935 ) 936 ) 937 938 # check if list is not empty 939 if list_same_s_p: 940 return min(list_same_s_p, key=lambda m: abs(m.mz_error)) 941 942 else: 943 return lowest_S_P_mf 944 945 def cia_score_N_S_P_error(self): 946 """Compound Identification Algorithm NSP Error - Assignment Filter 947 948 This function applies the Compound Identification Algorithm (CIA) NSP Error filter to possible molecular formula assignments. 949 950 It takes the molecular formula with the lowest N+S+P count, and returns the formula with the lowest absolute error from this subset. 951 952 Returns 953 ------- 954 MolecularFormula 955 A single molecular formula which fits the rules of the CIA NSP Error filter 956 957 References 958 ---------- 959 1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" 960 Anal. Chem. 2006, 78, 13, 4363–4373 961 doi: 10.1021/ac0600306 962 963 Raises 964 ------- 965 Exception 966 If no molecular formula are associated with mass spectrum peak. 967 """ 968 # case EFormulaScore.HAcap: 969 if self.molecular_formulas: 970 lowest_N_S_P_mf = min( 971 self.molecular_formulas, 972 key=lambda mf: mf.get("N") + mf.get("S") + mf.get("P"), 973 ) 974 lowest_N_S_P_count = ( 975 lowest_N_S_P_mf.get("N") 976 + lowest_N_S_P_mf.get("S") 977 + lowest_N_S_P_mf.get("P") 978 ) 979 980 list_same_N_S_P = list( 981 filter( 982 lambda mf: mf.get("N") + mf.get("S") + mf.get("P") 983 == lowest_N_S_P_count, 984 self.molecular_formulas, 985 ) 986 ) 987 988 if list_same_N_S_P: 989 SP_filtered_list = list( 990 filter( 991 lambda mf: (mf.get("S") <= 3) and (mf.get("P") <= 1), 992 list_same_N_S_P, 993 ) 994 ) 995 996 if SP_filtered_list: 997 return min(SP_filtered_list, key=lambda m: abs(m.mz_error)) 998 999 else: 1000 return min(list_same_N_S_P, key=lambda m: abs(m.mz_error)) 1001 1002 else: 1003 return lowest_N_S_P_mf 1004 else: 1005 raise Exception( 1006 "No molecular formula associated with the mass spectrum peak at m/z: %.6f" 1007 % self.mz_exp 1008 )
Class to perform calculations on MSPeak objects.
This class provides methods to perform various calculations on MSPeak objects, such as calculating Kendrick Mass Defect (KMD) and Kendrick Mass (KM), calculating peak area, and fitting peak lineshape using different models.
Parameters
- None
Attributes
- _ms_parent (MSParent): The parent MSParent object associated with the MSPeakCalculation object.
- mz_exp (float): The experimental m/z value of the peak.
- peak_left_index (int): The start scan index of the peak.
- peak_right_index (int): The final scan index of the peak.
- resolving_power (float): The resolving power of the peak.
Methods
- _calc_kmd(dict_base). Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula.
- calc_area(). Calculate the peak area using numpy's trapezoidal fit.
- fit_peak(mz_extend=6, delta_rp=0, model='Gaussian'). Perform lineshape analysis on a peak using lmfit module.
- voigt_pso(w, r, yoff, width, loc, a). Calculate the Voigt function for particle swarm optimization (PSO) fitting.
- objective_pso(x, w, u). Calculate the objective function for PSO fitting.
- minimize_pso(lower, upper, w, u). Minimize the objective function using the particle swarm optimization algorithm.
- fit_peak_pso(mz_extend=6, upsample_multiplier=5). Perform lineshape analysis on a peak using particle swarm optimization (PSO) fitting.
- voigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1). [Legacy] Perform voigt lineshape analysis on a peak.
- pseudovoigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5). [Legacy] Perform pseudovoigt lineshape analysis on a peak.
- lorentz(oversample_multiplier=1, delta_rp=0, mz_overlay=1). [Legacy] Perform lorentz lineshape analysis on a peak.
- gaussian(oversample_multiplier=1, delta_rp=0, mz_overlay=1). [Legacy] Perform gaussian lineshape analysis on a peak.
- get_mz_domain(oversample_multiplier, mz_overlay). [Legacy] Resample/interpolate datapoints for lineshape analysis.
- number_possible_assignments(). Return the number of possible molecular formula assignments for the peak.
- molecular_formula_lowest_error(). Return the molecular formula with the smallest absolute mz error.
- molecular_formula_highest_prob_score(). Return the molecular formula with the highest confidence score.
- molecular_formula_earth_filter(lowest_error=True). Filter molecular formula using the 'Earth' filter.
- molecular_formula_water_filter(lowest_error=True). Filter molecular formula using the 'Water' filter.
- molecular_formula_air_filter(lowest_error=True). Filter molecular formula using the 'Air' filter.
- cia_score_S_P_error(). Compound Identification Algorithm SP Error - Assignment Filter.
- cia_score_N_S_P_error(). Compound Identification Algorithm NSP Error - Assignment Filter.
145 def calc_area(self): 146 """Calculate the peak area using numpy's trapezoidal fit 147 148 uses provided mz_domain to accurately integrate areas independent of digital resolution 149 150 Returns 151 ------- 152 float 153 peak area 154 """ 155 if self.peak_right_index > self.peak_left_index: 156 yy = self._ms_parent.abundance_profile[ 157 self.peak_left_index : self.peak_right_index 158 ] 159 xx = self._ms_parent.mz_exp_profile[ 160 self.peak_left_index : self.peak_right_index 161 ] 162 # check if the axis is high to low m/z or not. if its MSFromFreq its high mz first, if its from Profile, its low mz first 163 if xx[0] > xx[-1]: 164 xx = flip(xx) 165 yy = flip(yy) 166 return float(trapz(yy, xx)) 167 168 else: 169 warnings.warn( 170 "Peak Area Calculation for m/z {} has failed".format(self.mz_exp) 171 ) 172 return nan
Calculate the peak area using numpy's trapezoidal fit
uses provided mz_domain to accurately integrate areas independent of digital resolution
Returns
- float: peak area
174 def fit_peak(self, mz_extend=6, delta_rp=0, model="Gaussian"): 175 """Lineshape analysis on a peak using lmfit module. 176 177 Model and fit peak lineshape by defined function - using lmfit module 178 Does not oversample/resample/interpolate data points 179 Better to go back to time domain and perform more zero filling - if possible. 180 181 Parameters 182 ---------- 183 mz_extend : int 184 extra points left and right of peak definition to include in fitting 185 delta_rp : float 186 delta resolving power to add to resolving power 187 model : str 188 Type of lineshape model to use. 189 Models allowed: Gaussian, Lorentz, Voigt 190 191 Returns 192 ----- 193 mz_domain : ndarray 194 x-axis domain for fit 195 fit_peak : lmfit object 196 fit results object from lmfit module 197 198 Notes 199 ----- 200 Returns the calculated mz domain, initial defined abundance profile, and the fit peak results object from lmfit module 201 mz_extend here extends the x-axis domain so that we have sufficient points either side of the apex to fit. 202 Takes about 10ms per peak 203 """ 204 start_index = ( 205 self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0 206 ) 207 final_index = ( 208 self.peak_right_index + mz_extend 209 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 210 else self.peak_right_index 211 ) 212 213 # check if MSPeak contains the resolving power info 214 if self.resolving_power: 215 # full width half maximum distance 216 self.fwhm = self.mz_exp / (self.resolving_power + delta_rp) 217 218 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 219 abundance_domain = self._ms_parent.abundance_profile[ 220 start_index:final_index 221 ] 222 223 if model == "Gaussian": 224 # stardard deviation 225 sigma = self.fwhm / (2 * sqrt(2 * log(2))) 226 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 227 model = models.GaussianModel() 228 params = model.make_params( 229 center=self.mz_exp, amplitude=amplitude, sigma=sigma 230 ) 231 232 elif model == "Lorentz": 233 # stardard deviation 234 sigma = self.fwhm / 2 235 amplitude = sigma * pi * self.abundance 236 model = models.LorentzianModel() 237 params = model.make_params( 238 center=self.mz_exp, amplitude=amplitude, sigma=sigma 239 ) 240 241 elif model == "Voigt": 242 # stardard deviation 243 sigma = self.fwhm / 3.6013 244 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 245 model = models.VoigtModel() 246 params = model.make_params( 247 center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma 248 ) 249 else: 250 raise LookupError("model lineshape not known or defined") 251 252 # calc_abundance = model.eval(params=params, x=mz_domain) #Same as initial fit, returned in fit_peak object 253 fit_peak = model.fit(abundance_domain, params=params, x=mz_domain) 254 return mz_domain, fit_peak 255 256 else: 257 raise LookupError( 258 "resolving power is not defined, try to use set_max_resolving_power()" 259 )
Lineshape analysis on a peak using lmfit module.
Model and fit peak lineshape by defined function - using lmfit module Does not oversample/resample/interpolate data points Better to go back to time domain and perform more zero filling - if possible.
Parameters
- mz_extend (int): extra points left and right of peak definition to include in fitting
- delta_rp (float): delta resolving power to add to resolving power
- model (str): Type of lineshape model to use. Models allowed: Gaussian, Lorentz, Voigt
Returns
- mz_domain (ndarray): x-axis domain for fit
- fit_peak (lmfit object): fit results object from lmfit module
Notes
Returns the calculated mz domain, initial defined abundance profile, and the fit peak results object from lmfit module mz_extend here extends the x-axis domain so that we have sufficient points either side of the apex to fit. Takes about 10ms per peak
261 def voigt_pso(self, w, r, yoff, width, loc, a): 262 """Voigt function for particle swarm optimisation (PSO) fitting 263 264 From https://github.com/pnnl/nmrfit/blob/master/nmrfit/equations.py. 265 Calculates a Voigt function over w based on the relevant properties of the distribution. 266 267 Parameters 268 ---------- 269 w : ndarray 270 Array over which the Voigt function will be evaluated. 271 r : float 272 Ratio between the Guassian and Lorentzian functions. 273 yoff : float 274 Y-offset of the Voigt function. 275 width : float 276 The width of the Voigt function. 277 loc : float 278 Center of the Voigt function. 279 a : float 280 Area of the Voigt function. 281 Returns 282 ------- 283 V : ndarray 284 Array defining the Voigt function over w. 285 286 References 287 ---------- 288 1. https://github.com/pnnl/nmrfit 289 290 Notes 291 ----- 292 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 293 294 """ 295 # Lorentzian component 296 L = (2 / (pi * width)) * 1 / (1 + ((w - loc) / (0.5 * width)) ** 2) 297 298 # Gaussian component 299 G = ( 300 (2 / width) 301 * sqrt(log(2) / pi) 302 * exp(-(((w - loc) / (width / (2 * sqrt(log(2))))) ** 2)) 303 ) 304 305 # Voigt body 306 V = (yoff + a) * (r * L + (1 - r) * G) 307 308 return V
Voigt function for particle swarm optimisation (PSO) fitting
From https://github.com/pnnl/nmrfit/blob/master/nmrfit/equations.py. Calculates a Voigt function over w based on the relevant properties of the distribution.
Parameters
- w (ndarray): Array over which the Voigt function will be evaluated.
- r (float): Ratio between the Guassian and Lorentzian functions.
- yoff (float): Y-offset of the Voigt function.
- width (float): The width of the Voigt function.
- loc (float): Center of the Voigt function.
- a (float): Area of the Voigt function.
Returns
- V (ndarray): Array defining the Voigt function over w.
References
Notes
Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
310 def objective_pso(self, x, w, u): 311 """Objective function for particle swarm optimisation (PSO) fitting 312 313 The objective function used to fit supplied data. Evaluates sum of squared differences between the fit and the data. 314 315 Parameters 316 ---------- 317 x : list of floats 318 Parameter vector. 319 w : ndarray 320 Array of frequency data. 321 u : ndarray 322 Array of data to be fit. 323 324 Returns 325 ------- 326 rmse : float 327 Root mean square error between the data and fit. 328 329 References 330 ---------- 331 1. https://github.com/pnnl/nmrfit 332 333 """ 334 # global parameters 335 r, width, loc, a = x 336 yoff = 0 337 338 # calculate fit for V 339 V_fit = self.voigt_pso(w, r, yoff, width, loc, a) 340 341 # real component RMSE 342 rmse = sqrt(square((u - V_fit)).mean(axis=None)) 343 344 # return the total RMSE 345 return rmse
Objective function for particle swarm optimisation (PSO) fitting
The objective function used to fit supplied data. Evaluates sum of squared differences between the fit and the data.
Parameters
- x (list of floats): Parameter vector.
- w (ndarray): Array of frequency data.
- u (ndarray): Array of data to be fit.
Returns
- rmse (float): Root mean square error between the data and fit.
References
347 def minimize_pso(self, lower, upper, w, u): 348 """Minimization function for particle swarm optimisation (PSO) fitting 349 350 Minimizes the objective function using the particle swarm optimization algorithm. 351 Minimization function based on defined parameters 352 353 354 Parameters 355 ---------- 356 lower : list of floats 357 Lower bounds for the parameters. 358 upper : list of floats 359 Upper bounds for the parameters. 360 w : ndarray 361 Array of frequency data. 362 u : ndarray 363 Array of data to be fit. 364 365 Notes 366 ----- 367 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 368 Current parameters take ~2 seconds per peak. 369 370 371 References 372 ---------- 373 1. https://github.com/pnnl/nmrfit 374 375 """ 376 # TODO - allow support to pass swarmsize, maxiter, omega, phip, phig parameters. 377 # TODO - Refactor PSO fitting into its own class? 378 379 xopt, fopt = pyswarm.pso( 380 self.objective_pso, 381 lower, 382 upper, 383 args=(w, u), 384 swarmsize=1000, 385 maxiter=5000, 386 omega=-0.2134, 387 phip=-0.3344, 388 phig=2.3259, 389 ) 390 return xopt, fopt
Minimization function for particle swarm optimisation (PSO) fitting
Minimizes the objective function using the particle swarm optimization algorithm. Minimization function based on defined parameters
Parameters
- lower (list of floats): Lower bounds for the parameters.
- upper (list of floats): Upper bounds for the parameters.
- w (ndarray): Array of frequency data.
- u (ndarray): Array of data to be fit.
Notes
Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. Current parameters take ~2 seconds per peak.
References
392 def fit_peak_pso(self, mz_extend: int = 6, upsample_multiplier: int = 5): 393 """Lineshape analysis on a peak using particle swarm optimisation (PSO) fitting 394 395 Function to fit a Voigt peakshape using particle swarm optimisation (PSO). 396 Should return better results than lmfit, but much more computationally expensive 397 398 Parameters 399 ---------- 400 mz_extend : int, optional 401 extra points left and right of peak definition to include in fitting. Defaults to 6. 402 upsample_multiplier : int, optional 403 factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 5. 404 405 Returns 406 ------- 407 xopt : array 408 variables describing the voigt function. 409 G/L ratio, width (fwhm), apex (x-axis), area. 410 y-axis offset is fixed at 0 411 fopt : float 412 objective score (rmse) 413 psfit : array 414 recalculated y values based on function and optimised fit 415 psfit_hdp : tuple of arrays 416 0 - linspace x-axis upsampled grid 417 1 - recalculated y values based on function and upsampled x-axis grid 418 Does not change results, but aids in visualisation of the 'true' voigt lineshape 419 420 Notes 421 ----- 422 Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. 423 """ 424 # TODO - Add ability to pass pso args (i.e. swarm size, maxiter, omega, phig, etc) 425 # TODO: fix xopt. Magnitude mode data through CoreMS/Bruker starts at 0 but is noise centered well above 0. 426 # Thermo data is noise reduced by also noise subtracted, so starts at 0 427 # Absorption mode/phased data will have positive and negative components and may not be baseline corrected 428 429 start_index = ( 430 self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0 431 ) 432 final_index = ( 433 self.peak_right_index + mz_extend 434 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 435 else self.peak_right_index 436 ) 437 438 # check if MSPeak contains the resolving power info 439 if self.resolving_power: 440 # full width half maximum distance 441 self.fwhm = self.mz_exp / (self.resolving_power) 442 443 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 444 abundance_domain = self._ms_parent.abundance_profile[ 445 start_index:final_index 446 ] 447 lower = [0, self.fwhm * 0.8, (self.mz_exp - 0.0005), 0] 448 upper = [ 449 1, 450 self.fwhm * 1.2, 451 (self.mz_exp + 0.0005), 452 self.abundance / self.signal_to_noise, 453 ] 454 xopt, fopt = self.minimize_pso(lower, upper, mz_domain, abundance_domain) 455 456 psfit = self.voigt_pso(mz_domain, xopt[0], 0, xopt[1], xopt[2], xopt[3]) 457 psfit_hdp_x = linspace( 458 min(mz_domain), max(mz_domain), num=len(mz_domain) * upsample_multiplier 459 ) 460 psfit_hdp = self.voigt_pso( 461 psfit_hdp_x, xopt[0], 0, xopt[1], xopt[2], xopt[3] 462 ) 463 return xopt, fopt, psfit, (psfit_hdp_x, psfit_hdp) 464 else: 465 raise LookupError( 466 "resolving power is not defined, try to use set_max_resolving_power()" 467 )
Lineshape analysis on a peak using particle swarm optimisation (PSO) fitting
Function to fit a Voigt peakshape using particle swarm optimisation (PSO). Should return better results than lmfit, but much more computationally expensive
Parameters
- mz_extend (int, optional): extra points left and right of peak definition to include in fitting. Defaults to 6.
- upsample_multiplier (int, optional): factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 5.
Returns
- xopt (array): variables describing the voigt function. G/L ratio, width (fwhm), apex (x-axis), area. y-axis offset is fixed at 0
- fopt (float): objective score (rmse)
- psfit (array): recalculated y values based on function and optimised fit
- psfit_hdp (tuple of arrays): 0 - linspace x-axis upsampled grid 1 - recalculated y values based on function and upsampled x-axis grid Does not change results, but aids in visualisation of the 'true' voigt lineshape
Notes
Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
469 def voigt(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 470 """[Legacy] Voigt lineshape analysis function 471 Legacy function for voigt lineshape analysis 472 473 Parameters 474 ---------- 475 oversample_multiplier : int 476 factor to increase x-axis points by for simulation of fitted lineshape function 477 delta_rp : float 478 delta resolving power to add to resolving power 479 mz_overlay : int 480 extra points left and right of peak definition to include in fitting 481 482 Returns 483 ------- 484 mz_domain : ndarray 485 x-axis domain for fit 486 calc_abundance : ndarray 487 calculated abundance profile based on voigt function 488 """ 489 490 if self.resolving_power: 491 # full width half maximum distance 492 self.fwhm = self.mz_exp / ( 493 self.resolving_power + delta_rp 494 ) # self.resolving_power) 495 496 # stardart deviation 497 sigma = self.fwhm / 3.6013 498 499 # half width baseline distance 500 501 # mz_domain = linspace(self.mz_exp - hw_base_distance, 502 # self.mz_exp + hw_base_distance, datapoint) 503 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 504 505 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 506 507 # TODO derive amplitude 508 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 509 510 model = models.VoigtModel() 511 512 params = model.make_params( 513 center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma 514 ) 515 516 calc_abundance = model.eval(params=params, x=mz_domain) 517 518 return mz_domain, calc_abundance 519 520 else: 521 raise LookupError( 522 "resolving power is not defined, try to use set_max_resolving_power()" 523 )
[Legacy] Voigt lineshape analysis function Legacy function for voigt lineshape analysis
Parameters
- oversample_multiplier (int): factor to increase x-axis points by for simulation of fitted lineshape function
- delta_rp (float): delta resolving power to add to resolving power
- mz_overlay (int): extra points left and right of peak definition to include in fitting
Returns
- mz_domain (ndarray): x-axis domain for fit
- calc_abundance (ndarray): calculated abundance profile based on voigt function
525 def pseudovoigt( 526 self, oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5 527 ): 528 """[Legacy] pseudovoigt lineshape function 529 530 Legacy function for pseudovoigt lineshape analysis. 531 Note - Code may not be functional currently. 532 533 Parameters 534 ---------- 535 oversample_multiplier : int, optional 536 factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 1. 537 delta_rp : float, optional 538 delta resolving power to add to resolving power. Defaults to 0. 539 mz_overlay : int, optional 540 extra points left and right of peak definition to include in fitting. Defaults to 1. 541 fraction : float, optional 542 fraction of gaussian component in pseudovoigt function. Defaults to 0.5. 543 544 """ 545 if self.resolving_power: 546 # full width half maximum distance 547 self.fwhm = self.mz_exp / ( 548 self.resolving_power + delta_rp 549 ) # self.resolving_power) 550 551 # stardart deviation 552 sigma = self.fwhm / 2 553 554 # half width baseline distance 555 556 # mz_domain = linspace(self.mz_exp - hw_base_distance, 557 # self.mz_exp + hw_base_distance, datapoint) 558 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 559 560 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 561 model = models.PseudoVoigtModel() 562 563 # TODO derive amplitude 564 gamma = sigma 565 566 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 567 amplitude = (sqrt(pi / log(2)) * (pi * sigma * self.abundance)) / ( 568 (pi * (1 - gamma)) + (sqrt(pi * log(2)) * gamma) 569 ) 570 571 params = model.make_params(center=self.mz_exp, sigma=sigma) 572 573 calc_abundance = model.eval(params=params, x=mz_domain) 574 575 return mz_domain, calc_abundance 576 577 else: 578 raise LookupError( 579 "resolving power is not defined, try to use set_max_resolving_power()" 580 )
[Legacy] pseudovoigt lineshape function
Legacy function for pseudovoigt lineshape analysis. Note - Code may not be functional currently.
Parameters
- oversample_multiplier (int, optional): factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 1.
- delta_rp (float, optional): delta resolving power to add to resolving power. Defaults to 0.
- mz_overlay (int, optional): extra points left and right of peak definition to include in fitting. Defaults to 1.
- fraction (float, optional): fraction of gaussian component in pseudovoigt function. Defaults to 0.5.
582 def lorentz(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 583 """[Legacy] Lorentz lineshape analysis function 584 585 Legacy function for lorentz lineshape analysis 586 587 Parameters 588 ---------- 589 oversample_multiplier : int 590 factor to increase x-axis points by for simulation of fitted lineshape function 591 delta_rp : float 592 delta resolving power to add to resolving power 593 mz_overlay : int 594 extra points left and right of peak definition to include in fitting 595 596 Returns 597 ------- 598 mz_domain : ndarray 599 x-axis domain for fit 600 calc_abundance : ndarray 601 calculated abundance profile based on lorentz function 602 603 """ 604 if self.resolving_power: 605 # full width half maximum distance 606 self.fwhm = self.mz_exp / ( 607 self.resolving_power + delta_rp 608 ) # self.resolving_power) 609 610 # stardart deviation 611 sigma = self.fwhm / 2 612 613 # half width baseline distance 614 hw_base_distance = 8 * sigma 615 616 # mz_domain = linspace(self.mz_exp - hw_base_distance, 617 # self.mz_exp + hw_base_distance, datapoint) 618 619 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 620 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 621 model = models.LorentzianModel() 622 623 amplitude = sigma * pi * self.abundance 624 625 params = model.make_params( 626 center=self.mz_exp, amplitude=amplitude, sigma=sigma 627 ) 628 629 calc_abundance = model.eval(params=params, x=mz_domain) 630 631 return mz_domain, calc_abundance 632 633 else: 634 raise LookupError( 635 "resolving power is not defined, try to use set_max_resolving_power()" 636 )
[Legacy] Lorentz lineshape analysis function
Legacy function for lorentz lineshape analysis
Parameters
- oversample_multiplier (int): factor to increase x-axis points by for simulation of fitted lineshape function
- delta_rp (float): delta resolving power to add to resolving power
- mz_overlay (int): extra points left and right of peak definition to include in fitting
Returns
- mz_domain (ndarray): x-axis domain for fit
- calc_abundance (ndarray): calculated abundance profile based on lorentz function
638 def gaussian(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1): 639 """[Legacy] Gaussian lineshape analysis function 640 Legacy gaussian lineshape analysis function 641 642 Parameters 643 ---------- 644 oversample_multiplier : int 645 factor to increase x-axis points by for simulation of fitted lineshape function 646 delta_rp : float 647 delta resolving power to add to resolving power 648 mz_overlay : int 649 extra points left and right of peak definition to include in fitting 650 651 Returns 652 ------- 653 mz_domain : ndarray 654 x-axis domain for fit 655 calc_abundance : ndarray 656 calculated abundance profile based on gaussian function 657 658 659 """ 660 661 # check if MSPeak contains the resolving power info 662 if self.resolving_power: 663 # full width half maximum distance 664 self.fwhm = self.mz_exp / ( 665 self.resolving_power + delta_rp 666 ) # self.resolving_power) 667 668 # stardart deviation 669 sigma = self.fwhm / (2 * sqrt(2 * log(2))) 670 671 # half width baseline distance 672 # hw_base_distance = (3.2 * s) 673 674 # match_loz_factor = 3 675 676 # n_d = hw_base_distance * match_loz_factor 677 678 # mz_domain = linspace( 679 # self.mz_exp - n_d, self.mz_exp + n_d, datapoint) 680 681 mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay) 682 683 # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) ) 684 685 # calc_abundance = norm.pdf(mz_domain, self.mz_exp, s) 686 687 model = models.GaussianModel() 688 689 amplitude = (sqrt(2 * pi) * sigma) * self.abundance 690 691 params = model.make_params( 692 center=self.mz_exp, amplitude=amplitude, sigma=sigma 693 ) 694 695 calc_abundance = model.eval(params=params, x=mz_domain) 696 697 return mz_domain, calc_abundance 698 699 else: 700 raise LookupError( 701 "resolving power is not defined, try to use set_max_resolving_power()" 702 )
[Legacy] Gaussian lineshape analysis function Legacy gaussian lineshape analysis function
Parameters
- oversample_multiplier (int): factor to increase x-axis points by for simulation of fitted lineshape function
- delta_rp (float): delta resolving power to add to resolving power
- mz_overlay (int): extra points left and right of peak definition to include in fitting
Returns
- mz_domain (ndarray): x-axis domain for fit
- calc_abundance (ndarray): calculated abundance profile based on gaussian function
704 def get_mz_domain(self, oversample_multiplier, mz_overlay): 705 """[Legacy] function to resample/interpolate datapoints for lineshape analysis 706 707 This code is used for the legacy line fitting functions and not recommended. 708 Legacy function to support expanding mz domain for legacy lineshape functions 709 710 Parameters 711 ---------- 712 oversample_multiplier : int 713 factor to increase x-axis points by for simulation of fitted lineshape function 714 mz_overlay : int 715 extra points left and right of peak definition to include in fitting 716 717 Returns 718 ------- 719 mz_domain : ndarray 720 x-axis domain for fit 721 722 """ 723 start_index = ( 724 self.peak_left_index - mz_overlay if not self.peak_left_index == 0 else 0 725 ) 726 final_index = ( 727 self.peak_right_index + mz_overlay 728 if not self.peak_right_index == len(self._ms_parent.mz_exp_profile) 729 else self.peak_right_index 730 ) 731 732 if oversample_multiplier == 1: 733 mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index] 734 735 else: 736 # we assume a linear correlation for m/z and datapoits 737 # which is only true if the m/z range in narrow (within 1 m/z unit) 738 # this is not true for a wide m/z range 739 740 indexes = range(start_index, final_index + 1) 741 mz = self._ms_parent.mz_exp_profile[indexes] 742 pol = poly1d(polyfit(indexes, mz, 1)) 743 oversampled_indexes = linspace( 744 start_index, 745 final_index, 746 (final_index - start_index) * oversample_multiplier, 747 ) 748 mz_domain = pol(oversampled_indexes) 749 750 return mz_domain
[Legacy] function to resample/interpolate datapoints for lineshape analysis
This code is used for the legacy line fitting functions and not recommended. Legacy function to support expanding mz domain for legacy lineshape functions
Parameters
- oversample_multiplier (int): factor to increase x-axis points by for simulation of fitted lineshape function
- mz_overlay (int): extra points left and right of peak definition to include in fitting
Returns
- mz_domain (ndarray): x-axis domain for fit
758 def molecular_formula_lowest_error(self): 759 """Return the molecular formula with the smallest absolute mz error""" 760 761 return min(self.molecular_formulas, key=lambda m: abs(m.mz_error))
Return the molecular formula with the smallest absolute mz error
763 def molecular_formula_highest_prob_score(self): 764 """Return the molecular formula with the highest confidence score score""" 765 766 return max(self.molecular_formulas, key=lambda m: abs(m.confidence_score))
Return the molecular formula with the highest confidence score score
768 def molecular_formula_earth_filter(self, lowest_error=True): 769 """Filter molecular formula using the 'Earth' filter 770 771 This function applies the Formularity-esque 'Earth' filter to possible molecular formula assignments. 772 Earth Filter: 773 O > 0 AND N <= 3 AND P <= 2 AND 3P <= O 774 775 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Earth filter. 776 Otherwise, it will return all Earth-filter compliant formulas. 777 778 Parameters 779 ---------- 780 lowest_error : bool, optional. 781 Return only the lowest error formula which also fits the Earth filter. 782 If False, return all Earth-filter compliant formulas. Default is True. 783 784 Returns 785 ------- 786 list 787 List of molecular formula objects which fit the Earth filter 788 789 References 790 ---------- 791 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 792 Anal. Chem. 2017, 89, 23, 12659–12665 793 doi: 10.1021/acs.analchem.7b03318 794 """ 795 796 candidates = list( 797 filter( 798 lambda mf: mf.get("O") > 0 799 and mf.get("N") <= 3 800 and mf.get("P") <= 2 801 and (3 * mf.get("P")) <= mf.get("O"), 802 self.molecular_formulas, 803 ) 804 ) 805 if len(candidates) > 0: 806 if lowest_error: 807 return min(candidates, key=lambda m: abs(m.mz_error)) 808 else: 809 return candidates 810 else: 811 return candidates
Filter molecular formula using the 'Earth' filter
This function applies the Formularity-esque 'Earth' filter to possible molecular formula assignments. Earth Filter: O > 0 AND N <= 3 AND P <= 2 AND 3P <= O
If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Earth filter. Otherwise, it will return all Earth-filter compliant formulas.
Parameters
- lowest_error (bool, optional.): Return only the lowest error formula which also fits the Earth filter. If False, return all Earth-filter compliant formulas. Default is True.
Returns
- list: List of molecular formula objects which fit the Earth filter
References
- Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" Anal. Chem. 2017, 89, 23, 12659–12665 doi: 10.1021/acs.analchem.7b03318
813 def molecular_formula_water_filter(self, lowest_error=True): 814 """Filter molecular formula using the 'Water' filter 815 816 This function applies the Formularity-esque 'Water' filter to possible molecular formula assignments. 817 Water Filter: 818 O > 0 AND N <= 3 AND S <= 2 AND P <= 2 819 820 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Water filter. 821 Otherwise, it will return all Water-filter compliant formulas. 822 823 Parameters 824 ---------- 825 lowest_error : bool, optional 826 Return only the lowest error formula which also fits the Water filter. 827 If False, return all Water-filter compliant formulas. Defaults to 2 828 829 Returns 830 ------- 831 list 832 List of molecular formula objects which fit the Water filter 833 834 References 835 ---------- 836 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 837 Anal. Chem. 2017, 89, 23, 12659–12665 838 doi: 10.1021/acs.analchem.7b03318 839 """ 840 841 candidates = list( 842 filter( 843 lambda mf: mf.get("O") > 0 844 and mf.get("N") <= 3 845 and mf.get("S") <= 2 846 and mf.get("P") <= 2, 847 self.molecular_formulas, 848 ) 849 ) 850 if len(candidates) > 0: 851 if lowest_error: 852 return min(candidates, key=lambda m: abs(m.mz_error)) 853 else: 854 return candidates 855 else: 856 return candidates
Filter molecular formula using the 'Water' filter
This function applies the Formularity-esque 'Water' filter to possible molecular formula assignments. Water Filter: O > 0 AND N <= 3 AND S <= 2 AND P <= 2
If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Water filter. Otherwise, it will return all Water-filter compliant formulas.
Parameters
- lowest_error (bool, optional): Return only the lowest error formula which also fits the Water filter. If False, return all Water-filter compliant formulas. Defaults to 2
Returns
- list: List of molecular formula objects which fit the Water filter
References
- Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" Anal. Chem. 2017, 89, 23, 12659–12665 doi: 10.1021/acs.analchem.7b03318
858 def molecular_formula_air_filter(self, lowest_error=True): 859 """Filter molecular formula using the 'Air' filter 860 861 This function applies the Formularity-esque 'Air' filter to possible molecular formula assignments. 862 Air Filter: 863 O > 0 AND N <= 3 AND S <= 1 AND P = 0 AND 3(S+N) <= O 864 865 If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Air filter. 866 Otherwise, it will return all Air-filter compliant formulas. 867 868 Parameters 869 ---------- 870 lowest_error : bool, optional 871 Return only the lowest error formula which also fits the Air filter. 872 If False, return all Air-filter compliant formulas. Defaults to True. 873 874 Returns 875 ------- 876 list 877 List of molecular formula objects which fit the Air filter 878 879 References 880 ---------- 881 1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" 882 Anal. Chem. 2017, 89, 23, 12659–12665 883 doi: 10.1021/acs.analchem.7b03318 884 """ 885 886 candidates = list( 887 filter( 888 lambda mf: mf.get("O") > 0 889 and mf.get("N") <= 2 890 and mf.get("S") <= 1 891 and mf.get("P") == 0 892 and 3 * (mf.get("S") + mf.get("N")) <= mf.get("O"), 893 self.molecular_formulas, 894 ) 895 ) 896 897 if len(candidates) > 0: 898 if lowest_error: 899 return min(candidates, key=lambda m: abs(m.mz_error)) 900 else: 901 return candidates 902 else: 903 return candidates
Filter molecular formula using the 'Air' filter
This function applies the Formularity-esque 'Air' filter to possible molecular formula assignments. Air Filter: O > 0 AND N <= 3 AND S <= 1 AND P = 0 AND 3(S+N) <= O
If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Air filter. Otherwise, it will return all Air-filter compliant formulas.
Parameters
- lowest_error (bool, optional): Return only the lowest error formula which also fits the Air filter. If False, return all Air-filter compliant formulas. Defaults to True.
Returns
- list: List of molecular formula objects which fit the Air filter
References
- Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" Anal. Chem. 2017, 89, 23, 12659–12665 doi: 10.1021/acs.analchem.7b03318
905 def cia_score_S_P_error(self): 906 """Compound Identification Algorithm SP Error - Assignment Filter 907 908 This function applies the Compound Identification Algorithm (CIA) SP Error filter to possible molecular formula assignments. 909 910 It takes the molecular formula with the lowest S+P count, and returns the formula with the lowest absolute error from this subset. 911 912 Returns 913 ------- 914 MolecularFormula 915 A single molecular formula which fits the rules of the CIA SP Error filter 916 917 918 References 919 ---------- 920 1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" 921 Anal. Chem. 2006, 78, 13, 4363–4373 922 doi: 10.1021/ac0600306 923 """ 924 # case EFormulaScore.HAcap: 925 926 lowest_S_P_mf = min( 927 self.molecular_formulas, key=lambda mf: mf.get("S") + mf.get("P") 928 ) 929 lowest_S_P_count = lowest_S_P_mf.get("S") + lowest_S_P_mf.get("P") 930 931 list_same_s_p = list( 932 filter( 933 lambda mf: mf.get("S") + mf.get("P") == lowest_S_P_count, 934 self.molecular_formulas, 935 ) 936 ) 937 938 # check if list is not empty 939 if list_same_s_p: 940 return min(list_same_s_p, key=lambda m: abs(m.mz_error)) 941 942 else: 943 return lowest_S_P_mf
Compound Identification Algorithm SP Error - Assignment Filter
This function applies the Compound Identification Algorithm (CIA) SP Error filter to possible molecular formula assignments.
It takes the molecular formula with the lowest S+P count, and returns the formula with the lowest absolute error from this subset.
Returns
- MolecularFormula: A single molecular formula which fits the rules of the CIA SP Error filter
References
- Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" Anal. Chem. 2006, 78, 13, 4363–4373 doi: 10.1021/ac0600306
945 def cia_score_N_S_P_error(self): 946 """Compound Identification Algorithm NSP Error - Assignment Filter 947 948 This function applies the Compound Identification Algorithm (CIA) NSP Error filter to possible molecular formula assignments. 949 950 It takes the molecular formula with the lowest N+S+P count, and returns the formula with the lowest absolute error from this subset. 951 952 Returns 953 ------- 954 MolecularFormula 955 A single molecular formula which fits the rules of the CIA NSP Error filter 956 957 References 958 ---------- 959 1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" 960 Anal. Chem. 2006, 78, 13, 4363–4373 961 doi: 10.1021/ac0600306 962 963 Raises 964 ------- 965 Exception 966 If no molecular formula are associated with mass spectrum peak. 967 """ 968 # case EFormulaScore.HAcap: 969 if self.molecular_formulas: 970 lowest_N_S_P_mf = min( 971 self.molecular_formulas, 972 key=lambda mf: mf.get("N") + mf.get("S") + mf.get("P"), 973 ) 974 lowest_N_S_P_count = ( 975 lowest_N_S_P_mf.get("N") 976 + lowest_N_S_P_mf.get("S") 977 + lowest_N_S_P_mf.get("P") 978 ) 979 980 list_same_N_S_P = list( 981 filter( 982 lambda mf: mf.get("N") + mf.get("S") + mf.get("P") 983 == lowest_N_S_P_count, 984 self.molecular_formulas, 985 ) 986 ) 987 988 if list_same_N_S_P: 989 SP_filtered_list = list( 990 filter( 991 lambda mf: (mf.get("S") <= 3) and (mf.get("P") <= 1), 992 list_same_N_S_P, 993 ) 994 ) 995 996 if SP_filtered_list: 997 return min(SP_filtered_list, key=lambda m: abs(m.mz_error)) 998 999 else: 1000 return min(list_same_N_S_P, key=lambda m: abs(m.mz_error)) 1001 1002 else: 1003 return lowest_N_S_P_mf 1004 else: 1005 raise Exception( 1006 "No molecular formula associated with the mass spectrum peak at m/z: %.6f" 1007 % self.mz_exp 1008 )
Compound Identification Algorithm NSP Error - Assignment Filter
This function applies the Compound Identification Algorithm (CIA) NSP Error filter to possible molecular formula assignments.
It takes the molecular formula with the lowest N+S+P count, and returns the formula with the lowest absolute error from this subset.
Returns
- MolecularFormula: A single molecular formula which fits the rules of the CIA NSP Error filter
References
- Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" Anal. Chem. 2006, 78, 13, 4363–4373 doi: 10.1021/ac0600306
Raises
- Exception: If no molecular formula are associated with mass spectrum peak.