corems.ms_peak.calc.MSPeakCalc

   1__author__ = "Yuri E. Corilo"
   2__date__ = "Jun 04, 2019"
   3
   4import warnings
   5
   6import pyswarm
   7from lmfit import models
   8from numpy import (
   9    ceil,
  10    exp,
  11    flip,
  12    floor,
  13    linspace,
  14    log,
  15    nan,
  16    pi,
  17    poly1d,
  18    polyfit,
  19    rint,
  20    sqrt,
  21    square,
  22    trapz,
  23)
  24
  25from corems.encapsulation.constant import Atoms
  26from corems.encapsulation.factory.parameters import MSParameters
  27
  28
  29class MSPeakCalculation:
  30    """Class to perform calculations on MSPeak objects.
  31
  32    This class provides methods to perform various calculations on MSPeak objects, such as calculating Kendrick Mass Defect (KMD) and Kendrick Mass (KM), calculating peak area, and fitting peak lineshape using different models.
  33
  34    Parameters
  35    ----------
  36    None
  37
  38    Attributes
  39    ----------
  40    _ms_parent : MSParent
  41        The parent MSParent object associated with the MSPeakCalculation object.
  42    mz_exp : float
  43        The experimental m/z value of the peak.
  44    peak_left_index : int
  45        The start scan index of the peak.
  46    peak_right_index : int
  47        The final scan index of the peak.
  48    resolving_power : float
  49        The resolving power of the peak.
  50
  51    Methods
  52    -------
  53    * _calc_kmd(dict_base).
  54        Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula.
  55    * calc_area().
  56        Calculate the peak area using numpy's trapezoidal fit.
  57    * fit_peak(mz_extend=6, delta_rp=0, model='Gaussian').
  58        Perform lineshape analysis on a peak using lmfit module.
  59    * voigt_pso(w, r, yoff, width, loc, a).
  60        Calculate the Voigt function for particle swarm optimization (PSO) fitting.
  61    * objective_pso(x, w, u).
  62        Calculate the objective function for PSO fitting.
  63    * minimize_pso(lower, upper, w, u).
  64        Minimize the objective function using the particle swarm optimization algorithm.
  65    * fit_peak_pso(mz_extend=6, upsample_multiplier=5).
  66        Perform lineshape analysis on a peak using particle swarm optimization (PSO) fitting.
  67    * voigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1).
  68        [Legacy] Perform voigt lineshape analysis on a peak.
  69    * pseudovoigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5).
  70        [Legacy] Perform pseudovoigt lineshape analysis on a peak.
  71    * lorentz(oversample_multiplier=1, delta_rp=0, mz_overlay=1).
  72        [Legacy] Perform lorentz lineshape analysis on a peak.
  73    * gaussian(oversample_multiplier=1, delta_rp=0, mz_overlay=1).
  74        [Legacy] Perform gaussian lineshape analysis on a peak.
  75    * get_mz_domain(oversample_multiplier, mz_overlay).
  76        [Legacy] Resample/interpolate datapoints for lineshape analysis.
  77    * number_possible_assignments().
  78        Return the number of possible molecular formula assignments for the peak.
  79    * molecular_formula_lowest_error().
  80        Return the molecular formula with the smallest absolute mz error.
  81    * molecular_formula_highest_prob_score().
  82        Return the molecular formula with the highest confidence score.
  83    * molecular_formula_earth_filter(lowest_error=True).
  84        Filter molecular formula using the 'Earth' filter.
  85    * molecular_formula_water_filter(lowest_error=True).
  86        Filter molecular formula using the 'Water' filter.
  87    * molecular_formula_air_filter(lowest_error=True).
  88        Filter molecular formula using the 'Air' filter.
  89    * cia_score_S_P_error().
  90        Compound Identification Algorithm SP Error - Assignment Filter.
  91    * cia_score_N_S_P_error().
  92        Compound Identification Algorithm NSP Error - Assignment Filter.
  93
  94    """
  95
  96    def _calc_kmd(self, dict_base):
  97        """Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula
  98
  99        Parameters
 100        ----------
 101        dict_base : dict
 102            dictionary with the base formula to be used in the calculation
 103            Default is CH2, e.g.
 104                dict_base = {"C": 1, "H": 2}
 105        """
 106
 107        if self._ms_parent:
 108            # msPeak obj does have a ms object parent
 109            kendrick_rounding_method = (
 110                self._ms_parent.mspeaks_settings.kendrick_rounding_method
 111            )  # rounding method can be one of floor, ceil or round
 112            # msPeak obj does not have a ms object parent
 113        else:
 114            kendrick_rounding_method = MSParameters.ms_peak.kendrick_rounding_method
 115
 116        mass = 0
 117        for atom in dict_base.keys():
 118            mass += Atoms.atomic_masses.get(atom) * dict_base.get(atom)
 119
 120        kendrick_mass = (int(mass) / mass) * self.mz_exp
 121
 122        if kendrick_rounding_method == "ceil":
 123            nominal_km = ceil(kendrick_mass)
 124
 125        elif kendrick_rounding_method == "round":
 126            nominal_km = rint(kendrick_mass)
 127
 128        elif kendrick_rounding_method == "floor":
 129            nominal_km = floor(kendrick_mass)
 130
 131        else:
 132            raise Exception(
 133                "%s method was not implemented, please refer to corems.ms_peak.calc.MSPeakCalc Class"
 134                % kendrick_rounding_method
 135            )
 136
 137        kmd = nominal_km - kendrick_mass
 138
 139        # kmd = (nominal_km - km) * 1
 140        # kmd = round(kmd,0)
 141
 142        return kmd, kendrick_mass, nominal_km
 143
 144    def calc_area(self):
 145        """Calculate the peak area using numpy's trapezoidal fit
 146
 147        uses provided mz_domain to accurately integrate areas independent of digital resolution
 148
 149        Returns
 150        -------
 151        float
 152            peak area
 153        """
 154        if self.peak_right_index > self.peak_left_index:
 155            yy = self._ms_parent.abundance_profile[
 156                self.peak_left_index : self.peak_right_index
 157            ]
 158            xx = self._ms_parent.mz_exp_profile[
 159                self.peak_left_index : self.peak_right_index
 160            ]
 161            # check if the axis is high to low m/z or not. if its MSFromFreq its high mz first, if its from Profile, its low mz first
 162            if xx[0] > xx[-1]:
 163                xx = flip(xx)
 164                yy = flip(yy)
 165            return float(trapz(yy, xx))
 166
 167        else:
 168            warnings.warn(
 169                "Peak Area Calculation for m/z {} has failed".format(self.mz_exp)
 170            )
 171            return nan
 172
 173    def fit_peak(self, mz_extend=6, delta_rp=0, model="Gaussian"):
 174        """Lineshape analysis on a peak using lmfit module.
 175
 176        Model and fit peak lineshape by defined function - using lmfit module
 177        Does not oversample/resample/interpolate data points
 178        Better to go back to time domain and perform more zero filling - if possible.
 179
 180        Parameters
 181        ----------
 182        mz_extend : int
 183            extra points left and right of peak definition to include in fitting
 184        delta_rp : float
 185            delta resolving power to add to resolving power
 186        model : str
 187            Type of lineshape model to use.
 188            Models allowed: Gaussian, Lorentz, Voigt
 189
 190        Returns
 191        -----
 192        mz_domain : ndarray
 193            x-axis domain for fit
 194        fit_peak : lmfit object
 195            fit results object from lmfit module
 196
 197        Notes
 198        -----
 199        Returns the calculated mz domain, initial defined abundance profile, and the fit peak results object from lmfit module
 200        mz_extend here extends the x-axis domain so that we have sufficient points either side of the apex to fit.
 201        Takes about 10ms per peak
 202        """
 203        start_index = (
 204            self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0
 205        )
 206        final_index = (
 207            self.peak_right_index + mz_extend
 208            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
 209            else self.peak_right_index
 210        )
 211
 212        # check if MSPeak contains the resolving power info
 213        if self.resolving_power:
 214            # full width half maximum distance
 215            self.fwhm = self.mz_exp / (self.resolving_power + delta_rp)
 216
 217            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
 218            abundance_domain = self._ms_parent.abundance_profile[
 219                start_index:final_index
 220            ]
 221
 222            if model == "Gaussian":
 223                # stardard deviation
 224                sigma = self.fwhm / (2 * sqrt(2 * log(2)))
 225                amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 226                model = models.GaussianModel()
 227                params = model.make_params(
 228                    center=self.mz_exp, amplitude=amplitude, sigma=sigma
 229                )
 230
 231            elif model == "Lorentz":
 232                # stardard deviation
 233                sigma = self.fwhm / 2
 234                amplitude = sigma * pi * self.abundance
 235                model = models.LorentzianModel()
 236                params = model.make_params(
 237                    center=self.mz_exp, amplitude=amplitude, sigma=sigma
 238                )
 239
 240            elif model == "Voigt":
 241                # stardard deviation
 242                sigma = self.fwhm / 3.6013
 243                amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 244                model = models.VoigtModel()
 245                params = model.make_params(
 246                    center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma
 247                )
 248            else:
 249                raise LookupError("model lineshape not known or defined")
 250
 251            # calc_abundance = model.eval(params=params, x=mz_domain) #Same as initial fit, returned in fit_peak object
 252            fit_peak = model.fit(abundance_domain, params=params, x=mz_domain)
 253            return mz_domain, fit_peak
 254
 255        else:
 256            raise LookupError(
 257                "resolving power is not defined, try to use set_max_resolving_power()"
 258            )
 259
 260    def voigt_pso(self, w, r, yoff, width, loc, a):
 261        """Voigt function for particle swarm optimisation (PSO) fitting
 262
 263        From https://github.com/pnnl/nmrfit/blob/master/nmrfit/equations.py.
 264        Calculates a Voigt function over w based on the relevant properties of the distribution.
 265
 266        Parameters
 267        ----------
 268        w : ndarray
 269            Array over which the Voigt function will be evaluated.
 270        r : float
 271            Ratio between the Guassian and Lorentzian functions.
 272        yoff : float
 273            Y-offset of the Voigt function.
 274        width : float
 275            The width of the Voigt function.
 276        loc : float
 277            Center of the Voigt function.
 278        a : float
 279            Area of the Voigt function.
 280        Returns
 281        -------
 282        V : ndarray
 283            Array defining the Voigt function over w.
 284
 285        References
 286        ----------
 287        1. https://github.com/pnnl/nmrfit
 288
 289        Notes
 290        -----
 291        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
 292
 293        """
 294        # Lorentzian component
 295        L = (2 / (pi * width)) * 1 / (1 + ((w - loc) / (0.5 * width)) ** 2)
 296
 297        # Gaussian component
 298        G = (
 299            (2 / width)
 300            * sqrt(log(2) / pi)
 301            * exp(-(((w - loc) / (width / (2 * sqrt(log(2))))) ** 2))
 302        )
 303
 304        # Voigt body
 305        V = (yoff + a) * (r * L + (1 - r) * G)
 306
 307        return V
 308
 309    def objective_pso(self, x, w, u):
 310        """Objective function for particle swarm optimisation (PSO) fitting
 311
 312        The objective function used to fit supplied data.  Evaluates sum of squared differences between the fit and the data.
 313
 314        Parameters
 315        ----------
 316        x : list of floats
 317            Parameter vector.
 318        w : ndarray
 319            Array of frequency data.
 320        u : ndarray
 321            Array of data to be fit.
 322
 323        Returns
 324        -------
 325        rmse : float
 326            Root mean square error between the data and fit.
 327
 328        References
 329        ----------
 330        1. https://github.com/pnnl/nmrfit
 331
 332        """
 333        # global parameters
 334        r, width, loc, a = x
 335        yoff = 0
 336
 337        # calculate fit for V
 338        V_fit = self.voigt_pso(w, r, yoff, width, loc, a)
 339
 340        # real component RMSE
 341        rmse = sqrt(square((u - V_fit)).mean(axis=None))
 342
 343        # return the total RMSE
 344        return rmse
 345
 346    def minimize_pso(self, lower, upper, w, u):
 347        """Minimization function for particle swarm optimisation (PSO) fitting
 348
 349        Minimizes the objective function using the particle swarm optimization algorithm.
 350        Minimization function based on defined parameters
 351
 352
 353        Parameters
 354        ----------
 355        lower : list of floats
 356            Lower bounds for the parameters.
 357        upper : list of floats
 358            Upper bounds for the parameters.
 359        w : ndarray
 360            Array of frequency data.
 361        u : ndarray
 362            Array of data to be fit.
 363
 364        Notes
 365        -----
 366        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
 367        Current parameters take ~2 seconds per peak.
 368
 369
 370        References
 371        ----------
 372        1. https://github.com/pnnl/nmrfit
 373
 374        """
 375        # TODO - allow support to pass swarmsize, maxiter, omega, phip, phig parameters.
 376        # TODO - Refactor PSO fitting into its own class?
 377
 378        xopt, fopt = pyswarm.pso(
 379            self.objective_pso,
 380            lower,
 381            upper,
 382            args=(w, u),
 383            swarmsize=1000,
 384            maxiter=5000,
 385            omega=-0.2134,
 386            phip=-0.3344,
 387            phig=2.3259,
 388        )
 389        return xopt, fopt
 390
 391    def fit_peak_pso(self, mz_extend: int = 6, upsample_multiplier: int = 5):
 392        """Lineshape analysis on a peak using particle swarm optimisation (PSO) fitting
 393
 394        Function to fit a Voigt peakshape using particle swarm optimisation (PSO).
 395        Should return better results than lmfit, but much more computationally expensive
 396
 397        Parameters
 398        ----------
 399        mz_extend : int, optional
 400            extra points left and right of peak definition to include in fitting. Defaults to 6.
 401        upsample_multiplier : int, optional
 402            factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 5.
 403
 404        Returns
 405        -------
 406        xopt : array
 407            variables describing the voigt function.
 408            G/L ratio, width (fwhm), apex (x-axis), area.
 409            y-axis offset is fixed at 0
 410        fopt : float
 411            objective score (rmse)
 412        psfit : array
 413            recalculated y values based on function and optimised fit
 414        psfit_hdp : tuple of arrays
 415            0 - linspace x-axis upsampled grid
 416            1 - recalculated y values based on function and upsampled x-axis grid
 417            Does not change results, but aids in visualisation of the 'true' voigt lineshape
 418
 419        Notes
 420        -----
 421        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
 422        """
 423        # TODO - Add ability to pass pso args (i.e. swarm size, maxiter, omega, phig, etc)
 424        # TODO: fix xopt. Magnitude mode data through CoreMS/Bruker starts at 0 but is noise centered well above 0.
 425        # Thermo data is noise reduced by also noise subtracted, so starts at 0
 426        # Absorption mode/phased data will have positive and negative components and may not be baseline corrected
 427
 428        start_index = (
 429            self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0
 430        )
 431        final_index = (
 432            self.peak_right_index + mz_extend
 433            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
 434            else self.peak_right_index
 435        )
 436
 437        # check if MSPeak contains the resolving power info
 438        if self.resolving_power:
 439            # full width half maximum distance
 440            self.fwhm = self.mz_exp / (self.resolving_power)
 441
 442            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
 443            abundance_domain = self._ms_parent.abundance_profile[
 444                start_index:final_index
 445            ]
 446            lower = [0, self.fwhm * 0.8, (self.mz_exp - 0.0005), 0]
 447            upper = [
 448                1,
 449                self.fwhm * 1.2,
 450                (self.mz_exp + 0.0005),
 451                self.abundance / self.signal_to_noise,
 452            ]
 453            xopt, fopt = self.minimize_pso(lower, upper, mz_domain, abundance_domain)
 454
 455            psfit = self.voigt_pso(mz_domain, xopt[0], 0, xopt[1], xopt[2], xopt[3])
 456            psfit_hdp_x = linspace(
 457                min(mz_domain), max(mz_domain), num=len(mz_domain) * upsample_multiplier
 458            )
 459            psfit_hdp = self.voigt_pso(
 460                psfit_hdp_x, xopt[0], 0, xopt[1], xopt[2], xopt[3]
 461            )
 462            return xopt, fopt, psfit, (psfit_hdp_x, psfit_hdp)
 463        else:
 464            raise LookupError(
 465                "resolving power is not defined, try to use set_max_resolving_power()"
 466            )
 467
 468    def voigt(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
 469        """[Legacy] Voigt lineshape analysis function
 470        Legacy function for voigt lineshape analysis
 471
 472        Parameters
 473        ----------
 474        oversample_multiplier : int
 475            factor to increase x-axis points by for simulation of fitted lineshape function
 476        delta_rp : float
 477            delta resolving power to add to resolving power
 478        mz_overlay : int
 479            extra points left and right of peak definition to include in fitting
 480
 481        Returns
 482        -------
 483        mz_domain : ndarray
 484            x-axis domain for fit
 485        calc_abundance : ndarray
 486            calculated abundance profile based on voigt function
 487        """
 488
 489        if self.resolving_power:
 490            # full width half maximum distance
 491            self.fwhm = self.mz_exp / (
 492                self.resolving_power + delta_rp
 493            )  # self.resolving_power)
 494
 495            # stardart deviation
 496            sigma = self.fwhm / 3.6013
 497
 498            # half width baseline distance
 499
 500            # mz_domain = linspace(self.mz_exp - hw_base_distance,
 501            #                     self.mz_exp + hw_base_distance, datapoint)
 502            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
 503
 504            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
 505
 506            # TODO derive amplitude
 507            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 508
 509            model = models.VoigtModel()
 510
 511            params = model.make_params(
 512                center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma
 513            )
 514
 515            calc_abundance = model.eval(params=params, x=mz_domain)
 516
 517            return mz_domain, calc_abundance
 518
 519        else:
 520            raise LookupError(
 521                "resolving power is not defined, try to use set_max_resolving_power()"
 522            )
 523
 524    def pseudovoigt(
 525        self, oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5
 526    ):
 527        """[Legacy] pseudovoigt lineshape function
 528
 529        Legacy function for pseudovoigt lineshape analysis.
 530        Note - Code may not be functional currently.
 531
 532        Parameters
 533        ----------
 534        oversample_multiplier : int, optional
 535            factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 1.
 536        delta_rp : float, optional
 537            delta resolving power to add to resolving power. Defaults to 0.
 538        mz_overlay : int, optional
 539            extra points left and right of peak definition to include in fitting. Defaults to 1.
 540        fraction : float, optional
 541            fraction of gaussian component in pseudovoigt function. Defaults to 0.5.
 542
 543        """
 544        if self.resolving_power:
 545            # full width half maximum distance
 546            self.fwhm = self.mz_exp / (
 547                self.resolving_power + delta_rp
 548            )  # self.resolving_power)
 549
 550            # stardart deviation
 551            sigma = self.fwhm / 2
 552
 553            # half width baseline distance
 554
 555            # mz_domain = linspace(self.mz_exp - hw_base_distance,
 556            #                     self.mz_exp + hw_base_distance, datapoint)
 557            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
 558
 559            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
 560            model = models.PseudoVoigtModel()
 561
 562            # TODO derive amplitude
 563            gamma = sigma
 564
 565            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 566            amplitude = (sqrt(pi / log(2)) * (pi * sigma * self.abundance)) / (
 567                (pi * (1 - gamma)) + (sqrt(pi * log(2)) * gamma)
 568            )
 569
 570            params = model.make_params(center=self.mz_exp, sigma=sigma)
 571
 572            calc_abundance = model.eval(params=params, x=mz_domain)
 573
 574            return mz_domain, calc_abundance
 575
 576        else:
 577            raise LookupError(
 578                "resolving power is not defined, try to use set_max_resolving_power()"
 579            )
 580
 581    def lorentz(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
 582        """[Legacy] Lorentz lineshape analysis function
 583
 584        Legacy function for lorentz lineshape analysis
 585
 586        Parameters
 587        ----------
 588        oversample_multiplier : int
 589            factor to increase x-axis points by for simulation of fitted lineshape function
 590        delta_rp : float
 591            delta resolving power to add to resolving power
 592        mz_overlay : int
 593            extra points left and right of peak definition to include in fitting
 594
 595        Returns
 596        -------
 597        mz_domain : ndarray
 598            x-axis domain for fit
 599        calc_abundance : ndarray
 600            calculated abundance profile based on lorentz function
 601
 602        """
 603        if self.resolving_power:
 604            # full width half maximum distance
 605            self.fwhm = self.mz_exp / (
 606                self.resolving_power + delta_rp
 607            )  # self.resolving_power)
 608
 609            # stardart deviation
 610            sigma = self.fwhm / 2
 611
 612            # half width baseline distance
 613            hw_base_distance = 8 * sigma
 614
 615            # mz_domain = linspace(self.mz_exp - hw_base_distance,
 616            #                     self.mz_exp + hw_base_distance, datapoint)
 617
 618            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
 619            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
 620            model = models.LorentzianModel()
 621
 622            amplitude = sigma * pi * self.abundance
 623
 624            params = model.make_params(
 625                center=self.mz_exp, amplitude=amplitude, sigma=sigma
 626            )
 627
 628            calc_abundance = model.eval(params=params, x=mz_domain)
 629
 630            return mz_domain, calc_abundance
 631
 632        else:
 633            raise LookupError(
 634                "resolving power is not defined, try to use set_max_resolving_power()"
 635            )
 636
 637    def gaussian(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
 638        """[Legacy] Gaussian lineshape analysis function
 639        Legacy gaussian lineshape analysis function
 640
 641        Parameters
 642        ----------
 643        oversample_multiplier : int
 644            factor to increase x-axis points by for simulation of fitted lineshape function
 645        delta_rp : float
 646            delta resolving power to add to resolving power
 647        mz_overlay : int
 648            extra points left and right of peak definition to include in fitting
 649
 650        Returns
 651        -------
 652        mz_domain : ndarray
 653            x-axis domain for fit
 654        calc_abundance : ndarray
 655            calculated abundance profile based on gaussian function
 656
 657
 658        """
 659
 660        # check if MSPeak contains the resolving power info
 661        if self.resolving_power:
 662            # full width half maximum distance
 663            self.fwhm = self.mz_exp / (
 664                self.resolving_power + delta_rp
 665            )  # self.resolving_power)
 666
 667            # stardart deviation
 668            sigma = self.fwhm / (2 * sqrt(2 * log(2)))
 669
 670            # half width baseline distance
 671            # hw_base_distance = (3.2 * s)
 672
 673            # match_loz_factor = 3
 674
 675            # n_d = hw_base_distance * match_loz_factor
 676
 677            # mz_domain = linspace(
 678            #    self.mz_exp - n_d, self.mz_exp + n_d, datapoint)
 679
 680            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
 681
 682            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
 683
 684            # calc_abundance = norm.pdf(mz_domain, self.mz_exp, s)
 685
 686            model = models.GaussianModel()
 687
 688            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 689
 690            params = model.make_params(
 691                center=self.mz_exp, amplitude=amplitude, sigma=sigma
 692            )
 693
 694            calc_abundance = model.eval(params=params, x=mz_domain)
 695
 696            return mz_domain, calc_abundance
 697
 698        else:
 699            raise LookupError(
 700                "resolving power is not defined, try to use set_max_resolving_power()"
 701            )
 702
 703    def get_mz_domain(self, oversample_multiplier, mz_overlay):
 704        """[Legacy] function to resample/interpolate datapoints for lineshape analysis
 705
 706        This code is used for the legacy line fitting functions and not recommended.
 707        Legacy function to support expanding mz domain for legacy lineshape functions
 708
 709        Parameters
 710        ----------
 711        oversample_multiplier : int
 712            factor to increase x-axis points by for simulation of fitted lineshape function
 713        mz_overlay : int
 714            extra points left and right of peak definition to include in fitting
 715
 716        Returns
 717        -------
 718        mz_domain : ndarray
 719            x-axis domain for fit
 720
 721        """
 722        start_index = (
 723            self.peak_left_index - mz_overlay if not self.peak_left_index == 0 else 0
 724        )
 725        final_index = (
 726            self.peak_right_index + mz_overlay
 727            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
 728            else self.peak_right_index
 729        )
 730
 731        if oversample_multiplier == 1:
 732            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
 733
 734        else:
 735            # we assume a linear correlation for m/z and datapoits
 736            # which is only true if the m/z range in narrow (within 1 m/z unit)
 737            # this is not true for a wide m/z range
 738
 739            indexes = range(start_index, final_index + 1)
 740            mz = self._ms_parent.mz_exp_profile[indexes]
 741            pol = poly1d(polyfit(indexes, mz, 1))
 742            oversampled_indexes = linspace(
 743                start_index,
 744                final_index,
 745                (final_index - start_index) * oversample_multiplier,
 746            )
 747            mz_domain = pol(oversampled_indexes)
 748
 749        return mz_domain
 750
 751    @property
 752    def number_possible_assignments(
 753        self,
 754    ):
 755        return len(self.molecular_formulas)
 756
 757    def molecular_formula_lowest_error(self):
 758        """Return the molecular formula with the smallest absolute mz error"""
 759
 760        return min(self.molecular_formulas, key=lambda m: abs(m.mz_error))
 761
 762    def molecular_formula_highest_prob_score(self):
 763        """Return the molecular formula with the highest confidence score score"""
 764
 765        return max(self.molecular_formulas, key=lambda m: abs(m.confidence_score))
 766
 767    def molecular_formula_earth_filter(self, lowest_error=True):
 768        """Filter molecular formula using the 'Earth' filter
 769
 770        This function applies the Formularity-esque 'Earth' filter to possible molecular formula assignments.
 771        Earth Filter:
 772            O > 0 AND N <= 3 AND P <= 2 AND 3P <= O
 773
 774        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Earth filter.
 775        Otherwise, it will return all Earth-filter compliant formulas.
 776
 777        Parameters
 778        ----------
 779        lowest_error : bool, optional.
 780            Return only the lowest error formula which also fits the Earth filter.
 781            If False, return all Earth-filter compliant formulas. Default is True.
 782
 783        Returns
 784        -------
 785        list
 786            List of molecular formula objects which fit the Earth filter
 787
 788        References
 789        ----------
 790        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
 791            Anal. Chem. 2017, 89, 23, 12659–12665
 792            doi: 10.1021/acs.analchem.7b03318
 793        """
 794
 795        candidates = list(
 796            filter(
 797                lambda mf: mf.get("O") > 0
 798                and mf.get("N") <= 3
 799                and mf.get("P") <= 2
 800                and (3 * mf.get("P")) <= mf.get("O"),
 801                self.molecular_formulas,
 802            )
 803        )
 804        if len(candidates) > 0:
 805            if lowest_error:
 806                return min(candidates, key=lambda m: abs(m.mz_error))
 807            else:
 808                return candidates
 809        else:
 810            return candidates
 811
 812    def molecular_formula_water_filter(self, lowest_error=True):
 813        """Filter molecular formula using the 'Water' filter
 814
 815        This function applies the Formularity-esque 'Water' filter to possible molecular formula assignments.
 816        Water Filter:
 817            O > 0 AND N <= 3 AND S <= 2 AND P <= 2
 818
 819        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Water filter.
 820        Otherwise, it will return all Water-filter compliant formulas.
 821
 822        Parameters
 823        ----------
 824        lowest_error : bool, optional
 825            Return only the lowest error formula which also fits the Water filter.
 826            If False, return all Water-filter compliant formulas. Defaults to 2
 827
 828        Returns
 829        -------
 830        list
 831            List of molecular formula objects which fit the Water filter
 832
 833        References
 834        ----------
 835        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
 836            Anal. Chem. 2017, 89, 23, 12659–12665
 837            doi: 10.1021/acs.analchem.7b03318
 838        """
 839
 840        candidates = list(
 841            filter(
 842                lambda mf: mf.get("O") > 0
 843                and mf.get("N") <= 3
 844                and mf.get("S") <= 2
 845                and mf.get("P") <= 2,
 846                self.molecular_formulas,
 847            )
 848        )
 849        if len(candidates) > 0:
 850            if lowest_error:
 851                return min(candidates, key=lambda m: abs(m.mz_error))
 852            else:
 853                return candidates
 854        else:
 855            return candidates
 856
 857    def molecular_formula_air_filter(self, lowest_error=True):
 858        """Filter molecular formula using the 'Air' filter
 859
 860        This function applies the Formularity-esque 'Air' filter to possible molecular formula assignments.
 861        Air Filter:
 862            O > 0 AND N <= 3 AND S <= 1 AND P = 0 AND 3(S+N) <= O
 863
 864        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Air filter.
 865        Otherwise, it will return all Air-filter compliant formulas.
 866
 867        Parameters
 868        ----------
 869        lowest_error : bool, optional
 870            Return only the lowest error formula which also fits the Air filter.
 871            If False, return all Air-filter compliant formulas. Defaults to True.
 872
 873        Returns
 874        -------
 875        list
 876            List of molecular formula objects which fit the Air filter
 877
 878        References
 879        ----------
 880        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
 881            Anal. Chem. 2017, 89, 23, 12659–12665
 882            doi: 10.1021/acs.analchem.7b03318
 883        """
 884
 885        candidates = list(
 886            filter(
 887                lambda mf: mf.get("O") > 0
 888                and mf.get("N") <= 2
 889                and mf.get("S") <= 1
 890                and mf.get("P") == 0
 891                and 3 * (mf.get("S") + mf.get("N")) <= mf.get("O"),
 892                self.molecular_formulas,
 893            )
 894        )
 895
 896        if len(candidates) > 0:
 897            if lowest_error:
 898                return min(candidates, key=lambda m: abs(m.mz_error))
 899            else:
 900                return candidates
 901        else:
 902            return candidates
 903
 904    def cia_score_S_P_error(self):
 905        """Compound Identification Algorithm SP Error - Assignment Filter
 906
 907        This function applies the Compound Identification Algorithm (CIA) SP Error filter to possible molecular formula assignments.
 908
 909        It takes the molecular formula with the lowest S+P count, and returns the formula with the lowest absolute error from this subset.
 910
 911        Returns
 912        -------
 913        MolecularFormula
 914            A single molecular formula which fits the rules of the CIA SP Error filter
 915
 916
 917        References
 918        ----------
 919        1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter"
 920            Anal. Chem. 2006, 78, 13, 4363–4373
 921            doi: 10.1021/ac0600306
 922        """
 923        # case EFormulaScore.HAcap:
 924
 925        lowest_S_P_mf = min(
 926            self.molecular_formulas, key=lambda mf: mf.get("S") + mf.get("P")
 927        )
 928        lowest_S_P_count = lowest_S_P_mf.get("S") + lowest_S_P_mf.get("P")
 929
 930        list_same_s_p = list(
 931            filter(
 932                lambda mf: mf.get("S") + mf.get("P") == lowest_S_P_count,
 933                self.molecular_formulas,
 934            )
 935        )
 936
 937        # check if list is not empty
 938        if list_same_s_p:
 939            return min(list_same_s_p, key=lambda m: abs(m.mz_error))
 940
 941        else:
 942            return lowest_S_P_mf
 943
 944    def cia_score_N_S_P_error(self):
 945        """Compound Identification Algorithm NSP Error - Assignment Filter
 946
 947        This function applies the Compound Identification Algorithm (CIA) NSP Error filter to possible molecular formula assignments.
 948
 949        It takes the molecular formula with the lowest N+S+P count, and returns the formula with the lowest absolute error from this subset.
 950
 951        Returns
 952        -------
 953        MolecularFormula
 954            A single molecular formula which fits the rules of the CIA NSP Error filter
 955
 956        References
 957        ----------
 958        1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter"
 959            Anal. Chem. 2006, 78, 13, 4363–4373
 960            doi: 10.1021/ac0600306
 961
 962        Raises
 963        -------
 964        Exception
 965            If no molecular formula are associated with mass spectrum peak.
 966        """
 967        # case EFormulaScore.HAcap:
 968        if self.molecular_formulas:
 969            lowest_N_S_P_mf = min(
 970                self.molecular_formulas,
 971                key=lambda mf: mf.get("N") + mf.get("S") + mf.get("P"),
 972            )
 973            lowest_N_S_P_count = (
 974                lowest_N_S_P_mf.get("N")
 975                + lowest_N_S_P_mf.get("S")
 976                + lowest_N_S_P_mf.get("P")
 977            )
 978
 979            list_same_N_S_P = list(
 980                filter(
 981                    lambda mf: mf.get("N") + mf.get("S") + mf.get("P")
 982                    == lowest_N_S_P_count,
 983                    self.molecular_formulas,
 984                )
 985            )
 986
 987            if list_same_N_S_P:
 988                SP_filtered_list = list(
 989                    filter(
 990                        lambda mf: (mf.get("S") <= 3) and (mf.get("P") <= 1),
 991                        list_same_N_S_P,
 992                    )
 993                )
 994
 995                if SP_filtered_list:
 996                    return min(SP_filtered_list, key=lambda m: abs(m.mz_error))
 997
 998                else:
 999                    return min(list_same_N_S_P, key=lambda m: abs(m.mz_error))
1000
1001            else:
1002                return lowest_N_S_P_mf
1003        else:
1004            raise Exception(
1005                "No molecular formula associated with the mass spectrum peak at m/z: %.6f"
1006                % self.mz_exp
1007            )
class MSPeakCalculation:
  30class MSPeakCalculation:
  31    """Class to perform calculations on MSPeak objects.
  32
  33    This class provides methods to perform various calculations on MSPeak objects, such as calculating Kendrick Mass Defect (KMD) and Kendrick Mass (KM), calculating peak area, and fitting peak lineshape using different models.
  34
  35    Parameters
  36    ----------
  37    None
  38
  39    Attributes
  40    ----------
  41    _ms_parent : MSParent
  42        The parent MSParent object associated with the MSPeakCalculation object.
  43    mz_exp : float
  44        The experimental m/z value of the peak.
  45    peak_left_index : int
  46        The start scan index of the peak.
  47    peak_right_index : int
  48        The final scan index of the peak.
  49    resolving_power : float
  50        The resolving power of the peak.
  51
  52    Methods
  53    -------
  54    * _calc_kmd(dict_base).
  55        Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula.
  56    * calc_area().
  57        Calculate the peak area using numpy's trapezoidal fit.
  58    * fit_peak(mz_extend=6, delta_rp=0, model='Gaussian').
  59        Perform lineshape analysis on a peak using lmfit module.
  60    * voigt_pso(w, r, yoff, width, loc, a).
  61        Calculate the Voigt function for particle swarm optimization (PSO) fitting.
  62    * objective_pso(x, w, u).
  63        Calculate the objective function for PSO fitting.
  64    * minimize_pso(lower, upper, w, u).
  65        Minimize the objective function using the particle swarm optimization algorithm.
  66    * fit_peak_pso(mz_extend=6, upsample_multiplier=5).
  67        Perform lineshape analysis on a peak using particle swarm optimization (PSO) fitting.
  68    * voigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1).
  69        [Legacy] Perform voigt lineshape analysis on a peak.
  70    * pseudovoigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5).
  71        [Legacy] Perform pseudovoigt lineshape analysis on a peak.
  72    * lorentz(oversample_multiplier=1, delta_rp=0, mz_overlay=1).
  73        [Legacy] Perform lorentz lineshape analysis on a peak.
  74    * gaussian(oversample_multiplier=1, delta_rp=0, mz_overlay=1).
  75        [Legacy] Perform gaussian lineshape analysis on a peak.
  76    * get_mz_domain(oversample_multiplier, mz_overlay).
  77        [Legacy] Resample/interpolate datapoints for lineshape analysis.
  78    * number_possible_assignments().
  79        Return the number of possible molecular formula assignments for the peak.
  80    * molecular_formula_lowest_error().
  81        Return the molecular formula with the smallest absolute mz error.
  82    * molecular_formula_highest_prob_score().
  83        Return the molecular formula with the highest confidence score.
  84    * molecular_formula_earth_filter(lowest_error=True).
  85        Filter molecular formula using the 'Earth' filter.
  86    * molecular_formula_water_filter(lowest_error=True).
  87        Filter molecular formula using the 'Water' filter.
  88    * molecular_formula_air_filter(lowest_error=True).
  89        Filter molecular formula using the 'Air' filter.
  90    * cia_score_S_P_error().
  91        Compound Identification Algorithm SP Error - Assignment Filter.
  92    * cia_score_N_S_P_error().
  93        Compound Identification Algorithm NSP Error - Assignment Filter.
  94
  95    """
  96
  97    def _calc_kmd(self, dict_base):
  98        """Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula
  99
 100        Parameters
 101        ----------
 102        dict_base : dict
 103            dictionary with the base formula to be used in the calculation
 104            Default is CH2, e.g.
 105                dict_base = {"C": 1, "H": 2}
 106        """
 107
 108        if self._ms_parent:
 109            # msPeak obj does have a ms object parent
 110            kendrick_rounding_method = (
 111                self._ms_parent.mspeaks_settings.kendrick_rounding_method
 112            )  # rounding method can be one of floor, ceil or round
 113            # msPeak obj does not have a ms object parent
 114        else:
 115            kendrick_rounding_method = MSParameters.ms_peak.kendrick_rounding_method
 116
 117        mass = 0
 118        for atom in dict_base.keys():
 119            mass += Atoms.atomic_masses.get(atom) * dict_base.get(atom)
 120
 121        kendrick_mass = (int(mass) / mass) * self.mz_exp
 122
 123        if kendrick_rounding_method == "ceil":
 124            nominal_km = ceil(kendrick_mass)
 125
 126        elif kendrick_rounding_method == "round":
 127            nominal_km = rint(kendrick_mass)
 128
 129        elif kendrick_rounding_method == "floor":
 130            nominal_km = floor(kendrick_mass)
 131
 132        else:
 133            raise Exception(
 134                "%s method was not implemented, please refer to corems.ms_peak.calc.MSPeakCalc Class"
 135                % kendrick_rounding_method
 136            )
 137
 138        kmd = nominal_km - kendrick_mass
 139
 140        # kmd = (nominal_km - km) * 1
 141        # kmd = round(kmd,0)
 142
 143        return kmd, kendrick_mass, nominal_km
 144
 145    def calc_area(self):
 146        """Calculate the peak area using numpy's trapezoidal fit
 147
 148        uses provided mz_domain to accurately integrate areas independent of digital resolution
 149
 150        Returns
 151        -------
 152        float
 153            peak area
 154        """
 155        if self.peak_right_index > self.peak_left_index:
 156            yy = self._ms_parent.abundance_profile[
 157                self.peak_left_index : self.peak_right_index
 158            ]
 159            xx = self._ms_parent.mz_exp_profile[
 160                self.peak_left_index : self.peak_right_index
 161            ]
 162            # check if the axis is high to low m/z or not. if its MSFromFreq its high mz first, if its from Profile, its low mz first
 163            if xx[0] > xx[-1]:
 164                xx = flip(xx)
 165                yy = flip(yy)
 166            return float(trapz(yy, xx))
 167
 168        else:
 169            warnings.warn(
 170                "Peak Area Calculation for m/z {} has failed".format(self.mz_exp)
 171            )
 172            return nan
 173
 174    def fit_peak(self, mz_extend=6, delta_rp=0, model="Gaussian"):
 175        """Lineshape analysis on a peak using lmfit module.
 176
 177        Model and fit peak lineshape by defined function - using lmfit module
 178        Does not oversample/resample/interpolate data points
 179        Better to go back to time domain and perform more zero filling - if possible.
 180
 181        Parameters
 182        ----------
 183        mz_extend : int
 184            extra points left and right of peak definition to include in fitting
 185        delta_rp : float
 186            delta resolving power to add to resolving power
 187        model : str
 188            Type of lineshape model to use.
 189            Models allowed: Gaussian, Lorentz, Voigt
 190
 191        Returns
 192        -----
 193        mz_domain : ndarray
 194            x-axis domain for fit
 195        fit_peak : lmfit object
 196            fit results object from lmfit module
 197
 198        Notes
 199        -----
 200        Returns the calculated mz domain, initial defined abundance profile, and the fit peak results object from lmfit module
 201        mz_extend here extends the x-axis domain so that we have sufficient points either side of the apex to fit.
 202        Takes about 10ms per peak
 203        """
 204        start_index = (
 205            self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0
 206        )
 207        final_index = (
 208            self.peak_right_index + mz_extend
 209            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
 210            else self.peak_right_index
 211        )
 212
 213        # check if MSPeak contains the resolving power info
 214        if self.resolving_power:
 215            # full width half maximum distance
 216            self.fwhm = self.mz_exp / (self.resolving_power + delta_rp)
 217
 218            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
 219            abundance_domain = self._ms_parent.abundance_profile[
 220                start_index:final_index
 221            ]
 222
 223            if model == "Gaussian":
 224                # stardard deviation
 225                sigma = self.fwhm / (2 * sqrt(2 * log(2)))
 226                amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 227                model = models.GaussianModel()
 228                params = model.make_params(
 229                    center=self.mz_exp, amplitude=amplitude, sigma=sigma
 230                )
 231
 232            elif model == "Lorentz":
 233                # stardard deviation
 234                sigma = self.fwhm / 2
 235                amplitude = sigma * pi * self.abundance
 236                model = models.LorentzianModel()
 237                params = model.make_params(
 238                    center=self.mz_exp, amplitude=amplitude, sigma=sigma
 239                )
 240
 241            elif model == "Voigt":
 242                # stardard deviation
 243                sigma = self.fwhm / 3.6013
 244                amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 245                model = models.VoigtModel()
 246                params = model.make_params(
 247                    center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma
 248                )
 249            else:
 250                raise LookupError("model lineshape not known or defined")
 251
 252            # calc_abundance = model.eval(params=params, x=mz_domain) #Same as initial fit, returned in fit_peak object
 253            fit_peak = model.fit(abundance_domain, params=params, x=mz_domain)
 254            return mz_domain, fit_peak
 255
 256        else:
 257            raise LookupError(
 258                "resolving power is not defined, try to use set_max_resolving_power()"
 259            )
 260
 261    def voigt_pso(self, w, r, yoff, width, loc, a):
 262        """Voigt function for particle swarm optimisation (PSO) fitting
 263
 264        From https://github.com/pnnl/nmrfit/blob/master/nmrfit/equations.py.
 265        Calculates a Voigt function over w based on the relevant properties of the distribution.
 266
 267        Parameters
 268        ----------
 269        w : ndarray
 270            Array over which the Voigt function will be evaluated.
 271        r : float
 272            Ratio between the Guassian and Lorentzian functions.
 273        yoff : float
 274            Y-offset of the Voigt function.
 275        width : float
 276            The width of the Voigt function.
 277        loc : float
 278            Center of the Voigt function.
 279        a : float
 280            Area of the Voigt function.
 281        Returns
 282        -------
 283        V : ndarray
 284            Array defining the Voigt function over w.
 285
 286        References
 287        ----------
 288        1. https://github.com/pnnl/nmrfit
 289
 290        Notes
 291        -----
 292        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
 293
 294        """
 295        # Lorentzian component
 296        L = (2 / (pi * width)) * 1 / (1 + ((w - loc) / (0.5 * width)) ** 2)
 297
 298        # Gaussian component
 299        G = (
 300            (2 / width)
 301            * sqrt(log(2) / pi)
 302            * exp(-(((w - loc) / (width / (2 * sqrt(log(2))))) ** 2))
 303        )
 304
 305        # Voigt body
 306        V = (yoff + a) * (r * L + (1 - r) * G)
 307
 308        return V
 309
 310    def objective_pso(self, x, w, u):
 311        """Objective function for particle swarm optimisation (PSO) fitting
 312
 313        The objective function used to fit supplied data.  Evaluates sum of squared differences between the fit and the data.
 314
 315        Parameters
 316        ----------
 317        x : list of floats
 318            Parameter vector.
 319        w : ndarray
 320            Array of frequency data.
 321        u : ndarray
 322            Array of data to be fit.
 323
 324        Returns
 325        -------
 326        rmse : float
 327            Root mean square error between the data and fit.
 328
 329        References
 330        ----------
 331        1. https://github.com/pnnl/nmrfit
 332
 333        """
 334        # global parameters
 335        r, width, loc, a = x
 336        yoff = 0
 337
 338        # calculate fit for V
 339        V_fit = self.voigt_pso(w, r, yoff, width, loc, a)
 340
 341        # real component RMSE
 342        rmse = sqrt(square((u - V_fit)).mean(axis=None))
 343
 344        # return the total RMSE
 345        return rmse
 346
 347    def minimize_pso(self, lower, upper, w, u):
 348        """Minimization function for particle swarm optimisation (PSO) fitting
 349
 350        Minimizes the objective function using the particle swarm optimization algorithm.
 351        Minimization function based on defined parameters
 352
 353
 354        Parameters
 355        ----------
 356        lower : list of floats
 357            Lower bounds for the parameters.
 358        upper : list of floats
 359            Upper bounds for the parameters.
 360        w : ndarray
 361            Array of frequency data.
 362        u : ndarray
 363            Array of data to be fit.
 364
 365        Notes
 366        -----
 367        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
 368        Current parameters take ~2 seconds per peak.
 369
 370
 371        References
 372        ----------
 373        1. https://github.com/pnnl/nmrfit
 374
 375        """
 376        # TODO - allow support to pass swarmsize, maxiter, omega, phip, phig parameters.
 377        # TODO - Refactor PSO fitting into its own class?
 378
 379        xopt, fopt = pyswarm.pso(
 380            self.objective_pso,
 381            lower,
 382            upper,
 383            args=(w, u),
 384            swarmsize=1000,
 385            maxiter=5000,
 386            omega=-0.2134,
 387            phip=-0.3344,
 388            phig=2.3259,
 389        )
 390        return xopt, fopt
 391
 392    def fit_peak_pso(self, mz_extend: int = 6, upsample_multiplier: int = 5):
 393        """Lineshape analysis on a peak using particle swarm optimisation (PSO) fitting
 394
 395        Function to fit a Voigt peakshape using particle swarm optimisation (PSO).
 396        Should return better results than lmfit, but much more computationally expensive
 397
 398        Parameters
 399        ----------
 400        mz_extend : int, optional
 401            extra points left and right of peak definition to include in fitting. Defaults to 6.
 402        upsample_multiplier : int, optional
 403            factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 5.
 404
 405        Returns
 406        -------
 407        xopt : array
 408            variables describing the voigt function.
 409            G/L ratio, width (fwhm), apex (x-axis), area.
 410            y-axis offset is fixed at 0
 411        fopt : float
 412            objective score (rmse)
 413        psfit : array
 414            recalculated y values based on function and optimised fit
 415        psfit_hdp : tuple of arrays
 416            0 - linspace x-axis upsampled grid
 417            1 - recalculated y values based on function and upsampled x-axis grid
 418            Does not change results, but aids in visualisation of the 'true' voigt lineshape
 419
 420        Notes
 421        -----
 422        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
 423        """
 424        # TODO - Add ability to pass pso args (i.e. swarm size, maxiter, omega, phig, etc)
 425        # TODO: fix xopt. Magnitude mode data through CoreMS/Bruker starts at 0 but is noise centered well above 0.
 426        # Thermo data is noise reduced by also noise subtracted, so starts at 0
 427        # Absorption mode/phased data will have positive and negative components and may not be baseline corrected
 428
 429        start_index = (
 430            self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0
 431        )
 432        final_index = (
 433            self.peak_right_index + mz_extend
 434            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
 435            else self.peak_right_index
 436        )
 437
 438        # check if MSPeak contains the resolving power info
 439        if self.resolving_power:
 440            # full width half maximum distance
 441            self.fwhm = self.mz_exp / (self.resolving_power)
 442
 443            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
 444            abundance_domain = self._ms_parent.abundance_profile[
 445                start_index:final_index
 446            ]
 447            lower = [0, self.fwhm * 0.8, (self.mz_exp - 0.0005), 0]
 448            upper = [
 449                1,
 450                self.fwhm * 1.2,
 451                (self.mz_exp + 0.0005),
 452                self.abundance / self.signal_to_noise,
 453            ]
 454            xopt, fopt = self.minimize_pso(lower, upper, mz_domain, abundance_domain)
 455
 456            psfit = self.voigt_pso(mz_domain, xopt[0], 0, xopt[1], xopt[2], xopt[3])
 457            psfit_hdp_x = linspace(
 458                min(mz_domain), max(mz_domain), num=len(mz_domain) * upsample_multiplier
 459            )
 460            psfit_hdp = self.voigt_pso(
 461                psfit_hdp_x, xopt[0], 0, xopt[1], xopt[2], xopt[3]
 462            )
 463            return xopt, fopt, psfit, (psfit_hdp_x, psfit_hdp)
 464        else:
 465            raise LookupError(
 466                "resolving power is not defined, try to use set_max_resolving_power()"
 467            )
 468
 469    def voigt(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
 470        """[Legacy] Voigt lineshape analysis function
 471        Legacy function for voigt lineshape analysis
 472
 473        Parameters
 474        ----------
 475        oversample_multiplier : int
 476            factor to increase x-axis points by for simulation of fitted lineshape function
 477        delta_rp : float
 478            delta resolving power to add to resolving power
 479        mz_overlay : int
 480            extra points left and right of peak definition to include in fitting
 481
 482        Returns
 483        -------
 484        mz_domain : ndarray
 485            x-axis domain for fit
 486        calc_abundance : ndarray
 487            calculated abundance profile based on voigt function
 488        """
 489
 490        if self.resolving_power:
 491            # full width half maximum distance
 492            self.fwhm = self.mz_exp / (
 493                self.resolving_power + delta_rp
 494            )  # self.resolving_power)
 495
 496            # stardart deviation
 497            sigma = self.fwhm / 3.6013
 498
 499            # half width baseline distance
 500
 501            # mz_domain = linspace(self.mz_exp - hw_base_distance,
 502            #                     self.mz_exp + hw_base_distance, datapoint)
 503            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
 504
 505            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
 506
 507            # TODO derive amplitude
 508            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 509
 510            model = models.VoigtModel()
 511
 512            params = model.make_params(
 513                center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma
 514            )
 515
 516            calc_abundance = model.eval(params=params, x=mz_domain)
 517
 518            return mz_domain, calc_abundance
 519
 520        else:
 521            raise LookupError(
 522                "resolving power is not defined, try to use set_max_resolving_power()"
 523            )
 524
 525    def pseudovoigt(
 526        self, oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5
 527    ):
 528        """[Legacy] pseudovoigt lineshape function
 529
 530        Legacy function for pseudovoigt lineshape analysis.
 531        Note - Code may not be functional currently.
 532
 533        Parameters
 534        ----------
 535        oversample_multiplier : int, optional
 536            factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 1.
 537        delta_rp : float, optional
 538            delta resolving power to add to resolving power. Defaults to 0.
 539        mz_overlay : int, optional
 540            extra points left and right of peak definition to include in fitting. Defaults to 1.
 541        fraction : float, optional
 542            fraction of gaussian component in pseudovoigt function. Defaults to 0.5.
 543
 544        """
 545        if self.resolving_power:
 546            # full width half maximum distance
 547            self.fwhm = self.mz_exp / (
 548                self.resolving_power + delta_rp
 549            )  # self.resolving_power)
 550
 551            # stardart deviation
 552            sigma = self.fwhm / 2
 553
 554            # half width baseline distance
 555
 556            # mz_domain = linspace(self.mz_exp - hw_base_distance,
 557            #                     self.mz_exp + hw_base_distance, datapoint)
 558            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
 559
 560            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
 561            model = models.PseudoVoigtModel()
 562
 563            # TODO derive amplitude
 564            gamma = sigma
 565
 566            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 567            amplitude = (sqrt(pi / log(2)) * (pi * sigma * self.abundance)) / (
 568                (pi * (1 - gamma)) + (sqrt(pi * log(2)) * gamma)
 569            )
 570
 571            params = model.make_params(center=self.mz_exp, sigma=sigma)
 572
 573            calc_abundance = model.eval(params=params, x=mz_domain)
 574
 575            return mz_domain, calc_abundance
 576
 577        else:
 578            raise LookupError(
 579                "resolving power is not defined, try to use set_max_resolving_power()"
 580            )
 581
 582    def lorentz(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
 583        """[Legacy] Lorentz lineshape analysis function
 584
 585        Legacy function for lorentz lineshape analysis
 586
 587        Parameters
 588        ----------
 589        oversample_multiplier : int
 590            factor to increase x-axis points by for simulation of fitted lineshape function
 591        delta_rp : float
 592            delta resolving power to add to resolving power
 593        mz_overlay : int
 594            extra points left and right of peak definition to include in fitting
 595
 596        Returns
 597        -------
 598        mz_domain : ndarray
 599            x-axis domain for fit
 600        calc_abundance : ndarray
 601            calculated abundance profile based on lorentz function
 602
 603        """
 604        if self.resolving_power:
 605            # full width half maximum distance
 606            self.fwhm = self.mz_exp / (
 607                self.resolving_power + delta_rp
 608            )  # self.resolving_power)
 609
 610            # stardart deviation
 611            sigma = self.fwhm / 2
 612
 613            # half width baseline distance
 614            hw_base_distance = 8 * sigma
 615
 616            # mz_domain = linspace(self.mz_exp - hw_base_distance,
 617            #                     self.mz_exp + hw_base_distance, datapoint)
 618
 619            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
 620            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
 621            model = models.LorentzianModel()
 622
 623            amplitude = sigma * pi * self.abundance
 624
 625            params = model.make_params(
 626                center=self.mz_exp, amplitude=amplitude, sigma=sigma
 627            )
 628
 629            calc_abundance = model.eval(params=params, x=mz_domain)
 630
 631            return mz_domain, calc_abundance
 632
 633        else:
 634            raise LookupError(
 635                "resolving power is not defined, try to use set_max_resolving_power()"
 636            )
 637
 638    def gaussian(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
 639        """[Legacy] Gaussian lineshape analysis function
 640        Legacy gaussian lineshape analysis function
 641
 642        Parameters
 643        ----------
 644        oversample_multiplier : int
 645            factor to increase x-axis points by for simulation of fitted lineshape function
 646        delta_rp : float
 647            delta resolving power to add to resolving power
 648        mz_overlay : int
 649            extra points left and right of peak definition to include in fitting
 650
 651        Returns
 652        -------
 653        mz_domain : ndarray
 654            x-axis domain for fit
 655        calc_abundance : ndarray
 656            calculated abundance profile based on gaussian function
 657
 658
 659        """
 660
 661        # check if MSPeak contains the resolving power info
 662        if self.resolving_power:
 663            # full width half maximum distance
 664            self.fwhm = self.mz_exp / (
 665                self.resolving_power + delta_rp
 666            )  # self.resolving_power)
 667
 668            # stardart deviation
 669            sigma = self.fwhm / (2 * sqrt(2 * log(2)))
 670
 671            # half width baseline distance
 672            # hw_base_distance = (3.2 * s)
 673
 674            # match_loz_factor = 3
 675
 676            # n_d = hw_base_distance * match_loz_factor
 677
 678            # mz_domain = linspace(
 679            #    self.mz_exp - n_d, self.mz_exp + n_d, datapoint)
 680
 681            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
 682
 683            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
 684
 685            # calc_abundance = norm.pdf(mz_domain, self.mz_exp, s)
 686
 687            model = models.GaussianModel()
 688
 689            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
 690
 691            params = model.make_params(
 692                center=self.mz_exp, amplitude=amplitude, sigma=sigma
 693            )
 694
 695            calc_abundance = model.eval(params=params, x=mz_domain)
 696
 697            return mz_domain, calc_abundance
 698
 699        else:
 700            raise LookupError(
 701                "resolving power is not defined, try to use set_max_resolving_power()"
 702            )
 703
 704    def get_mz_domain(self, oversample_multiplier, mz_overlay):
 705        """[Legacy] function to resample/interpolate datapoints for lineshape analysis
 706
 707        This code is used for the legacy line fitting functions and not recommended.
 708        Legacy function to support expanding mz domain for legacy lineshape functions
 709
 710        Parameters
 711        ----------
 712        oversample_multiplier : int
 713            factor to increase x-axis points by for simulation of fitted lineshape function
 714        mz_overlay : int
 715            extra points left and right of peak definition to include in fitting
 716
 717        Returns
 718        -------
 719        mz_domain : ndarray
 720            x-axis domain for fit
 721
 722        """
 723        start_index = (
 724            self.peak_left_index - mz_overlay if not self.peak_left_index == 0 else 0
 725        )
 726        final_index = (
 727            self.peak_right_index + mz_overlay
 728            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
 729            else self.peak_right_index
 730        )
 731
 732        if oversample_multiplier == 1:
 733            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
 734
 735        else:
 736            # we assume a linear correlation for m/z and datapoits
 737            # which is only true if the m/z range in narrow (within 1 m/z unit)
 738            # this is not true for a wide m/z range
 739
 740            indexes = range(start_index, final_index + 1)
 741            mz = self._ms_parent.mz_exp_profile[indexes]
 742            pol = poly1d(polyfit(indexes, mz, 1))
 743            oversampled_indexes = linspace(
 744                start_index,
 745                final_index,
 746                (final_index - start_index) * oversample_multiplier,
 747            )
 748            mz_domain = pol(oversampled_indexes)
 749
 750        return mz_domain
 751
 752    @property
 753    def number_possible_assignments(
 754        self,
 755    ):
 756        return len(self.molecular_formulas)
 757
 758    def molecular_formula_lowest_error(self):
 759        """Return the molecular formula with the smallest absolute mz error"""
 760
 761        return min(self.molecular_formulas, key=lambda m: abs(m.mz_error))
 762
 763    def molecular_formula_highest_prob_score(self):
 764        """Return the molecular formula with the highest confidence score score"""
 765
 766        return max(self.molecular_formulas, key=lambda m: abs(m.confidence_score))
 767
 768    def molecular_formula_earth_filter(self, lowest_error=True):
 769        """Filter molecular formula using the 'Earth' filter
 770
 771        This function applies the Formularity-esque 'Earth' filter to possible molecular formula assignments.
 772        Earth Filter:
 773            O > 0 AND N <= 3 AND P <= 2 AND 3P <= O
 774
 775        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Earth filter.
 776        Otherwise, it will return all Earth-filter compliant formulas.
 777
 778        Parameters
 779        ----------
 780        lowest_error : bool, optional.
 781            Return only the lowest error formula which also fits the Earth filter.
 782            If False, return all Earth-filter compliant formulas. Default is True.
 783
 784        Returns
 785        -------
 786        list
 787            List of molecular formula objects which fit the Earth filter
 788
 789        References
 790        ----------
 791        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
 792            Anal. Chem. 2017, 89, 23, 12659–12665
 793            doi: 10.1021/acs.analchem.7b03318
 794        """
 795
 796        candidates = list(
 797            filter(
 798                lambda mf: mf.get("O") > 0
 799                and mf.get("N") <= 3
 800                and mf.get("P") <= 2
 801                and (3 * mf.get("P")) <= mf.get("O"),
 802                self.molecular_formulas,
 803            )
 804        )
 805        if len(candidates) > 0:
 806            if lowest_error:
 807                return min(candidates, key=lambda m: abs(m.mz_error))
 808            else:
 809                return candidates
 810        else:
 811            return candidates
 812
 813    def molecular_formula_water_filter(self, lowest_error=True):
 814        """Filter molecular formula using the 'Water' filter
 815
 816        This function applies the Formularity-esque 'Water' filter to possible molecular formula assignments.
 817        Water Filter:
 818            O > 0 AND N <= 3 AND S <= 2 AND P <= 2
 819
 820        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Water filter.
 821        Otherwise, it will return all Water-filter compliant formulas.
 822
 823        Parameters
 824        ----------
 825        lowest_error : bool, optional
 826            Return only the lowest error formula which also fits the Water filter.
 827            If False, return all Water-filter compliant formulas. Defaults to 2
 828
 829        Returns
 830        -------
 831        list
 832            List of molecular formula objects which fit the Water filter
 833
 834        References
 835        ----------
 836        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
 837            Anal. Chem. 2017, 89, 23, 12659–12665
 838            doi: 10.1021/acs.analchem.7b03318
 839        """
 840
 841        candidates = list(
 842            filter(
 843                lambda mf: mf.get("O") > 0
 844                and mf.get("N") <= 3
 845                and mf.get("S") <= 2
 846                and mf.get("P") <= 2,
 847                self.molecular_formulas,
 848            )
 849        )
 850        if len(candidates) > 0:
 851            if lowest_error:
 852                return min(candidates, key=lambda m: abs(m.mz_error))
 853            else:
 854                return candidates
 855        else:
 856            return candidates
 857
 858    def molecular_formula_air_filter(self, lowest_error=True):
 859        """Filter molecular formula using the 'Air' filter
 860
 861        This function applies the Formularity-esque 'Air' filter to possible molecular formula assignments.
 862        Air Filter:
 863            O > 0 AND N <= 3 AND S <= 1 AND P = 0 AND 3(S+N) <= O
 864
 865        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Air filter.
 866        Otherwise, it will return all Air-filter compliant formulas.
 867
 868        Parameters
 869        ----------
 870        lowest_error : bool, optional
 871            Return only the lowest error formula which also fits the Air filter.
 872            If False, return all Air-filter compliant formulas. Defaults to True.
 873
 874        Returns
 875        -------
 876        list
 877            List of molecular formula objects which fit the Air filter
 878
 879        References
 880        ----------
 881        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
 882            Anal. Chem. 2017, 89, 23, 12659–12665
 883            doi: 10.1021/acs.analchem.7b03318
 884        """
 885
 886        candidates = list(
 887            filter(
 888                lambda mf: mf.get("O") > 0
 889                and mf.get("N") <= 2
 890                and mf.get("S") <= 1
 891                and mf.get("P") == 0
 892                and 3 * (mf.get("S") + mf.get("N")) <= mf.get("O"),
 893                self.molecular_formulas,
 894            )
 895        )
 896
 897        if len(candidates) > 0:
 898            if lowest_error:
 899                return min(candidates, key=lambda m: abs(m.mz_error))
 900            else:
 901                return candidates
 902        else:
 903            return candidates
 904
 905    def cia_score_S_P_error(self):
 906        """Compound Identification Algorithm SP Error - Assignment Filter
 907
 908        This function applies the Compound Identification Algorithm (CIA) SP Error filter to possible molecular formula assignments.
 909
 910        It takes the molecular formula with the lowest S+P count, and returns the formula with the lowest absolute error from this subset.
 911
 912        Returns
 913        -------
 914        MolecularFormula
 915            A single molecular formula which fits the rules of the CIA SP Error filter
 916
 917
 918        References
 919        ----------
 920        1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter"
 921            Anal. Chem. 2006, 78, 13, 4363–4373
 922            doi: 10.1021/ac0600306
 923        """
 924        # case EFormulaScore.HAcap:
 925
 926        lowest_S_P_mf = min(
 927            self.molecular_formulas, key=lambda mf: mf.get("S") + mf.get("P")
 928        )
 929        lowest_S_P_count = lowest_S_P_mf.get("S") + lowest_S_P_mf.get("P")
 930
 931        list_same_s_p = list(
 932            filter(
 933                lambda mf: mf.get("S") + mf.get("P") == lowest_S_P_count,
 934                self.molecular_formulas,
 935            )
 936        )
 937
 938        # check if list is not empty
 939        if list_same_s_p:
 940            return min(list_same_s_p, key=lambda m: abs(m.mz_error))
 941
 942        else:
 943            return lowest_S_P_mf
 944
 945    def cia_score_N_S_P_error(self):
 946        """Compound Identification Algorithm NSP Error - Assignment Filter
 947
 948        This function applies the Compound Identification Algorithm (CIA) NSP Error filter to possible molecular formula assignments.
 949
 950        It takes the molecular formula with the lowest N+S+P count, and returns the formula with the lowest absolute error from this subset.
 951
 952        Returns
 953        -------
 954        MolecularFormula
 955            A single molecular formula which fits the rules of the CIA NSP Error filter
 956
 957        References
 958        ----------
 959        1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter"
 960            Anal. Chem. 2006, 78, 13, 4363–4373
 961            doi: 10.1021/ac0600306
 962
 963        Raises
 964        -------
 965        Exception
 966            If no molecular formula are associated with mass spectrum peak.
 967        """
 968        # case EFormulaScore.HAcap:
 969        if self.molecular_formulas:
 970            lowest_N_S_P_mf = min(
 971                self.molecular_formulas,
 972                key=lambda mf: mf.get("N") + mf.get("S") + mf.get("P"),
 973            )
 974            lowest_N_S_P_count = (
 975                lowest_N_S_P_mf.get("N")
 976                + lowest_N_S_P_mf.get("S")
 977                + lowest_N_S_P_mf.get("P")
 978            )
 979
 980            list_same_N_S_P = list(
 981                filter(
 982                    lambda mf: mf.get("N") + mf.get("S") + mf.get("P")
 983                    == lowest_N_S_P_count,
 984                    self.molecular_formulas,
 985                )
 986            )
 987
 988            if list_same_N_S_P:
 989                SP_filtered_list = list(
 990                    filter(
 991                        lambda mf: (mf.get("S") <= 3) and (mf.get("P") <= 1),
 992                        list_same_N_S_P,
 993                    )
 994                )
 995
 996                if SP_filtered_list:
 997                    return min(SP_filtered_list, key=lambda m: abs(m.mz_error))
 998
 999                else:
1000                    return min(list_same_N_S_P, key=lambda m: abs(m.mz_error))
1001
1002            else:
1003                return lowest_N_S_P_mf
1004        else:
1005            raise Exception(
1006                "No molecular formula associated with the mass spectrum peak at m/z: %.6f"
1007                % self.mz_exp
1008            )

Class to perform calculations on MSPeak objects.

This class provides methods to perform various calculations on MSPeak objects, such as calculating Kendrick Mass Defect (KMD) and Kendrick Mass (KM), calculating peak area, and fitting peak lineshape using different models.

Parameters
  • None
Attributes
  • _ms_parent (MSParent): The parent MSParent object associated with the MSPeakCalculation object.
  • mz_exp (float): The experimental m/z value of the peak.
  • peak_left_index (int): The start scan index of the peak.
  • peak_right_index (int): The final scan index of the peak.
  • resolving_power (float): The resolving power of the peak.
Methods
  • _calc_kmd(dict_base). Calculate the Kendrick Mass Defect (KMD) and Kendrick Mass (KM) for a given base formula.
  • calc_area(). Calculate the peak area using numpy's trapezoidal fit.
  • fit_peak(mz_extend=6, delta_rp=0, model='Gaussian'). Perform lineshape analysis on a peak using lmfit module.
  • voigt_pso(w, r, yoff, width, loc, a). Calculate the Voigt function for particle swarm optimization (PSO) fitting.
  • objective_pso(x, w, u). Calculate the objective function for PSO fitting.
  • minimize_pso(lower, upper, w, u). Minimize the objective function using the particle swarm optimization algorithm.
  • fit_peak_pso(mz_extend=6, upsample_multiplier=5). Perform lineshape analysis on a peak using particle swarm optimization (PSO) fitting.
  • voigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1). [Legacy] Perform voigt lineshape analysis on a peak.
  • pseudovoigt(oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5). [Legacy] Perform pseudovoigt lineshape analysis on a peak.
  • lorentz(oversample_multiplier=1, delta_rp=0, mz_overlay=1). [Legacy] Perform lorentz lineshape analysis on a peak.
  • gaussian(oversample_multiplier=1, delta_rp=0, mz_overlay=1). [Legacy] Perform gaussian lineshape analysis on a peak.
  • get_mz_domain(oversample_multiplier, mz_overlay). [Legacy] Resample/interpolate datapoints for lineshape analysis.
  • number_possible_assignments(). Return the number of possible molecular formula assignments for the peak.
  • molecular_formula_lowest_error(). Return the molecular formula with the smallest absolute mz error.
  • molecular_formula_highest_prob_score(). Return the molecular formula with the highest confidence score.
  • molecular_formula_earth_filter(lowest_error=True). Filter molecular formula using the 'Earth' filter.
  • molecular_formula_water_filter(lowest_error=True). Filter molecular formula using the 'Water' filter.
  • molecular_formula_air_filter(lowest_error=True). Filter molecular formula using the 'Air' filter.
  • cia_score_S_P_error(). Compound Identification Algorithm SP Error - Assignment Filter.
  • cia_score_N_S_P_error(). Compound Identification Algorithm NSP Error - Assignment Filter.
def calc_area(self):
145    def calc_area(self):
146        """Calculate the peak area using numpy's trapezoidal fit
147
148        uses provided mz_domain to accurately integrate areas independent of digital resolution
149
150        Returns
151        -------
152        float
153            peak area
154        """
155        if self.peak_right_index > self.peak_left_index:
156            yy = self._ms_parent.abundance_profile[
157                self.peak_left_index : self.peak_right_index
158            ]
159            xx = self._ms_parent.mz_exp_profile[
160                self.peak_left_index : self.peak_right_index
161            ]
162            # check if the axis is high to low m/z or not. if its MSFromFreq its high mz first, if its from Profile, its low mz first
163            if xx[0] > xx[-1]:
164                xx = flip(xx)
165                yy = flip(yy)
166            return float(trapz(yy, xx))
167
168        else:
169            warnings.warn(
170                "Peak Area Calculation for m/z {} has failed".format(self.mz_exp)
171            )
172            return nan

Calculate the peak area using numpy's trapezoidal fit

uses provided mz_domain to accurately integrate areas independent of digital resolution

Returns
  • float: peak area
def fit_peak(self, mz_extend=6, delta_rp=0, model='Gaussian'):
174    def fit_peak(self, mz_extend=6, delta_rp=0, model="Gaussian"):
175        """Lineshape analysis on a peak using lmfit module.
176
177        Model and fit peak lineshape by defined function - using lmfit module
178        Does not oversample/resample/interpolate data points
179        Better to go back to time domain and perform more zero filling - if possible.
180
181        Parameters
182        ----------
183        mz_extend : int
184            extra points left and right of peak definition to include in fitting
185        delta_rp : float
186            delta resolving power to add to resolving power
187        model : str
188            Type of lineshape model to use.
189            Models allowed: Gaussian, Lorentz, Voigt
190
191        Returns
192        -----
193        mz_domain : ndarray
194            x-axis domain for fit
195        fit_peak : lmfit object
196            fit results object from lmfit module
197
198        Notes
199        -----
200        Returns the calculated mz domain, initial defined abundance profile, and the fit peak results object from lmfit module
201        mz_extend here extends the x-axis domain so that we have sufficient points either side of the apex to fit.
202        Takes about 10ms per peak
203        """
204        start_index = (
205            self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0
206        )
207        final_index = (
208            self.peak_right_index + mz_extend
209            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
210            else self.peak_right_index
211        )
212
213        # check if MSPeak contains the resolving power info
214        if self.resolving_power:
215            # full width half maximum distance
216            self.fwhm = self.mz_exp / (self.resolving_power + delta_rp)
217
218            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
219            abundance_domain = self._ms_parent.abundance_profile[
220                start_index:final_index
221            ]
222
223            if model == "Gaussian":
224                # stardard deviation
225                sigma = self.fwhm / (2 * sqrt(2 * log(2)))
226                amplitude = (sqrt(2 * pi) * sigma) * self.abundance
227                model = models.GaussianModel()
228                params = model.make_params(
229                    center=self.mz_exp, amplitude=amplitude, sigma=sigma
230                )
231
232            elif model == "Lorentz":
233                # stardard deviation
234                sigma = self.fwhm / 2
235                amplitude = sigma * pi * self.abundance
236                model = models.LorentzianModel()
237                params = model.make_params(
238                    center=self.mz_exp, amplitude=amplitude, sigma=sigma
239                )
240
241            elif model == "Voigt":
242                # stardard deviation
243                sigma = self.fwhm / 3.6013
244                amplitude = (sqrt(2 * pi) * sigma) * self.abundance
245                model = models.VoigtModel()
246                params = model.make_params(
247                    center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma
248                )
249            else:
250                raise LookupError("model lineshape not known or defined")
251
252            # calc_abundance = model.eval(params=params, x=mz_domain) #Same as initial fit, returned in fit_peak object
253            fit_peak = model.fit(abundance_domain, params=params, x=mz_domain)
254            return mz_domain, fit_peak
255
256        else:
257            raise LookupError(
258                "resolving power is not defined, try to use set_max_resolving_power()"
259            )

Lineshape analysis on a peak using lmfit module.

Model and fit peak lineshape by defined function - using lmfit module Does not oversample/resample/interpolate data points Better to go back to time domain and perform more zero filling - if possible.

Parameters
  • mz_extend (int): extra points left and right of peak definition to include in fitting
  • delta_rp (float): delta resolving power to add to resolving power
  • model (str): Type of lineshape model to use. Models allowed: Gaussian, Lorentz, Voigt
Returns
  • mz_domain (ndarray): x-axis domain for fit
  • fit_peak (lmfit object): fit results object from lmfit module
Notes

Returns the calculated mz domain, initial defined abundance profile, and the fit peak results object from lmfit module mz_extend here extends the x-axis domain so that we have sufficient points either side of the apex to fit. Takes about 10ms per peak

def voigt_pso(self, w, r, yoff, width, loc, a):
261    def voigt_pso(self, w, r, yoff, width, loc, a):
262        """Voigt function for particle swarm optimisation (PSO) fitting
263
264        From https://github.com/pnnl/nmrfit/blob/master/nmrfit/equations.py.
265        Calculates a Voigt function over w based on the relevant properties of the distribution.
266
267        Parameters
268        ----------
269        w : ndarray
270            Array over which the Voigt function will be evaluated.
271        r : float
272            Ratio between the Guassian and Lorentzian functions.
273        yoff : float
274            Y-offset of the Voigt function.
275        width : float
276            The width of the Voigt function.
277        loc : float
278            Center of the Voigt function.
279        a : float
280            Area of the Voigt function.
281        Returns
282        -------
283        V : ndarray
284            Array defining the Voigt function over w.
285
286        References
287        ----------
288        1. https://github.com/pnnl/nmrfit
289
290        Notes
291        -----
292        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
293
294        """
295        # Lorentzian component
296        L = (2 / (pi * width)) * 1 / (1 + ((w - loc) / (0.5 * width)) ** 2)
297
298        # Gaussian component
299        G = (
300            (2 / width)
301            * sqrt(log(2) / pi)
302            * exp(-(((w - loc) / (width / (2 * sqrt(log(2))))) ** 2))
303        )
304
305        # Voigt body
306        V = (yoff + a) * (r * L + (1 - r) * G)
307
308        return V

Voigt function for particle swarm optimisation (PSO) fitting

From https://github.com/pnnl/nmrfit/blob/master/nmrfit/equations.py. Calculates a Voigt function over w based on the relevant properties of the distribution.

Parameters
  • w (ndarray): Array over which the Voigt function will be evaluated.
  • r (float): Ratio between the Guassian and Lorentzian functions.
  • yoff (float): Y-offset of the Voigt function.
  • width (float): The width of the Voigt function.
  • loc (float): Center of the Voigt function.
  • a (float): Area of the Voigt function.
Returns
  • V (ndarray): Array defining the Voigt function over w.
References
  1. https://github.com/pnnl/nmrfit
Notes

Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.

def objective_pso(self, x, w, u):
310    def objective_pso(self, x, w, u):
311        """Objective function for particle swarm optimisation (PSO) fitting
312
313        The objective function used to fit supplied data.  Evaluates sum of squared differences between the fit and the data.
314
315        Parameters
316        ----------
317        x : list of floats
318            Parameter vector.
319        w : ndarray
320            Array of frequency data.
321        u : ndarray
322            Array of data to be fit.
323
324        Returns
325        -------
326        rmse : float
327            Root mean square error between the data and fit.
328
329        References
330        ----------
331        1. https://github.com/pnnl/nmrfit
332
333        """
334        # global parameters
335        r, width, loc, a = x
336        yoff = 0
337
338        # calculate fit for V
339        V_fit = self.voigt_pso(w, r, yoff, width, loc, a)
340
341        # real component RMSE
342        rmse = sqrt(square((u - V_fit)).mean(axis=None))
343
344        # return the total RMSE
345        return rmse

Objective function for particle swarm optimisation (PSO) fitting

The objective function used to fit supplied data. Evaluates sum of squared differences between the fit and the data.

Parameters
  • x (list of floats): Parameter vector.
  • w (ndarray): Array of frequency data.
  • u (ndarray): Array of data to be fit.
Returns
  • rmse (float): Root mean square error between the data and fit.
References
  1. https://github.com/pnnl/nmrfit
def minimize_pso(self, lower, upper, w, u):
347    def minimize_pso(self, lower, upper, w, u):
348        """Minimization function for particle swarm optimisation (PSO) fitting
349
350        Minimizes the objective function using the particle swarm optimization algorithm.
351        Minimization function based on defined parameters
352
353
354        Parameters
355        ----------
356        lower : list of floats
357            Lower bounds for the parameters.
358        upper : list of floats
359            Upper bounds for the parameters.
360        w : ndarray
361            Array of frequency data.
362        u : ndarray
363            Array of data to be fit.
364
365        Notes
366        -----
367        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
368        Current parameters take ~2 seconds per peak.
369
370
371        References
372        ----------
373        1. https://github.com/pnnl/nmrfit
374
375        """
376        # TODO - allow support to pass swarmsize, maxiter, omega, phip, phig parameters.
377        # TODO - Refactor PSO fitting into its own class?
378
379        xopt, fopt = pyswarm.pso(
380            self.objective_pso,
381            lower,
382            upper,
383            args=(w, u),
384            swarmsize=1000,
385            maxiter=5000,
386            omega=-0.2134,
387            phip=-0.3344,
388            phig=2.3259,
389        )
390        return xopt, fopt

Minimization function for particle swarm optimisation (PSO) fitting

Minimizes the objective function using the particle swarm optimization algorithm. Minimization function based on defined parameters

Parameters
  • lower (list of floats): Lower bounds for the parameters.
  • upper (list of floats): Upper bounds for the parameters.
  • w (ndarray): Array of frequency data.
  • u (ndarray): Array of data to be fit.
Notes

Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise. Current parameters take ~2 seconds per peak.

References
  1. https://github.com/pnnl/nmrfit
def fit_peak_pso(self, mz_extend: int = 6, upsample_multiplier: int = 5):
392    def fit_peak_pso(self, mz_extend: int = 6, upsample_multiplier: int = 5):
393        """Lineshape analysis on a peak using particle swarm optimisation (PSO) fitting
394
395        Function to fit a Voigt peakshape using particle swarm optimisation (PSO).
396        Should return better results than lmfit, but much more computationally expensive
397
398        Parameters
399        ----------
400        mz_extend : int, optional
401            extra points left and right of peak definition to include in fitting. Defaults to 6.
402        upsample_multiplier : int, optional
403            factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 5.
404
405        Returns
406        -------
407        xopt : array
408            variables describing the voigt function.
409            G/L ratio, width (fwhm), apex (x-axis), area.
410            y-axis offset is fixed at 0
411        fopt : float
412            objective score (rmse)
413        psfit : array
414            recalculated y values based on function and optimised fit
415        psfit_hdp : tuple of arrays
416            0 - linspace x-axis upsampled grid
417            1 - recalculated y values based on function and upsampled x-axis grid
418            Does not change results, but aids in visualisation of the 'true' voigt lineshape
419
420        Notes
421        -----
422        Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.
423        """
424        # TODO - Add ability to pass pso args (i.e. swarm size, maxiter, omega, phig, etc)
425        # TODO: fix xopt. Magnitude mode data through CoreMS/Bruker starts at 0 but is noise centered well above 0.
426        # Thermo data is noise reduced by also noise subtracted, so starts at 0
427        # Absorption mode/phased data will have positive and negative components and may not be baseline corrected
428
429        start_index = (
430            self.peak_left_index - mz_extend if not self.peak_left_index == 0 else 0
431        )
432        final_index = (
433            self.peak_right_index + mz_extend
434            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
435            else self.peak_right_index
436        )
437
438        # check if MSPeak contains the resolving power info
439        if self.resolving_power:
440            # full width half maximum distance
441            self.fwhm = self.mz_exp / (self.resolving_power)
442
443            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
444            abundance_domain = self._ms_parent.abundance_profile[
445                start_index:final_index
446            ]
447            lower = [0, self.fwhm * 0.8, (self.mz_exp - 0.0005), 0]
448            upper = [
449                1,
450                self.fwhm * 1.2,
451                (self.mz_exp + 0.0005),
452                self.abundance / self.signal_to_noise,
453            ]
454            xopt, fopt = self.minimize_pso(lower, upper, mz_domain, abundance_domain)
455
456            psfit = self.voigt_pso(mz_domain, xopt[0], 0, xopt[1], xopt[2], xopt[3])
457            psfit_hdp_x = linspace(
458                min(mz_domain), max(mz_domain), num=len(mz_domain) * upsample_multiplier
459            )
460            psfit_hdp = self.voigt_pso(
461                psfit_hdp_x, xopt[0], 0, xopt[1], xopt[2], xopt[3]
462            )
463            return xopt, fopt, psfit, (psfit_hdp_x, psfit_hdp)
464        else:
465            raise LookupError(
466                "resolving power is not defined, try to use set_max_resolving_power()"
467            )

Lineshape analysis on a peak using particle swarm optimisation (PSO) fitting

Function to fit a Voigt peakshape using particle swarm optimisation (PSO). Should return better results than lmfit, but much more computationally expensive

Parameters
  • mz_extend (int, optional): extra points left and right of peak definition to include in fitting. Defaults to 6.
  • upsample_multiplier (int, optional): factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 5.
Returns
  • xopt (array): variables describing the voigt function. G/L ratio, width (fwhm), apex (x-axis), area. y-axis offset is fixed at 0
  • fopt (float): objective score (rmse)
  • psfit (array): recalculated y values based on function and optimised fit
  • psfit_hdp (tuple of arrays): 0 - linspace x-axis upsampled grid 1 - recalculated y values based on function and upsampled x-axis grid Does not change results, but aids in visualisation of the 'true' voigt lineshape
Notes

Particle swarm optimisation (PSO) fitting function can be significantly more computationally expensive than lmfit, with more parameters to optimise.

def voigt(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
469    def voigt(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
470        """[Legacy] Voigt lineshape analysis function
471        Legacy function for voigt lineshape analysis
472
473        Parameters
474        ----------
475        oversample_multiplier : int
476            factor to increase x-axis points by for simulation of fitted lineshape function
477        delta_rp : float
478            delta resolving power to add to resolving power
479        mz_overlay : int
480            extra points left and right of peak definition to include in fitting
481
482        Returns
483        -------
484        mz_domain : ndarray
485            x-axis domain for fit
486        calc_abundance : ndarray
487            calculated abundance profile based on voigt function
488        """
489
490        if self.resolving_power:
491            # full width half maximum distance
492            self.fwhm = self.mz_exp / (
493                self.resolving_power + delta_rp
494            )  # self.resolving_power)
495
496            # stardart deviation
497            sigma = self.fwhm / 3.6013
498
499            # half width baseline distance
500
501            # mz_domain = linspace(self.mz_exp - hw_base_distance,
502            #                     self.mz_exp + hw_base_distance, datapoint)
503            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
504
505            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
506
507            # TODO derive amplitude
508            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
509
510            model = models.VoigtModel()
511
512            params = model.make_params(
513                center=self.mz_exp, amplitude=amplitude, sigma=sigma, gamma=sigma
514            )
515
516            calc_abundance = model.eval(params=params, x=mz_domain)
517
518            return mz_domain, calc_abundance
519
520        else:
521            raise LookupError(
522                "resolving power is not defined, try to use set_max_resolving_power()"
523            )

[Legacy] Voigt lineshape analysis function Legacy function for voigt lineshape analysis

Parameters
  • oversample_multiplier (int): factor to increase x-axis points by for simulation of fitted lineshape function
  • delta_rp (float): delta resolving power to add to resolving power
  • mz_overlay (int): extra points left and right of peak definition to include in fitting
Returns
  • mz_domain (ndarray): x-axis domain for fit
  • calc_abundance (ndarray): calculated abundance profile based on voigt function
def pseudovoigt( self, oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5):
525    def pseudovoigt(
526        self, oversample_multiplier=1, delta_rp=0, mz_overlay=1, fraction=0.5
527    ):
528        """[Legacy] pseudovoigt lineshape function
529
530        Legacy function for pseudovoigt lineshape analysis.
531        Note - Code may not be functional currently.
532
533        Parameters
534        ----------
535        oversample_multiplier : int, optional
536            factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 1.
537        delta_rp : float, optional
538            delta resolving power to add to resolving power. Defaults to 0.
539        mz_overlay : int, optional
540            extra points left and right of peak definition to include in fitting. Defaults to 1.
541        fraction : float, optional
542            fraction of gaussian component in pseudovoigt function. Defaults to 0.5.
543
544        """
545        if self.resolving_power:
546            # full width half maximum distance
547            self.fwhm = self.mz_exp / (
548                self.resolving_power + delta_rp
549            )  # self.resolving_power)
550
551            # stardart deviation
552            sigma = self.fwhm / 2
553
554            # half width baseline distance
555
556            # mz_domain = linspace(self.mz_exp - hw_base_distance,
557            #                     self.mz_exp + hw_base_distance, datapoint)
558            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
559
560            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
561            model = models.PseudoVoigtModel()
562
563            # TODO derive amplitude
564            gamma = sigma
565
566            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
567            amplitude = (sqrt(pi / log(2)) * (pi * sigma * self.abundance)) / (
568                (pi * (1 - gamma)) + (sqrt(pi * log(2)) * gamma)
569            )
570
571            params = model.make_params(center=self.mz_exp, sigma=sigma)
572
573            calc_abundance = model.eval(params=params, x=mz_domain)
574
575            return mz_domain, calc_abundance
576
577        else:
578            raise LookupError(
579                "resolving power is not defined, try to use set_max_resolving_power()"
580            )

[Legacy] pseudovoigt lineshape function

Legacy function for pseudovoigt lineshape analysis. Note - Code may not be functional currently.

Parameters
  • oversample_multiplier (int, optional): factor to increase x-axis points by for simulation of fitted lineshape function. Defaults to 1.
  • delta_rp (float, optional): delta resolving power to add to resolving power. Defaults to 0.
  • mz_overlay (int, optional): extra points left and right of peak definition to include in fitting. Defaults to 1.
  • fraction (float, optional): fraction of gaussian component in pseudovoigt function. Defaults to 0.5.
def lorentz(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
582    def lorentz(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
583        """[Legacy] Lorentz lineshape analysis function
584
585        Legacy function for lorentz lineshape analysis
586
587        Parameters
588        ----------
589        oversample_multiplier : int
590            factor to increase x-axis points by for simulation of fitted lineshape function
591        delta_rp : float
592            delta resolving power to add to resolving power
593        mz_overlay : int
594            extra points left and right of peak definition to include in fitting
595
596        Returns
597        -------
598        mz_domain : ndarray
599            x-axis domain for fit
600        calc_abundance : ndarray
601            calculated abundance profile based on lorentz function
602
603        """
604        if self.resolving_power:
605            # full width half maximum distance
606            self.fwhm = self.mz_exp / (
607                self.resolving_power + delta_rp
608            )  # self.resolving_power)
609
610            # stardart deviation
611            sigma = self.fwhm / 2
612
613            # half width baseline distance
614            hw_base_distance = 8 * sigma
615
616            # mz_domain = linspace(self.mz_exp - hw_base_distance,
617            #                     self.mz_exp + hw_base_distance, datapoint)
618
619            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
620            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
621            model = models.LorentzianModel()
622
623            amplitude = sigma * pi * self.abundance
624
625            params = model.make_params(
626                center=self.mz_exp, amplitude=amplitude, sigma=sigma
627            )
628
629            calc_abundance = model.eval(params=params, x=mz_domain)
630
631            return mz_domain, calc_abundance
632
633        else:
634            raise LookupError(
635                "resolving power is not defined, try to use set_max_resolving_power()"
636            )

[Legacy] Lorentz lineshape analysis function

Legacy function for lorentz lineshape analysis

Parameters
  • oversample_multiplier (int): factor to increase x-axis points by for simulation of fitted lineshape function
  • delta_rp (float): delta resolving power to add to resolving power
  • mz_overlay (int): extra points left and right of peak definition to include in fitting
Returns
  • mz_domain (ndarray): x-axis domain for fit
  • calc_abundance (ndarray): calculated abundance profile based on lorentz function
def gaussian(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
638    def gaussian(self, oversample_multiplier=1, delta_rp=0, mz_overlay=1):
639        """[Legacy] Gaussian lineshape analysis function
640        Legacy gaussian lineshape analysis function
641
642        Parameters
643        ----------
644        oversample_multiplier : int
645            factor to increase x-axis points by for simulation of fitted lineshape function
646        delta_rp : float
647            delta resolving power to add to resolving power
648        mz_overlay : int
649            extra points left and right of peak definition to include in fitting
650
651        Returns
652        -------
653        mz_domain : ndarray
654            x-axis domain for fit
655        calc_abundance : ndarray
656            calculated abundance profile based on gaussian function
657
658
659        """
660
661        # check if MSPeak contains the resolving power info
662        if self.resolving_power:
663            # full width half maximum distance
664            self.fwhm = self.mz_exp / (
665                self.resolving_power + delta_rp
666            )  # self.resolving_power)
667
668            # stardart deviation
669            sigma = self.fwhm / (2 * sqrt(2 * log(2)))
670
671            # half width baseline distance
672            # hw_base_distance = (3.2 * s)
673
674            # match_loz_factor = 3
675
676            # n_d = hw_base_distance * match_loz_factor
677
678            # mz_domain = linspace(
679            #    self.mz_exp - n_d, self.mz_exp + n_d, datapoint)
680
681            mz_domain = self.get_mz_domain(oversample_multiplier, mz_overlay)
682
683            # gaussian_pdf = lambda x0, x, s: (1/ math.sqrt(2*math.pi*math.pow(s,2))) * math.exp(-1 * math.pow(x-x0,2) / 2*math.pow(s,2) )
684
685            # calc_abundance = norm.pdf(mz_domain, self.mz_exp, s)
686
687            model = models.GaussianModel()
688
689            amplitude = (sqrt(2 * pi) * sigma) * self.abundance
690
691            params = model.make_params(
692                center=self.mz_exp, amplitude=amplitude, sigma=sigma
693            )
694
695            calc_abundance = model.eval(params=params, x=mz_domain)
696
697            return mz_domain, calc_abundance
698
699        else:
700            raise LookupError(
701                "resolving power is not defined, try to use set_max_resolving_power()"
702            )

[Legacy] Gaussian lineshape analysis function Legacy gaussian lineshape analysis function

Parameters
  • oversample_multiplier (int): factor to increase x-axis points by for simulation of fitted lineshape function
  • delta_rp (float): delta resolving power to add to resolving power
  • mz_overlay (int): extra points left and right of peak definition to include in fitting
Returns
  • mz_domain (ndarray): x-axis domain for fit
  • calc_abundance (ndarray): calculated abundance profile based on gaussian function
def get_mz_domain(self, oversample_multiplier, mz_overlay):
704    def get_mz_domain(self, oversample_multiplier, mz_overlay):
705        """[Legacy] function to resample/interpolate datapoints for lineshape analysis
706
707        This code is used for the legacy line fitting functions and not recommended.
708        Legacy function to support expanding mz domain for legacy lineshape functions
709
710        Parameters
711        ----------
712        oversample_multiplier : int
713            factor to increase x-axis points by for simulation of fitted lineshape function
714        mz_overlay : int
715            extra points left and right of peak definition to include in fitting
716
717        Returns
718        -------
719        mz_domain : ndarray
720            x-axis domain for fit
721
722        """
723        start_index = (
724            self.peak_left_index - mz_overlay if not self.peak_left_index == 0 else 0
725        )
726        final_index = (
727            self.peak_right_index + mz_overlay
728            if not self.peak_right_index == len(self._ms_parent.mz_exp_profile)
729            else self.peak_right_index
730        )
731
732        if oversample_multiplier == 1:
733            mz_domain = self._ms_parent.mz_exp_profile[start_index:final_index]
734
735        else:
736            # we assume a linear correlation for m/z and datapoits
737            # which is only true if the m/z range in narrow (within 1 m/z unit)
738            # this is not true for a wide m/z range
739
740            indexes = range(start_index, final_index + 1)
741            mz = self._ms_parent.mz_exp_profile[indexes]
742            pol = poly1d(polyfit(indexes, mz, 1))
743            oversampled_indexes = linspace(
744                start_index,
745                final_index,
746                (final_index - start_index) * oversample_multiplier,
747            )
748            mz_domain = pol(oversampled_indexes)
749
750        return mz_domain

[Legacy] function to resample/interpolate datapoints for lineshape analysis

This code is used for the legacy line fitting functions and not recommended. Legacy function to support expanding mz domain for legacy lineshape functions

Parameters
  • oversample_multiplier (int): factor to increase x-axis points by for simulation of fitted lineshape function
  • mz_overlay (int): extra points left and right of peak definition to include in fitting
Returns
  • mz_domain (ndarray): x-axis domain for fit
number_possible_assignments
def molecular_formula_lowest_error(self):
758    def molecular_formula_lowest_error(self):
759        """Return the molecular formula with the smallest absolute mz error"""
760
761        return min(self.molecular_formulas, key=lambda m: abs(m.mz_error))

Return the molecular formula with the smallest absolute mz error

def molecular_formula_highest_prob_score(self):
763    def molecular_formula_highest_prob_score(self):
764        """Return the molecular formula with the highest confidence score score"""
765
766        return max(self.molecular_formulas, key=lambda m: abs(m.confidence_score))

Return the molecular formula with the highest confidence score score

def molecular_formula_earth_filter(self, lowest_error=True):
768    def molecular_formula_earth_filter(self, lowest_error=True):
769        """Filter molecular formula using the 'Earth' filter
770
771        This function applies the Formularity-esque 'Earth' filter to possible molecular formula assignments.
772        Earth Filter:
773            O > 0 AND N <= 3 AND P <= 2 AND 3P <= O
774
775        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Earth filter.
776        Otherwise, it will return all Earth-filter compliant formulas.
777
778        Parameters
779        ----------
780        lowest_error : bool, optional.
781            Return only the lowest error formula which also fits the Earth filter.
782            If False, return all Earth-filter compliant formulas. Default is True.
783
784        Returns
785        -------
786        list
787            List of molecular formula objects which fit the Earth filter
788
789        References
790        ----------
791        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
792            Anal. Chem. 2017, 89, 23, 12659–12665
793            doi: 10.1021/acs.analchem.7b03318
794        """
795
796        candidates = list(
797            filter(
798                lambda mf: mf.get("O") > 0
799                and mf.get("N") <= 3
800                and mf.get("P") <= 2
801                and (3 * mf.get("P")) <= mf.get("O"),
802                self.molecular_formulas,
803            )
804        )
805        if len(candidates) > 0:
806            if lowest_error:
807                return min(candidates, key=lambda m: abs(m.mz_error))
808            else:
809                return candidates
810        else:
811            return candidates

Filter molecular formula using the 'Earth' filter

This function applies the Formularity-esque 'Earth' filter to possible molecular formula assignments. Earth Filter: O > 0 AND N <= 3 AND P <= 2 AND 3P <= O

If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Earth filter. Otherwise, it will return all Earth-filter compliant formulas.

Parameters
  • lowest_error (bool, optional.): Return only the lowest error formula which also fits the Earth filter. If False, return all Earth-filter compliant formulas. Default is True.
Returns
  • list: List of molecular formula objects which fit the Earth filter
References
  1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" Anal. Chem. 2017, 89, 23, 12659–12665 doi: 10.1021/acs.analchem.7b03318
def molecular_formula_water_filter(self, lowest_error=True):
813    def molecular_formula_water_filter(self, lowest_error=True):
814        """Filter molecular formula using the 'Water' filter
815
816        This function applies the Formularity-esque 'Water' filter to possible molecular formula assignments.
817        Water Filter:
818            O > 0 AND N <= 3 AND S <= 2 AND P <= 2
819
820        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Water filter.
821        Otherwise, it will return all Water-filter compliant formulas.
822
823        Parameters
824        ----------
825        lowest_error : bool, optional
826            Return only the lowest error formula which also fits the Water filter.
827            If False, return all Water-filter compliant formulas. Defaults to 2
828
829        Returns
830        -------
831        list
832            List of molecular formula objects which fit the Water filter
833
834        References
835        ----------
836        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
837            Anal. Chem. 2017, 89, 23, 12659–12665
838            doi: 10.1021/acs.analchem.7b03318
839        """
840
841        candidates = list(
842            filter(
843                lambda mf: mf.get("O") > 0
844                and mf.get("N") <= 3
845                and mf.get("S") <= 2
846                and mf.get("P") <= 2,
847                self.molecular_formulas,
848            )
849        )
850        if len(candidates) > 0:
851            if lowest_error:
852                return min(candidates, key=lambda m: abs(m.mz_error))
853            else:
854                return candidates
855        else:
856            return candidates

Filter molecular formula using the 'Water' filter

This function applies the Formularity-esque 'Water' filter to possible molecular formula assignments. Water Filter: O > 0 AND N <= 3 AND S <= 2 AND P <= 2

If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Water filter. Otherwise, it will return all Water-filter compliant formulas.

Parameters
  • lowest_error (bool, optional): Return only the lowest error formula which also fits the Water filter. If False, return all Water-filter compliant formulas. Defaults to 2
Returns
  • list: List of molecular formula objects which fit the Water filter
References
  1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" Anal. Chem. 2017, 89, 23, 12659–12665 doi: 10.1021/acs.analchem.7b03318
def molecular_formula_air_filter(self, lowest_error=True):
858    def molecular_formula_air_filter(self, lowest_error=True):
859        """Filter molecular formula using the 'Air' filter
860
861        This function applies the Formularity-esque 'Air' filter to possible molecular formula assignments.
862        Air Filter:
863            O > 0 AND N <= 3 AND S <= 1 AND P = 0 AND 3(S+N) <= O
864
865        If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Air filter.
866        Otherwise, it will return all Air-filter compliant formulas.
867
868        Parameters
869        ----------
870        lowest_error : bool, optional
871            Return only the lowest error formula which also fits the Air filter.
872            If False, return all Air-filter compliant formulas. Defaults to True.
873
874        Returns
875        -------
876        list
877            List of molecular formula objects which fit the Air filter
878
879        References
880        ----------
881        1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra"
882            Anal. Chem. 2017, 89, 23, 12659–12665
883            doi: 10.1021/acs.analchem.7b03318
884        """
885
886        candidates = list(
887            filter(
888                lambda mf: mf.get("O") > 0
889                and mf.get("N") <= 2
890                and mf.get("S") <= 1
891                and mf.get("P") == 0
892                and 3 * (mf.get("S") + mf.get("N")) <= mf.get("O"),
893                self.molecular_formulas,
894            )
895        )
896
897        if len(candidates) > 0:
898            if lowest_error:
899                return min(candidates, key=lambda m: abs(m.mz_error))
900            else:
901                return candidates
902        else:
903            return candidates

Filter molecular formula using the 'Air' filter

This function applies the Formularity-esque 'Air' filter to possible molecular formula assignments. Air Filter: O > 0 AND N <= 3 AND S <= 1 AND P = 0 AND 3(S+N) <= O

If the lowest_error method is also used, it will return the single formula annotation with the smallest absolute error which also fits the Air filter. Otherwise, it will return all Air-filter compliant formulas.

Parameters
  • lowest_error (bool, optional): Return only the lowest error formula which also fits the Air filter. If False, return all Air-filter compliant formulas. Defaults to True.
Returns
  • list: List of molecular formula objects which fit the Air filter
References
  1. Nikola Tolic et al., "Formularity: Software for Automated Formula Assignment of Natural and Other Organic Matter from Ultrahigh-Resolution Mass Spectra" Anal. Chem. 2017, 89, 23, 12659–12665 doi: 10.1021/acs.analchem.7b03318
def cia_score_S_P_error(self):
905    def cia_score_S_P_error(self):
906        """Compound Identification Algorithm SP Error - Assignment Filter
907
908        This function applies the Compound Identification Algorithm (CIA) SP Error filter to possible molecular formula assignments.
909
910        It takes the molecular formula with the lowest S+P count, and returns the formula with the lowest absolute error from this subset.
911
912        Returns
913        -------
914        MolecularFormula
915            A single molecular formula which fits the rules of the CIA SP Error filter
916
917
918        References
919        ----------
920        1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter"
921            Anal. Chem. 2006, 78, 13, 4363–4373
922            doi: 10.1021/ac0600306
923        """
924        # case EFormulaScore.HAcap:
925
926        lowest_S_P_mf = min(
927            self.molecular_formulas, key=lambda mf: mf.get("S") + mf.get("P")
928        )
929        lowest_S_P_count = lowest_S_P_mf.get("S") + lowest_S_P_mf.get("P")
930
931        list_same_s_p = list(
932            filter(
933                lambda mf: mf.get("S") + mf.get("P") == lowest_S_P_count,
934                self.molecular_formulas,
935            )
936        )
937
938        # check if list is not empty
939        if list_same_s_p:
940            return min(list_same_s_p, key=lambda m: abs(m.mz_error))
941
942        else:
943            return lowest_S_P_mf

Compound Identification Algorithm SP Error - Assignment Filter

This function applies the Compound Identification Algorithm (CIA) SP Error filter to possible molecular formula assignments.

It takes the molecular formula with the lowest S+P count, and returns the formula with the lowest absolute error from this subset.

Returns
  • MolecularFormula: A single molecular formula which fits the rules of the CIA SP Error filter
References
  1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" Anal. Chem. 2006, 78, 13, 4363–4373 doi: 10.1021/ac0600306
def cia_score_N_S_P_error(self):
 945    def cia_score_N_S_P_error(self):
 946        """Compound Identification Algorithm NSP Error - Assignment Filter
 947
 948        This function applies the Compound Identification Algorithm (CIA) NSP Error filter to possible molecular formula assignments.
 949
 950        It takes the molecular formula with the lowest N+S+P count, and returns the formula with the lowest absolute error from this subset.
 951
 952        Returns
 953        -------
 954        MolecularFormula
 955            A single molecular formula which fits the rules of the CIA NSP Error filter
 956
 957        References
 958        ----------
 959        1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter"
 960            Anal. Chem. 2006, 78, 13, 4363–4373
 961            doi: 10.1021/ac0600306
 962
 963        Raises
 964        -------
 965        Exception
 966            If no molecular formula are associated with mass spectrum peak.
 967        """
 968        # case EFormulaScore.HAcap:
 969        if self.molecular_formulas:
 970            lowest_N_S_P_mf = min(
 971                self.molecular_formulas,
 972                key=lambda mf: mf.get("N") + mf.get("S") + mf.get("P"),
 973            )
 974            lowest_N_S_P_count = (
 975                lowest_N_S_P_mf.get("N")
 976                + lowest_N_S_P_mf.get("S")
 977                + lowest_N_S_P_mf.get("P")
 978            )
 979
 980            list_same_N_S_P = list(
 981                filter(
 982                    lambda mf: mf.get("N") + mf.get("S") + mf.get("P")
 983                    == lowest_N_S_P_count,
 984                    self.molecular_formulas,
 985                )
 986            )
 987
 988            if list_same_N_S_P:
 989                SP_filtered_list = list(
 990                    filter(
 991                        lambda mf: (mf.get("S") <= 3) and (mf.get("P") <= 1),
 992                        list_same_N_S_P,
 993                    )
 994                )
 995
 996                if SP_filtered_list:
 997                    return min(SP_filtered_list, key=lambda m: abs(m.mz_error))
 998
 999                else:
1000                    return min(list_same_N_S_P, key=lambda m: abs(m.mz_error))
1001
1002            else:
1003                return lowest_N_S_P_mf
1004        else:
1005            raise Exception(
1006                "No molecular formula associated with the mass spectrum peak at m/z: %.6f"
1007                % self.mz_exp
1008            )

Compound Identification Algorithm NSP Error - Assignment Filter

This function applies the Compound Identification Algorithm (CIA) NSP Error filter to possible molecular formula assignments.

It takes the molecular formula with the lowest N+S+P count, and returns the formula with the lowest absolute error from this subset.

Returns
  • MolecularFormula: A single molecular formula which fits the rules of the CIA NSP Error filter
References
  1. Elizabeth B. Kujawinski and Mark D. Behn, "Automated Analysis of Electrospray Ionization Fourier Transform Ion Cyclotron Resonance Mass Spectra of Natural Organic Matter" Anal. Chem. 2006, 78, 13, 4363–4373 doi: 10.1021/ac0600306
Raises
  • Exception: If no molecular formula are associated with mass spectrum peak.