corems.mass_spectrum.calc.Calibration

Created on Wed May 13 02:16:09 2020

@author: Will Kew

  1# -*- coding: utf-8 -*-
  2"""
  3Created on Wed May 13 02:16:09 2020
  4
  5@author: Will Kew
  6"""
  7
  8# import modules
  9import csv
 10import warnings
 11from io import BytesIO
 12from pathlib import Path
 13
 14import numpy as np
 15import pandas as pd
 16from s3path import S3Path
 17
 18# import scipy modules for calibration
 19from scipy.optimize import minimize
 20
 21
 22class MzDomainCalibration:
 23    """MzDomainCalibration class for recalibrating mass spectra
 24
 25    Parameters
 26    ----------
 27    mass_spectrum : CoreMS MassSpectrum Object
 28        The mass spectrum to be calibrated.
 29    ref_masslist : str
 30        The path to a reference mass list.
 31    mzsegment : tuple of floats, optional
 32        The mz range to recalibrate, or None. Used for calibration of specific parts of the mz domain at a time.
 33        Future work - allow multiple mzsegments to be passed.
 34
 35    Attributes
 36    ----------
 37    mass_spectrum : CoreMS MassSpectrum Object
 38        The mass spectrum to be calibrated.
 39    mzsegment : tuple of floats or None
 40        The mz range to recalibrate, or None.
 41    ref_mass_list_path : str or Path
 42        The path to the reference mass list.
 43
 44    Methods
 45    -------
 46    * run().
 47        Main function to run this class.
 48    * load_ref_mass_list().
 49        Load reference mass list (Bruker format).
 50    * gen_ref_mass_list_from_assigned(min_conf=0.7).
 51        Generate reference mass list from assigned masses.
 52    * find_calibration_points(df_ref, calib_ppm_error_threshold=(-1, 1), calib_snr_threshold=5).
 53        Find calibration points in the mass spectrum based on the reference mass list.
 54    * robust_calib(param, cal_peaks_mz, cal_refs_mz, order=1).
 55        Recalibration function.
 56    * recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=1, diagnostic=False).
 57        Main recalibration function which uses a robust linear regression.
 58
 59
 60    """
 61
 62    def __init__(self, mass_spectrum, ref_masslist, mzsegment=None):
 63        self.mass_spectrum = mass_spectrum
 64        self.mzsegment = mzsegment
 65
 66        # define reference mass list - bruker .ref format
 67        self.ref_mass_list_path = ref_masslist
 68        if self.mass_spectrum.percentile_assigned(mute_output=True)[0] != 0:
 69            warnings.warn(
 70                "Warning: calibrating spectra which have already been assigned may yield erroneous results"
 71            )
 72        self.mass_spectrum.mz_cal = self.mass_spectrum.mz_exp
 73        self.mass_spectrum.mz_cal_profile = self.mass_spectrum._mz_exp
 74
 75        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
 76            print(
 77                "MS Obj loaded - " + str(len(mass_spectrum.mspeaks)) + " peaks found."
 78            )
 79
 80    def load_ref_mass_list(self):
 81        """Load reference mass list (Bruker format)
 82
 83        Loads in a reference mass list from a .ref file
 84        Note that some versions of Bruker's software produce .ref files with a different format.
 85        As such, users may need to manually edit the .ref file in a text editor to ensure it is in the correct format.
 86        CoreMS includes an example .ref file with the correct format for reference.
 87
 88        Returns
 89        -------
 90        df_ref : Pandas DataFrame
 91            reference mass list object.
 92
 93        """
 94        refmasslist = (
 95            Path(self.ref_mass_list_path)
 96            if isinstance(self.ref_mass_list_path, str)
 97            else self.ref_mass_list_path
 98        )
 99
100        if not refmasslist.exists():
101            raise FileExistsError("File does not exist: %s" % refmasslist)
102
103        with refmasslist.open("r") as csvfile:
104            dialect = csv.Sniffer().sniff(csvfile.read(1024))
105            delimiter = dialect.delimiter
106
107        if isinstance(refmasslist, S3Path):
108            # data = self.file_location.open('rb').read()
109            data = BytesIO(refmasslist.open("rb").read())
110
111        else:
112            data = refmasslist
113
114        df_ref = pd.read_csv(data, sep=delimiter, header=None, skiprows=1)
115
116        df_ref = df_ref.rename(
117            {0: "Formula", 1: "m/z", 2: "Charge", 3: "Form2"}, axis=1
118        )
119
120        df_ref.sort_values(by="m/z", ascending=True, inplace=True)
121        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
122            print(
123                "Reference mass list loaded - "
124                + str(len(df_ref))
125                + " calibration masses loaded."
126            )
127
128        return df_ref
129
130    def gen_ref_mass_list_from_assigned(self, min_conf: float = 0.7):
131        """Generate reference mass list from assigned masses
132
133        This function will generate a ref mass dataframe object from an assigned corems mass spec obj
134        using assigned masses above a certain minimum confidence threshold.
135
136        This function needs to be retested and check it is covered in the unit tests.
137
138        Parameters
139        ----------
140        min_conf : float, optional
141            minimum confidence score. The default is 0.7.
142
143        Returns
144        -------
145        df_ref : Pandas DataFrame
146            reference mass list - based on calculated masses.
147
148        """
149        # TODO this function needs to be retested and check it is covered in the unit tests
150        df = self.mass_spectrum.to_dataframe()
151        df = df[df["Confidence Score"] > min_conf]
152        df_ref = pd.DataFrame(columns=["m/z"])
153        df_ref["m/z"] = df["Calculated m/z"]
154        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
155            print(
156                "Reference mass list generated - "
157                + str(len(df_ref))
158                + " calibration masses."
159            )
160        return df_ref
161
162    def find_calibration_points(
163        self,
164        df_ref,
165        calib_ppm_error_threshold: tuple[float, float] = (-1, 1),
166        calib_snr_threshold: float = 5,
167        calibration_ref_match_method: str = "legacy",
168        calibration_ref_match_tolerance: float = 0.003,
169        calibration_ref_match_std_raw_error_limit: float = 1.5,
170    ):
171        """Function to find calibration points in the mass spectrum
172
173        Based on the reference mass list.
174
175        Parameters
176        ----------
177        df_ref : Pandas DataFrame
178            reference mass list for recalibration.
179        calib_ppm_error_threshold : tuple of floats, optional
180            ppm error for finding calibration masses in the spectrum. The default is -1,1.
181            Note: This is based on the calculation of ppm = ((mz_measure - mz_theoretical)/mz_theoretical)*1e6.
182                Some software does this the other way around and value signs must be inverted for that to work.
183        calib_snr_threshold : float, optional
184            snr threshold for finding calibration masses in the spectrum. The default is 5.
185
186        Returns
187        -------
188        cal_peaks_mz : list of floats
189            masses of measured ions to use in calibration routine
190        cal_refs_mz : list of floats
191            reference mz values of found calibration points.
192
193        """
194
195        # This approach is much more efficient and expedient than the original implementation.
196        peaks_mz = []
197        for x in self.mass_spectrum.mspeaks:
198            if x.signal_to_noise > calib_snr_threshold:
199                if self.mzsegment:
200                    if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment):
201                        peaks_mz.append(x.mz_exp)
202                else:
203                    peaks_mz.append(x.mz_exp)
204        peaks_mz = np.asarray(peaks_mz)
205
206        if calibration_ref_match_method == "legacy":
207            # This legacy approach iterates through each reference match and finds the entries within 1 mz and within the user defined PPM error threshold
208            # Then it removes ambiguities - which means the calibration threshold hasto be very tight.
209            cal_peaks_mz = []
210            cal_refs_mz = []
211            for mzref in df_ref["m/z"]:
212                tmp_peaks_mz = peaks_mz[abs(peaks_mz - mzref) < 1]
213                for mzmeas in tmp_peaks_mz:
214                    delta_mass = ((mzmeas - mzref) / mzref) * 1e6
215                    if delta_mass < max(calib_ppm_error_threshold):
216                        if delta_mass > min(calib_ppm_error_threshold):
217                            cal_peaks_mz.append(mzmeas)
218                            cal_refs_mz.append(mzref)
219
220            # To remove entries with duplicated indices (reference masses matching multiple peaks)
221            tmpdf = pd.Series(index=cal_refs_mz, data=cal_peaks_mz, dtype=float)
222            tmpdf = tmpdf[~tmpdf.index.duplicated(keep=False)]
223
224            cal_peaks_mz = list(tmpdf.values)
225            cal_refs_mz = list(tmpdf.index)
226        elif calibration_ref_match_method == "merged":
227            #warnings.warn("Using experimental new reference mass list merging")
228            # This is a new approach (August 2024) which uses Pandas 'merged_asof' to find the peaks closest in m/z between
229            # reference and measured masses. This is a quicker way to match, and seems to get more matches.
230            # It may not work as well when the data are far from correc initial mass
231            # e.g. if the correct peak is further from the reference than an incorrect peak.
232            meas_df = pd.DataFrame(columns=["meas_m/z"], data=peaks_mz)
233            tolerance = calibration_ref_match_tolerance
234            merged_df = pd.merge_asof(
235                df_ref,
236                meas_df,
237                left_on="m/z",
238                right_on="meas_m/z",
239                tolerance=tolerance,
240                direction="nearest",
241            )
242            merged_df.dropna(how="any", inplace=True)
243            merged_df["Error_ppm"] = (
244                (merged_df["meas_m/z"] - merged_df["m/z"]) / merged_df["m/z"]
245            ) * 1e6
246            median_raw_error = merged_df["Error_ppm"].median()
247            std_raw_error = merged_df["Error_ppm"].std()
248            if std_raw_error > calibration_ref_match_std_raw_error_limit:
249                std_raw_error = calibration_ref_match_std_raw_error_limit
250            self.mass_spectrum.calibration_raw_error_median = median_raw_error
251            self.mass_spectrum.calibration_raw_error_stdev = std_raw_error
252            merged_df = merged_df[
253                (merged_df["Error_ppm"] > (median_raw_error - 1.5 * std_raw_error))
254                & (merged_df["Error_ppm"] < (median_raw_error + 1.5 * std_raw_error))
255            ]
256            # merged_df= merged_df[(merged_df['Error_ppm']>min(calib_ppm_error_threshold))&(merged_df['Error_ppm']<max(calib_ppm_error_threshold))]
257            cal_peaks_mz = list(merged_df["meas_m/z"])
258            cal_refs_mz = list(merged_df["m/z"])
259        else:
260            raise ValueError(f"{calibration_ref_match_method} not allowed.")
261
262        if False:
263            min_calib_ppm_error = calib_ppm_error_threshold[0]
264            max_calib_ppm_error = calib_ppm_error_threshold[1]
265            df_raw = self.mass_spectrum.to_dataframe()
266
267            df_raw = df_raw[df_raw["S/N"] > calib_snr_threshold]
268            # optionally further subset that based on minimum S/N, RP, Peak Height
269            # to ensure only valid points are utilized
270            # in this example, only a S/N threshold is implemented.
271            imzmeas = []
272            mzrefs = []
273
274            for mzref in df_ref["m/z"]:
275                # find all peaks within a defined ppm error threshold
276                tmpdf = df_raw[
277                    ((df_raw["m/z"] - mzref) / mzref) * 1e6 < max_calib_ppm_error
278                ]
279                # Error is relative to the theoretical, so the divisor should be divisor
280
281                tmpdf = tmpdf[
282                    ((tmpdf["m/z"] - mzref) / mzref) * 1e6 > min_calib_ppm_error
283                ]
284
285                # only use the calibration point if only one peak is within the thresholds
286                # This may require some optimization of the threshold tolerances
287                if len(tmpdf) == 1:
288                    imzmeas.append(int(tmpdf.index.values))
289                    mzrefs.append(mzref)
290
291        # it is crucial the mass lists are in same order
292        # corems likes to do masses from high to low.
293        cal_refs_mz.sort(reverse=False)
294        cal_peaks_mz.sort(reverse=False)
295        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
296            print(
297                str(len(cal_peaks_mz))
298                + " calibration points matched within thresholds."
299            )
300        return cal_peaks_mz, cal_refs_mz
301
302    def robust_calib(
303        self,
304        param: list[float],
305        cal_peaks_mz: list[float],
306        cal_refs_mz: list[float],
307        order: int = 1,
308    ):
309        """Recalibration function
310
311        Computes the rms of m/z errors to minimize when calibrating.
312        This is adapted from from spike.
313
314        Parameters
315        ----------
316        param : list of floats
317            generated by minimize function from scipy optimize.
318        cal_peaks_mz : list of floats
319            masses of measured peaks to use in mass calibration.
320        cal_peaks_mz : list of floats
321            reference mz values of found calibration points.
322        order : int, optional
323            order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
324
325        Returns
326        -------
327        rmserror : float
328            root mean square mass error for calibration points.
329
330        """
331        Aterm = param[0]
332        Bterm = param[1]
333        try:
334            Cterm = param[2]
335        except IndexError:
336            pass
337
338        # get the mspeaks from the mass spectrum object which were calibration points
339        # mspeaks = [self.mass_spectrum.mspeaks[x] for x in imzmeas]
340        # get their calibrated mass values
341        # mspeakmzs = [x.mz_cal for x in mspeaks]
342        cal_peaks_mz = np.asarray(cal_peaks_mz)
343
344        # linearz
345        if order == 1:
346            ref_recal_points = (Aterm * cal_peaks_mz) + Bterm
347        # quadratic
348        elif order == 2:
349            ref_recal_points = (Aterm * (cal_peaks_mz)) + (
350                Bterm * np.power((cal_peaks_mz), 2) + Cterm
351            )
352
353        # sort both the calibration points (measured, recalibrated)
354        ref_recal_points.sort()
355        # and sort the calibration points (theoretical, predefined)
356        cal_refs_mz.sort()
357
358        # calculate the ppm error for each calibration point
359        error = ((ref_recal_points - cal_refs_mz) / cal_refs_mz) * 1e6
360        # calculate the root mean square error - this is our target to minimize
361        rmserror = np.sqrt(np.mean(error**2))
362        return rmserror
363
364    def recalibrate_mass_spectrum(
365        self,
366        cal_peaks_mz: list[float],
367        cal_refs_mz: list[float],
368        order: int = 1,
369        diagnostic: bool = False,
370    ):
371        """Main recalibration function which uses a robust linear regression
372
373        This function performs the recalibration of the mass spectrum object.
374        It iteratively applies
375
376        Parameters
377        ----------
378        cal_peaks_mz : list of float
379            masses of measured peaks to use in mass calibration.
380        cal_refs_mz : list of float
381            reference mz values of found calibration points.
382        order : int, optional
383            order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
384
385        Returns
386        -------
387        mass_spectrum : CoreMS mass spectrum object
388            Calibrated mass spectrum object
389
390
391        Notes
392        -----
393        This function is adapted, in part, from the SPIKE project [1,2] and is based on the robust linear regression method.
394
395        References
396        ----------
397        1.  Chiron L., Coutouly M-A., Starck J-P., Rolando C., Delsuc M-A.
398            SPIKE a Processing Software dedicated to Fourier Spectroscopies
399            https://arxiv.org/abs/1608.06777 (2016)
400        2.  SPIKE - https://github.com/spike-project/spike
401
402        """
403        # initialise parameters for recalibration
404        # these are the 'Aterm, Bterm, Cterm'
405        # as spectra are already freq->mz calibrated, these terms are very small
406        # may be beneficial to formally separate them from the freq->mz terms
407        if order == 1:
408            Po = [1, 0]
409        elif order == 2:
410            Po = [1, 0, 0]
411
412        if len(cal_peaks_mz) >= 2:
413            if self.mzsegment:  # If only part of the spectrum is to be recalibrated
414                mz_exp_peaks = np.array(
415                    [mspeak.mz_exp for mspeak in self.mass_spectrum]
416                )
417                # Split the array into two parts - one to recailbrate, one to keep unchanged.
418                mz_exp_peaks_tocal = mz_exp_peaks[
419                    (mz_exp_peaks >= min(self.mzsegment))
420                    & (mz_exp_peaks <= max(self.mzsegment))
421                ]
422                mz_exp_peaks_unchanged = mz_exp_peaks[
423                    ~(mz_exp_peaks >= min(self.mzsegment))
424                    | ~(mz_exp_peaks <= max(self.mzsegment))
425                ]
426                # TODO: - segmented calibration needs a way to better track the calibration args/values...
427                if not self.mass_spectrum.is_centroid:
428                    mz_exp_profile = np.array(self.mass_spectrum.mz_exp_profile)
429                    # Split the array into two parts - one to recailbrate, one to keep unchanged.
430                    mz_exp_profile_tocal = mz_exp_profile[
431                        (mz_exp_profile >= min(self.mzsegment))
432                        & (mz_exp_profile <= max(self.mzsegment))
433                    ]
434                    mz_exp_profile_unchanged = mz_exp_profile[
435                        ~(mz_exp_profile >= min(self.mzsegment))
436                        | ~(mz_exp_profile <= max(self.mzsegment))
437                    ]
438            else:  # if just recalibrating the whole spectrum
439                mz_exp_peaks_tocal = np.array(
440                    [mspeak.mz_exp for mspeak in self.mass_spectrum]
441                )
442                if not self.mass_spectrum.is_centroid:
443                    mz_exp_profile_tocal = np.array(self.mass_spectrum.mz_exp_profile)
444
445            minimize_method = self.mass_spectrum.settings.calib_minimize_method
446            res = minimize(
447                self.robust_calib,
448                Po,
449                args=(cal_peaks_mz, cal_refs_mz, order),
450                method=minimize_method,
451            )
452            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
453                print(
454                    "minimize function completed with RMS error of: {:0.3f} ppm".format(
455                        res["fun"]
456                    )
457                )
458                print(
459                    "minimize function performed {:1d} fn evals and {:1d} iterations".format(
460                        res["nfev"], res["nit"]
461                    )
462                )
463            Pn = res.x
464
465            # mz_exp_ms = np.array([mspeak.mz_exp for mspeak in self.mass_spectrum])
466
467            if order == 1:
468                mz_domain = (Pn[0] * mz_exp_peaks_tocal) + Pn[1]
469                if not self.mass_spectrum.is_centroid:
470                    mz_profile_calc = (Pn[0] * mz_exp_profile_tocal) + Pn[1]
471
472            elif order == 2:
473                mz_domain = (Pn[0] * (mz_exp_peaks_tocal)) + (
474                    Pn[1] * np.power((mz_exp_peaks_tocal), 2) + Pn[2]
475                )
476
477                if not self.mass_spectrum.is_centroid:
478                    mz_profile_calc = (Pn[0] * (mz_exp_profile_tocal)) + (
479                        Pn[1] * np.power((mz_exp_profile_tocal), 2) + Pn[2]
480                    )
481
482            if self.mzsegment:
483                # Recombine the mass domains
484                mz_domain = np.concatenate([mz_domain, mz_exp_peaks_unchanged])
485                mz_domain.sort()
486                if not self.mass_spectrum.is_centroid:
487                    mz_profile_calc = np.concatenate(
488                        [mz_profile_calc, mz_exp_profile_unchanged]
489                    )
490                    mz_profile_calc.sort()
491                # Sort them
492                if (
493                    mz_exp_peaks[0] > mz_exp_peaks[1]
494                ):  # If originally descending mass order
495                    mz_domain = mz_domain[::-1]
496                    if not self.mass_spectrum.is_centroid:
497                        mz_profile_calc = mz_profile_calc[::-1]
498
499            self.mass_spectrum.mz_cal = mz_domain
500            if not self.mass_spectrum.is_centroid:
501                self.mass_spectrum.mz_cal_profile = mz_profile_calc
502
503            self.mass_spectrum.calibration_order = order
504            self.mass_spectrum.calibration_RMS = float(res["fun"])
505            self.mass_spectrum.calibration_points = int(len(cal_refs_mz))
506            self.mass_spectrum.calibration_ref_mzs = cal_refs_mz
507            self.mass_spectrum.calibration_meas_mzs = cal_peaks_mz
508
509            self.mass_spectrum.calibration_segment = self.mzsegment
510
511            if diagnostic:
512                return self.mass_spectrum, res
513            return self.mass_spectrum
514        else:
515            warnings.warn("Too few calibration points - aborting.")
516            return self.mass_spectrum
517
518    def run(self):
519        """Run the calibration routine
520
521        This function runs the calibration routine.
522
523        """
524        calib_ppm_error_threshold = self.mass_spectrum.settings.calib_sn_threshold
525        max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error
526        min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error
527        calib_pol_order = self.mass_spectrum.settings.calib_pol_order
528        calibration_ref_match_method = (
529            self.mass_spectrum.settings.calibration_ref_match_method
530        )
531        calibration_ref_match_tolerance = (
532            self.mass_spectrum.settings.calibration_ref_match_tolerance
533        )
534        calibration_ref_match_std_raw_error_limit = (
535            self.mass_spectrum.settings.calibration_ref_match_std_raw_error_limit
536        )
537
538        # load reference mass list
539        df_ref = self.load_ref_mass_list()
540
541        # find calibration points
542        cal_peaks_mz, cal_refs_mz = self.find_calibration_points(
543            df_ref,
544            calib_ppm_error_threshold=(min_calib_ppm_error, max_calib_ppm_error),
545            calib_snr_threshold=calib_ppm_error_threshold,
546            calibration_ref_match_method=calibration_ref_match_method,
547            calibration_ref_match_tolerance=calibration_ref_match_tolerance,
548            calibration_ref_match_std_raw_error_limit=calibration_ref_match_std_raw_error_limit,
549        )
550        if len(cal_peaks_mz) == 2:
551            self.mass_spectrum.settings.calib_pol_order = 1
552            calib_pol_order = 1
553            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
554                print("Only 2 calibration points found, forcing a linear recalibration")
555        elif len(cal_peaks_mz) < 2:
556            warnings.warn("Too few calibration points found, function will fail")
557        self.recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=calib_pol_order)
class MzDomainCalibration:
 23class MzDomainCalibration:
 24    """MzDomainCalibration class for recalibrating mass spectra
 25
 26    Parameters
 27    ----------
 28    mass_spectrum : CoreMS MassSpectrum Object
 29        The mass spectrum to be calibrated.
 30    ref_masslist : str
 31        The path to a reference mass list.
 32    mzsegment : tuple of floats, optional
 33        The mz range to recalibrate, or None. Used for calibration of specific parts of the mz domain at a time.
 34        Future work - allow multiple mzsegments to be passed.
 35
 36    Attributes
 37    ----------
 38    mass_spectrum : CoreMS MassSpectrum Object
 39        The mass spectrum to be calibrated.
 40    mzsegment : tuple of floats or None
 41        The mz range to recalibrate, or None.
 42    ref_mass_list_path : str or Path
 43        The path to the reference mass list.
 44
 45    Methods
 46    -------
 47    * run().
 48        Main function to run this class.
 49    * load_ref_mass_list().
 50        Load reference mass list (Bruker format).
 51    * gen_ref_mass_list_from_assigned(min_conf=0.7).
 52        Generate reference mass list from assigned masses.
 53    * find_calibration_points(df_ref, calib_ppm_error_threshold=(-1, 1), calib_snr_threshold=5).
 54        Find calibration points in the mass spectrum based on the reference mass list.
 55    * robust_calib(param, cal_peaks_mz, cal_refs_mz, order=1).
 56        Recalibration function.
 57    * recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=1, diagnostic=False).
 58        Main recalibration function which uses a robust linear regression.
 59
 60
 61    """
 62
 63    def __init__(self, mass_spectrum, ref_masslist, mzsegment=None):
 64        self.mass_spectrum = mass_spectrum
 65        self.mzsegment = mzsegment
 66
 67        # define reference mass list - bruker .ref format
 68        self.ref_mass_list_path = ref_masslist
 69        if self.mass_spectrum.percentile_assigned(mute_output=True)[0] != 0:
 70            warnings.warn(
 71                "Warning: calibrating spectra which have already been assigned may yield erroneous results"
 72            )
 73        self.mass_spectrum.mz_cal = self.mass_spectrum.mz_exp
 74        self.mass_spectrum.mz_cal_profile = self.mass_spectrum._mz_exp
 75
 76        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
 77            print(
 78                "MS Obj loaded - " + str(len(mass_spectrum.mspeaks)) + " peaks found."
 79            )
 80
 81    def load_ref_mass_list(self):
 82        """Load reference mass list (Bruker format)
 83
 84        Loads in a reference mass list from a .ref file
 85        Note that some versions of Bruker's software produce .ref files with a different format.
 86        As such, users may need to manually edit the .ref file in a text editor to ensure it is in the correct format.
 87        CoreMS includes an example .ref file with the correct format for reference.
 88
 89        Returns
 90        -------
 91        df_ref : Pandas DataFrame
 92            reference mass list object.
 93
 94        """
 95        refmasslist = (
 96            Path(self.ref_mass_list_path)
 97            if isinstance(self.ref_mass_list_path, str)
 98            else self.ref_mass_list_path
 99        )
100
101        if not refmasslist.exists():
102            raise FileExistsError("File does not exist: %s" % refmasslist)
103
104        with refmasslist.open("r") as csvfile:
105            dialect = csv.Sniffer().sniff(csvfile.read(1024))
106            delimiter = dialect.delimiter
107
108        if isinstance(refmasslist, S3Path):
109            # data = self.file_location.open('rb').read()
110            data = BytesIO(refmasslist.open("rb").read())
111
112        else:
113            data = refmasslist
114
115        df_ref = pd.read_csv(data, sep=delimiter, header=None, skiprows=1)
116
117        df_ref = df_ref.rename(
118            {0: "Formula", 1: "m/z", 2: "Charge", 3: "Form2"}, axis=1
119        )
120
121        df_ref.sort_values(by="m/z", ascending=True, inplace=True)
122        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
123            print(
124                "Reference mass list loaded - "
125                + str(len(df_ref))
126                + " calibration masses loaded."
127            )
128
129        return df_ref
130
131    def gen_ref_mass_list_from_assigned(self, min_conf: float = 0.7):
132        """Generate reference mass list from assigned masses
133
134        This function will generate a ref mass dataframe object from an assigned corems mass spec obj
135        using assigned masses above a certain minimum confidence threshold.
136
137        This function needs to be retested and check it is covered in the unit tests.
138
139        Parameters
140        ----------
141        min_conf : float, optional
142            minimum confidence score. The default is 0.7.
143
144        Returns
145        -------
146        df_ref : Pandas DataFrame
147            reference mass list - based on calculated masses.
148
149        """
150        # TODO this function needs to be retested and check it is covered in the unit tests
151        df = self.mass_spectrum.to_dataframe()
152        df = df[df["Confidence Score"] > min_conf]
153        df_ref = pd.DataFrame(columns=["m/z"])
154        df_ref["m/z"] = df["Calculated m/z"]
155        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
156            print(
157                "Reference mass list generated - "
158                + str(len(df_ref))
159                + " calibration masses."
160            )
161        return df_ref
162
163    def find_calibration_points(
164        self,
165        df_ref,
166        calib_ppm_error_threshold: tuple[float, float] = (-1, 1),
167        calib_snr_threshold: float = 5,
168        calibration_ref_match_method: str = "legacy",
169        calibration_ref_match_tolerance: float = 0.003,
170        calibration_ref_match_std_raw_error_limit: float = 1.5,
171    ):
172        """Function to find calibration points in the mass spectrum
173
174        Based on the reference mass list.
175
176        Parameters
177        ----------
178        df_ref : Pandas DataFrame
179            reference mass list for recalibration.
180        calib_ppm_error_threshold : tuple of floats, optional
181            ppm error for finding calibration masses in the spectrum. The default is -1,1.
182            Note: This is based on the calculation of ppm = ((mz_measure - mz_theoretical)/mz_theoretical)*1e6.
183                Some software does this the other way around and value signs must be inverted for that to work.
184        calib_snr_threshold : float, optional
185            snr threshold for finding calibration masses in the spectrum. The default is 5.
186
187        Returns
188        -------
189        cal_peaks_mz : list of floats
190            masses of measured ions to use in calibration routine
191        cal_refs_mz : list of floats
192            reference mz values of found calibration points.
193
194        """
195
196        # This approach is much more efficient and expedient than the original implementation.
197        peaks_mz = []
198        for x in self.mass_spectrum.mspeaks:
199            if x.signal_to_noise > calib_snr_threshold:
200                if self.mzsegment:
201                    if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment):
202                        peaks_mz.append(x.mz_exp)
203                else:
204                    peaks_mz.append(x.mz_exp)
205        peaks_mz = np.asarray(peaks_mz)
206
207        if calibration_ref_match_method == "legacy":
208            # This legacy approach iterates through each reference match and finds the entries within 1 mz and within the user defined PPM error threshold
209            # Then it removes ambiguities - which means the calibration threshold hasto be very tight.
210            cal_peaks_mz = []
211            cal_refs_mz = []
212            for mzref in df_ref["m/z"]:
213                tmp_peaks_mz = peaks_mz[abs(peaks_mz - mzref) < 1]
214                for mzmeas in tmp_peaks_mz:
215                    delta_mass = ((mzmeas - mzref) / mzref) * 1e6
216                    if delta_mass < max(calib_ppm_error_threshold):
217                        if delta_mass > min(calib_ppm_error_threshold):
218                            cal_peaks_mz.append(mzmeas)
219                            cal_refs_mz.append(mzref)
220
221            # To remove entries with duplicated indices (reference masses matching multiple peaks)
222            tmpdf = pd.Series(index=cal_refs_mz, data=cal_peaks_mz, dtype=float)
223            tmpdf = tmpdf[~tmpdf.index.duplicated(keep=False)]
224
225            cal_peaks_mz = list(tmpdf.values)
226            cal_refs_mz = list(tmpdf.index)
227        elif calibration_ref_match_method == "merged":
228            #warnings.warn("Using experimental new reference mass list merging")
229            # This is a new approach (August 2024) which uses Pandas 'merged_asof' to find the peaks closest in m/z between
230            # reference and measured masses. This is a quicker way to match, and seems to get more matches.
231            # It may not work as well when the data are far from correc initial mass
232            # e.g. if the correct peak is further from the reference than an incorrect peak.
233            meas_df = pd.DataFrame(columns=["meas_m/z"], data=peaks_mz)
234            tolerance = calibration_ref_match_tolerance
235            merged_df = pd.merge_asof(
236                df_ref,
237                meas_df,
238                left_on="m/z",
239                right_on="meas_m/z",
240                tolerance=tolerance,
241                direction="nearest",
242            )
243            merged_df.dropna(how="any", inplace=True)
244            merged_df["Error_ppm"] = (
245                (merged_df["meas_m/z"] - merged_df["m/z"]) / merged_df["m/z"]
246            ) * 1e6
247            median_raw_error = merged_df["Error_ppm"].median()
248            std_raw_error = merged_df["Error_ppm"].std()
249            if std_raw_error > calibration_ref_match_std_raw_error_limit:
250                std_raw_error = calibration_ref_match_std_raw_error_limit
251            self.mass_spectrum.calibration_raw_error_median = median_raw_error
252            self.mass_spectrum.calibration_raw_error_stdev = std_raw_error
253            merged_df = merged_df[
254                (merged_df["Error_ppm"] > (median_raw_error - 1.5 * std_raw_error))
255                & (merged_df["Error_ppm"] < (median_raw_error + 1.5 * std_raw_error))
256            ]
257            # merged_df= merged_df[(merged_df['Error_ppm']>min(calib_ppm_error_threshold))&(merged_df['Error_ppm']<max(calib_ppm_error_threshold))]
258            cal_peaks_mz = list(merged_df["meas_m/z"])
259            cal_refs_mz = list(merged_df["m/z"])
260        else:
261            raise ValueError(f"{calibration_ref_match_method} not allowed.")
262
263        if False:
264            min_calib_ppm_error = calib_ppm_error_threshold[0]
265            max_calib_ppm_error = calib_ppm_error_threshold[1]
266            df_raw = self.mass_spectrum.to_dataframe()
267
268            df_raw = df_raw[df_raw["S/N"] > calib_snr_threshold]
269            # optionally further subset that based on minimum S/N, RP, Peak Height
270            # to ensure only valid points are utilized
271            # in this example, only a S/N threshold is implemented.
272            imzmeas = []
273            mzrefs = []
274
275            for mzref in df_ref["m/z"]:
276                # find all peaks within a defined ppm error threshold
277                tmpdf = df_raw[
278                    ((df_raw["m/z"] - mzref) / mzref) * 1e6 < max_calib_ppm_error
279                ]
280                # Error is relative to the theoretical, so the divisor should be divisor
281
282                tmpdf = tmpdf[
283                    ((tmpdf["m/z"] - mzref) / mzref) * 1e6 > min_calib_ppm_error
284                ]
285
286                # only use the calibration point if only one peak is within the thresholds
287                # This may require some optimization of the threshold tolerances
288                if len(tmpdf) == 1:
289                    imzmeas.append(int(tmpdf.index.values))
290                    mzrefs.append(mzref)
291
292        # it is crucial the mass lists are in same order
293        # corems likes to do masses from high to low.
294        cal_refs_mz.sort(reverse=False)
295        cal_peaks_mz.sort(reverse=False)
296        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
297            print(
298                str(len(cal_peaks_mz))
299                + " calibration points matched within thresholds."
300            )
301        return cal_peaks_mz, cal_refs_mz
302
303    def robust_calib(
304        self,
305        param: list[float],
306        cal_peaks_mz: list[float],
307        cal_refs_mz: list[float],
308        order: int = 1,
309    ):
310        """Recalibration function
311
312        Computes the rms of m/z errors to minimize when calibrating.
313        This is adapted from from spike.
314
315        Parameters
316        ----------
317        param : list of floats
318            generated by minimize function from scipy optimize.
319        cal_peaks_mz : list of floats
320            masses of measured peaks to use in mass calibration.
321        cal_peaks_mz : list of floats
322            reference mz values of found calibration points.
323        order : int, optional
324            order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
325
326        Returns
327        -------
328        rmserror : float
329            root mean square mass error for calibration points.
330
331        """
332        Aterm = param[0]
333        Bterm = param[1]
334        try:
335            Cterm = param[2]
336        except IndexError:
337            pass
338
339        # get the mspeaks from the mass spectrum object which were calibration points
340        # mspeaks = [self.mass_spectrum.mspeaks[x] for x in imzmeas]
341        # get their calibrated mass values
342        # mspeakmzs = [x.mz_cal for x in mspeaks]
343        cal_peaks_mz = np.asarray(cal_peaks_mz)
344
345        # linearz
346        if order == 1:
347            ref_recal_points = (Aterm * cal_peaks_mz) + Bterm
348        # quadratic
349        elif order == 2:
350            ref_recal_points = (Aterm * (cal_peaks_mz)) + (
351                Bterm * np.power((cal_peaks_mz), 2) + Cterm
352            )
353
354        # sort both the calibration points (measured, recalibrated)
355        ref_recal_points.sort()
356        # and sort the calibration points (theoretical, predefined)
357        cal_refs_mz.sort()
358
359        # calculate the ppm error for each calibration point
360        error = ((ref_recal_points - cal_refs_mz) / cal_refs_mz) * 1e6
361        # calculate the root mean square error - this is our target to minimize
362        rmserror = np.sqrt(np.mean(error**2))
363        return rmserror
364
365    def recalibrate_mass_spectrum(
366        self,
367        cal_peaks_mz: list[float],
368        cal_refs_mz: list[float],
369        order: int = 1,
370        diagnostic: bool = False,
371    ):
372        """Main recalibration function which uses a robust linear regression
373
374        This function performs the recalibration of the mass spectrum object.
375        It iteratively applies
376
377        Parameters
378        ----------
379        cal_peaks_mz : list of float
380            masses of measured peaks to use in mass calibration.
381        cal_refs_mz : list of float
382            reference mz values of found calibration points.
383        order : int, optional
384            order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
385
386        Returns
387        -------
388        mass_spectrum : CoreMS mass spectrum object
389            Calibrated mass spectrum object
390
391
392        Notes
393        -----
394        This function is adapted, in part, from the SPIKE project [1,2] and is based on the robust linear regression method.
395
396        References
397        ----------
398        1.  Chiron L., Coutouly M-A., Starck J-P., Rolando C., Delsuc M-A.
399            SPIKE a Processing Software dedicated to Fourier Spectroscopies
400            https://arxiv.org/abs/1608.06777 (2016)
401        2.  SPIKE - https://github.com/spike-project/spike
402
403        """
404        # initialise parameters for recalibration
405        # these are the 'Aterm, Bterm, Cterm'
406        # as spectra are already freq->mz calibrated, these terms are very small
407        # may be beneficial to formally separate them from the freq->mz terms
408        if order == 1:
409            Po = [1, 0]
410        elif order == 2:
411            Po = [1, 0, 0]
412
413        if len(cal_peaks_mz) >= 2:
414            if self.mzsegment:  # If only part of the spectrum is to be recalibrated
415                mz_exp_peaks = np.array(
416                    [mspeak.mz_exp for mspeak in self.mass_spectrum]
417                )
418                # Split the array into two parts - one to recailbrate, one to keep unchanged.
419                mz_exp_peaks_tocal = mz_exp_peaks[
420                    (mz_exp_peaks >= min(self.mzsegment))
421                    & (mz_exp_peaks <= max(self.mzsegment))
422                ]
423                mz_exp_peaks_unchanged = mz_exp_peaks[
424                    ~(mz_exp_peaks >= min(self.mzsegment))
425                    | ~(mz_exp_peaks <= max(self.mzsegment))
426                ]
427                # TODO: - segmented calibration needs a way to better track the calibration args/values...
428                if not self.mass_spectrum.is_centroid:
429                    mz_exp_profile = np.array(self.mass_spectrum.mz_exp_profile)
430                    # Split the array into two parts - one to recailbrate, one to keep unchanged.
431                    mz_exp_profile_tocal = mz_exp_profile[
432                        (mz_exp_profile >= min(self.mzsegment))
433                        & (mz_exp_profile <= max(self.mzsegment))
434                    ]
435                    mz_exp_profile_unchanged = mz_exp_profile[
436                        ~(mz_exp_profile >= min(self.mzsegment))
437                        | ~(mz_exp_profile <= max(self.mzsegment))
438                    ]
439            else:  # if just recalibrating the whole spectrum
440                mz_exp_peaks_tocal = np.array(
441                    [mspeak.mz_exp for mspeak in self.mass_spectrum]
442                )
443                if not self.mass_spectrum.is_centroid:
444                    mz_exp_profile_tocal = np.array(self.mass_spectrum.mz_exp_profile)
445
446            minimize_method = self.mass_spectrum.settings.calib_minimize_method
447            res = minimize(
448                self.robust_calib,
449                Po,
450                args=(cal_peaks_mz, cal_refs_mz, order),
451                method=minimize_method,
452            )
453            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
454                print(
455                    "minimize function completed with RMS error of: {:0.3f} ppm".format(
456                        res["fun"]
457                    )
458                )
459                print(
460                    "minimize function performed {:1d} fn evals and {:1d} iterations".format(
461                        res["nfev"], res["nit"]
462                    )
463                )
464            Pn = res.x
465
466            # mz_exp_ms = np.array([mspeak.mz_exp for mspeak in self.mass_spectrum])
467
468            if order == 1:
469                mz_domain = (Pn[0] * mz_exp_peaks_tocal) + Pn[1]
470                if not self.mass_spectrum.is_centroid:
471                    mz_profile_calc = (Pn[0] * mz_exp_profile_tocal) + Pn[1]
472
473            elif order == 2:
474                mz_domain = (Pn[0] * (mz_exp_peaks_tocal)) + (
475                    Pn[1] * np.power((mz_exp_peaks_tocal), 2) + Pn[2]
476                )
477
478                if not self.mass_spectrum.is_centroid:
479                    mz_profile_calc = (Pn[0] * (mz_exp_profile_tocal)) + (
480                        Pn[1] * np.power((mz_exp_profile_tocal), 2) + Pn[2]
481                    )
482
483            if self.mzsegment:
484                # Recombine the mass domains
485                mz_domain = np.concatenate([mz_domain, mz_exp_peaks_unchanged])
486                mz_domain.sort()
487                if not self.mass_spectrum.is_centroid:
488                    mz_profile_calc = np.concatenate(
489                        [mz_profile_calc, mz_exp_profile_unchanged]
490                    )
491                    mz_profile_calc.sort()
492                # Sort them
493                if (
494                    mz_exp_peaks[0] > mz_exp_peaks[1]
495                ):  # If originally descending mass order
496                    mz_domain = mz_domain[::-1]
497                    if not self.mass_spectrum.is_centroid:
498                        mz_profile_calc = mz_profile_calc[::-1]
499
500            self.mass_spectrum.mz_cal = mz_domain
501            if not self.mass_spectrum.is_centroid:
502                self.mass_spectrum.mz_cal_profile = mz_profile_calc
503
504            self.mass_spectrum.calibration_order = order
505            self.mass_spectrum.calibration_RMS = float(res["fun"])
506            self.mass_spectrum.calibration_points = int(len(cal_refs_mz))
507            self.mass_spectrum.calibration_ref_mzs = cal_refs_mz
508            self.mass_spectrum.calibration_meas_mzs = cal_peaks_mz
509
510            self.mass_spectrum.calibration_segment = self.mzsegment
511
512            if diagnostic:
513                return self.mass_spectrum, res
514            return self.mass_spectrum
515        else:
516            warnings.warn("Too few calibration points - aborting.")
517            return self.mass_spectrum
518
519    def run(self):
520        """Run the calibration routine
521
522        This function runs the calibration routine.
523
524        """
525        calib_ppm_error_threshold = self.mass_spectrum.settings.calib_sn_threshold
526        max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error
527        min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error
528        calib_pol_order = self.mass_spectrum.settings.calib_pol_order
529        calibration_ref_match_method = (
530            self.mass_spectrum.settings.calibration_ref_match_method
531        )
532        calibration_ref_match_tolerance = (
533            self.mass_spectrum.settings.calibration_ref_match_tolerance
534        )
535        calibration_ref_match_std_raw_error_limit = (
536            self.mass_spectrum.settings.calibration_ref_match_std_raw_error_limit
537        )
538
539        # load reference mass list
540        df_ref = self.load_ref_mass_list()
541
542        # find calibration points
543        cal_peaks_mz, cal_refs_mz = self.find_calibration_points(
544            df_ref,
545            calib_ppm_error_threshold=(min_calib_ppm_error, max_calib_ppm_error),
546            calib_snr_threshold=calib_ppm_error_threshold,
547            calibration_ref_match_method=calibration_ref_match_method,
548            calibration_ref_match_tolerance=calibration_ref_match_tolerance,
549            calibration_ref_match_std_raw_error_limit=calibration_ref_match_std_raw_error_limit,
550        )
551        if len(cal_peaks_mz) == 2:
552            self.mass_spectrum.settings.calib_pol_order = 1
553            calib_pol_order = 1
554            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
555                print("Only 2 calibration points found, forcing a linear recalibration")
556        elif len(cal_peaks_mz) < 2:
557            warnings.warn("Too few calibration points found, function will fail")
558        self.recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=calib_pol_order)

MzDomainCalibration class for recalibrating mass spectra

Parameters
  • mass_spectrum (CoreMS MassSpectrum Object): The mass spectrum to be calibrated.
  • ref_masslist (str): The path to a reference mass list.
  • mzsegment (tuple of floats, optional): The mz range to recalibrate, or None. Used for calibration of specific parts of the mz domain at a time. Future work - allow multiple mzsegments to be passed.
Attributes
  • mass_spectrum (CoreMS MassSpectrum Object): The mass spectrum to be calibrated.
  • mzsegment (tuple of floats or None): The mz range to recalibrate, or None.
  • ref_mass_list_path (str or Path): The path to the reference mass list.
Methods
  • run(). Main function to run this class.
  • load_ref_mass_list(). Load reference mass list (Bruker format).
  • gen_ref_mass_list_from_assigned(min_conf=0.7). Generate reference mass list from assigned masses.
  • find_calibration_points(df_ref, calib_ppm_error_threshold=(-1, 1), calib_snr_threshold=5). Find calibration points in the mass spectrum based on the reference mass list.
  • robust_calib(param, cal_peaks_mz, cal_refs_mz, order=1). Recalibration function.
  • recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=1, diagnostic=False). Main recalibration function which uses a robust linear regression.
MzDomainCalibration(mass_spectrum, ref_masslist, mzsegment=None)
63    def __init__(self, mass_spectrum, ref_masslist, mzsegment=None):
64        self.mass_spectrum = mass_spectrum
65        self.mzsegment = mzsegment
66
67        # define reference mass list - bruker .ref format
68        self.ref_mass_list_path = ref_masslist
69        if self.mass_spectrum.percentile_assigned(mute_output=True)[0] != 0:
70            warnings.warn(
71                "Warning: calibrating spectra which have already been assigned may yield erroneous results"
72            )
73        self.mass_spectrum.mz_cal = self.mass_spectrum.mz_exp
74        self.mass_spectrum.mz_cal_profile = self.mass_spectrum._mz_exp
75
76        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
77            print(
78                "MS Obj loaded - " + str(len(mass_spectrum.mspeaks)) + " peaks found."
79            )
mass_spectrum
mzsegment
ref_mass_list_path
def load_ref_mass_list(self):
 81    def load_ref_mass_list(self):
 82        """Load reference mass list (Bruker format)
 83
 84        Loads in a reference mass list from a .ref file
 85        Note that some versions of Bruker's software produce .ref files with a different format.
 86        As such, users may need to manually edit the .ref file in a text editor to ensure it is in the correct format.
 87        CoreMS includes an example .ref file with the correct format for reference.
 88
 89        Returns
 90        -------
 91        df_ref : Pandas DataFrame
 92            reference mass list object.
 93
 94        """
 95        refmasslist = (
 96            Path(self.ref_mass_list_path)
 97            if isinstance(self.ref_mass_list_path, str)
 98            else self.ref_mass_list_path
 99        )
100
101        if not refmasslist.exists():
102            raise FileExistsError("File does not exist: %s" % refmasslist)
103
104        with refmasslist.open("r") as csvfile:
105            dialect = csv.Sniffer().sniff(csvfile.read(1024))
106            delimiter = dialect.delimiter
107
108        if isinstance(refmasslist, S3Path):
109            # data = self.file_location.open('rb').read()
110            data = BytesIO(refmasslist.open("rb").read())
111
112        else:
113            data = refmasslist
114
115        df_ref = pd.read_csv(data, sep=delimiter, header=None, skiprows=1)
116
117        df_ref = df_ref.rename(
118            {0: "Formula", 1: "m/z", 2: "Charge", 3: "Form2"}, axis=1
119        )
120
121        df_ref.sort_values(by="m/z", ascending=True, inplace=True)
122        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
123            print(
124                "Reference mass list loaded - "
125                + str(len(df_ref))
126                + " calibration masses loaded."
127            )
128
129        return df_ref

Load reference mass list (Bruker format)

Loads in a reference mass list from a .ref file Note that some versions of Bruker's software produce .ref files with a different format. As such, users may need to manually edit the .ref file in a text editor to ensure it is in the correct format. CoreMS includes an example .ref file with the correct format for reference.

Returns
  • df_ref (Pandas DataFrame): reference mass list object.
def gen_ref_mass_list_from_assigned(self, min_conf: float = 0.7):
131    def gen_ref_mass_list_from_assigned(self, min_conf: float = 0.7):
132        """Generate reference mass list from assigned masses
133
134        This function will generate a ref mass dataframe object from an assigned corems mass spec obj
135        using assigned masses above a certain minimum confidence threshold.
136
137        This function needs to be retested and check it is covered in the unit tests.
138
139        Parameters
140        ----------
141        min_conf : float, optional
142            minimum confidence score. The default is 0.7.
143
144        Returns
145        -------
146        df_ref : Pandas DataFrame
147            reference mass list - based on calculated masses.
148
149        """
150        # TODO this function needs to be retested and check it is covered in the unit tests
151        df = self.mass_spectrum.to_dataframe()
152        df = df[df["Confidence Score"] > min_conf]
153        df_ref = pd.DataFrame(columns=["m/z"])
154        df_ref["m/z"] = df["Calculated m/z"]
155        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
156            print(
157                "Reference mass list generated - "
158                + str(len(df_ref))
159                + " calibration masses."
160            )
161        return df_ref

Generate reference mass list from assigned masses

This function will generate a ref mass dataframe object from an assigned corems mass spec obj using assigned masses above a certain minimum confidence threshold.

This function needs to be retested and check it is covered in the unit tests.

Parameters
  • min_conf (float, optional): minimum confidence score. The default is 0.7.
Returns
  • df_ref (Pandas DataFrame): reference mass list - based on calculated masses.
def find_calibration_points( self, df_ref, calib_ppm_error_threshold: tuple[float, float] = (-1, 1), calib_snr_threshold: float = 5, calibration_ref_match_method: str = 'legacy', calibration_ref_match_tolerance: float = 0.003, calibration_ref_match_std_raw_error_limit: float = 1.5):
163    def find_calibration_points(
164        self,
165        df_ref,
166        calib_ppm_error_threshold: tuple[float, float] = (-1, 1),
167        calib_snr_threshold: float = 5,
168        calibration_ref_match_method: str = "legacy",
169        calibration_ref_match_tolerance: float = 0.003,
170        calibration_ref_match_std_raw_error_limit: float = 1.5,
171    ):
172        """Function to find calibration points in the mass spectrum
173
174        Based on the reference mass list.
175
176        Parameters
177        ----------
178        df_ref : Pandas DataFrame
179            reference mass list for recalibration.
180        calib_ppm_error_threshold : tuple of floats, optional
181            ppm error for finding calibration masses in the spectrum. The default is -1,1.
182            Note: This is based on the calculation of ppm = ((mz_measure - mz_theoretical)/mz_theoretical)*1e6.
183                Some software does this the other way around and value signs must be inverted for that to work.
184        calib_snr_threshold : float, optional
185            snr threshold for finding calibration masses in the spectrum. The default is 5.
186
187        Returns
188        -------
189        cal_peaks_mz : list of floats
190            masses of measured ions to use in calibration routine
191        cal_refs_mz : list of floats
192            reference mz values of found calibration points.
193
194        """
195
196        # This approach is much more efficient and expedient than the original implementation.
197        peaks_mz = []
198        for x in self.mass_spectrum.mspeaks:
199            if x.signal_to_noise > calib_snr_threshold:
200                if self.mzsegment:
201                    if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment):
202                        peaks_mz.append(x.mz_exp)
203                else:
204                    peaks_mz.append(x.mz_exp)
205        peaks_mz = np.asarray(peaks_mz)
206
207        if calibration_ref_match_method == "legacy":
208            # This legacy approach iterates through each reference match and finds the entries within 1 mz and within the user defined PPM error threshold
209            # Then it removes ambiguities - which means the calibration threshold hasto be very tight.
210            cal_peaks_mz = []
211            cal_refs_mz = []
212            for mzref in df_ref["m/z"]:
213                tmp_peaks_mz = peaks_mz[abs(peaks_mz - mzref) < 1]
214                for mzmeas in tmp_peaks_mz:
215                    delta_mass = ((mzmeas - mzref) / mzref) * 1e6
216                    if delta_mass < max(calib_ppm_error_threshold):
217                        if delta_mass > min(calib_ppm_error_threshold):
218                            cal_peaks_mz.append(mzmeas)
219                            cal_refs_mz.append(mzref)
220
221            # To remove entries with duplicated indices (reference masses matching multiple peaks)
222            tmpdf = pd.Series(index=cal_refs_mz, data=cal_peaks_mz, dtype=float)
223            tmpdf = tmpdf[~tmpdf.index.duplicated(keep=False)]
224
225            cal_peaks_mz = list(tmpdf.values)
226            cal_refs_mz = list(tmpdf.index)
227        elif calibration_ref_match_method == "merged":
228            #warnings.warn("Using experimental new reference mass list merging")
229            # This is a new approach (August 2024) which uses Pandas 'merged_asof' to find the peaks closest in m/z between
230            # reference and measured masses. This is a quicker way to match, and seems to get more matches.
231            # It may not work as well when the data are far from correc initial mass
232            # e.g. if the correct peak is further from the reference than an incorrect peak.
233            meas_df = pd.DataFrame(columns=["meas_m/z"], data=peaks_mz)
234            tolerance = calibration_ref_match_tolerance
235            merged_df = pd.merge_asof(
236                df_ref,
237                meas_df,
238                left_on="m/z",
239                right_on="meas_m/z",
240                tolerance=tolerance,
241                direction="nearest",
242            )
243            merged_df.dropna(how="any", inplace=True)
244            merged_df["Error_ppm"] = (
245                (merged_df["meas_m/z"] - merged_df["m/z"]) / merged_df["m/z"]
246            ) * 1e6
247            median_raw_error = merged_df["Error_ppm"].median()
248            std_raw_error = merged_df["Error_ppm"].std()
249            if std_raw_error > calibration_ref_match_std_raw_error_limit:
250                std_raw_error = calibration_ref_match_std_raw_error_limit
251            self.mass_spectrum.calibration_raw_error_median = median_raw_error
252            self.mass_spectrum.calibration_raw_error_stdev = std_raw_error
253            merged_df = merged_df[
254                (merged_df["Error_ppm"] > (median_raw_error - 1.5 * std_raw_error))
255                & (merged_df["Error_ppm"] < (median_raw_error + 1.5 * std_raw_error))
256            ]
257            # merged_df= merged_df[(merged_df['Error_ppm']>min(calib_ppm_error_threshold))&(merged_df['Error_ppm']<max(calib_ppm_error_threshold))]
258            cal_peaks_mz = list(merged_df["meas_m/z"])
259            cal_refs_mz = list(merged_df["m/z"])
260        else:
261            raise ValueError(f"{calibration_ref_match_method} not allowed.")
262
263        if False:
264            min_calib_ppm_error = calib_ppm_error_threshold[0]
265            max_calib_ppm_error = calib_ppm_error_threshold[1]
266            df_raw = self.mass_spectrum.to_dataframe()
267
268            df_raw = df_raw[df_raw["S/N"] > calib_snr_threshold]
269            # optionally further subset that based on minimum S/N, RP, Peak Height
270            # to ensure only valid points are utilized
271            # in this example, only a S/N threshold is implemented.
272            imzmeas = []
273            mzrefs = []
274
275            for mzref in df_ref["m/z"]:
276                # find all peaks within a defined ppm error threshold
277                tmpdf = df_raw[
278                    ((df_raw["m/z"] - mzref) / mzref) * 1e6 < max_calib_ppm_error
279                ]
280                # Error is relative to the theoretical, so the divisor should be divisor
281
282                tmpdf = tmpdf[
283                    ((tmpdf["m/z"] - mzref) / mzref) * 1e6 > min_calib_ppm_error
284                ]
285
286                # only use the calibration point if only one peak is within the thresholds
287                # This may require some optimization of the threshold tolerances
288                if len(tmpdf) == 1:
289                    imzmeas.append(int(tmpdf.index.values))
290                    mzrefs.append(mzref)
291
292        # it is crucial the mass lists are in same order
293        # corems likes to do masses from high to low.
294        cal_refs_mz.sort(reverse=False)
295        cal_peaks_mz.sort(reverse=False)
296        if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
297            print(
298                str(len(cal_peaks_mz))
299                + " calibration points matched within thresholds."
300            )
301        return cal_peaks_mz, cal_refs_mz

Function to find calibration points in the mass spectrum

Based on the reference mass list.

Parameters
  • df_ref (Pandas DataFrame): reference mass list for recalibration.
  • calib_ppm_error_threshold (tuple of floats, optional): ppm error for finding calibration masses in the spectrum. The default is -1,1. Note: This is based on the calculation of ppm = ((mz_measure - mz_theoretical)/mz_theoretical)*1e6. Some software does this the other way around and value signs must be inverted for that to work.
  • calib_snr_threshold (float, optional): snr threshold for finding calibration masses in the spectrum. The default is 5.
Returns
  • cal_peaks_mz (list of floats): masses of measured ions to use in calibration routine
  • cal_refs_mz (list of floats): reference mz values of found calibration points.
def robust_calib( self, param: list[float], cal_peaks_mz: list[float], cal_refs_mz: list[float], order: int = 1):
303    def robust_calib(
304        self,
305        param: list[float],
306        cal_peaks_mz: list[float],
307        cal_refs_mz: list[float],
308        order: int = 1,
309    ):
310        """Recalibration function
311
312        Computes the rms of m/z errors to minimize when calibrating.
313        This is adapted from from spike.
314
315        Parameters
316        ----------
317        param : list of floats
318            generated by minimize function from scipy optimize.
319        cal_peaks_mz : list of floats
320            masses of measured peaks to use in mass calibration.
321        cal_peaks_mz : list of floats
322            reference mz values of found calibration points.
323        order : int, optional
324            order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
325
326        Returns
327        -------
328        rmserror : float
329            root mean square mass error for calibration points.
330
331        """
332        Aterm = param[0]
333        Bterm = param[1]
334        try:
335            Cterm = param[2]
336        except IndexError:
337            pass
338
339        # get the mspeaks from the mass spectrum object which were calibration points
340        # mspeaks = [self.mass_spectrum.mspeaks[x] for x in imzmeas]
341        # get their calibrated mass values
342        # mspeakmzs = [x.mz_cal for x in mspeaks]
343        cal_peaks_mz = np.asarray(cal_peaks_mz)
344
345        # linearz
346        if order == 1:
347            ref_recal_points = (Aterm * cal_peaks_mz) + Bterm
348        # quadratic
349        elif order == 2:
350            ref_recal_points = (Aterm * (cal_peaks_mz)) + (
351                Bterm * np.power((cal_peaks_mz), 2) + Cterm
352            )
353
354        # sort both the calibration points (measured, recalibrated)
355        ref_recal_points.sort()
356        # and sort the calibration points (theoretical, predefined)
357        cal_refs_mz.sort()
358
359        # calculate the ppm error for each calibration point
360        error = ((ref_recal_points - cal_refs_mz) / cal_refs_mz) * 1e6
361        # calculate the root mean square error - this is our target to minimize
362        rmserror = np.sqrt(np.mean(error**2))
363        return rmserror

Recalibration function

Computes the rms of m/z errors to minimize when calibrating. This is adapted from from spike.

Parameters
  • param (list of floats): generated by minimize function from scipy optimize.
  • cal_peaks_mz (list of floats): masses of measured peaks to use in mass calibration.
  • cal_peaks_mz (list of floats): reference mz values of found calibration points.
  • order (int, optional): order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
Returns
  • rmserror (float): root mean square mass error for calibration points.
def recalibrate_mass_spectrum( self, cal_peaks_mz: list[float], cal_refs_mz: list[float], order: int = 1, diagnostic: bool = False):
365    def recalibrate_mass_spectrum(
366        self,
367        cal_peaks_mz: list[float],
368        cal_refs_mz: list[float],
369        order: int = 1,
370        diagnostic: bool = False,
371    ):
372        """Main recalibration function which uses a robust linear regression
373
374        This function performs the recalibration of the mass spectrum object.
375        It iteratively applies
376
377        Parameters
378        ----------
379        cal_peaks_mz : list of float
380            masses of measured peaks to use in mass calibration.
381        cal_refs_mz : list of float
382            reference mz values of found calibration points.
383        order : int, optional
384            order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
385
386        Returns
387        -------
388        mass_spectrum : CoreMS mass spectrum object
389            Calibrated mass spectrum object
390
391
392        Notes
393        -----
394        This function is adapted, in part, from the SPIKE project [1,2] and is based on the robust linear regression method.
395
396        References
397        ----------
398        1.  Chiron L., Coutouly M-A., Starck J-P., Rolando C., Delsuc M-A.
399            SPIKE a Processing Software dedicated to Fourier Spectroscopies
400            https://arxiv.org/abs/1608.06777 (2016)
401        2.  SPIKE - https://github.com/spike-project/spike
402
403        """
404        # initialise parameters for recalibration
405        # these are the 'Aterm, Bterm, Cterm'
406        # as spectra are already freq->mz calibrated, these terms are very small
407        # may be beneficial to formally separate them from the freq->mz terms
408        if order == 1:
409            Po = [1, 0]
410        elif order == 2:
411            Po = [1, 0, 0]
412
413        if len(cal_peaks_mz) >= 2:
414            if self.mzsegment:  # If only part of the spectrum is to be recalibrated
415                mz_exp_peaks = np.array(
416                    [mspeak.mz_exp for mspeak in self.mass_spectrum]
417                )
418                # Split the array into two parts - one to recailbrate, one to keep unchanged.
419                mz_exp_peaks_tocal = mz_exp_peaks[
420                    (mz_exp_peaks >= min(self.mzsegment))
421                    & (mz_exp_peaks <= max(self.mzsegment))
422                ]
423                mz_exp_peaks_unchanged = mz_exp_peaks[
424                    ~(mz_exp_peaks >= min(self.mzsegment))
425                    | ~(mz_exp_peaks <= max(self.mzsegment))
426                ]
427                # TODO: - segmented calibration needs a way to better track the calibration args/values...
428                if not self.mass_spectrum.is_centroid:
429                    mz_exp_profile = np.array(self.mass_spectrum.mz_exp_profile)
430                    # Split the array into two parts - one to recailbrate, one to keep unchanged.
431                    mz_exp_profile_tocal = mz_exp_profile[
432                        (mz_exp_profile >= min(self.mzsegment))
433                        & (mz_exp_profile <= max(self.mzsegment))
434                    ]
435                    mz_exp_profile_unchanged = mz_exp_profile[
436                        ~(mz_exp_profile >= min(self.mzsegment))
437                        | ~(mz_exp_profile <= max(self.mzsegment))
438                    ]
439            else:  # if just recalibrating the whole spectrum
440                mz_exp_peaks_tocal = np.array(
441                    [mspeak.mz_exp for mspeak in self.mass_spectrum]
442                )
443                if not self.mass_spectrum.is_centroid:
444                    mz_exp_profile_tocal = np.array(self.mass_spectrum.mz_exp_profile)
445
446            minimize_method = self.mass_spectrum.settings.calib_minimize_method
447            res = minimize(
448                self.robust_calib,
449                Po,
450                args=(cal_peaks_mz, cal_refs_mz, order),
451                method=minimize_method,
452            )
453            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
454                print(
455                    "minimize function completed with RMS error of: {:0.3f} ppm".format(
456                        res["fun"]
457                    )
458                )
459                print(
460                    "minimize function performed {:1d} fn evals and {:1d} iterations".format(
461                        res["nfev"], res["nit"]
462                    )
463                )
464            Pn = res.x
465
466            # mz_exp_ms = np.array([mspeak.mz_exp for mspeak in self.mass_spectrum])
467
468            if order == 1:
469                mz_domain = (Pn[0] * mz_exp_peaks_tocal) + Pn[1]
470                if not self.mass_spectrum.is_centroid:
471                    mz_profile_calc = (Pn[0] * mz_exp_profile_tocal) + Pn[1]
472
473            elif order == 2:
474                mz_domain = (Pn[0] * (mz_exp_peaks_tocal)) + (
475                    Pn[1] * np.power((mz_exp_peaks_tocal), 2) + Pn[2]
476                )
477
478                if not self.mass_spectrum.is_centroid:
479                    mz_profile_calc = (Pn[0] * (mz_exp_profile_tocal)) + (
480                        Pn[1] * np.power((mz_exp_profile_tocal), 2) + Pn[2]
481                    )
482
483            if self.mzsegment:
484                # Recombine the mass domains
485                mz_domain = np.concatenate([mz_domain, mz_exp_peaks_unchanged])
486                mz_domain.sort()
487                if not self.mass_spectrum.is_centroid:
488                    mz_profile_calc = np.concatenate(
489                        [mz_profile_calc, mz_exp_profile_unchanged]
490                    )
491                    mz_profile_calc.sort()
492                # Sort them
493                if (
494                    mz_exp_peaks[0] > mz_exp_peaks[1]
495                ):  # If originally descending mass order
496                    mz_domain = mz_domain[::-1]
497                    if not self.mass_spectrum.is_centroid:
498                        mz_profile_calc = mz_profile_calc[::-1]
499
500            self.mass_spectrum.mz_cal = mz_domain
501            if not self.mass_spectrum.is_centroid:
502                self.mass_spectrum.mz_cal_profile = mz_profile_calc
503
504            self.mass_spectrum.calibration_order = order
505            self.mass_spectrum.calibration_RMS = float(res["fun"])
506            self.mass_spectrum.calibration_points = int(len(cal_refs_mz))
507            self.mass_spectrum.calibration_ref_mzs = cal_refs_mz
508            self.mass_spectrum.calibration_meas_mzs = cal_peaks_mz
509
510            self.mass_spectrum.calibration_segment = self.mzsegment
511
512            if diagnostic:
513                return self.mass_spectrum, res
514            return self.mass_spectrum
515        else:
516            warnings.warn("Too few calibration points - aborting.")
517            return self.mass_spectrum

Main recalibration function which uses a robust linear regression

This function performs the recalibration of the mass spectrum object. It iteratively applies

Parameters
  • cal_peaks_mz (list of float): masses of measured peaks to use in mass calibration.
  • cal_refs_mz (list of float): reference mz values of found calibration points.
  • order (int, optional): order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
Returns
  • mass_spectrum (CoreMS mass spectrum object): Calibrated mass spectrum object
Notes

This function is adapted, in part, from the SPIKE project [1,2] and is based on the robust linear regression method.

References
  1. Chiron L., Coutouly M-A., Starck J-P., Rolando C., Delsuc M-A. SPIKE a Processing Software dedicated to Fourier Spectroscopies https://arxiv.org/abs/1608.06777 (2016)
  2. SPIKE - https://github.com/spike-project/spike
def run(self):
519    def run(self):
520        """Run the calibration routine
521
522        This function runs the calibration routine.
523
524        """
525        calib_ppm_error_threshold = self.mass_spectrum.settings.calib_sn_threshold
526        max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error
527        min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error
528        calib_pol_order = self.mass_spectrum.settings.calib_pol_order
529        calibration_ref_match_method = (
530            self.mass_spectrum.settings.calibration_ref_match_method
531        )
532        calibration_ref_match_tolerance = (
533            self.mass_spectrum.settings.calibration_ref_match_tolerance
534        )
535        calibration_ref_match_std_raw_error_limit = (
536            self.mass_spectrum.settings.calibration_ref_match_std_raw_error_limit
537        )
538
539        # load reference mass list
540        df_ref = self.load_ref_mass_list()
541
542        # find calibration points
543        cal_peaks_mz, cal_refs_mz = self.find_calibration_points(
544            df_ref,
545            calib_ppm_error_threshold=(min_calib_ppm_error, max_calib_ppm_error),
546            calib_snr_threshold=calib_ppm_error_threshold,
547            calibration_ref_match_method=calibration_ref_match_method,
548            calibration_ref_match_tolerance=calibration_ref_match_tolerance,
549            calibration_ref_match_std_raw_error_limit=calibration_ref_match_std_raw_error_limit,
550        )
551        if len(cal_peaks_mz) == 2:
552            self.mass_spectrum.settings.calib_pol_order = 1
553            calib_pol_order = 1
554            if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
555                print("Only 2 calibration points found, forcing a linear recalibration")
556        elif len(cal_peaks_mz) < 2:
557            warnings.warn("Too few calibration points found, function will fail")
558        self.recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=calib_pol_order)

Run the calibration routine

This function runs the calibration routine.