corems.mass_spectrum.calc.Calibration
Created on Wed May 13 02:16:09 2020
@author: Will Kew
1# -*- coding: utf-8 -*- 2""" 3Created on Wed May 13 02:16:09 2020 4 5@author: Will Kew 6""" 7 8# import modules 9import csv 10import warnings 11from io import BytesIO 12from pathlib import Path 13 14import numpy as np 15import pandas as pd 16from s3path import S3Path 17 18# import scipy modules for calibration 19from scipy.optimize import minimize 20 21 22class MzDomainCalibration: 23 """MzDomainCalibration class for recalibrating mass spectra 24 25 Parameters 26 ---------- 27 mass_spectrum : CoreMS MassSpectrum Object 28 The mass spectrum to be calibrated. 29 ref_masslist : str 30 The path to a reference mass list. 31 mzsegment : tuple of floats, optional 32 The mz range to recalibrate, or None. Used for calibration of specific parts of the mz domain at a time. 33 Future work - allow multiple mzsegments to be passed. 34 35 Attributes 36 ---------- 37 mass_spectrum : CoreMS MassSpectrum Object 38 The mass spectrum to be calibrated. 39 mzsegment : tuple of floats or None 40 The mz range to recalibrate, or None. 41 ref_mass_list_path : str or Path 42 The path to the reference mass list. 43 44 Methods 45 ------- 46 * run(). 47 Main function to run this class. 48 * load_ref_mass_list(). 49 Load reference mass list (Bruker format). 50 * gen_ref_mass_list_from_assigned(min_conf=0.7). 51 Generate reference mass list from assigned masses. 52 * find_calibration_points(df_ref, calib_ppm_error_threshold=(-1, 1), calib_snr_threshold=5). 53 Find calibration points in the mass spectrum based on the reference mass list. 54 * robust_calib(param, cal_peaks_mz, cal_refs_mz, order=1). 55 Recalibration function. 56 * recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=1, diagnostic=False). 57 Main recalibration function which uses a robust linear regression. 58 59 60 """ 61 62 def __init__(self, mass_spectrum, ref_masslist, mzsegment=None): 63 self.mass_spectrum = mass_spectrum 64 self.mzsegment = mzsegment 65 66 # define reference mass list - bruker .ref format 67 self.ref_mass_list_path = ref_masslist 68 if self.mass_spectrum.percentile_assigned(mute_output=True)[0] != 0: 69 warnings.warn( 70 "Warning: calibrating spectra which have already been assigned may yield erroneous results" 71 ) 72 self.mass_spectrum.mz_cal = self.mass_spectrum.mz_exp 73 self.mass_spectrum.mz_cal_profile = self.mass_spectrum._mz_exp 74 75 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 76 print( 77 "MS Obj loaded - " + str(len(mass_spectrum.mspeaks)) + " peaks found." 78 ) 79 80 def load_ref_mass_list(self): 81 """Load reference mass list (Bruker format) 82 83 Loads in a reference mass list from a .ref file 84 Note that some versions of Bruker's software produce .ref files with a different format. 85 As such, users may need to manually edit the .ref file in a text editor to ensure it is in the correct format. 86 CoreMS includes an example .ref file with the correct format for reference. 87 88 Returns 89 ------- 90 df_ref : Pandas DataFrame 91 reference mass list object. 92 93 """ 94 refmasslist = ( 95 Path(self.ref_mass_list_path) 96 if isinstance(self.ref_mass_list_path, str) 97 else self.ref_mass_list_path 98 ) 99 100 if not refmasslist.exists(): 101 raise FileExistsError("File does not exist: %s" % refmasslist) 102 103 with refmasslist.open("r") as csvfile: 104 dialect = csv.Sniffer().sniff(csvfile.read(1024)) 105 delimiter = dialect.delimiter 106 107 if isinstance(refmasslist, S3Path): 108 # data = self.file_location.open('rb').read() 109 data = BytesIO(refmasslist.open("rb").read()) 110 111 else: 112 data = refmasslist 113 114 df_ref = pd.read_csv(data, sep=delimiter, header=None, skiprows=1) 115 116 df_ref = df_ref.rename( 117 {0: "Formula", 1: "m/z", 2: "Charge", 3: "Form2"}, axis=1 118 ) 119 120 df_ref.sort_values(by="m/z", ascending=True, inplace=True) 121 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 122 print( 123 "Reference mass list loaded - " 124 + str(len(df_ref)) 125 + " calibration masses loaded." 126 ) 127 128 return df_ref 129 130 def gen_ref_mass_list_from_assigned(self, min_conf: float = 0.7): 131 """Generate reference mass list from assigned masses 132 133 This function will generate a ref mass dataframe object from an assigned corems mass spec obj 134 using assigned masses above a certain minimum confidence threshold. 135 136 This function needs to be retested and check it is covered in the unit tests. 137 138 Parameters 139 ---------- 140 min_conf : float, optional 141 minimum confidence score. The default is 0.7. 142 143 Returns 144 ------- 145 df_ref : Pandas DataFrame 146 reference mass list - based on calculated masses. 147 148 """ 149 # TODO this function needs to be retested and check it is covered in the unit tests 150 df = self.mass_spectrum.to_dataframe() 151 df = df[df["Confidence Score"] > min_conf] 152 df_ref = pd.DataFrame(columns=["m/z"]) 153 df_ref["m/z"] = df["Calculated m/z"] 154 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 155 print( 156 "Reference mass list generated - " 157 + str(len(df_ref)) 158 + " calibration masses." 159 ) 160 return df_ref 161 162 def find_calibration_points( 163 self, 164 df_ref, 165 calib_ppm_error_threshold: tuple[float, float] = (-1, 1), 166 calib_snr_threshold: float = 5, 167 calibration_ref_match_method: str = "legacy", 168 calibration_ref_match_tolerance: float = 0.003, 169 calibration_ref_match_std_raw_error_limit: float = 1.5, 170 ): 171 """Function to find calibration points in the mass spectrum 172 173 Based on the reference mass list. 174 175 Parameters 176 ---------- 177 df_ref : Pandas DataFrame 178 reference mass list for recalibration. 179 calib_ppm_error_threshold : tuple of floats, optional 180 ppm error for finding calibration masses in the spectrum. The default is -1,1. 181 Note: This is based on the calculation of ppm = ((mz_measure - mz_theoretical)/mz_theoretical)*1e6. 182 Some software does this the other way around and value signs must be inverted for that to work. 183 calib_snr_threshold : float, optional 184 snr threshold for finding calibration masses in the spectrum. The default is 5. 185 186 Returns 187 ------- 188 cal_peaks_mz : list of floats 189 masses of measured ions to use in calibration routine 190 cal_refs_mz : list of floats 191 reference mz values of found calibration points. 192 193 """ 194 195 # This approach is much more efficient and expedient than the original implementation. 196 peaks_mz = [] 197 for x in self.mass_spectrum.mspeaks: 198 if x.signal_to_noise > calib_snr_threshold: 199 if self.mzsegment: 200 if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment): 201 peaks_mz.append(x.mz_exp) 202 else: 203 peaks_mz.append(x.mz_exp) 204 peaks_mz = np.asarray(peaks_mz) 205 206 if calibration_ref_match_method == "legacy": 207 # This legacy approach iterates through each reference match and finds the entries within 1 mz and within the user defined PPM error threshold 208 # Then it removes ambiguities - which means the calibration threshold hasto be very tight. 209 cal_peaks_mz = [] 210 cal_refs_mz = [] 211 for mzref in df_ref["m/z"]: 212 tmp_peaks_mz = peaks_mz[abs(peaks_mz - mzref) < 1] 213 for mzmeas in tmp_peaks_mz: 214 delta_mass = ((mzmeas - mzref) / mzref) * 1e6 215 if delta_mass < max(calib_ppm_error_threshold): 216 if delta_mass > min(calib_ppm_error_threshold): 217 cal_peaks_mz.append(mzmeas) 218 cal_refs_mz.append(mzref) 219 220 # To remove entries with duplicated indices (reference masses matching multiple peaks) 221 tmpdf = pd.Series(index=cal_refs_mz, data=cal_peaks_mz, dtype=float) 222 tmpdf = tmpdf[~tmpdf.index.duplicated(keep=False)] 223 224 cal_peaks_mz = list(tmpdf.values) 225 cal_refs_mz = list(tmpdf.index) 226 elif calibration_ref_match_method == "merged": 227 #warnings.warn("Using experimental new reference mass list merging") 228 # This is a new approach (August 2024) which uses Pandas 'merged_asof' to find the peaks closest in m/z between 229 # reference and measured masses. This is a quicker way to match, and seems to get more matches. 230 # It may not work as well when the data are far from correc initial mass 231 # e.g. if the correct peak is further from the reference than an incorrect peak. 232 meas_df = pd.DataFrame(columns=["meas_m/z"], data=peaks_mz) 233 tolerance = calibration_ref_match_tolerance 234 merged_df = pd.merge_asof( 235 df_ref, 236 meas_df, 237 left_on="m/z", 238 right_on="meas_m/z", 239 tolerance=tolerance, 240 direction="nearest", 241 ) 242 merged_df.dropna(how="any", inplace=True) 243 merged_df["Error_ppm"] = ( 244 (merged_df["meas_m/z"] - merged_df["m/z"]) / merged_df["m/z"] 245 ) * 1e6 246 median_raw_error = merged_df["Error_ppm"].median() 247 std_raw_error = merged_df["Error_ppm"].std() 248 if std_raw_error > calibration_ref_match_std_raw_error_limit: 249 std_raw_error = calibration_ref_match_std_raw_error_limit 250 self.mass_spectrum.calibration_raw_error_median = median_raw_error 251 self.mass_spectrum.calibration_raw_error_stdev = std_raw_error 252 merged_df = merged_df[ 253 (merged_df["Error_ppm"] > (median_raw_error - 1.5 * std_raw_error)) 254 & (merged_df["Error_ppm"] < (median_raw_error + 1.5 * std_raw_error)) 255 ] 256 # merged_df= merged_df[(merged_df['Error_ppm']>min(calib_ppm_error_threshold))&(merged_df['Error_ppm']<max(calib_ppm_error_threshold))] 257 cal_peaks_mz = list(merged_df["meas_m/z"]) 258 cal_refs_mz = list(merged_df["m/z"]) 259 else: 260 raise ValueError(f"{calibration_ref_match_method} not allowed.") 261 262 if False: 263 min_calib_ppm_error = calib_ppm_error_threshold[0] 264 max_calib_ppm_error = calib_ppm_error_threshold[1] 265 df_raw = self.mass_spectrum.to_dataframe() 266 267 df_raw = df_raw[df_raw["S/N"] > calib_snr_threshold] 268 # optionally further subset that based on minimum S/N, RP, Peak Height 269 # to ensure only valid points are utilized 270 # in this example, only a S/N threshold is implemented. 271 imzmeas = [] 272 mzrefs = [] 273 274 for mzref in df_ref["m/z"]: 275 # find all peaks within a defined ppm error threshold 276 tmpdf = df_raw[ 277 ((df_raw["m/z"] - mzref) / mzref) * 1e6 < max_calib_ppm_error 278 ] 279 # Error is relative to the theoretical, so the divisor should be divisor 280 281 tmpdf = tmpdf[ 282 ((tmpdf["m/z"] - mzref) / mzref) * 1e6 > min_calib_ppm_error 283 ] 284 285 # only use the calibration point if only one peak is within the thresholds 286 # This may require some optimization of the threshold tolerances 287 if len(tmpdf) == 1: 288 imzmeas.append(int(tmpdf.index.values)) 289 mzrefs.append(mzref) 290 291 # it is crucial the mass lists are in same order 292 # corems likes to do masses from high to low. 293 cal_refs_mz.sort(reverse=False) 294 cal_peaks_mz.sort(reverse=False) 295 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 296 print( 297 str(len(cal_peaks_mz)) 298 + " calibration points matched within thresholds." 299 ) 300 return cal_peaks_mz, cal_refs_mz 301 302 def robust_calib( 303 self, 304 param: list[float], 305 cal_peaks_mz: list[float], 306 cal_refs_mz: list[float], 307 order: int = 1, 308 ): 309 """Recalibration function 310 311 Computes the rms of m/z errors to minimize when calibrating. 312 This is adapted from from spike. 313 314 Parameters 315 ---------- 316 param : list of floats 317 generated by minimize function from scipy optimize. 318 cal_peaks_mz : list of floats 319 masses of measured peaks to use in mass calibration. 320 cal_peaks_mz : list of floats 321 reference mz values of found calibration points. 322 order : int, optional 323 order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1. 324 325 Returns 326 ------- 327 rmserror : float 328 root mean square mass error for calibration points. 329 330 """ 331 Aterm = param[0] 332 Bterm = param[1] 333 try: 334 Cterm = param[2] 335 except IndexError: 336 pass 337 338 # get the mspeaks from the mass spectrum object which were calibration points 339 # mspeaks = [self.mass_spectrum.mspeaks[x] for x in imzmeas] 340 # get their calibrated mass values 341 # mspeakmzs = [x.mz_cal for x in mspeaks] 342 cal_peaks_mz = np.asarray(cal_peaks_mz) 343 344 # linearz 345 if order == 1: 346 ref_recal_points = (Aterm * cal_peaks_mz) + Bterm 347 # quadratic 348 elif order == 2: 349 ref_recal_points = (Aterm * (cal_peaks_mz)) + ( 350 Bterm * np.power((cal_peaks_mz), 2) + Cterm 351 ) 352 353 # sort both the calibration points (measured, recalibrated) 354 ref_recal_points.sort() 355 # and sort the calibration points (theoretical, predefined) 356 cal_refs_mz.sort() 357 358 # calculate the ppm error for each calibration point 359 error = ((ref_recal_points - cal_refs_mz) / cal_refs_mz) * 1e6 360 # calculate the root mean square error - this is our target to minimize 361 rmserror = np.sqrt(np.mean(error**2)) 362 return rmserror 363 364 def recalibrate_mass_spectrum( 365 self, 366 cal_peaks_mz: list[float], 367 cal_refs_mz: list[float], 368 order: int = 1, 369 diagnostic: bool = False, 370 ): 371 """Main recalibration function which uses a robust linear regression 372 373 This function performs the recalibration of the mass spectrum object. 374 It iteratively applies 375 376 Parameters 377 ---------- 378 cal_peaks_mz : list of float 379 masses of measured peaks to use in mass calibration. 380 cal_refs_mz : list of float 381 reference mz values of found calibration points. 382 order : int, optional 383 order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1. 384 385 Returns 386 ------- 387 mass_spectrum : CoreMS mass spectrum object 388 Calibrated mass spectrum object 389 390 391 Notes 392 ----- 393 This function is adapted, in part, from the SPIKE project [1,2] and is based on the robust linear regression method. 394 395 References 396 ---------- 397 1. Chiron L., Coutouly M-A., Starck J-P., Rolando C., Delsuc M-A. 398 SPIKE a Processing Software dedicated to Fourier Spectroscopies 399 https://arxiv.org/abs/1608.06777 (2016) 400 2. SPIKE - https://github.com/spike-project/spike 401 402 """ 403 # initialise parameters for recalibration 404 # these are the 'Aterm, Bterm, Cterm' 405 # as spectra are already freq->mz calibrated, these terms are very small 406 # may be beneficial to formally separate them from the freq->mz terms 407 if order == 1: 408 Po = [1, 0] 409 elif order == 2: 410 Po = [1, 0, 0] 411 412 if len(cal_peaks_mz) >= 2: 413 if self.mzsegment: # If only part of the spectrum is to be recalibrated 414 mz_exp_peaks = np.array( 415 [mspeak.mz_exp for mspeak in self.mass_spectrum] 416 ) 417 # Split the array into two parts - one to recailbrate, one to keep unchanged. 418 mz_exp_peaks_tocal = mz_exp_peaks[ 419 (mz_exp_peaks >= min(self.mzsegment)) 420 & (mz_exp_peaks <= max(self.mzsegment)) 421 ] 422 mz_exp_peaks_unchanged = mz_exp_peaks[ 423 ~(mz_exp_peaks >= min(self.mzsegment)) 424 | ~(mz_exp_peaks <= max(self.mzsegment)) 425 ] 426 # TODO: - segmented calibration needs a way to better track the calibration args/values... 427 if not self.mass_spectrum.is_centroid: 428 mz_exp_profile = np.array(self.mass_spectrum.mz_exp_profile) 429 # Split the array into two parts - one to recailbrate, one to keep unchanged. 430 mz_exp_profile_tocal = mz_exp_profile[ 431 (mz_exp_profile >= min(self.mzsegment)) 432 & (mz_exp_profile <= max(self.mzsegment)) 433 ] 434 mz_exp_profile_unchanged = mz_exp_profile[ 435 ~(mz_exp_profile >= min(self.mzsegment)) 436 | ~(mz_exp_profile <= max(self.mzsegment)) 437 ] 438 else: # if just recalibrating the whole spectrum 439 mz_exp_peaks_tocal = np.array( 440 [mspeak.mz_exp for mspeak in self.mass_spectrum] 441 ) 442 if not self.mass_spectrum.is_centroid: 443 mz_exp_profile_tocal = np.array(self.mass_spectrum.mz_exp_profile) 444 445 minimize_method = self.mass_spectrum.settings.calib_minimize_method 446 res = minimize( 447 self.robust_calib, 448 Po, 449 args=(cal_peaks_mz, cal_refs_mz, order), 450 method=minimize_method, 451 ) 452 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 453 print( 454 "minimize function completed with RMS error of: {:0.3f} ppm".format( 455 res["fun"] 456 ) 457 ) 458 print( 459 "minimize function performed {:1d} fn evals and {:1d} iterations".format( 460 res["nfev"], res["nit"] 461 ) 462 ) 463 Pn = res.x 464 465 # mz_exp_ms = np.array([mspeak.mz_exp for mspeak in self.mass_spectrum]) 466 467 if order == 1: 468 mz_domain = (Pn[0] * mz_exp_peaks_tocal) + Pn[1] 469 if not self.mass_spectrum.is_centroid: 470 mz_profile_calc = (Pn[0] * mz_exp_profile_tocal) + Pn[1] 471 472 elif order == 2: 473 mz_domain = (Pn[0] * (mz_exp_peaks_tocal)) + ( 474 Pn[1] * np.power((mz_exp_peaks_tocal), 2) + Pn[2] 475 ) 476 477 if not self.mass_spectrum.is_centroid: 478 mz_profile_calc = (Pn[0] * (mz_exp_profile_tocal)) + ( 479 Pn[1] * np.power((mz_exp_profile_tocal), 2) + Pn[2] 480 ) 481 482 if self.mzsegment: 483 # Recombine the mass domains 484 mz_domain = np.concatenate([mz_domain, mz_exp_peaks_unchanged]) 485 mz_domain.sort() 486 if not self.mass_spectrum.is_centroid: 487 mz_profile_calc = np.concatenate( 488 [mz_profile_calc, mz_exp_profile_unchanged] 489 ) 490 mz_profile_calc.sort() 491 # Sort them 492 if ( 493 mz_exp_peaks[0] > mz_exp_peaks[1] 494 ): # If originally descending mass order 495 mz_domain = mz_domain[::-1] 496 if not self.mass_spectrum.is_centroid: 497 mz_profile_calc = mz_profile_calc[::-1] 498 499 self.mass_spectrum.mz_cal = mz_domain 500 if not self.mass_spectrum.is_centroid: 501 self.mass_spectrum.mz_cal_profile = mz_profile_calc 502 503 self.mass_spectrum.calibration_order = order 504 self.mass_spectrum.calibration_RMS = float(res["fun"]) 505 self.mass_spectrum.calibration_points = int(len(cal_refs_mz)) 506 self.mass_spectrum.calibration_ref_mzs = cal_refs_mz 507 self.mass_spectrum.calibration_meas_mzs = cal_peaks_mz 508 509 self.mass_spectrum.calibration_segment = self.mzsegment 510 511 if diagnostic: 512 return self.mass_spectrum, res 513 return self.mass_spectrum 514 else: 515 warnings.warn("Too few calibration points - aborting.") 516 return self.mass_spectrum 517 518 def run(self): 519 """Run the calibration routine 520 521 This function runs the calibration routine. 522 523 """ 524 calib_ppm_error_threshold = self.mass_spectrum.settings.calib_sn_threshold 525 max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error 526 min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error 527 calib_pol_order = self.mass_spectrum.settings.calib_pol_order 528 calibration_ref_match_method = ( 529 self.mass_spectrum.settings.calibration_ref_match_method 530 ) 531 calibration_ref_match_tolerance = ( 532 self.mass_spectrum.settings.calibration_ref_match_tolerance 533 ) 534 calibration_ref_match_std_raw_error_limit = ( 535 self.mass_spectrum.settings.calibration_ref_match_std_raw_error_limit 536 ) 537 538 # load reference mass list 539 df_ref = self.load_ref_mass_list() 540 541 # find calibration points 542 cal_peaks_mz, cal_refs_mz = self.find_calibration_points( 543 df_ref, 544 calib_ppm_error_threshold=(min_calib_ppm_error, max_calib_ppm_error), 545 calib_snr_threshold=calib_ppm_error_threshold, 546 calibration_ref_match_method=calibration_ref_match_method, 547 calibration_ref_match_tolerance=calibration_ref_match_tolerance, 548 calibration_ref_match_std_raw_error_limit=calibration_ref_match_std_raw_error_limit, 549 ) 550 if len(cal_peaks_mz) == 2: 551 self.mass_spectrum.settings.calib_pol_order = 1 552 calib_pol_order = 1 553 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 554 print("Only 2 calibration points found, forcing a linear recalibration") 555 elif len(cal_peaks_mz) < 2: 556 warnings.warn("Too few calibration points found, function will fail") 557 self.recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=calib_pol_order)
23class MzDomainCalibration: 24 """MzDomainCalibration class for recalibrating mass spectra 25 26 Parameters 27 ---------- 28 mass_spectrum : CoreMS MassSpectrum Object 29 The mass spectrum to be calibrated. 30 ref_masslist : str 31 The path to a reference mass list. 32 mzsegment : tuple of floats, optional 33 The mz range to recalibrate, or None. Used for calibration of specific parts of the mz domain at a time. 34 Future work - allow multiple mzsegments to be passed. 35 36 Attributes 37 ---------- 38 mass_spectrum : CoreMS MassSpectrum Object 39 The mass spectrum to be calibrated. 40 mzsegment : tuple of floats or None 41 The mz range to recalibrate, or None. 42 ref_mass_list_path : str or Path 43 The path to the reference mass list. 44 45 Methods 46 ------- 47 * run(). 48 Main function to run this class. 49 * load_ref_mass_list(). 50 Load reference mass list (Bruker format). 51 * gen_ref_mass_list_from_assigned(min_conf=0.7). 52 Generate reference mass list from assigned masses. 53 * find_calibration_points(df_ref, calib_ppm_error_threshold=(-1, 1), calib_snr_threshold=5). 54 Find calibration points in the mass spectrum based on the reference mass list. 55 * robust_calib(param, cal_peaks_mz, cal_refs_mz, order=1). 56 Recalibration function. 57 * recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=1, diagnostic=False). 58 Main recalibration function which uses a robust linear regression. 59 60 61 """ 62 63 def __init__(self, mass_spectrum, ref_masslist, mzsegment=None): 64 self.mass_spectrum = mass_spectrum 65 self.mzsegment = mzsegment 66 67 # define reference mass list - bruker .ref format 68 self.ref_mass_list_path = ref_masslist 69 if self.mass_spectrum.percentile_assigned(mute_output=True)[0] != 0: 70 warnings.warn( 71 "Warning: calibrating spectra which have already been assigned may yield erroneous results" 72 ) 73 self.mass_spectrum.mz_cal = self.mass_spectrum.mz_exp 74 self.mass_spectrum.mz_cal_profile = self.mass_spectrum._mz_exp 75 76 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 77 print( 78 "MS Obj loaded - " + str(len(mass_spectrum.mspeaks)) + " peaks found." 79 ) 80 81 def load_ref_mass_list(self): 82 """Load reference mass list (Bruker format) 83 84 Loads in a reference mass list from a .ref file 85 Note that some versions of Bruker's software produce .ref files with a different format. 86 As such, users may need to manually edit the .ref file in a text editor to ensure it is in the correct format. 87 CoreMS includes an example .ref file with the correct format for reference. 88 89 Returns 90 ------- 91 df_ref : Pandas DataFrame 92 reference mass list object. 93 94 """ 95 refmasslist = ( 96 Path(self.ref_mass_list_path) 97 if isinstance(self.ref_mass_list_path, str) 98 else self.ref_mass_list_path 99 ) 100 101 if not refmasslist.exists(): 102 raise FileExistsError("File does not exist: %s" % refmasslist) 103 104 with refmasslist.open("r") as csvfile: 105 dialect = csv.Sniffer().sniff(csvfile.read(1024)) 106 delimiter = dialect.delimiter 107 108 if isinstance(refmasslist, S3Path): 109 # data = self.file_location.open('rb').read() 110 data = BytesIO(refmasslist.open("rb").read()) 111 112 else: 113 data = refmasslist 114 115 df_ref = pd.read_csv(data, sep=delimiter, header=None, skiprows=1) 116 117 df_ref = df_ref.rename( 118 {0: "Formula", 1: "m/z", 2: "Charge", 3: "Form2"}, axis=1 119 ) 120 121 df_ref.sort_values(by="m/z", ascending=True, inplace=True) 122 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 123 print( 124 "Reference mass list loaded - " 125 + str(len(df_ref)) 126 + " calibration masses loaded." 127 ) 128 129 return df_ref 130 131 def gen_ref_mass_list_from_assigned(self, min_conf: float = 0.7): 132 """Generate reference mass list from assigned masses 133 134 This function will generate a ref mass dataframe object from an assigned corems mass spec obj 135 using assigned masses above a certain minimum confidence threshold. 136 137 This function needs to be retested and check it is covered in the unit tests. 138 139 Parameters 140 ---------- 141 min_conf : float, optional 142 minimum confidence score. The default is 0.7. 143 144 Returns 145 ------- 146 df_ref : Pandas DataFrame 147 reference mass list - based on calculated masses. 148 149 """ 150 # TODO this function needs to be retested and check it is covered in the unit tests 151 df = self.mass_spectrum.to_dataframe() 152 df = df[df["Confidence Score"] > min_conf] 153 df_ref = pd.DataFrame(columns=["m/z"]) 154 df_ref["m/z"] = df["Calculated m/z"] 155 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 156 print( 157 "Reference mass list generated - " 158 + str(len(df_ref)) 159 + " calibration masses." 160 ) 161 return df_ref 162 163 def find_calibration_points( 164 self, 165 df_ref, 166 calib_ppm_error_threshold: tuple[float, float] = (-1, 1), 167 calib_snr_threshold: float = 5, 168 calibration_ref_match_method: str = "legacy", 169 calibration_ref_match_tolerance: float = 0.003, 170 calibration_ref_match_std_raw_error_limit: float = 1.5, 171 ): 172 """Function to find calibration points in the mass spectrum 173 174 Based on the reference mass list. 175 176 Parameters 177 ---------- 178 df_ref : Pandas DataFrame 179 reference mass list for recalibration. 180 calib_ppm_error_threshold : tuple of floats, optional 181 ppm error for finding calibration masses in the spectrum. The default is -1,1. 182 Note: This is based on the calculation of ppm = ((mz_measure - mz_theoretical)/mz_theoretical)*1e6. 183 Some software does this the other way around and value signs must be inverted for that to work. 184 calib_snr_threshold : float, optional 185 snr threshold for finding calibration masses in the spectrum. The default is 5. 186 187 Returns 188 ------- 189 cal_peaks_mz : list of floats 190 masses of measured ions to use in calibration routine 191 cal_refs_mz : list of floats 192 reference mz values of found calibration points. 193 194 """ 195 196 # This approach is much more efficient and expedient than the original implementation. 197 peaks_mz = [] 198 for x in self.mass_spectrum.mspeaks: 199 if x.signal_to_noise > calib_snr_threshold: 200 if self.mzsegment: 201 if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment): 202 peaks_mz.append(x.mz_exp) 203 else: 204 peaks_mz.append(x.mz_exp) 205 peaks_mz = np.asarray(peaks_mz) 206 207 if calibration_ref_match_method == "legacy": 208 # This legacy approach iterates through each reference match and finds the entries within 1 mz and within the user defined PPM error threshold 209 # Then it removes ambiguities - which means the calibration threshold hasto be very tight. 210 cal_peaks_mz = [] 211 cal_refs_mz = [] 212 for mzref in df_ref["m/z"]: 213 tmp_peaks_mz = peaks_mz[abs(peaks_mz - mzref) < 1] 214 for mzmeas in tmp_peaks_mz: 215 delta_mass = ((mzmeas - mzref) / mzref) * 1e6 216 if delta_mass < max(calib_ppm_error_threshold): 217 if delta_mass > min(calib_ppm_error_threshold): 218 cal_peaks_mz.append(mzmeas) 219 cal_refs_mz.append(mzref) 220 221 # To remove entries with duplicated indices (reference masses matching multiple peaks) 222 tmpdf = pd.Series(index=cal_refs_mz, data=cal_peaks_mz, dtype=float) 223 tmpdf = tmpdf[~tmpdf.index.duplicated(keep=False)] 224 225 cal_peaks_mz = list(tmpdf.values) 226 cal_refs_mz = list(tmpdf.index) 227 elif calibration_ref_match_method == "merged": 228 #warnings.warn("Using experimental new reference mass list merging") 229 # This is a new approach (August 2024) which uses Pandas 'merged_asof' to find the peaks closest in m/z between 230 # reference and measured masses. This is a quicker way to match, and seems to get more matches. 231 # It may not work as well when the data are far from correc initial mass 232 # e.g. if the correct peak is further from the reference than an incorrect peak. 233 meas_df = pd.DataFrame(columns=["meas_m/z"], data=peaks_mz) 234 tolerance = calibration_ref_match_tolerance 235 merged_df = pd.merge_asof( 236 df_ref, 237 meas_df, 238 left_on="m/z", 239 right_on="meas_m/z", 240 tolerance=tolerance, 241 direction="nearest", 242 ) 243 merged_df.dropna(how="any", inplace=True) 244 merged_df["Error_ppm"] = ( 245 (merged_df["meas_m/z"] - merged_df["m/z"]) / merged_df["m/z"] 246 ) * 1e6 247 median_raw_error = merged_df["Error_ppm"].median() 248 std_raw_error = merged_df["Error_ppm"].std() 249 if std_raw_error > calibration_ref_match_std_raw_error_limit: 250 std_raw_error = calibration_ref_match_std_raw_error_limit 251 self.mass_spectrum.calibration_raw_error_median = median_raw_error 252 self.mass_spectrum.calibration_raw_error_stdev = std_raw_error 253 merged_df = merged_df[ 254 (merged_df["Error_ppm"] > (median_raw_error - 1.5 * std_raw_error)) 255 & (merged_df["Error_ppm"] < (median_raw_error + 1.5 * std_raw_error)) 256 ] 257 # merged_df= merged_df[(merged_df['Error_ppm']>min(calib_ppm_error_threshold))&(merged_df['Error_ppm']<max(calib_ppm_error_threshold))] 258 cal_peaks_mz = list(merged_df["meas_m/z"]) 259 cal_refs_mz = list(merged_df["m/z"]) 260 else: 261 raise ValueError(f"{calibration_ref_match_method} not allowed.") 262 263 if False: 264 min_calib_ppm_error = calib_ppm_error_threshold[0] 265 max_calib_ppm_error = calib_ppm_error_threshold[1] 266 df_raw = self.mass_spectrum.to_dataframe() 267 268 df_raw = df_raw[df_raw["S/N"] > calib_snr_threshold] 269 # optionally further subset that based on minimum S/N, RP, Peak Height 270 # to ensure only valid points are utilized 271 # in this example, only a S/N threshold is implemented. 272 imzmeas = [] 273 mzrefs = [] 274 275 for mzref in df_ref["m/z"]: 276 # find all peaks within a defined ppm error threshold 277 tmpdf = df_raw[ 278 ((df_raw["m/z"] - mzref) / mzref) * 1e6 < max_calib_ppm_error 279 ] 280 # Error is relative to the theoretical, so the divisor should be divisor 281 282 tmpdf = tmpdf[ 283 ((tmpdf["m/z"] - mzref) / mzref) * 1e6 > min_calib_ppm_error 284 ] 285 286 # only use the calibration point if only one peak is within the thresholds 287 # This may require some optimization of the threshold tolerances 288 if len(tmpdf) == 1: 289 imzmeas.append(int(tmpdf.index.values)) 290 mzrefs.append(mzref) 291 292 # it is crucial the mass lists are in same order 293 # corems likes to do masses from high to low. 294 cal_refs_mz.sort(reverse=False) 295 cal_peaks_mz.sort(reverse=False) 296 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 297 print( 298 str(len(cal_peaks_mz)) 299 + " calibration points matched within thresholds." 300 ) 301 return cal_peaks_mz, cal_refs_mz 302 303 def robust_calib( 304 self, 305 param: list[float], 306 cal_peaks_mz: list[float], 307 cal_refs_mz: list[float], 308 order: int = 1, 309 ): 310 """Recalibration function 311 312 Computes the rms of m/z errors to minimize when calibrating. 313 This is adapted from from spike. 314 315 Parameters 316 ---------- 317 param : list of floats 318 generated by minimize function from scipy optimize. 319 cal_peaks_mz : list of floats 320 masses of measured peaks to use in mass calibration. 321 cal_peaks_mz : list of floats 322 reference mz values of found calibration points. 323 order : int, optional 324 order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1. 325 326 Returns 327 ------- 328 rmserror : float 329 root mean square mass error for calibration points. 330 331 """ 332 Aterm = param[0] 333 Bterm = param[1] 334 try: 335 Cterm = param[2] 336 except IndexError: 337 pass 338 339 # get the mspeaks from the mass spectrum object which were calibration points 340 # mspeaks = [self.mass_spectrum.mspeaks[x] for x in imzmeas] 341 # get their calibrated mass values 342 # mspeakmzs = [x.mz_cal for x in mspeaks] 343 cal_peaks_mz = np.asarray(cal_peaks_mz) 344 345 # linearz 346 if order == 1: 347 ref_recal_points = (Aterm * cal_peaks_mz) + Bterm 348 # quadratic 349 elif order == 2: 350 ref_recal_points = (Aterm * (cal_peaks_mz)) + ( 351 Bterm * np.power((cal_peaks_mz), 2) + Cterm 352 ) 353 354 # sort both the calibration points (measured, recalibrated) 355 ref_recal_points.sort() 356 # and sort the calibration points (theoretical, predefined) 357 cal_refs_mz.sort() 358 359 # calculate the ppm error for each calibration point 360 error = ((ref_recal_points - cal_refs_mz) / cal_refs_mz) * 1e6 361 # calculate the root mean square error - this is our target to minimize 362 rmserror = np.sqrt(np.mean(error**2)) 363 return rmserror 364 365 def recalibrate_mass_spectrum( 366 self, 367 cal_peaks_mz: list[float], 368 cal_refs_mz: list[float], 369 order: int = 1, 370 diagnostic: bool = False, 371 ): 372 """Main recalibration function which uses a robust linear regression 373 374 This function performs the recalibration of the mass spectrum object. 375 It iteratively applies 376 377 Parameters 378 ---------- 379 cal_peaks_mz : list of float 380 masses of measured peaks to use in mass calibration. 381 cal_refs_mz : list of float 382 reference mz values of found calibration points. 383 order : int, optional 384 order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1. 385 386 Returns 387 ------- 388 mass_spectrum : CoreMS mass spectrum object 389 Calibrated mass spectrum object 390 391 392 Notes 393 ----- 394 This function is adapted, in part, from the SPIKE project [1,2] and is based on the robust linear regression method. 395 396 References 397 ---------- 398 1. Chiron L., Coutouly M-A., Starck J-P., Rolando C., Delsuc M-A. 399 SPIKE a Processing Software dedicated to Fourier Spectroscopies 400 https://arxiv.org/abs/1608.06777 (2016) 401 2. SPIKE - https://github.com/spike-project/spike 402 403 """ 404 # initialise parameters for recalibration 405 # these are the 'Aterm, Bterm, Cterm' 406 # as spectra are already freq->mz calibrated, these terms are very small 407 # may be beneficial to formally separate them from the freq->mz terms 408 if order == 1: 409 Po = [1, 0] 410 elif order == 2: 411 Po = [1, 0, 0] 412 413 if len(cal_peaks_mz) >= 2: 414 if self.mzsegment: # If only part of the spectrum is to be recalibrated 415 mz_exp_peaks = np.array( 416 [mspeak.mz_exp for mspeak in self.mass_spectrum] 417 ) 418 # Split the array into two parts - one to recailbrate, one to keep unchanged. 419 mz_exp_peaks_tocal = mz_exp_peaks[ 420 (mz_exp_peaks >= min(self.mzsegment)) 421 & (mz_exp_peaks <= max(self.mzsegment)) 422 ] 423 mz_exp_peaks_unchanged = mz_exp_peaks[ 424 ~(mz_exp_peaks >= min(self.mzsegment)) 425 | ~(mz_exp_peaks <= max(self.mzsegment)) 426 ] 427 # TODO: - segmented calibration needs a way to better track the calibration args/values... 428 if not self.mass_spectrum.is_centroid: 429 mz_exp_profile = np.array(self.mass_spectrum.mz_exp_profile) 430 # Split the array into two parts - one to recailbrate, one to keep unchanged. 431 mz_exp_profile_tocal = mz_exp_profile[ 432 (mz_exp_profile >= min(self.mzsegment)) 433 & (mz_exp_profile <= max(self.mzsegment)) 434 ] 435 mz_exp_profile_unchanged = mz_exp_profile[ 436 ~(mz_exp_profile >= min(self.mzsegment)) 437 | ~(mz_exp_profile <= max(self.mzsegment)) 438 ] 439 else: # if just recalibrating the whole spectrum 440 mz_exp_peaks_tocal = np.array( 441 [mspeak.mz_exp for mspeak in self.mass_spectrum] 442 ) 443 if not self.mass_spectrum.is_centroid: 444 mz_exp_profile_tocal = np.array(self.mass_spectrum.mz_exp_profile) 445 446 minimize_method = self.mass_spectrum.settings.calib_minimize_method 447 res = minimize( 448 self.robust_calib, 449 Po, 450 args=(cal_peaks_mz, cal_refs_mz, order), 451 method=minimize_method, 452 ) 453 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 454 print( 455 "minimize function completed with RMS error of: {:0.3f} ppm".format( 456 res["fun"] 457 ) 458 ) 459 print( 460 "minimize function performed {:1d} fn evals and {:1d} iterations".format( 461 res["nfev"], res["nit"] 462 ) 463 ) 464 Pn = res.x 465 466 # mz_exp_ms = np.array([mspeak.mz_exp for mspeak in self.mass_spectrum]) 467 468 if order == 1: 469 mz_domain = (Pn[0] * mz_exp_peaks_tocal) + Pn[1] 470 if not self.mass_spectrum.is_centroid: 471 mz_profile_calc = (Pn[0] * mz_exp_profile_tocal) + Pn[1] 472 473 elif order == 2: 474 mz_domain = (Pn[0] * (mz_exp_peaks_tocal)) + ( 475 Pn[1] * np.power((mz_exp_peaks_tocal), 2) + Pn[2] 476 ) 477 478 if not self.mass_spectrum.is_centroid: 479 mz_profile_calc = (Pn[0] * (mz_exp_profile_tocal)) + ( 480 Pn[1] * np.power((mz_exp_profile_tocal), 2) + Pn[2] 481 ) 482 483 if self.mzsegment: 484 # Recombine the mass domains 485 mz_domain = np.concatenate([mz_domain, mz_exp_peaks_unchanged]) 486 mz_domain.sort() 487 if not self.mass_spectrum.is_centroid: 488 mz_profile_calc = np.concatenate( 489 [mz_profile_calc, mz_exp_profile_unchanged] 490 ) 491 mz_profile_calc.sort() 492 # Sort them 493 if ( 494 mz_exp_peaks[0] > mz_exp_peaks[1] 495 ): # If originally descending mass order 496 mz_domain = mz_domain[::-1] 497 if not self.mass_spectrum.is_centroid: 498 mz_profile_calc = mz_profile_calc[::-1] 499 500 self.mass_spectrum.mz_cal = mz_domain 501 if not self.mass_spectrum.is_centroid: 502 self.mass_spectrum.mz_cal_profile = mz_profile_calc 503 504 self.mass_spectrum.calibration_order = order 505 self.mass_spectrum.calibration_RMS = float(res["fun"]) 506 self.mass_spectrum.calibration_points = int(len(cal_refs_mz)) 507 self.mass_spectrum.calibration_ref_mzs = cal_refs_mz 508 self.mass_spectrum.calibration_meas_mzs = cal_peaks_mz 509 510 self.mass_spectrum.calibration_segment = self.mzsegment 511 512 if diagnostic: 513 return self.mass_spectrum, res 514 return self.mass_spectrum 515 else: 516 warnings.warn("Too few calibration points - aborting.") 517 return self.mass_spectrum 518 519 def run(self): 520 """Run the calibration routine 521 522 This function runs the calibration routine. 523 524 """ 525 calib_ppm_error_threshold = self.mass_spectrum.settings.calib_sn_threshold 526 max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error 527 min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error 528 calib_pol_order = self.mass_spectrum.settings.calib_pol_order 529 calibration_ref_match_method = ( 530 self.mass_spectrum.settings.calibration_ref_match_method 531 ) 532 calibration_ref_match_tolerance = ( 533 self.mass_spectrum.settings.calibration_ref_match_tolerance 534 ) 535 calibration_ref_match_std_raw_error_limit = ( 536 self.mass_spectrum.settings.calibration_ref_match_std_raw_error_limit 537 ) 538 539 # load reference mass list 540 df_ref = self.load_ref_mass_list() 541 542 # find calibration points 543 cal_peaks_mz, cal_refs_mz = self.find_calibration_points( 544 df_ref, 545 calib_ppm_error_threshold=(min_calib_ppm_error, max_calib_ppm_error), 546 calib_snr_threshold=calib_ppm_error_threshold, 547 calibration_ref_match_method=calibration_ref_match_method, 548 calibration_ref_match_tolerance=calibration_ref_match_tolerance, 549 calibration_ref_match_std_raw_error_limit=calibration_ref_match_std_raw_error_limit, 550 ) 551 if len(cal_peaks_mz) == 2: 552 self.mass_spectrum.settings.calib_pol_order = 1 553 calib_pol_order = 1 554 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 555 print("Only 2 calibration points found, forcing a linear recalibration") 556 elif len(cal_peaks_mz) < 2: 557 warnings.warn("Too few calibration points found, function will fail") 558 self.recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=calib_pol_order)
MzDomainCalibration class for recalibrating mass spectra
Parameters
- mass_spectrum (CoreMS MassSpectrum Object): The mass spectrum to be calibrated.
- ref_masslist (str): The path to a reference mass list.
- mzsegment (tuple of floats, optional): The mz range to recalibrate, or None. Used for calibration of specific parts of the mz domain at a time. Future work - allow multiple mzsegments to be passed.
Attributes
- mass_spectrum (CoreMS MassSpectrum Object): The mass spectrum to be calibrated.
- mzsegment (tuple of floats or None): The mz range to recalibrate, or None.
- ref_mass_list_path (str or Path): The path to the reference mass list.
Methods
- run(). Main function to run this class.
- load_ref_mass_list(). Load reference mass list (Bruker format).
- gen_ref_mass_list_from_assigned(min_conf=0.7). Generate reference mass list from assigned masses.
- find_calibration_points(df_ref, calib_ppm_error_threshold=(-1, 1), calib_snr_threshold=5). Find calibration points in the mass spectrum based on the reference mass list.
- robust_calib(param, cal_peaks_mz, cal_refs_mz, order=1). Recalibration function.
- recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=1, diagnostic=False). Main recalibration function which uses a robust linear regression.
63 def __init__(self, mass_spectrum, ref_masslist, mzsegment=None): 64 self.mass_spectrum = mass_spectrum 65 self.mzsegment = mzsegment 66 67 # define reference mass list - bruker .ref format 68 self.ref_mass_list_path = ref_masslist 69 if self.mass_spectrum.percentile_assigned(mute_output=True)[0] != 0: 70 warnings.warn( 71 "Warning: calibrating spectra which have already been assigned may yield erroneous results" 72 ) 73 self.mass_spectrum.mz_cal = self.mass_spectrum.mz_exp 74 self.mass_spectrum.mz_cal_profile = self.mass_spectrum._mz_exp 75 76 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 77 print( 78 "MS Obj loaded - " + str(len(mass_spectrum.mspeaks)) + " peaks found." 79 )
81 def load_ref_mass_list(self): 82 """Load reference mass list (Bruker format) 83 84 Loads in a reference mass list from a .ref file 85 Note that some versions of Bruker's software produce .ref files with a different format. 86 As such, users may need to manually edit the .ref file in a text editor to ensure it is in the correct format. 87 CoreMS includes an example .ref file with the correct format for reference. 88 89 Returns 90 ------- 91 df_ref : Pandas DataFrame 92 reference mass list object. 93 94 """ 95 refmasslist = ( 96 Path(self.ref_mass_list_path) 97 if isinstance(self.ref_mass_list_path, str) 98 else self.ref_mass_list_path 99 ) 100 101 if not refmasslist.exists(): 102 raise FileExistsError("File does not exist: %s" % refmasslist) 103 104 with refmasslist.open("r") as csvfile: 105 dialect = csv.Sniffer().sniff(csvfile.read(1024)) 106 delimiter = dialect.delimiter 107 108 if isinstance(refmasslist, S3Path): 109 # data = self.file_location.open('rb').read() 110 data = BytesIO(refmasslist.open("rb").read()) 111 112 else: 113 data = refmasslist 114 115 df_ref = pd.read_csv(data, sep=delimiter, header=None, skiprows=1) 116 117 df_ref = df_ref.rename( 118 {0: "Formula", 1: "m/z", 2: "Charge", 3: "Form2"}, axis=1 119 ) 120 121 df_ref.sort_values(by="m/z", ascending=True, inplace=True) 122 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 123 print( 124 "Reference mass list loaded - " 125 + str(len(df_ref)) 126 + " calibration masses loaded." 127 ) 128 129 return df_ref
Load reference mass list (Bruker format)
Loads in a reference mass list from a .ref file Note that some versions of Bruker's software produce .ref files with a different format. As such, users may need to manually edit the .ref file in a text editor to ensure it is in the correct format. CoreMS includes an example .ref file with the correct format for reference.
Returns
- df_ref (Pandas DataFrame): reference mass list object.
131 def gen_ref_mass_list_from_assigned(self, min_conf: float = 0.7): 132 """Generate reference mass list from assigned masses 133 134 This function will generate a ref mass dataframe object from an assigned corems mass spec obj 135 using assigned masses above a certain minimum confidence threshold. 136 137 This function needs to be retested and check it is covered in the unit tests. 138 139 Parameters 140 ---------- 141 min_conf : float, optional 142 minimum confidence score. The default is 0.7. 143 144 Returns 145 ------- 146 df_ref : Pandas DataFrame 147 reference mass list - based on calculated masses. 148 149 """ 150 # TODO this function needs to be retested and check it is covered in the unit tests 151 df = self.mass_spectrum.to_dataframe() 152 df = df[df["Confidence Score"] > min_conf] 153 df_ref = pd.DataFrame(columns=["m/z"]) 154 df_ref["m/z"] = df["Calculated m/z"] 155 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 156 print( 157 "Reference mass list generated - " 158 + str(len(df_ref)) 159 + " calibration masses." 160 ) 161 return df_ref
Generate reference mass list from assigned masses
This function will generate a ref mass dataframe object from an assigned corems mass spec obj using assigned masses above a certain minimum confidence threshold.
This function needs to be retested and check it is covered in the unit tests.
Parameters
- min_conf (float, optional): minimum confidence score. The default is 0.7.
Returns
- df_ref (Pandas DataFrame): reference mass list - based on calculated masses.
163 def find_calibration_points( 164 self, 165 df_ref, 166 calib_ppm_error_threshold: tuple[float, float] = (-1, 1), 167 calib_snr_threshold: float = 5, 168 calibration_ref_match_method: str = "legacy", 169 calibration_ref_match_tolerance: float = 0.003, 170 calibration_ref_match_std_raw_error_limit: float = 1.5, 171 ): 172 """Function to find calibration points in the mass spectrum 173 174 Based on the reference mass list. 175 176 Parameters 177 ---------- 178 df_ref : Pandas DataFrame 179 reference mass list for recalibration. 180 calib_ppm_error_threshold : tuple of floats, optional 181 ppm error for finding calibration masses in the spectrum. The default is -1,1. 182 Note: This is based on the calculation of ppm = ((mz_measure - mz_theoretical)/mz_theoretical)*1e6. 183 Some software does this the other way around and value signs must be inverted for that to work. 184 calib_snr_threshold : float, optional 185 snr threshold for finding calibration masses in the spectrum. The default is 5. 186 187 Returns 188 ------- 189 cal_peaks_mz : list of floats 190 masses of measured ions to use in calibration routine 191 cal_refs_mz : list of floats 192 reference mz values of found calibration points. 193 194 """ 195 196 # This approach is much more efficient and expedient than the original implementation. 197 peaks_mz = [] 198 for x in self.mass_spectrum.mspeaks: 199 if x.signal_to_noise > calib_snr_threshold: 200 if self.mzsegment: 201 if min(self.mzsegment) <= x.mz_exp <= max(self.mzsegment): 202 peaks_mz.append(x.mz_exp) 203 else: 204 peaks_mz.append(x.mz_exp) 205 peaks_mz = np.asarray(peaks_mz) 206 207 if calibration_ref_match_method == "legacy": 208 # This legacy approach iterates through each reference match and finds the entries within 1 mz and within the user defined PPM error threshold 209 # Then it removes ambiguities - which means the calibration threshold hasto be very tight. 210 cal_peaks_mz = [] 211 cal_refs_mz = [] 212 for mzref in df_ref["m/z"]: 213 tmp_peaks_mz = peaks_mz[abs(peaks_mz - mzref) < 1] 214 for mzmeas in tmp_peaks_mz: 215 delta_mass = ((mzmeas - mzref) / mzref) * 1e6 216 if delta_mass < max(calib_ppm_error_threshold): 217 if delta_mass > min(calib_ppm_error_threshold): 218 cal_peaks_mz.append(mzmeas) 219 cal_refs_mz.append(mzref) 220 221 # To remove entries with duplicated indices (reference masses matching multiple peaks) 222 tmpdf = pd.Series(index=cal_refs_mz, data=cal_peaks_mz, dtype=float) 223 tmpdf = tmpdf[~tmpdf.index.duplicated(keep=False)] 224 225 cal_peaks_mz = list(tmpdf.values) 226 cal_refs_mz = list(tmpdf.index) 227 elif calibration_ref_match_method == "merged": 228 #warnings.warn("Using experimental new reference mass list merging") 229 # This is a new approach (August 2024) which uses Pandas 'merged_asof' to find the peaks closest in m/z between 230 # reference and measured masses. This is a quicker way to match, and seems to get more matches. 231 # It may not work as well when the data are far from correc initial mass 232 # e.g. if the correct peak is further from the reference than an incorrect peak. 233 meas_df = pd.DataFrame(columns=["meas_m/z"], data=peaks_mz) 234 tolerance = calibration_ref_match_tolerance 235 merged_df = pd.merge_asof( 236 df_ref, 237 meas_df, 238 left_on="m/z", 239 right_on="meas_m/z", 240 tolerance=tolerance, 241 direction="nearest", 242 ) 243 merged_df.dropna(how="any", inplace=True) 244 merged_df["Error_ppm"] = ( 245 (merged_df["meas_m/z"] - merged_df["m/z"]) / merged_df["m/z"] 246 ) * 1e6 247 median_raw_error = merged_df["Error_ppm"].median() 248 std_raw_error = merged_df["Error_ppm"].std() 249 if std_raw_error > calibration_ref_match_std_raw_error_limit: 250 std_raw_error = calibration_ref_match_std_raw_error_limit 251 self.mass_spectrum.calibration_raw_error_median = median_raw_error 252 self.mass_spectrum.calibration_raw_error_stdev = std_raw_error 253 merged_df = merged_df[ 254 (merged_df["Error_ppm"] > (median_raw_error - 1.5 * std_raw_error)) 255 & (merged_df["Error_ppm"] < (median_raw_error + 1.5 * std_raw_error)) 256 ] 257 # merged_df= merged_df[(merged_df['Error_ppm']>min(calib_ppm_error_threshold))&(merged_df['Error_ppm']<max(calib_ppm_error_threshold))] 258 cal_peaks_mz = list(merged_df["meas_m/z"]) 259 cal_refs_mz = list(merged_df["m/z"]) 260 else: 261 raise ValueError(f"{calibration_ref_match_method} not allowed.") 262 263 if False: 264 min_calib_ppm_error = calib_ppm_error_threshold[0] 265 max_calib_ppm_error = calib_ppm_error_threshold[1] 266 df_raw = self.mass_spectrum.to_dataframe() 267 268 df_raw = df_raw[df_raw["S/N"] > calib_snr_threshold] 269 # optionally further subset that based on minimum S/N, RP, Peak Height 270 # to ensure only valid points are utilized 271 # in this example, only a S/N threshold is implemented. 272 imzmeas = [] 273 mzrefs = [] 274 275 for mzref in df_ref["m/z"]: 276 # find all peaks within a defined ppm error threshold 277 tmpdf = df_raw[ 278 ((df_raw["m/z"] - mzref) / mzref) * 1e6 < max_calib_ppm_error 279 ] 280 # Error is relative to the theoretical, so the divisor should be divisor 281 282 tmpdf = tmpdf[ 283 ((tmpdf["m/z"] - mzref) / mzref) * 1e6 > min_calib_ppm_error 284 ] 285 286 # only use the calibration point if only one peak is within the thresholds 287 # This may require some optimization of the threshold tolerances 288 if len(tmpdf) == 1: 289 imzmeas.append(int(tmpdf.index.values)) 290 mzrefs.append(mzref) 291 292 # it is crucial the mass lists are in same order 293 # corems likes to do masses from high to low. 294 cal_refs_mz.sort(reverse=False) 295 cal_peaks_mz.sort(reverse=False) 296 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 297 print( 298 str(len(cal_peaks_mz)) 299 + " calibration points matched within thresholds." 300 ) 301 return cal_peaks_mz, cal_refs_mz
Function to find calibration points in the mass spectrum
Based on the reference mass list.
Parameters
- df_ref (Pandas DataFrame): reference mass list for recalibration.
- calib_ppm_error_threshold (tuple of floats, optional): ppm error for finding calibration masses in the spectrum. The default is -1,1. Note: This is based on the calculation of ppm = ((mz_measure - mz_theoretical)/mz_theoretical)*1e6. Some software does this the other way around and value signs must be inverted for that to work.
- calib_snr_threshold (float, optional): snr threshold for finding calibration masses in the spectrum. The default is 5.
Returns
- cal_peaks_mz (list of floats): masses of measured ions to use in calibration routine
- cal_refs_mz (list of floats): reference mz values of found calibration points.
303 def robust_calib( 304 self, 305 param: list[float], 306 cal_peaks_mz: list[float], 307 cal_refs_mz: list[float], 308 order: int = 1, 309 ): 310 """Recalibration function 311 312 Computes the rms of m/z errors to minimize when calibrating. 313 This is adapted from from spike. 314 315 Parameters 316 ---------- 317 param : list of floats 318 generated by minimize function from scipy optimize. 319 cal_peaks_mz : list of floats 320 masses of measured peaks to use in mass calibration. 321 cal_peaks_mz : list of floats 322 reference mz values of found calibration points. 323 order : int, optional 324 order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1. 325 326 Returns 327 ------- 328 rmserror : float 329 root mean square mass error for calibration points. 330 331 """ 332 Aterm = param[0] 333 Bterm = param[1] 334 try: 335 Cterm = param[2] 336 except IndexError: 337 pass 338 339 # get the mspeaks from the mass spectrum object which were calibration points 340 # mspeaks = [self.mass_spectrum.mspeaks[x] for x in imzmeas] 341 # get their calibrated mass values 342 # mspeakmzs = [x.mz_cal for x in mspeaks] 343 cal_peaks_mz = np.asarray(cal_peaks_mz) 344 345 # linearz 346 if order == 1: 347 ref_recal_points = (Aterm * cal_peaks_mz) + Bterm 348 # quadratic 349 elif order == 2: 350 ref_recal_points = (Aterm * (cal_peaks_mz)) + ( 351 Bterm * np.power((cal_peaks_mz), 2) + Cterm 352 ) 353 354 # sort both the calibration points (measured, recalibrated) 355 ref_recal_points.sort() 356 # and sort the calibration points (theoretical, predefined) 357 cal_refs_mz.sort() 358 359 # calculate the ppm error for each calibration point 360 error = ((ref_recal_points - cal_refs_mz) / cal_refs_mz) * 1e6 361 # calculate the root mean square error - this is our target to minimize 362 rmserror = np.sqrt(np.mean(error**2)) 363 return rmserror
Recalibration function
Computes the rms of m/z errors to minimize when calibrating. This is adapted from from spike.
Parameters
- param (list of floats): generated by minimize function from scipy optimize.
- cal_peaks_mz (list of floats): masses of measured peaks to use in mass calibration.
- cal_peaks_mz (list of floats): reference mz values of found calibration points.
- order (int, optional): order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
Returns
- rmserror (float): root mean square mass error for calibration points.
365 def recalibrate_mass_spectrum( 366 self, 367 cal_peaks_mz: list[float], 368 cal_refs_mz: list[float], 369 order: int = 1, 370 diagnostic: bool = False, 371 ): 372 """Main recalibration function which uses a robust linear regression 373 374 This function performs the recalibration of the mass spectrum object. 375 It iteratively applies 376 377 Parameters 378 ---------- 379 cal_peaks_mz : list of float 380 masses of measured peaks to use in mass calibration. 381 cal_refs_mz : list of float 382 reference mz values of found calibration points. 383 order : int, optional 384 order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1. 385 386 Returns 387 ------- 388 mass_spectrum : CoreMS mass spectrum object 389 Calibrated mass spectrum object 390 391 392 Notes 393 ----- 394 This function is adapted, in part, from the SPIKE project [1,2] and is based on the robust linear regression method. 395 396 References 397 ---------- 398 1. Chiron L., Coutouly M-A., Starck J-P., Rolando C., Delsuc M-A. 399 SPIKE a Processing Software dedicated to Fourier Spectroscopies 400 https://arxiv.org/abs/1608.06777 (2016) 401 2. SPIKE - https://github.com/spike-project/spike 402 403 """ 404 # initialise parameters for recalibration 405 # these are the 'Aterm, Bterm, Cterm' 406 # as spectra are already freq->mz calibrated, these terms are very small 407 # may be beneficial to formally separate them from the freq->mz terms 408 if order == 1: 409 Po = [1, 0] 410 elif order == 2: 411 Po = [1, 0, 0] 412 413 if len(cal_peaks_mz) >= 2: 414 if self.mzsegment: # If only part of the spectrum is to be recalibrated 415 mz_exp_peaks = np.array( 416 [mspeak.mz_exp for mspeak in self.mass_spectrum] 417 ) 418 # Split the array into two parts - one to recailbrate, one to keep unchanged. 419 mz_exp_peaks_tocal = mz_exp_peaks[ 420 (mz_exp_peaks >= min(self.mzsegment)) 421 & (mz_exp_peaks <= max(self.mzsegment)) 422 ] 423 mz_exp_peaks_unchanged = mz_exp_peaks[ 424 ~(mz_exp_peaks >= min(self.mzsegment)) 425 | ~(mz_exp_peaks <= max(self.mzsegment)) 426 ] 427 # TODO: - segmented calibration needs a way to better track the calibration args/values... 428 if not self.mass_spectrum.is_centroid: 429 mz_exp_profile = np.array(self.mass_spectrum.mz_exp_profile) 430 # Split the array into two parts - one to recailbrate, one to keep unchanged. 431 mz_exp_profile_tocal = mz_exp_profile[ 432 (mz_exp_profile >= min(self.mzsegment)) 433 & (mz_exp_profile <= max(self.mzsegment)) 434 ] 435 mz_exp_profile_unchanged = mz_exp_profile[ 436 ~(mz_exp_profile >= min(self.mzsegment)) 437 | ~(mz_exp_profile <= max(self.mzsegment)) 438 ] 439 else: # if just recalibrating the whole spectrum 440 mz_exp_peaks_tocal = np.array( 441 [mspeak.mz_exp for mspeak in self.mass_spectrum] 442 ) 443 if not self.mass_spectrum.is_centroid: 444 mz_exp_profile_tocal = np.array(self.mass_spectrum.mz_exp_profile) 445 446 minimize_method = self.mass_spectrum.settings.calib_minimize_method 447 res = minimize( 448 self.robust_calib, 449 Po, 450 args=(cal_peaks_mz, cal_refs_mz, order), 451 method=minimize_method, 452 ) 453 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 454 print( 455 "minimize function completed with RMS error of: {:0.3f} ppm".format( 456 res["fun"] 457 ) 458 ) 459 print( 460 "minimize function performed {:1d} fn evals and {:1d} iterations".format( 461 res["nfev"], res["nit"] 462 ) 463 ) 464 Pn = res.x 465 466 # mz_exp_ms = np.array([mspeak.mz_exp for mspeak in self.mass_spectrum]) 467 468 if order == 1: 469 mz_domain = (Pn[0] * mz_exp_peaks_tocal) + Pn[1] 470 if not self.mass_spectrum.is_centroid: 471 mz_profile_calc = (Pn[0] * mz_exp_profile_tocal) + Pn[1] 472 473 elif order == 2: 474 mz_domain = (Pn[0] * (mz_exp_peaks_tocal)) + ( 475 Pn[1] * np.power((mz_exp_peaks_tocal), 2) + Pn[2] 476 ) 477 478 if not self.mass_spectrum.is_centroid: 479 mz_profile_calc = (Pn[0] * (mz_exp_profile_tocal)) + ( 480 Pn[1] * np.power((mz_exp_profile_tocal), 2) + Pn[2] 481 ) 482 483 if self.mzsegment: 484 # Recombine the mass domains 485 mz_domain = np.concatenate([mz_domain, mz_exp_peaks_unchanged]) 486 mz_domain.sort() 487 if not self.mass_spectrum.is_centroid: 488 mz_profile_calc = np.concatenate( 489 [mz_profile_calc, mz_exp_profile_unchanged] 490 ) 491 mz_profile_calc.sort() 492 # Sort them 493 if ( 494 mz_exp_peaks[0] > mz_exp_peaks[1] 495 ): # If originally descending mass order 496 mz_domain = mz_domain[::-1] 497 if not self.mass_spectrum.is_centroid: 498 mz_profile_calc = mz_profile_calc[::-1] 499 500 self.mass_spectrum.mz_cal = mz_domain 501 if not self.mass_spectrum.is_centroid: 502 self.mass_spectrum.mz_cal_profile = mz_profile_calc 503 504 self.mass_spectrum.calibration_order = order 505 self.mass_spectrum.calibration_RMS = float(res["fun"]) 506 self.mass_spectrum.calibration_points = int(len(cal_refs_mz)) 507 self.mass_spectrum.calibration_ref_mzs = cal_refs_mz 508 self.mass_spectrum.calibration_meas_mzs = cal_peaks_mz 509 510 self.mass_spectrum.calibration_segment = self.mzsegment 511 512 if diagnostic: 513 return self.mass_spectrum, res 514 return self.mass_spectrum 515 else: 516 warnings.warn("Too few calibration points - aborting.") 517 return self.mass_spectrum
Main recalibration function which uses a robust linear regression
This function performs the recalibration of the mass spectrum object. It iteratively applies
Parameters
- cal_peaks_mz (list of float): masses of measured peaks to use in mass calibration.
- cal_refs_mz (list of float): reference mz values of found calibration points.
- order (int, optional): order of the recalibration function. 1 = linear, 2 = quadratic. The default is 1.
Returns
- mass_spectrum (CoreMS mass spectrum object): Calibrated mass spectrum object
Notes
This function is adapted, in part, from the SPIKE project [1,2] and is based on the robust linear regression method.
References
- Chiron L., Coutouly M-A., Starck J-P., Rolando C., Delsuc M-A. SPIKE a Processing Software dedicated to Fourier Spectroscopies https://arxiv.org/abs/1608.06777 (2016)
- SPIKE - https://github.com/spike-project/spike
519 def run(self): 520 """Run the calibration routine 521 522 This function runs the calibration routine. 523 524 """ 525 calib_ppm_error_threshold = self.mass_spectrum.settings.calib_sn_threshold 526 max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error 527 min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error 528 calib_pol_order = self.mass_spectrum.settings.calib_pol_order 529 calibration_ref_match_method = ( 530 self.mass_spectrum.settings.calibration_ref_match_method 531 ) 532 calibration_ref_match_tolerance = ( 533 self.mass_spectrum.settings.calibration_ref_match_tolerance 534 ) 535 calibration_ref_match_std_raw_error_limit = ( 536 self.mass_spectrum.settings.calibration_ref_match_std_raw_error_limit 537 ) 538 539 # load reference mass list 540 df_ref = self.load_ref_mass_list() 541 542 # find calibration points 543 cal_peaks_mz, cal_refs_mz = self.find_calibration_points( 544 df_ref, 545 calib_ppm_error_threshold=(min_calib_ppm_error, max_calib_ppm_error), 546 calib_snr_threshold=calib_ppm_error_threshold, 547 calibration_ref_match_method=calibration_ref_match_method, 548 calibration_ref_match_tolerance=calibration_ref_match_tolerance, 549 calibration_ref_match_std_raw_error_limit=calibration_ref_match_std_raw_error_limit, 550 ) 551 if len(cal_peaks_mz) == 2: 552 self.mass_spectrum.settings.calib_pol_order = 1 553 calib_pol_order = 1 554 if self.mass_spectrum.parameters.mass_spectrum.verbose_processing: 555 print("Only 2 calibration points found, forcing a linear recalibration") 556 elif len(cal_peaks_mz) < 2: 557 warnings.warn("Too few calibration points found, function will fail") 558 self.recalibrate_mass_spectrum(cal_peaks_mz, cal_refs_mz, order=calib_pol_order)
Run the calibration routine
This function runs the calibration routine.