corems.molecular_id.calc.SpectralSimilarity
1__author__ = "Yuri E. Corilo" 2__date__ = "Jun 09, 2021" 3 4from numpy.fft import rfft 5from scipy.stats import pearsonr, spearmanr, kendalltau 6from numpy import ( 7 power, 8 dot, 9 absolute, 10 sqrt, 11) 12from numpy import sum as np_sum 13from numpy.linalg import norm 14from pandas import DataFrame 15import numpy as np 16 17methods_name = { 18 # "entropy_distance": "Entropy Distance", 19 # "weighted_entropy_distance": "Dynamic weighted entropy Distance", 20 "chebyshev_distance": "Chebyshev Distance", 21 "squared_euclidean_distance": "Squared Euclidean Distance", 22 "fidelity_similarity": "Fidelity Similarity", 23 "matusita_distance": "Matusita Distance", 24 "squared_chord_distance": "Squared-chord Distance", 25 # "bhattacharya_1_distance": "Bhattacharya 1 Distance", 26 # "bhattacharya_2_distance": "Bhattacharya 2 Distance", 27 "harmonic_mean_similarity": "Harmonic mean Distance", 28 "Pearson_chi_squared_distance": "Pearson Chi Squared Distance", 29 "Neyman_chi_squared_distance": "Neyman Chi Squared Distance", 30 "probabilistic_symmetric_chi_squared_distance": "Probabilistic symmetric X2 Distance", 31 "topsoe_distance": "Topsoe Distance", 32 "chernoff_distance": "Chernoff Distance", 33 "ruzicka_distance": "Ruzicka Distance", 34 "roberts_distance": "Roberts Distance", 35 # "intersection_distance": "Intersection Distance", 36 "motyka_distance": "Motyka Distance", 37 "canberra_distance": "Canberra Distance", 38 "canberra_metric": "Canberra Metric", 39 "kulczynski_1_distance": "Kulczynski 1 Distance", 40 # "baroni_urbani_buser_distance": "Baroni-Urbani-Buser Distance", 41 # "penrose_size_distance": "Penrose size Distance", 42 # "mean_character_distance": "Mean character Distance", 43 "lorentzian_distance": "Lorentzian Distance", 44 # "penrose_shape_distance": "Penrose shape Distance", 45 "clark_distance": "Clark Distance", 46 "hellinger_distance": "Hellinger Distance", 47 "whittaker_index_of_association_distance": "Whittaker index of association Distance", 48 # "similarity_index_distance": "Similarity Index Distance", 49 # "improved_similarity_distance": "Improved Similarity", 50 # "absolute_value_distance": "Absolute Value Distance", 51 "spectral_contrast_angle_distance": "Spectral Contrast Angle", 52 "wave_hedges_distance": "Wave Hedges Distance", 53 "dice_similarity": "Dice Similarity", 54 "inner_product_distance": "Inner Product Distance", 55 "divergence_distance": "Divergence Distance", 56 "jensen_difference_distance": "Jensen Differences Distance", 57 "kumar_johnson_distance": "Kumar Johnson Distance", 58 "avg_l_distance": "Avg (L1, L8) Distance", 59 "vicis_wave_hadges_distance": "Vicis Wave Hadges Distance", 60 "vicis_symmetric_chi_squared_1_distance": "Vicis-Symmetric X2 1 Distance", 61 "vicis_symmetric_chi_squared_2_distance": "Vicis-Symmetric X2 2 Distance", 62 "vicis_symmetric_chi_squared_3_distance": "Vicis-Symmetric X2 3 Distance", 63 "max_symmetric_chi_squared_distance": "Max Symmetric Chi Squared Distance", 64 "min_symmetric_chi_squared_distance": "Min Symmetric Chi Squared Distance", 65 # "ms_for_id_v1": "MSforID Distance version 1", 66 # "ms_for_id": "MSforID Distance", 67 "additive_sym_chi_sq": "Additive Symmetric Chi Squared", 68 "bhattacharya_distance": "Battacharya Distance", 69 "generalized_ochiai_index": "Generalized Ochiai Index", 70 "gower_distance": "Gower Distance", 71 "impr_sqrt_cosine_sim": "Improved Square Root Cosine Similarity", 72 "intersection_sim": "Intersection Similarity", 73 "j_divergence": "J Divergence", 74 "jensen_shannon_index": "Jensen Shannon Index", 75 "k_divergence": "K Divergence", 76 "VW6": "VW6", 77 "VW5": "VW5", 78 "VW4": "VW4", 79 "VW3": "VW3", 80 "VW2": "VW2", 81 "VW1": "VW1", 82 "taneja_divergence": "Taneja Divergence", 83 "symmetric_chi_squared_distance": "Symmetric Chi Squared Distance", 84 "squared_chi_squared_distance": "Squared Chi Squared Distance", 85 "square_root_cosine_correlation": "Square Root Cosine Correlation", 86 "sorensen_distance": "Sorensen Distance", 87 "Minokowski_3": "Minokowski 3 Distance", 88 "Minokowski_4": "Minokowski 4 Distance", 89 "kumarjohnson_divergence": "Kumar Johnson Divergence", 90 "kumarhassebrook_similarity": "Kumar Hassebrook Similarity", 91 "kullbackleibler_divergence": "Kullback Leibler Divergence", 92 "soergel_distance": "Soergel Distance", 93} 94 95methods_scale = { 96 "entropy": [0, np.log(4)], 97 "weighted_entropy": [0, np.log(4)], 98 "absolute_value": [0, 2], 99 "avg_l": [0, 1.5], 100 "bhattacharya_1": [0, np.arccos(0) ** 2], 101 "bhattacharya_2": [0, np.inf], 102 "canberra": [0, np.inf], 103 "clark": [0, np.inf], 104 "divergence": [0, np.inf], 105 "euclidean": [0, np.sqrt(2)], 106 "hellinger": [0, np.inf], 107 "improved_similarity": [0, np.inf], 108 "lorentzian": [0, np.inf], 109 "manhattan": [0, 2], 110 "matusita": [0, np.sqrt(2)], 111 "mean_character": [0, 2], 112 "motyka": [-0.5, 0], 113 "ms_for_id": [-np.inf, 0], 114 "ms_for_id_v1": [0, np.inf], 115 "pearson_correlation": [-1, 1], 116 "penrose_shape": [0, np.sqrt(2)], 117 "penrose_size": [0, np.inf], 118 "probabilistic_symmetric_chi_squared": [0, 1], 119 "similarity_index": [0, np.inf], 120 "squared_chord": [0, 2], 121 "squared_euclidean": [0, 2], 122 "symmetric_chi_squared": [0, 0.5 * np.sqrt(2)], 123 "topsoe": [0, np.sqrt(2)], 124 "vicis_symmetric_chi_squared_3": [0, 2], 125 "wave_hedges": [0, np.inf], 126 "whittaker_index_of_association": [0, np.inf], 127} 128 129 130class SpectralSimilarity: 131 """Class containing methods for calculating spectral similarity between two mass spectra. 132 133 Parameters 134 ---------- 135 ms_mz_abun_dict : dict 136 Dictionary of mass to abundance values for the experimental mass spectrum. 137 ref_obj : dict 138 Dictionary of mass to abundance values for the reference mass spectrum. 139 norm_func : function 140 Function to normalize the abundance values. 141 142 Attributes 143 ---------- 144 normalize_func : function 145 Function to normalize the abundance values. 146 ms_mz_abun_dict : dict 147 Dictionary of mass to abundance values for the experimental mass spectrum. 148 ref_obj : dict 149 Dictionary of mass to abundance values for the reference mass spectrum. 150 exp_abun : list 151 List of abundance values for the experimental mass spectrum. 152 exp_mz : list 153 List of mass values for the experimental mass spectrum. 154 ref_mz : list 155 List of mass values for the reference mass spectrum. 156 ref_abun : list 157 List of abundance values for the reference mass spectrum. 158 ref_mz_abun_dict : dict 159 Dictionary of mass to abundance values for the reference mass spectrum. 160 df : DataFrame 161 DataFrame containing the experimental and reference mass spectrum data. 162 zero_filled_u_l : tuple 163 Tuple containing the experimental and reference mass spectrum data after zero filling and normalization. 164 common_mz_values : list 165 List of common mass values between the experimental and reference mass spectra. 166 n_x_y : int 167 Number of common mass values between the experimental and reference mass spectra. 168 169 Methods 170 ------- 171 * nan_fill(df, fill_with=0). 172 Fill missing mass values with a given value. 173 * normalize(x, y, norm_func=sum). 174 Normalize the abundance values. 175 * weighted_cosine_correlation(a=0.5, b=1.3, nanfill=1e-10). 176 Calculate the weighted cosine correlation between the experimental and reference mass spectra. 177 * cosine_correlation(). 178 Calculate the cosine correlation between the experimental and reference mass spectra. 179 * stein_scott(). 180 Calculate the Stein-Scott similarity between the experimental and reference mass spectra. 181 * pearson_correlation(). 182 Calculate the Pearson correlation between the experimental and reference mass spectra. 183 * spearman_correlation(). 184 Calculate the Spearman correlation between the experimental and reference mass spectra. 185 186 187 """ 188 189 def __init__(self, ms_mz_abun_dict, ref_obj, norm_func=sum): 190 self.normalize_func = norm_func 191 self.ms_mz_abun_dict = ms_mz_abun_dict 192 self.ref_obj = ref_obj 193 194 self.exp_abun = list(self.ms_mz_abun_dict.values()) 195 self.exp_mz = list(self.ms_mz_abun_dict.keys()) 196 197 self.ref_mz = self.ref_obj.get("mz") 198 self.ref_abun = self.ref_obj.get("abundance") 199 200 self.ref_mz_abun_dict = dict(zip(self.ref_mz, self.ref_abun)) 201 202 # parse to dataframe, easier to zerofill and tranpose 203 self.df = DataFrame([self.ms_mz_abun_dict, self.ref_mz_abun_dict]) 204 205 # fill missing mz with abundance 0 206 x, y = self.nan_fill(self.df, fill_with=1e-10) 207 208 self.zero_filled_u_l = self.normalize(x, y, norm_func=self.normalize_func) 209 210 # filter out the mass values that have zero intensities in self.exp_abun 211 exp_mz_filtered = set([k for k in self.exp_mz if self.ms_mz_abun_dict[k] != 0]) 212 213 # filter out the mass values that have zero intensities in self.ref_mz 214 ref_mz_filtered = set([k for k in self.ref_mz if self.ref_mz_abun_dict[k] != 0]) 215 216 # find the intersection/common mass values of both ref and exp, and sort them 217 self.common_mz_values = sorted( 218 list(exp_mz_filtered.intersection(ref_mz_filtered)) 219 ) 220 221 # find the number of common mass values (after filtering 0s) 222 self.n_x_y = len(self.common_mz_values) 223 # print(self.n_x_y) 224 225 def nan_fill(self, df, fill_with=0): 226 """Fill missing mass values with a given value. 227 228 Parameters 229 ---------- 230 df : DataFrame 231 DataFrame containing the experimental and reference mass spectrum data. 232 fill_with : float 233 Value to fill missing mass values with. 234 235 Returns 236 ------- 237 x : list 238 List of abundance values for the experimental mass spectrum. 239 y : list 240 List of abundance values for the reference mass spectrum.""" 241 df.fillna(fill_with, inplace=True) 242 243 return df.T[0].values, df.T[1].values 244 245 def normalize(self, x, y, norm_func=sum): 246 """Normalize the abundance values. 247 248 Parameters 249 ---------- 250 x : list 251 List of abundance values for the experimental mass spectrum. 252 y : list 253 List of abundance values for the reference mass spectrum. 254 norm_func : function 255 Function to normalize the abundance values. 256 Default is sum 257 258 Returns 259 ------- 260 u_l : tuple 261 Tuple containing the experimental and reference mass spectrum data after zero filling and normalization. 262 """ 263 if norm_func is not None: 264 u_l = (x / norm_func(x), y / norm_func(y)) 265 return u_l 266 else: 267 return (x, y) 268 269 def weighted_cosine_correlation(self, a=0.5, b=1.3, nanfill=1e-10): 270 """Calculate the weighted cosine correlation between the experimental and reference mass spectra. 271 272 Parameters 273 ---------- 274 a : float 275 Weighting factor for the abundance values. 276 Default is 0.5 277 b : float 278 Weighting factor for the mass values. 279 Default is 1.3 280 nanfill : float 281 Value to fill missing mass values with. 282 Default is 1e-10 283 284 Returns 285 ------- 286 correlation : float 287 Weighted cosine correlation between the experimental and reference mass spectra. 288 """ 289 # create dict['mz'] = abundance, for experimental data 290 # ms_mz_abun_dict = mass_spec.mz_abun_dict 291 # weight exp data 292 293 xc = power(self.exp_abun, a) * power(self.exp_mz, b) 294 295 # track back to individual mz 296 weighted_exp_dict = dict(zip(self.ms_mz_abun_dict.keys(), xc)) 297 298 # weight ref data 299 yc = power(self.ref_obj.get("abundance"), a) * power(self.ref_obj.get("mz"), b) 300 301 ref_mz_abun_dict = dict(zip(self.ref_obj.get("mz"), yc)) 302 303 # parse to dataframe, easier to zerofill and tranpose 304 df = DataFrame([weighted_exp_dict, ref_mz_abun_dict]) 305 306 # fill missing mz with weight {abun**a}{m/z**b} to 0 307 x, y = self.nan_fill(df, fill_with=nanfill) 308 309 # correlation = (1 - cosine(x, y)) 310 311 correlation = dot(x, y) / (norm(x) * norm(y)) 312 313 return correlation 314 315 def cosine_correlation(self): 316 """Calculate the cosine correlation between the experimental and reference mass spectra. 317 318 Returns 319 ------- 320 correlation : float 321 Cosine correlation between the experimental and reference mass spectra. 322 323 """ 324 # calculate cosine correlation, 325 x = self.zero_filled_u_l[0] 326 y = self.zero_filled_u_l[1] 327 328 # correlation = (1 - cosine(x, y)) 329 330 correlation = dot(x, y) / (norm(x) * norm(y)) 331 332 return correlation 333 334 def stein_scott(self): 335 """Calculate the Stein-Scott similarity between the experimental and reference mass spectra. 336 337 Returns 338 ------- 339 s_ss_x_y : float 340 Stein-Scott similarity between the experimental and reference mass spectra. 341 s_ss_x_y_nist : float 342 Stein-Scott similarity between the experimental and reference mass spectra. 343 """ 344 # TODO check this code 345 if self.n_x_y == 0: 346 return 0, 0 347 348 # count number of non-zero abundance/peak intensity values 349 n_x = sum(a != 0 for a in self.exp_abun) 350 351 s_r_x_y = 0 352 353 a, b = 1, 0 354 355 for i in range(1, self.n_x_y): 356 current_value = self.common_mz_values[i] 357 previous_value = self.common_mz_values[i - 1] 358 359 y_i = self.ref_mz_abun_dict[current_value] 360 y_i_minus1 = self.ref_mz_abun_dict[previous_value] 361 362 lc_current = power(y_i, a) * power(current_value, b) 363 lc_previous = power(y_i_minus1, a) * power(previous_value, b) 364 365 x_i = self.ms_mz_abun_dict[current_value] 366 x_i_minus1 = self.ms_mz_abun_dict[previous_value] 367 368 uc_current = power(x_i, a) * power(current_value, b) 369 uc_previous = power(x_i_minus1, a) * power(previous_value, b) 370 371 T1 = lc_current / lc_previous 372 373 T2 = uc_previous / uc_current 374 375 temp_computation = T1 * T2 376 377 n = 0 378 if temp_computation <= 1: 379 n = 1 380 else: 381 n = -1 382 383 s_r_x_y = s_r_x_y + power(temp_computation, n) 384 385 # finish the calculation of S_R(X,Y) 386 387 s_r_x_y = s_r_x_y / self.n_x_y 388 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 389 s_wc_x_y = self.weighted_cosine_correlation(a=0.5, b=3, nanfill=0) 390 391 s_ss_x_y = ((n_x * s_wc_x_y) + (self.n_x_y * s_r_x_y)) / (n_x + self.n_x_y) 392 393 s_wc_x_y_nist = self.weighted_cosine_correlation(a=0.5, b=1.3, nanfill=0) 394 395 s_ss_x_y_nist = ((n_x * s_wc_x_y_nist) + (self.n_x_y * s_r_x_y)) / ( 396 n_x + self.n_x_y 397 ) 398 # final step 399 400 return s_ss_x_y, s_ss_x_y_nist 401 402 def pearson_correlation( 403 self, 404 ): 405 """Calculate the Pearson correlation between the experimental and reference mass spectra. 406 407 Returns 408 ------- 409 correlation : float 410 Pearson correlation between the experimental and reference mass spectra. 411 """ 412 correlation = pearsonr(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 413 414 return correlation[0] 415 416 def spearman_correlation(self): 417 """Calculate the Spearman correlation between the experimental and reference mass spectra. 418 419 Returns 420 ------- 421 coorelation : float 422 Spearman correlation between the experimental and reference mass spectra. 423 """ 424 # calculate Spearman correlation 425 # ## TODO - Check axis 426 correlation = spearmanr( 427 self.zero_filled_u_l[0], self.zero_filled_u_l[1], axis=0 428 ) 429 430 return correlation[0] 431 432 def kendall_tau(self): 433 """Calculate the Kendall's tau correlation between the experimental and reference mass spectra. 434 435 Returns 436 ------- 437 correlation : float 438 Kendall's tau correlation between the experimental and reference mass spectra.""" 439 # create dict['mz'] = abundance, for experimental data 440 # self.ms_mz_abun_dict = mass_spec.mz_abun_dict 441 442 # create dict['mz'] = abundance, for experimental data 443 444 # calculate Kendall's tau 445 correlation = kendalltau(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 446 447 return correlation[0] 448 449 def dft_correlation(self): 450 """Calculate the DFT correlation between the experimental and reference mass spectra. 451 452 Returns 453 ------- 454 correlation : float 455 DFT correlation between the experimental and reference mass spectra. 456 """ 457 if self.n_x_y == 0: 458 return 0 459 460 # count number of non-zero abundance/peak intensity values 461 n_x = sum(a != 0 for a in self.exp_abun) 462 463 x, y = self.nan_fill(self.df, fill_with=0) 464 465 x, y = self.normalize(x, y, norm_func=self.normalize_func) 466 467 # get the Fourier transform of x and y 468 x_dft = rfft(x).real 469 y_dft = rfft(y).real 470 471 s_dft_xy = dot(x_dft, y_dft) / (norm(x_dft) * norm(y_dft)) 472 473 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 474 s_wc_x_y = self.weighted_cosine_correlation(nanfill=0) 475 476 # final step 477 s_dft = (n_x * s_wc_x_y + self.n_x_y * s_dft_xy) / (n_x + self.n_x_y) 478 479 return s_dft 480 481 def dwt_correlation(self): 482 """Calculate the DWT correlation between the experimental and reference mass spectra. 483 484 Returns 485 ------- 486 correlation : float 487 DWT correlation between the experimental and reference mass spectra. 488 489 Notes 490 ----- 491 This function requires the PyWavelets library to be installed. 492 This is not a default requirement as this function is not widely used. 493 """ 494 495 from pywt import dwt 496 497 if self.n_x_y == 0: 498 return 0 499 500 # count number of non-zero abundance/peak intensity values 501 n_x = sum(a != 0 for a in self.exp_abun) 502 503 # calculate cosine correlation, 504 x, y = self.nan_fill(self.df, fill_with=0) 505 506 x, y = self.normalize(x, y, norm_func=self.normalize_func) 507 508 # Make x and y into an array 509 x_a = list(x) 510 y_a = list(y) 511 512 # get the wavelet transform of x and y (Daubechies with a filter length of 4. Asymmetric. pywavelets function) 513 # Will only use the detail dwt (dwtDd 514 x_dwtD = dwt(x_a, "db2")[1] 515 y_dwtD = dwt(y_a, "db2")[1] 516 517 s_dwt_xy = dot(x_dwtD, y_dwtD) / (norm(x_dwtD) * norm(y_dwtD)) 518 519 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 520 s_wc_x_y = self.weighted_cosine_correlation(nanfill=0) 521 522 # final step 523 s_dwt = (n_x * s_wc_x_y + self.n_x_y * s_dwt_xy) / (n_x + self.n_x_y) 524 525 return s_dwt 526 527 def euclidean_distance(self): 528 """Calculate the Euclidean distance between the experimental and reference mass spectra. 529 530 Returns 531 ------- 532 correlation : float 533 Euclidean distance between the experimental and reference mass spectra. 534 """ 535 # correlation = euclidean_distance_manual(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 536 qlist = self.zero_filled_u_l[0] 537 rlist = self.zero_filled_u_l[1] 538 539 correlation = sqrt(np_sum(power(qlist - rlist, 2))) 540 541 return correlation 542 543 def manhattan_distance(self): 544 """Calculate the Manhattan distance between the experimental and reference mass spectra. 545 546 Returns 547 ------- 548 correlation : float 549 Manhattan distance between the experimental and reference mass spectra. 550 """ 551 qlist = self.zero_filled_u_l[0] 552 rlist = self.zero_filled_u_l[1] 553 554 return np_sum(absolute(qlist - rlist)) 555 556 def jaccard_distance(self): 557 """Calculate the Jaccard distance between the experimental and reference mass spectra. 558 559 Returns 560 ------- 561 correlation : float 562 Jaccard distance between the experimental and reference mass spectra. 563 """ 564 565 def jaccard_similarity(list1, list2): 566 intersection = len(list(set(list1).intersection(list2))) 567 union = (len(list1) + len(list2)) - intersection 568 return float(intersection) / union 569 570 qlist = self.zero_filled_u_l[0] 571 rlist = self.zero_filled_u_l[1] 572 573 return np_sum(power(qlist - rlist, 2)) / ( 574 np_sum(power(qlist, 2)) + np_sum(power(rlist, 2)) - np_sum(qlist * rlist) 575 ) 576 # correlation = jaccard_similarity(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 577 # @return correlation 578 579 def extra_distances(self): 580 """Function to calculate distances using additional metrics defined in math_distance.py 581 582 Currently, calculates all distances. 583 584 Returns 585 ------- 586 dict_res : dict 587 Dictionary containing the distances between the experimental and reference mass spectra. 588 589 """ 590 from corems.molecular_id.calc import math_distance 591 592 # qlist = self.zero_filled_u_l[2] 593 # rlist = self.zero_filled_u_l[3] 594 595 dict_res = {} 596 597 for method in methods_name: 598 # function_name = method + "_distance" 599 function_name = method 600 if hasattr(math_distance, function_name): 601 f = getattr(math_distance, function_name) 602 603 if function_name == "canberra_metric": 604 x, y = self.nan_fill(self.df, fill_with=0) 605 606 qlist, rlist = self.normalize(x, y, norm_func=self.normalize_func) 607 # print("qlist:") 608 # print(qlist) 609 # print("rlist:") 610 # print(rlist) 611 612 else: 613 qlist = self.zero_filled_u_l[0] 614 rlist = self.zero_filled_u_l[1] 615 616 dist = f(qlist, rlist) 617 # if method == "Minokowski_3": 618 # print("qlist:") 619 # print(qlist) 620 # print("rlist") 621 # print(rlist) 622 # exit() 623 # if dist == np.nan or dis == np.inf: 624 # print(self.exp_abun) 625 # print(self.exp_mz) 626 # print(function_name) 627 # print(len(self.exp_abun)) 628 # print(len(self.exp_mz)) 629 # print(self.zero_filled_u_l[1]) 630 dict_res[method] = dist 631 632 return dict_res
131class SpectralSimilarity: 132 """Class containing methods for calculating spectral similarity between two mass spectra. 133 134 Parameters 135 ---------- 136 ms_mz_abun_dict : dict 137 Dictionary of mass to abundance values for the experimental mass spectrum. 138 ref_obj : dict 139 Dictionary of mass to abundance values for the reference mass spectrum. 140 norm_func : function 141 Function to normalize the abundance values. 142 143 Attributes 144 ---------- 145 normalize_func : function 146 Function to normalize the abundance values. 147 ms_mz_abun_dict : dict 148 Dictionary of mass to abundance values for the experimental mass spectrum. 149 ref_obj : dict 150 Dictionary of mass to abundance values for the reference mass spectrum. 151 exp_abun : list 152 List of abundance values for the experimental mass spectrum. 153 exp_mz : list 154 List of mass values for the experimental mass spectrum. 155 ref_mz : list 156 List of mass values for the reference mass spectrum. 157 ref_abun : list 158 List of abundance values for the reference mass spectrum. 159 ref_mz_abun_dict : dict 160 Dictionary of mass to abundance values for the reference mass spectrum. 161 df : DataFrame 162 DataFrame containing the experimental and reference mass spectrum data. 163 zero_filled_u_l : tuple 164 Tuple containing the experimental and reference mass spectrum data after zero filling and normalization. 165 common_mz_values : list 166 List of common mass values between the experimental and reference mass spectra. 167 n_x_y : int 168 Number of common mass values between the experimental and reference mass spectra. 169 170 Methods 171 ------- 172 * nan_fill(df, fill_with=0). 173 Fill missing mass values with a given value. 174 * normalize(x, y, norm_func=sum). 175 Normalize the abundance values. 176 * weighted_cosine_correlation(a=0.5, b=1.3, nanfill=1e-10). 177 Calculate the weighted cosine correlation between the experimental and reference mass spectra. 178 * cosine_correlation(). 179 Calculate the cosine correlation between the experimental and reference mass spectra. 180 * stein_scott(). 181 Calculate the Stein-Scott similarity between the experimental and reference mass spectra. 182 * pearson_correlation(). 183 Calculate the Pearson correlation between the experimental and reference mass spectra. 184 * spearman_correlation(). 185 Calculate the Spearman correlation between the experimental and reference mass spectra. 186 187 188 """ 189 190 def __init__(self, ms_mz_abun_dict, ref_obj, norm_func=sum): 191 self.normalize_func = norm_func 192 self.ms_mz_abun_dict = ms_mz_abun_dict 193 self.ref_obj = ref_obj 194 195 self.exp_abun = list(self.ms_mz_abun_dict.values()) 196 self.exp_mz = list(self.ms_mz_abun_dict.keys()) 197 198 self.ref_mz = self.ref_obj.get("mz") 199 self.ref_abun = self.ref_obj.get("abundance") 200 201 self.ref_mz_abun_dict = dict(zip(self.ref_mz, self.ref_abun)) 202 203 # parse to dataframe, easier to zerofill and tranpose 204 self.df = DataFrame([self.ms_mz_abun_dict, self.ref_mz_abun_dict]) 205 206 # fill missing mz with abundance 0 207 x, y = self.nan_fill(self.df, fill_with=1e-10) 208 209 self.zero_filled_u_l = self.normalize(x, y, norm_func=self.normalize_func) 210 211 # filter out the mass values that have zero intensities in self.exp_abun 212 exp_mz_filtered = set([k for k in self.exp_mz if self.ms_mz_abun_dict[k] != 0]) 213 214 # filter out the mass values that have zero intensities in self.ref_mz 215 ref_mz_filtered = set([k for k in self.ref_mz if self.ref_mz_abun_dict[k] != 0]) 216 217 # find the intersection/common mass values of both ref and exp, and sort them 218 self.common_mz_values = sorted( 219 list(exp_mz_filtered.intersection(ref_mz_filtered)) 220 ) 221 222 # find the number of common mass values (after filtering 0s) 223 self.n_x_y = len(self.common_mz_values) 224 # print(self.n_x_y) 225 226 def nan_fill(self, df, fill_with=0): 227 """Fill missing mass values with a given value. 228 229 Parameters 230 ---------- 231 df : DataFrame 232 DataFrame containing the experimental and reference mass spectrum data. 233 fill_with : float 234 Value to fill missing mass values with. 235 236 Returns 237 ------- 238 x : list 239 List of abundance values for the experimental mass spectrum. 240 y : list 241 List of abundance values for the reference mass spectrum.""" 242 df.fillna(fill_with, inplace=True) 243 244 return df.T[0].values, df.T[1].values 245 246 def normalize(self, x, y, norm_func=sum): 247 """Normalize the abundance values. 248 249 Parameters 250 ---------- 251 x : list 252 List of abundance values for the experimental mass spectrum. 253 y : list 254 List of abundance values for the reference mass spectrum. 255 norm_func : function 256 Function to normalize the abundance values. 257 Default is sum 258 259 Returns 260 ------- 261 u_l : tuple 262 Tuple containing the experimental and reference mass spectrum data after zero filling and normalization. 263 """ 264 if norm_func is not None: 265 u_l = (x / norm_func(x), y / norm_func(y)) 266 return u_l 267 else: 268 return (x, y) 269 270 def weighted_cosine_correlation(self, a=0.5, b=1.3, nanfill=1e-10): 271 """Calculate the weighted cosine correlation between the experimental and reference mass spectra. 272 273 Parameters 274 ---------- 275 a : float 276 Weighting factor for the abundance values. 277 Default is 0.5 278 b : float 279 Weighting factor for the mass values. 280 Default is 1.3 281 nanfill : float 282 Value to fill missing mass values with. 283 Default is 1e-10 284 285 Returns 286 ------- 287 correlation : float 288 Weighted cosine correlation between the experimental and reference mass spectra. 289 """ 290 # create dict['mz'] = abundance, for experimental data 291 # ms_mz_abun_dict = mass_spec.mz_abun_dict 292 # weight exp data 293 294 xc = power(self.exp_abun, a) * power(self.exp_mz, b) 295 296 # track back to individual mz 297 weighted_exp_dict = dict(zip(self.ms_mz_abun_dict.keys(), xc)) 298 299 # weight ref data 300 yc = power(self.ref_obj.get("abundance"), a) * power(self.ref_obj.get("mz"), b) 301 302 ref_mz_abun_dict = dict(zip(self.ref_obj.get("mz"), yc)) 303 304 # parse to dataframe, easier to zerofill and tranpose 305 df = DataFrame([weighted_exp_dict, ref_mz_abun_dict]) 306 307 # fill missing mz with weight {abun**a}{m/z**b} to 0 308 x, y = self.nan_fill(df, fill_with=nanfill) 309 310 # correlation = (1 - cosine(x, y)) 311 312 correlation = dot(x, y) / (norm(x) * norm(y)) 313 314 return correlation 315 316 def cosine_correlation(self): 317 """Calculate the cosine correlation between the experimental and reference mass spectra. 318 319 Returns 320 ------- 321 correlation : float 322 Cosine correlation between the experimental and reference mass spectra. 323 324 """ 325 # calculate cosine correlation, 326 x = self.zero_filled_u_l[0] 327 y = self.zero_filled_u_l[1] 328 329 # correlation = (1 - cosine(x, y)) 330 331 correlation = dot(x, y) / (norm(x) * norm(y)) 332 333 return correlation 334 335 def stein_scott(self): 336 """Calculate the Stein-Scott similarity between the experimental and reference mass spectra. 337 338 Returns 339 ------- 340 s_ss_x_y : float 341 Stein-Scott similarity between the experimental and reference mass spectra. 342 s_ss_x_y_nist : float 343 Stein-Scott similarity between the experimental and reference mass spectra. 344 """ 345 # TODO check this code 346 if self.n_x_y == 0: 347 return 0, 0 348 349 # count number of non-zero abundance/peak intensity values 350 n_x = sum(a != 0 for a in self.exp_abun) 351 352 s_r_x_y = 0 353 354 a, b = 1, 0 355 356 for i in range(1, self.n_x_y): 357 current_value = self.common_mz_values[i] 358 previous_value = self.common_mz_values[i - 1] 359 360 y_i = self.ref_mz_abun_dict[current_value] 361 y_i_minus1 = self.ref_mz_abun_dict[previous_value] 362 363 lc_current = power(y_i, a) * power(current_value, b) 364 lc_previous = power(y_i_minus1, a) * power(previous_value, b) 365 366 x_i = self.ms_mz_abun_dict[current_value] 367 x_i_minus1 = self.ms_mz_abun_dict[previous_value] 368 369 uc_current = power(x_i, a) * power(current_value, b) 370 uc_previous = power(x_i_minus1, a) * power(previous_value, b) 371 372 T1 = lc_current / lc_previous 373 374 T2 = uc_previous / uc_current 375 376 temp_computation = T1 * T2 377 378 n = 0 379 if temp_computation <= 1: 380 n = 1 381 else: 382 n = -1 383 384 s_r_x_y = s_r_x_y + power(temp_computation, n) 385 386 # finish the calculation of S_R(X,Y) 387 388 s_r_x_y = s_r_x_y / self.n_x_y 389 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 390 s_wc_x_y = self.weighted_cosine_correlation(a=0.5, b=3, nanfill=0) 391 392 s_ss_x_y = ((n_x * s_wc_x_y) + (self.n_x_y * s_r_x_y)) / (n_x + self.n_x_y) 393 394 s_wc_x_y_nist = self.weighted_cosine_correlation(a=0.5, b=1.3, nanfill=0) 395 396 s_ss_x_y_nist = ((n_x * s_wc_x_y_nist) + (self.n_x_y * s_r_x_y)) / ( 397 n_x + self.n_x_y 398 ) 399 # final step 400 401 return s_ss_x_y, s_ss_x_y_nist 402 403 def pearson_correlation( 404 self, 405 ): 406 """Calculate the Pearson correlation between the experimental and reference mass spectra. 407 408 Returns 409 ------- 410 correlation : float 411 Pearson correlation between the experimental and reference mass spectra. 412 """ 413 correlation = pearsonr(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 414 415 return correlation[0] 416 417 def spearman_correlation(self): 418 """Calculate the Spearman correlation between the experimental and reference mass spectra. 419 420 Returns 421 ------- 422 coorelation : float 423 Spearman correlation between the experimental and reference mass spectra. 424 """ 425 # calculate Spearman correlation 426 # ## TODO - Check axis 427 correlation = spearmanr( 428 self.zero_filled_u_l[0], self.zero_filled_u_l[1], axis=0 429 ) 430 431 return correlation[0] 432 433 def kendall_tau(self): 434 """Calculate the Kendall's tau correlation between the experimental and reference mass spectra. 435 436 Returns 437 ------- 438 correlation : float 439 Kendall's tau correlation between the experimental and reference mass spectra.""" 440 # create dict['mz'] = abundance, for experimental data 441 # self.ms_mz_abun_dict = mass_spec.mz_abun_dict 442 443 # create dict['mz'] = abundance, for experimental data 444 445 # calculate Kendall's tau 446 correlation = kendalltau(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 447 448 return correlation[0] 449 450 def dft_correlation(self): 451 """Calculate the DFT correlation between the experimental and reference mass spectra. 452 453 Returns 454 ------- 455 correlation : float 456 DFT correlation between the experimental and reference mass spectra. 457 """ 458 if self.n_x_y == 0: 459 return 0 460 461 # count number of non-zero abundance/peak intensity values 462 n_x = sum(a != 0 for a in self.exp_abun) 463 464 x, y = self.nan_fill(self.df, fill_with=0) 465 466 x, y = self.normalize(x, y, norm_func=self.normalize_func) 467 468 # get the Fourier transform of x and y 469 x_dft = rfft(x).real 470 y_dft = rfft(y).real 471 472 s_dft_xy = dot(x_dft, y_dft) / (norm(x_dft) * norm(y_dft)) 473 474 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 475 s_wc_x_y = self.weighted_cosine_correlation(nanfill=0) 476 477 # final step 478 s_dft = (n_x * s_wc_x_y + self.n_x_y * s_dft_xy) / (n_x + self.n_x_y) 479 480 return s_dft 481 482 def dwt_correlation(self): 483 """Calculate the DWT correlation between the experimental and reference mass spectra. 484 485 Returns 486 ------- 487 correlation : float 488 DWT correlation between the experimental and reference mass spectra. 489 490 Notes 491 ----- 492 This function requires the PyWavelets library to be installed. 493 This is not a default requirement as this function is not widely used. 494 """ 495 496 from pywt import dwt 497 498 if self.n_x_y == 0: 499 return 0 500 501 # count number of non-zero abundance/peak intensity values 502 n_x = sum(a != 0 for a in self.exp_abun) 503 504 # calculate cosine correlation, 505 x, y = self.nan_fill(self.df, fill_with=0) 506 507 x, y = self.normalize(x, y, norm_func=self.normalize_func) 508 509 # Make x and y into an array 510 x_a = list(x) 511 y_a = list(y) 512 513 # get the wavelet transform of x and y (Daubechies with a filter length of 4. Asymmetric. pywavelets function) 514 # Will only use the detail dwt (dwtDd 515 x_dwtD = dwt(x_a, "db2")[1] 516 y_dwtD = dwt(y_a, "db2")[1] 517 518 s_dwt_xy = dot(x_dwtD, y_dwtD) / (norm(x_dwtD) * norm(y_dwtD)) 519 520 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 521 s_wc_x_y = self.weighted_cosine_correlation(nanfill=0) 522 523 # final step 524 s_dwt = (n_x * s_wc_x_y + self.n_x_y * s_dwt_xy) / (n_x + self.n_x_y) 525 526 return s_dwt 527 528 def euclidean_distance(self): 529 """Calculate the Euclidean distance between the experimental and reference mass spectra. 530 531 Returns 532 ------- 533 correlation : float 534 Euclidean distance between the experimental and reference mass spectra. 535 """ 536 # correlation = euclidean_distance_manual(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 537 qlist = self.zero_filled_u_l[0] 538 rlist = self.zero_filled_u_l[1] 539 540 correlation = sqrt(np_sum(power(qlist - rlist, 2))) 541 542 return correlation 543 544 def manhattan_distance(self): 545 """Calculate the Manhattan distance between the experimental and reference mass spectra. 546 547 Returns 548 ------- 549 correlation : float 550 Manhattan distance between the experimental and reference mass spectra. 551 """ 552 qlist = self.zero_filled_u_l[0] 553 rlist = self.zero_filled_u_l[1] 554 555 return np_sum(absolute(qlist - rlist)) 556 557 def jaccard_distance(self): 558 """Calculate the Jaccard distance between the experimental and reference mass spectra. 559 560 Returns 561 ------- 562 correlation : float 563 Jaccard distance between the experimental and reference mass spectra. 564 """ 565 566 def jaccard_similarity(list1, list2): 567 intersection = len(list(set(list1).intersection(list2))) 568 union = (len(list1) + len(list2)) - intersection 569 return float(intersection) / union 570 571 qlist = self.zero_filled_u_l[0] 572 rlist = self.zero_filled_u_l[1] 573 574 return np_sum(power(qlist - rlist, 2)) / ( 575 np_sum(power(qlist, 2)) + np_sum(power(rlist, 2)) - np_sum(qlist * rlist) 576 ) 577 # correlation = jaccard_similarity(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 578 # @return correlation 579 580 def extra_distances(self): 581 """Function to calculate distances using additional metrics defined in math_distance.py 582 583 Currently, calculates all distances. 584 585 Returns 586 ------- 587 dict_res : dict 588 Dictionary containing the distances between the experimental and reference mass spectra. 589 590 """ 591 from corems.molecular_id.calc import math_distance 592 593 # qlist = self.zero_filled_u_l[2] 594 # rlist = self.zero_filled_u_l[3] 595 596 dict_res = {} 597 598 for method in methods_name: 599 # function_name = method + "_distance" 600 function_name = method 601 if hasattr(math_distance, function_name): 602 f = getattr(math_distance, function_name) 603 604 if function_name == "canberra_metric": 605 x, y = self.nan_fill(self.df, fill_with=0) 606 607 qlist, rlist = self.normalize(x, y, norm_func=self.normalize_func) 608 # print("qlist:") 609 # print(qlist) 610 # print("rlist:") 611 # print(rlist) 612 613 else: 614 qlist = self.zero_filled_u_l[0] 615 rlist = self.zero_filled_u_l[1] 616 617 dist = f(qlist, rlist) 618 # if method == "Minokowski_3": 619 # print("qlist:") 620 # print(qlist) 621 # print("rlist") 622 # print(rlist) 623 # exit() 624 # if dist == np.nan or dis == np.inf: 625 # print(self.exp_abun) 626 # print(self.exp_mz) 627 # print(function_name) 628 # print(len(self.exp_abun)) 629 # print(len(self.exp_mz)) 630 # print(self.zero_filled_u_l[1]) 631 dict_res[method] = dist 632 633 return dict_res
Class containing methods for calculating spectral similarity between two mass spectra.
Parameters
- ms_mz_abun_dict (dict): Dictionary of mass to abundance values for the experimental mass spectrum.
- ref_obj (dict): Dictionary of mass to abundance values for the reference mass spectrum.
- norm_func (function): Function to normalize the abundance values.
Attributes
- normalize_func (function): Function to normalize the abundance values.
- ms_mz_abun_dict (dict): Dictionary of mass to abundance values for the experimental mass spectrum.
- ref_obj (dict): Dictionary of mass to abundance values for the reference mass spectrum.
- exp_abun (list): List of abundance values for the experimental mass spectrum.
- exp_mz (list): List of mass values for the experimental mass spectrum.
- ref_mz (list): List of mass values for the reference mass spectrum.
- ref_abun (list): List of abundance values for the reference mass spectrum.
- ref_mz_abun_dict (dict): Dictionary of mass to abundance values for the reference mass spectrum.
- df (DataFrame): DataFrame containing the experimental and reference mass spectrum data.
- zero_filled_u_l (tuple): Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
- common_mz_values (list): List of common mass values between the experimental and reference mass spectra.
- n_x_y (int): Number of common mass values between the experimental and reference mass spectra.
Methods
- nan_fill(df, fill_with=0). Fill missing mass values with a given value.
- normalize(x, y, norm_func=sum). Normalize the abundance values.
- weighted_cosine_correlation(a=0.5, b=1.3, nanfill=1e-10). Calculate the weighted cosine correlation between the experimental and reference mass spectra.
- cosine_correlation(). Calculate the cosine correlation between the experimental and reference mass spectra.
- stein_scott(). Calculate the Stein-Scott similarity between the experimental and reference mass spectra.
- pearson_correlation(). Calculate the Pearson correlation between the experimental and reference mass spectra.
- spearman_correlation(). Calculate the Spearman correlation between the experimental and reference mass spectra.
190 def __init__(self, ms_mz_abun_dict, ref_obj, norm_func=sum): 191 self.normalize_func = norm_func 192 self.ms_mz_abun_dict = ms_mz_abun_dict 193 self.ref_obj = ref_obj 194 195 self.exp_abun = list(self.ms_mz_abun_dict.values()) 196 self.exp_mz = list(self.ms_mz_abun_dict.keys()) 197 198 self.ref_mz = self.ref_obj.get("mz") 199 self.ref_abun = self.ref_obj.get("abundance") 200 201 self.ref_mz_abun_dict = dict(zip(self.ref_mz, self.ref_abun)) 202 203 # parse to dataframe, easier to zerofill and tranpose 204 self.df = DataFrame([self.ms_mz_abun_dict, self.ref_mz_abun_dict]) 205 206 # fill missing mz with abundance 0 207 x, y = self.nan_fill(self.df, fill_with=1e-10) 208 209 self.zero_filled_u_l = self.normalize(x, y, norm_func=self.normalize_func) 210 211 # filter out the mass values that have zero intensities in self.exp_abun 212 exp_mz_filtered = set([k for k in self.exp_mz if self.ms_mz_abun_dict[k] != 0]) 213 214 # filter out the mass values that have zero intensities in self.ref_mz 215 ref_mz_filtered = set([k for k in self.ref_mz if self.ref_mz_abun_dict[k] != 0]) 216 217 # find the intersection/common mass values of both ref and exp, and sort them 218 self.common_mz_values = sorted( 219 list(exp_mz_filtered.intersection(ref_mz_filtered)) 220 ) 221 222 # find the number of common mass values (after filtering 0s) 223 self.n_x_y = len(self.common_mz_values) 224 # print(self.n_x_y)
226 def nan_fill(self, df, fill_with=0): 227 """Fill missing mass values with a given value. 228 229 Parameters 230 ---------- 231 df : DataFrame 232 DataFrame containing the experimental and reference mass spectrum data. 233 fill_with : float 234 Value to fill missing mass values with. 235 236 Returns 237 ------- 238 x : list 239 List of abundance values for the experimental mass spectrum. 240 y : list 241 List of abundance values for the reference mass spectrum.""" 242 df.fillna(fill_with, inplace=True) 243 244 return df.T[0].values, df.T[1].values
Fill missing mass values with a given value.
Parameters
- df (DataFrame): DataFrame containing the experimental and reference mass spectrum data.
- fill_with (float): Value to fill missing mass values with.
Returns
- x (list): List of abundance values for the experimental mass spectrum.
- y (list): List of abundance values for the reference mass spectrum.
246 def normalize(self, x, y, norm_func=sum): 247 """Normalize the abundance values. 248 249 Parameters 250 ---------- 251 x : list 252 List of abundance values for the experimental mass spectrum. 253 y : list 254 List of abundance values for the reference mass spectrum. 255 norm_func : function 256 Function to normalize the abundance values. 257 Default is sum 258 259 Returns 260 ------- 261 u_l : tuple 262 Tuple containing the experimental and reference mass spectrum data after zero filling and normalization. 263 """ 264 if norm_func is not None: 265 u_l = (x / norm_func(x), y / norm_func(y)) 266 return u_l 267 else: 268 return (x, y)
Normalize the abundance values.
Parameters
- x (list): List of abundance values for the experimental mass spectrum.
- y (list): List of abundance values for the reference mass spectrum.
- norm_func (function): Function to normalize the abundance values. Default is sum
Returns
- u_l (tuple): Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
270 def weighted_cosine_correlation(self, a=0.5, b=1.3, nanfill=1e-10): 271 """Calculate the weighted cosine correlation between the experimental and reference mass spectra. 272 273 Parameters 274 ---------- 275 a : float 276 Weighting factor for the abundance values. 277 Default is 0.5 278 b : float 279 Weighting factor for the mass values. 280 Default is 1.3 281 nanfill : float 282 Value to fill missing mass values with. 283 Default is 1e-10 284 285 Returns 286 ------- 287 correlation : float 288 Weighted cosine correlation between the experimental and reference mass spectra. 289 """ 290 # create dict['mz'] = abundance, for experimental data 291 # ms_mz_abun_dict = mass_spec.mz_abun_dict 292 # weight exp data 293 294 xc = power(self.exp_abun, a) * power(self.exp_mz, b) 295 296 # track back to individual mz 297 weighted_exp_dict = dict(zip(self.ms_mz_abun_dict.keys(), xc)) 298 299 # weight ref data 300 yc = power(self.ref_obj.get("abundance"), a) * power(self.ref_obj.get("mz"), b) 301 302 ref_mz_abun_dict = dict(zip(self.ref_obj.get("mz"), yc)) 303 304 # parse to dataframe, easier to zerofill and tranpose 305 df = DataFrame([weighted_exp_dict, ref_mz_abun_dict]) 306 307 # fill missing mz with weight {abun**a}{m/z**b} to 0 308 x, y = self.nan_fill(df, fill_with=nanfill) 309 310 # correlation = (1 - cosine(x, y)) 311 312 correlation = dot(x, y) / (norm(x) * norm(y)) 313 314 return correlation
Calculate the weighted cosine correlation between the experimental and reference mass spectra.
Parameters
- a (float): Weighting factor for the abundance values. Default is 0.5
- b (float): Weighting factor for the mass values. Default is 1.3
- nanfill (float): Value to fill missing mass values with. Default is 1e-10
Returns
- correlation (float): Weighted cosine correlation between the experimental and reference mass spectra.
316 def cosine_correlation(self): 317 """Calculate the cosine correlation between the experimental and reference mass spectra. 318 319 Returns 320 ------- 321 correlation : float 322 Cosine correlation between the experimental and reference mass spectra. 323 324 """ 325 # calculate cosine correlation, 326 x = self.zero_filled_u_l[0] 327 y = self.zero_filled_u_l[1] 328 329 # correlation = (1 - cosine(x, y)) 330 331 correlation = dot(x, y) / (norm(x) * norm(y)) 332 333 return correlation
Calculate the cosine correlation between the experimental and reference mass spectra.
Returns
- correlation (float): Cosine correlation between the experimental and reference mass spectra.
335 def stein_scott(self): 336 """Calculate the Stein-Scott similarity between the experimental and reference mass spectra. 337 338 Returns 339 ------- 340 s_ss_x_y : float 341 Stein-Scott similarity between the experimental and reference mass spectra. 342 s_ss_x_y_nist : float 343 Stein-Scott similarity between the experimental and reference mass spectra. 344 """ 345 # TODO check this code 346 if self.n_x_y == 0: 347 return 0, 0 348 349 # count number of non-zero abundance/peak intensity values 350 n_x = sum(a != 0 for a in self.exp_abun) 351 352 s_r_x_y = 0 353 354 a, b = 1, 0 355 356 for i in range(1, self.n_x_y): 357 current_value = self.common_mz_values[i] 358 previous_value = self.common_mz_values[i - 1] 359 360 y_i = self.ref_mz_abun_dict[current_value] 361 y_i_minus1 = self.ref_mz_abun_dict[previous_value] 362 363 lc_current = power(y_i, a) * power(current_value, b) 364 lc_previous = power(y_i_minus1, a) * power(previous_value, b) 365 366 x_i = self.ms_mz_abun_dict[current_value] 367 x_i_minus1 = self.ms_mz_abun_dict[previous_value] 368 369 uc_current = power(x_i, a) * power(current_value, b) 370 uc_previous = power(x_i_minus1, a) * power(previous_value, b) 371 372 T1 = lc_current / lc_previous 373 374 T2 = uc_previous / uc_current 375 376 temp_computation = T1 * T2 377 378 n = 0 379 if temp_computation <= 1: 380 n = 1 381 else: 382 n = -1 383 384 s_r_x_y = s_r_x_y + power(temp_computation, n) 385 386 # finish the calculation of S_R(X,Y) 387 388 s_r_x_y = s_r_x_y / self.n_x_y 389 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 390 s_wc_x_y = self.weighted_cosine_correlation(a=0.5, b=3, nanfill=0) 391 392 s_ss_x_y = ((n_x * s_wc_x_y) + (self.n_x_y * s_r_x_y)) / (n_x + self.n_x_y) 393 394 s_wc_x_y_nist = self.weighted_cosine_correlation(a=0.5, b=1.3, nanfill=0) 395 396 s_ss_x_y_nist = ((n_x * s_wc_x_y_nist) + (self.n_x_y * s_r_x_y)) / ( 397 n_x + self.n_x_y 398 ) 399 # final step 400 401 return s_ss_x_y, s_ss_x_y_nist
Calculate the Stein-Scott similarity between the experimental and reference mass spectra.
Returns
- s_ss_x_y (float): Stein-Scott similarity between the experimental and reference mass spectra.
- s_ss_x_y_nist (float): Stein-Scott similarity between the experimental and reference mass spectra.
403 def pearson_correlation( 404 self, 405 ): 406 """Calculate the Pearson correlation between the experimental and reference mass spectra. 407 408 Returns 409 ------- 410 correlation : float 411 Pearson correlation between the experimental and reference mass spectra. 412 """ 413 correlation = pearsonr(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 414 415 return correlation[0]
Calculate the Pearson correlation between the experimental and reference mass spectra.
Returns
- correlation (float): Pearson correlation between the experimental and reference mass spectra.
417 def spearman_correlation(self): 418 """Calculate the Spearman correlation between the experimental and reference mass spectra. 419 420 Returns 421 ------- 422 coorelation : float 423 Spearman correlation between the experimental and reference mass spectra. 424 """ 425 # calculate Spearman correlation 426 # ## TODO - Check axis 427 correlation = spearmanr( 428 self.zero_filled_u_l[0], self.zero_filled_u_l[1], axis=0 429 ) 430 431 return correlation[0]
Calculate the Spearman correlation between the experimental and reference mass spectra.
Returns
- coorelation (float): Spearman correlation between the experimental and reference mass spectra.
433 def kendall_tau(self): 434 """Calculate the Kendall's tau correlation between the experimental and reference mass spectra. 435 436 Returns 437 ------- 438 correlation : float 439 Kendall's tau correlation between the experimental and reference mass spectra.""" 440 # create dict['mz'] = abundance, for experimental data 441 # self.ms_mz_abun_dict = mass_spec.mz_abun_dict 442 443 # create dict['mz'] = abundance, for experimental data 444 445 # calculate Kendall's tau 446 correlation = kendalltau(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 447 448 return correlation[0]
Calculate the Kendall's tau correlation between the experimental and reference mass spectra.
Returns
- correlation (float): Kendall's tau correlation between the experimental and reference mass spectra.
450 def dft_correlation(self): 451 """Calculate the DFT correlation between the experimental and reference mass spectra. 452 453 Returns 454 ------- 455 correlation : float 456 DFT correlation between the experimental and reference mass spectra. 457 """ 458 if self.n_x_y == 0: 459 return 0 460 461 # count number of non-zero abundance/peak intensity values 462 n_x = sum(a != 0 for a in self.exp_abun) 463 464 x, y = self.nan_fill(self.df, fill_with=0) 465 466 x, y = self.normalize(x, y, norm_func=self.normalize_func) 467 468 # get the Fourier transform of x and y 469 x_dft = rfft(x).real 470 y_dft = rfft(y).real 471 472 s_dft_xy = dot(x_dft, y_dft) / (norm(x_dft) * norm(y_dft)) 473 474 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 475 s_wc_x_y = self.weighted_cosine_correlation(nanfill=0) 476 477 # final step 478 s_dft = (n_x * s_wc_x_y + self.n_x_y * s_dft_xy) / (n_x + self.n_x_y) 479 480 return s_dft
Calculate the DFT correlation between the experimental and reference mass spectra.
Returns
- correlation (float): DFT correlation between the experimental and reference mass spectra.
482 def dwt_correlation(self): 483 """Calculate the DWT correlation between the experimental and reference mass spectra. 484 485 Returns 486 ------- 487 correlation : float 488 DWT correlation between the experimental and reference mass spectra. 489 490 Notes 491 ----- 492 This function requires the PyWavelets library to be installed. 493 This is not a default requirement as this function is not widely used. 494 """ 495 496 from pywt import dwt 497 498 if self.n_x_y == 0: 499 return 0 500 501 # count number of non-zero abundance/peak intensity values 502 n_x = sum(a != 0 for a in self.exp_abun) 503 504 # calculate cosine correlation, 505 x, y = self.nan_fill(self.df, fill_with=0) 506 507 x, y = self.normalize(x, y, norm_func=self.normalize_func) 508 509 # Make x and y into an array 510 x_a = list(x) 511 y_a = list(y) 512 513 # get the wavelet transform of x and y (Daubechies with a filter length of 4. Asymmetric. pywavelets function) 514 # Will only use the detail dwt (dwtDd 515 x_dwtD = dwt(x_a, "db2")[1] 516 y_dwtD = dwt(y_a, "db2")[1] 517 518 s_dwt_xy = dot(x_dwtD, y_dwtD) / (norm(x_dwtD) * norm(y_dwtD)) 519 520 # using the existing weighted_cosine_correlation function to get S_WC(X,Y) 521 s_wc_x_y = self.weighted_cosine_correlation(nanfill=0) 522 523 # final step 524 s_dwt = (n_x * s_wc_x_y + self.n_x_y * s_dwt_xy) / (n_x + self.n_x_y) 525 526 return s_dwt
Calculate the DWT correlation between the experimental and reference mass spectra.
Returns
- correlation (float): DWT correlation between the experimental and reference mass spectra.
Notes
This function requires the PyWavelets library to be installed. This is not a default requirement as this function is not widely used.
528 def euclidean_distance(self): 529 """Calculate the Euclidean distance between the experimental and reference mass spectra. 530 531 Returns 532 ------- 533 correlation : float 534 Euclidean distance between the experimental and reference mass spectra. 535 """ 536 # correlation = euclidean_distance_manual(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 537 qlist = self.zero_filled_u_l[0] 538 rlist = self.zero_filled_u_l[1] 539 540 correlation = sqrt(np_sum(power(qlist - rlist, 2))) 541 542 return correlation
Calculate the Euclidean distance between the experimental and reference mass spectra.
Returns
- correlation (float): Euclidean distance between the experimental and reference mass spectra.
544 def manhattan_distance(self): 545 """Calculate the Manhattan distance between the experimental and reference mass spectra. 546 547 Returns 548 ------- 549 correlation : float 550 Manhattan distance between the experimental and reference mass spectra. 551 """ 552 qlist = self.zero_filled_u_l[0] 553 rlist = self.zero_filled_u_l[1] 554 555 return np_sum(absolute(qlist - rlist))
Calculate the Manhattan distance between the experimental and reference mass spectra.
Returns
- correlation (float): Manhattan distance between the experimental and reference mass spectra.
557 def jaccard_distance(self): 558 """Calculate the Jaccard distance between the experimental and reference mass spectra. 559 560 Returns 561 ------- 562 correlation : float 563 Jaccard distance between the experimental and reference mass spectra. 564 """ 565 566 def jaccard_similarity(list1, list2): 567 intersection = len(list(set(list1).intersection(list2))) 568 union = (len(list1) + len(list2)) - intersection 569 return float(intersection) / union 570 571 qlist = self.zero_filled_u_l[0] 572 rlist = self.zero_filled_u_l[1] 573 574 return np_sum(power(qlist - rlist, 2)) / ( 575 np_sum(power(qlist, 2)) + np_sum(power(rlist, 2)) - np_sum(qlist * rlist) 576 ) 577 # correlation = jaccard_similarity(self.zero_filled_u_l[0], self.zero_filled_u_l[1]) 578 # @return correlation
Calculate the Jaccard distance between the experimental and reference mass spectra.
Returns
- correlation (float): Jaccard distance between the experimental and reference mass spectra.
580 def extra_distances(self): 581 """Function to calculate distances using additional metrics defined in math_distance.py 582 583 Currently, calculates all distances. 584 585 Returns 586 ------- 587 dict_res : dict 588 Dictionary containing the distances between the experimental and reference mass spectra. 589 590 """ 591 from corems.molecular_id.calc import math_distance 592 593 # qlist = self.zero_filled_u_l[2] 594 # rlist = self.zero_filled_u_l[3] 595 596 dict_res = {} 597 598 for method in methods_name: 599 # function_name = method + "_distance" 600 function_name = method 601 if hasattr(math_distance, function_name): 602 f = getattr(math_distance, function_name) 603 604 if function_name == "canberra_metric": 605 x, y = self.nan_fill(self.df, fill_with=0) 606 607 qlist, rlist = self.normalize(x, y, norm_func=self.normalize_func) 608 # print("qlist:") 609 # print(qlist) 610 # print("rlist:") 611 # print(rlist) 612 613 else: 614 qlist = self.zero_filled_u_l[0] 615 rlist = self.zero_filled_u_l[1] 616 617 dist = f(qlist, rlist) 618 # if method == "Minokowski_3": 619 # print("qlist:") 620 # print(qlist) 621 # print("rlist") 622 # print(rlist) 623 # exit() 624 # if dist == np.nan or dis == np.inf: 625 # print(self.exp_abun) 626 # print(self.exp_mz) 627 # print(function_name) 628 # print(len(self.exp_abun)) 629 # print(len(self.exp_mz)) 630 # print(self.zero_filled_u_l[1]) 631 dict_res[method] = dist 632 633 return dict_res
Function to calculate distances using additional metrics defined in math_distance.py
Currently, calculates all distances.
Returns
- dict_res (dict): Dictionary containing the distances between the experimental and reference mass spectra.