corems.molecular_id.calc.SpectralSimilarity

  1__author__ = "Yuri E. Corilo"
  2__date__ = "Jun 09, 2021"
  3
  4from numpy.fft import rfft
  5from scipy.stats import pearsonr, spearmanr, kendalltau
  6from numpy import (
  7    power,
  8    dot,
  9    absolute,
 10    sqrt,
 11)
 12from numpy import sum as np_sum
 13from numpy.linalg import norm
 14from pandas import DataFrame
 15import numpy as np
 16
 17methods_name = {
 18    # "entropy_distance": "Entropy Distance",
 19    # "weighted_entropy_distance": "Dynamic weighted entropy Distance",
 20    "chebyshev_distance": "Chebyshev Distance",
 21    "squared_euclidean_distance": "Squared Euclidean Distance",
 22    "fidelity_similarity": "Fidelity Similarity",
 23    "matusita_distance": "Matusita Distance",
 24    "squared_chord_distance": "Squared-chord Distance",
 25    # "bhattacharya_1_distance": "Bhattacharya 1 Distance",
 26    # "bhattacharya_2_distance": "Bhattacharya 2 Distance",
 27    "harmonic_mean_similarity": "Harmonic mean Distance",
 28    "Pearson_chi_squared_distance": "Pearson Chi Squared Distance",
 29    "Neyman_chi_squared_distance": "Neyman Chi Squared Distance",
 30    "probabilistic_symmetric_chi_squared_distance": "Probabilistic symmetric X2 Distance",
 31    "topsoe_distance": "Topsoe Distance",
 32    "chernoff_distance": "Chernoff Distance",
 33    "ruzicka_distance": "Ruzicka Distance",
 34    "roberts_distance": "Roberts Distance",
 35    # "intersection_distance": "Intersection Distance",
 36    "motyka_distance": "Motyka Distance",
 37    "canberra_distance": "Canberra Distance",
 38    "canberra_metric": "Canberra Metric",
 39    "kulczynski_1_distance": "Kulczynski 1 Distance",
 40    # "baroni_urbani_buser_distance": "Baroni-Urbani-Buser Distance",
 41    # "penrose_size_distance": "Penrose size Distance",
 42    # "mean_character_distance": "Mean character Distance",
 43    "lorentzian_distance": "Lorentzian Distance",
 44    # "penrose_shape_distance": "Penrose shape Distance",
 45    "clark_distance": "Clark Distance",
 46    "hellinger_distance": "Hellinger Distance",
 47    "whittaker_index_of_association_distance": "Whittaker index of association Distance",
 48    # "similarity_index_distance": "Similarity Index Distance",
 49    # "improved_similarity_distance": "Improved Similarity",
 50    # "absolute_value_distance": "Absolute Value Distance",
 51    "spectral_contrast_angle_distance": "Spectral Contrast Angle",
 52    "wave_hedges_distance": "Wave Hedges Distance",
 53    "dice_similarity": "Dice Similarity",
 54    "inner_product_distance": "Inner Product Distance",
 55    "divergence_distance": "Divergence Distance",
 56    "jensen_difference_distance": "Jensen Differences Distance",
 57    "kumar_johnson_distance": "Kumar Johnson Distance",
 58    "avg_l_distance": "Avg (L1, L8) Distance",
 59    "vicis_wave_hadges_distance": "Vicis Wave Hadges Distance",
 60    "vicis_symmetric_chi_squared_1_distance": "Vicis-Symmetric X2 1 Distance",
 61    "vicis_symmetric_chi_squared_2_distance": "Vicis-Symmetric X2 2 Distance",
 62    "vicis_symmetric_chi_squared_3_distance": "Vicis-Symmetric X2 3 Distance",
 63    "max_symmetric_chi_squared_distance": "Max Symmetric Chi Squared Distance",
 64    "min_symmetric_chi_squared_distance": "Min Symmetric Chi Squared Distance",
 65    # "ms_for_id_v1": "MSforID Distance version 1",
 66    # "ms_for_id": "MSforID Distance",
 67    "additive_sym_chi_sq": "Additive Symmetric Chi Squared",
 68    "bhattacharya_distance": "Battacharya Distance",
 69    "generalized_ochiai_index": "Generalized Ochiai Index",
 70    "gower_distance": "Gower Distance",
 71    "impr_sqrt_cosine_sim": "Improved Square Root Cosine Similarity",
 72    "intersection_sim": "Intersection Similarity",
 73    "j_divergence": "J Divergence",
 74    "jensen_shannon_index": "Jensen Shannon Index",
 75    "k_divergence": "K Divergence",
 76    "VW6": "VW6",
 77    "VW5": "VW5",
 78    "VW4": "VW4",
 79    "VW3": "VW3",
 80    "VW2": "VW2",
 81    "VW1": "VW1",
 82    "taneja_divergence": "Taneja Divergence",
 83    "symmetric_chi_squared_distance": "Symmetric Chi Squared Distance",
 84    "squared_chi_squared_distance": "Squared Chi Squared Distance",
 85    "square_root_cosine_correlation": "Square Root Cosine Correlation",
 86    "sorensen_distance": "Sorensen Distance",
 87    "Minokowski_3": "Minokowski 3 Distance",
 88    "Minokowski_4": "Minokowski 4 Distance",
 89    "kumarjohnson_divergence": "Kumar Johnson Divergence",
 90    "kumarhassebrook_similarity": "Kumar Hassebrook Similarity",
 91    "kullbackleibler_divergence": "Kullback Leibler Divergence",
 92    "soergel_distance": "Soergel Distance",
 93}
 94
 95methods_scale = {
 96    "entropy": [0, np.log(4)],
 97    "weighted_entropy": [0, np.log(4)],
 98    "absolute_value": [0, 2],
 99    "avg_l": [0, 1.5],
100    "bhattacharya_1": [0, np.arccos(0) ** 2],
101    "bhattacharya_2": [0, np.inf],
102    "canberra": [0, np.inf],
103    "clark": [0, np.inf],
104    "divergence": [0, np.inf],
105    "euclidean": [0, np.sqrt(2)],
106    "hellinger": [0, np.inf],
107    "improved_similarity": [0, np.inf],
108    "lorentzian": [0, np.inf],
109    "manhattan": [0, 2],
110    "matusita": [0, np.sqrt(2)],
111    "mean_character": [0, 2],
112    "motyka": [-0.5, 0],
113    "ms_for_id": [-np.inf, 0],
114    "ms_for_id_v1": [0, np.inf],
115    "pearson_correlation": [-1, 1],
116    "penrose_shape": [0, np.sqrt(2)],
117    "penrose_size": [0, np.inf],
118    "probabilistic_symmetric_chi_squared": [0, 1],
119    "similarity_index": [0, np.inf],
120    "squared_chord": [0, 2],
121    "squared_euclidean": [0, 2],
122    "symmetric_chi_squared": [0, 0.5 * np.sqrt(2)],
123    "topsoe": [0, np.sqrt(2)],
124    "vicis_symmetric_chi_squared_3": [0, 2],
125    "wave_hedges": [0, np.inf],
126    "whittaker_index_of_association": [0, np.inf],
127}
128
129
130class SpectralSimilarity:
131    """Class containing methods for calculating spectral similarity between two mass spectra.
132
133    Parameters
134    ----------
135    ms_mz_abun_dict : dict
136        Dictionary of mass to abundance values for the experimental mass spectrum.
137    ref_obj : dict
138        Dictionary of mass to abundance values for the reference mass spectrum.
139    norm_func : function
140        Function to normalize the abundance values.
141
142    Attributes
143    ----------
144    normalize_func : function
145        Function to normalize the abundance values.
146    ms_mz_abun_dict : dict
147        Dictionary of mass to abundance values for the experimental mass spectrum.
148    ref_obj : dict
149        Dictionary of mass to abundance values for the reference mass spectrum.
150    exp_abun : list
151        List of abundance values for the experimental mass spectrum.
152    exp_mz : list
153        List of mass values for the experimental mass spectrum.
154    ref_mz : list
155        List of mass values for the reference mass spectrum.
156    ref_abun : list
157        List of abundance values for the reference mass spectrum.
158    ref_mz_abun_dict : dict
159        Dictionary of mass to abundance values for the reference mass spectrum.
160    df : DataFrame
161        DataFrame containing the experimental and reference mass spectrum data.
162    zero_filled_u_l : tuple
163        Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
164    common_mz_values : list
165        List of common mass values between the experimental and reference mass spectra.
166    n_x_y : int
167        Number of common mass values between the experimental and reference mass spectra.
168
169    Methods
170    -------
171    * nan_fill(df, fill_with=0).
172        Fill missing mass values with a given value.
173    * normalize(x, y, norm_func=sum).
174        Normalize the abundance values.
175    * weighted_cosine_correlation(a=0.5, b=1.3, nanfill=1e-10).
176        Calculate the weighted cosine correlation between the experimental and reference mass spectra.
177    * cosine_correlation().
178        Calculate the cosine correlation between the experimental and reference mass spectra.
179    * stein_scott().
180        Calculate the Stein-Scott similarity between the experimental and reference mass spectra.
181    * pearson_correlation().
182        Calculate the Pearson correlation between the experimental and reference mass spectra.
183    * spearman_correlation().
184        Calculate the Spearman correlation between the experimental and reference mass spectra.
185
186
187    """
188
189    def __init__(self, ms_mz_abun_dict, ref_obj, norm_func=sum):
190        self.normalize_func = norm_func
191        self.ms_mz_abun_dict = ms_mz_abun_dict
192        self.ref_obj = ref_obj
193
194        self.exp_abun = list(self.ms_mz_abun_dict.values())
195        self.exp_mz = list(self.ms_mz_abun_dict.keys())
196
197        self.ref_mz = self.ref_obj.get("mz")
198        self.ref_abun = self.ref_obj.get("abundance")
199
200        self.ref_mz_abun_dict = dict(zip(self.ref_mz, self.ref_abun))
201
202        # parse to dataframe, easier to zerofill and tranpose
203        self.df = DataFrame([self.ms_mz_abun_dict, self.ref_mz_abun_dict])
204
205        # fill missing mz with abundance 0
206        x, y = self.nan_fill(self.df, fill_with=1e-10)
207
208        self.zero_filled_u_l = self.normalize(x, y, norm_func=self.normalize_func)
209
210        # filter out the mass values that have zero intensities in self.exp_abun
211        exp_mz_filtered = set([k for k in self.exp_mz if self.ms_mz_abun_dict[k] != 0])
212
213        # filter out the mass values that have zero intensities in self.ref_mz
214        ref_mz_filtered = set([k for k in self.ref_mz if self.ref_mz_abun_dict[k] != 0])
215
216        # find the intersection/common mass values of both ref and exp, and sort them
217        self.common_mz_values = sorted(
218            list(exp_mz_filtered.intersection(ref_mz_filtered))
219        )
220
221        # find the number of common mass values (after filtering 0s)
222        self.n_x_y = len(self.common_mz_values)
223        # print(self.n_x_y)
224
225    def nan_fill(self, df, fill_with=0):
226        """Fill missing mass values with a given value.
227
228        Parameters
229        ----------
230        df : DataFrame
231            DataFrame containing the experimental and reference mass spectrum data.
232        fill_with : float
233            Value to fill missing mass values with.
234
235        Returns
236        -------
237        x : list
238            List of abundance values for the experimental mass spectrum.
239        y : list
240            List of abundance values for the reference mass spectrum."""
241        df.fillna(fill_with, inplace=True)
242
243        return df.T[0].values, df.T[1].values
244
245    def normalize(self, x, y, norm_func=sum):
246        """Normalize the abundance values.
247
248        Parameters
249        ----------
250        x : list
251            List of abundance values for the experimental mass spectrum.
252        y : list
253            List of abundance values for the reference mass spectrum.
254        norm_func : function
255            Function to normalize the abundance values.
256            Default is sum
257
258        Returns
259        -------
260        u_l : tuple
261            Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
262        """
263        if norm_func is not None:
264            u_l = (x / norm_func(x), y / norm_func(y))
265            return u_l
266        else:
267            return (x, y)
268
269    def weighted_cosine_correlation(self, a=0.5, b=1.3, nanfill=1e-10):
270        """Calculate the weighted cosine correlation between the experimental and reference mass spectra.
271
272        Parameters
273        ----------
274        a : float
275            Weighting factor for the abundance values.
276            Default is 0.5
277        b : float
278            Weighting factor for the mass values.
279            Default is 1.3
280        nanfill : float
281            Value to fill missing mass values with.
282            Default is 1e-10
283
284        Returns
285        -------
286        correlation : float
287            Weighted cosine correlation between the experimental and reference mass spectra.
288        """
289        # create dict['mz'] = abundance, for experimental data
290        # ms_mz_abun_dict = mass_spec.mz_abun_dict
291        # weight exp data
292
293        xc = power(self.exp_abun, a) * power(self.exp_mz, b)
294
295        # track back to individual mz
296        weighted_exp_dict = dict(zip(self.ms_mz_abun_dict.keys(), xc))
297
298        # weight ref data
299        yc = power(self.ref_obj.get("abundance"), a) * power(self.ref_obj.get("mz"), b)
300
301        ref_mz_abun_dict = dict(zip(self.ref_obj.get("mz"), yc))
302
303        # parse to dataframe, easier to zerofill and tranpose
304        df = DataFrame([weighted_exp_dict, ref_mz_abun_dict])
305
306        # fill missing mz with weight {abun**a}{m/z**b} to 0
307        x, y = self.nan_fill(df, fill_with=nanfill)
308
309        # correlation = (1 - cosine(x, y))
310
311        correlation = dot(x, y) / (norm(x) * norm(y))
312
313        return correlation
314
315    def cosine_correlation(self):
316        """Calculate the cosine correlation between the experimental and reference mass spectra.
317
318        Returns
319        -------
320        correlation : float
321            Cosine correlation between the experimental and reference mass spectra.
322
323        """
324        # calculate cosine correlation,
325        x = self.zero_filled_u_l[0]
326        y = self.zero_filled_u_l[1]
327
328        # correlation = (1 - cosine(x, y))
329
330        correlation = dot(x, y) / (norm(x) * norm(y))
331
332        return correlation
333
334    def stein_scott(self):
335        """Calculate the Stein-Scott similarity between the experimental and reference mass spectra.
336
337        Returns
338        -------
339        s_ss_x_y : float
340            Stein-Scott similarity between the experimental and reference mass spectra.
341        s_ss_x_y_nist : float
342            Stein-Scott similarity between the experimental and reference mass spectra.
343        """
344        # TODO check this code
345        if self.n_x_y == 0:
346            return 0, 0
347
348        # count number of non-zero abundance/peak intensity values
349        n_x = sum(a != 0 for a in self.exp_abun)
350
351        s_r_x_y = 0
352
353        a, b = 1, 0
354
355        for i in range(1, self.n_x_y):
356            current_value = self.common_mz_values[i]
357            previous_value = self.common_mz_values[i - 1]
358
359            y_i = self.ref_mz_abun_dict[current_value]
360            y_i_minus1 = self.ref_mz_abun_dict[previous_value]
361
362            lc_current = power(y_i, a) * power(current_value, b)
363            lc_previous = power(y_i_minus1, a) * power(previous_value, b)
364
365            x_i = self.ms_mz_abun_dict[current_value]
366            x_i_minus1 = self.ms_mz_abun_dict[previous_value]
367
368            uc_current = power(x_i, a) * power(current_value, b)
369            uc_previous = power(x_i_minus1, a) * power(previous_value, b)
370
371            T1 = lc_current / lc_previous
372
373            T2 = uc_previous / uc_current
374
375            temp_computation = T1 * T2
376
377            n = 0
378            if temp_computation <= 1:
379                n = 1
380            else:
381                n = -1
382
383            s_r_x_y = s_r_x_y + power(temp_computation, n)
384
385        # finish the calculation of S_R(X,Y)
386
387        s_r_x_y = s_r_x_y / self.n_x_y
388        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
389        s_wc_x_y = self.weighted_cosine_correlation(a=0.5, b=3, nanfill=0)
390
391        s_ss_x_y = ((n_x * s_wc_x_y) + (self.n_x_y * s_r_x_y)) / (n_x + self.n_x_y)
392
393        s_wc_x_y_nist = self.weighted_cosine_correlation(a=0.5, b=1.3, nanfill=0)
394
395        s_ss_x_y_nist = ((n_x * s_wc_x_y_nist) + (self.n_x_y * s_r_x_y)) / (
396            n_x + self.n_x_y
397        )
398        # final step
399
400        return s_ss_x_y, s_ss_x_y_nist
401
402    def pearson_correlation(
403        self,
404    ):
405        """Calculate the Pearson correlation between the experimental and reference mass spectra.
406
407        Returns
408        -------
409        correlation : float
410            Pearson correlation between the experimental and reference mass spectra.
411        """
412        correlation = pearsonr(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
413
414        return correlation[0]
415
416    def spearman_correlation(self):
417        """Calculate the Spearman correlation between the experimental and reference mass spectra.
418
419        Returns
420        -------
421        coorelation : float
422            Spearman correlation between the experimental and reference mass spectra.
423        """
424        # calculate Spearman correlation
425        # ## TODO - Check axis
426        correlation = spearmanr(
427            self.zero_filled_u_l[0], self.zero_filled_u_l[1], axis=0
428        )
429
430        return correlation[0]
431
432    def kendall_tau(self):
433        """Calculate the Kendall's tau correlation between the experimental and reference mass spectra.
434
435        Returns
436        -------
437        correlation : float
438            Kendall's tau correlation between the experimental and reference mass spectra."""
439        # create dict['mz'] = abundance, for experimental data
440        # self.ms_mz_abun_dict = mass_spec.mz_abun_dict
441
442        # create dict['mz'] = abundance, for experimental data
443
444        # calculate Kendall's tau
445        correlation = kendalltau(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
446
447        return correlation[0]
448
449    def dft_correlation(self):
450        """Calculate the DFT correlation between the experimental and reference mass spectra.
451
452        Returns
453        -------
454        correlation : float
455            DFT correlation between the experimental and reference mass spectra.
456        """
457        if self.n_x_y == 0:
458            return 0
459
460        # count number of non-zero abundance/peak intensity values
461        n_x = sum(a != 0 for a in self.exp_abun)
462
463        x, y = self.nan_fill(self.df, fill_with=0)
464
465        x, y = self.normalize(x, y, norm_func=self.normalize_func)
466
467        # get the Fourier transform of x and y
468        x_dft = rfft(x).real
469        y_dft = rfft(y).real
470
471        s_dft_xy = dot(x_dft, y_dft) / (norm(x_dft) * norm(y_dft))
472
473        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
474        s_wc_x_y = self.weighted_cosine_correlation(nanfill=0)
475
476        # final step
477        s_dft = (n_x * s_wc_x_y + self.n_x_y * s_dft_xy) / (n_x + self.n_x_y)
478
479        return s_dft
480
481    def dwt_correlation(self):
482        """Calculate the DWT correlation between the experimental and reference mass spectra.
483
484        Returns
485        -------
486        correlation : float
487            DWT correlation between the experimental and reference mass spectra.
488
489        Notes
490        -----
491        This function requires the PyWavelets library to be installed.
492            This is not a default requirement as this function is not widely used.
493        """
494
495        from pywt import dwt
496
497        if self.n_x_y == 0:
498            return 0
499
500        # count number of non-zero abundance/peak intensity values
501        n_x = sum(a != 0 for a in self.exp_abun)
502
503        # calculate cosine correlation,
504        x, y = self.nan_fill(self.df, fill_with=0)
505
506        x, y = self.normalize(x, y, norm_func=self.normalize_func)
507
508        # Make x and y into an array
509        x_a = list(x)
510        y_a = list(y)
511
512        # get the wavelet transform of x and y (Daubechies with a filter length of 4. Asymmetric. pywavelets function)
513        # Will only use the detail dwt (dwtDd
514        x_dwtD = dwt(x_a, "db2")[1]
515        y_dwtD = dwt(y_a, "db2")[1]
516
517        s_dwt_xy = dot(x_dwtD, y_dwtD) / (norm(x_dwtD) * norm(y_dwtD))
518
519        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
520        s_wc_x_y = self.weighted_cosine_correlation(nanfill=0)
521
522        # final step
523        s_dwt = (n_x * s_wc_x_y + self.n_x_y * s_dwt_xy) / (n_x + self.n_x_y)
524
525        return s_dwt
526
527    def euclidean_distance(self):
528        """Calculate the Euclidean distance between the experimental and reference mass spectra.
529
530        Returns
531        -------
532        correlation : float
533            Euclidean distance between the experimental and reference mass spectra.
534        """
535        # correlation = euclidean_distance_manual(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
536        qlist = self.zero_filled_u_l[0]
537        rlist = self.zero_filled_u_l[1]
538
539        correlation = sqrt(np_sum(power(qlist - rlist, 2)))
540
541        return correlation
542
543    def manhattan_distance(self):
544        """Calculate the Manhattan distance between the experimental and reference mass spectra.
545
546        Returns
547        -------
548        correlation : float
549            Manhattan distance between the experimental and reference mass spectra.
550        """
551        qlist = self.zero_filled_u_l[0]
552        rlist = self.zero_filled_u_l[1]
553
554        return np_sum(absolute(qlist - rlist))
555
556    def jaccard_distance(self):
557        """Calculate the Jaccard distance between the experimental and reference mass spectra.
558
559        Returns
560        -------
561        correlation : float
562            Jaccard distance between the experimental and reference mass spectra.
563        """
564
565        def jaccard_similarity(list1, list2):
566            intersection = len(list(set(list1).intersection(list2)))
567            union = (len(list1) + len(list2)) - intersection
568            return float(intersection) / union
569
570        qlist = self.zero_filled_u_l[0]
571        rlist = self.zero_filled_u_l[1]
572
573        return np_sum(power(qlist - rlist, 2)) / (
574            np_sum(power(qlist, 2)) + np_sum(power(rlist, 2)) - np_sum(qlist * rlist)
575        )
576        # correlation = jaccard_similarity(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
577        # @return correlation
578
579    def extra_distances(self):
580        """Function to calculate distances using additional metrics defined in math_distance.py
581
582        Currently, calculates all distances.
583
584        Returns
585        -------
586        dict_res : dict
587            Dictionary containing the distances between the experimental and reference mass spectra.
588
589        """
590        from corems.molecular_id.calc import math_distance
591
592        # qlist = self.zero_filled_u_l[2]
593        # rlist = self.zero_filled_u_l[3]
594
595        dict_res = {}
596
597        for method in methods_name:
598            # function_name = method + "_distance"
599            function_name = method
600            if hasattr(math_distance, function_name):
601                f = getattr(math_distance, function_name)
602
603                if function_name == "canberra_metric":
604                    x, y = self.nan_fill(self.df, fill_with=0)
605
606                    qlist, rlist = self.normalize(x, y, norm_func=self.normalize_func)
607                    # print("qlist:")
608                    # print(qlist)
609                    # print("rlist:")
610                    # print(rlist)
611
612                else:
613                    qlist = self.zero_filled_u_l[0]
614                    rlist = self.zero_filled_u_l[1]
615
616                dist = f(qlist, rlist)
617                # if method == "Minokowski_3":
618                #    print("qlist:")
619                #    print(qlist)
620                #    print("rlist")
621                #    print(rlist)
622                #    exit()
623                # if dist == np.nan or dis == np.inf:
624                # print(self.exp_abun)
625                # print(self.exp_mz)
626                # print(function_name)
627                # print(len(self.exp_abun))
628                # print(len(self.exp_mz))
629                # print(self.zero_filled_u_l[1])
630                dict_res[method] = dist
631
632        return dict_res
methods_name = {'chebyshev_distance': 'Chebyshev Distance', 'squared_euclidean_distance': 'Squared Euclidean Distance', 'fidelity_similarity': 'Fidelity Similarity', 'matusita_distance': 'Matusita Distance', 'squared_chord_distance': 'Squared-chord Distance', 'harmonic_mean_similarity': 'Harmonic mean Distance', 'Pearson_chi_squared_distance': 'Pearson Chi Squared Distance', 'Neyman_chi_squared_distance': 'Neyman Chi Squared Distance', 'probabilistic_symmetric_chi_squared_distance': 'Probabilistic symmetric X2 Distance', 'topsoe_distance': 'Topsoe Distance', 'chernoff_distance': 'Chernoff Distance', 'ruzicka_distance': 'Ruzicka Distance', 'roberts_distance': 'Roberts Distance', 'motyka_distance': 'Motyka Distance', 'canberra_distance': 'Canberra Distance', 'canberra_metric': 'Canberra Metric', 'kulczynski_1_distance': 'Kulczynski 1 Distance', 'lorentzian_distance': 'Lorentzian Distance', 'clark_distance': 'Clark Distance', 'hellinger_distance': 'Hellinger Distance', 'whittaker_index_of_association_distance': 'Whittaker index of association Distance', 'spectral_contrast_angle_distance': 'Spectral Contrast Angle', 'wave_hedges_distance': 'Wave Hedges Distance', 'dice_similarity': 'Dice Similarity', 'inner_product_distance': 'Inner Product Distance', 'divergence_distance': 'Divergence Distance', 'jensen_difference_distance': 'Jensen Differences Distance', 'kumar_johnson_distance': 'Kumar Johnson Distance', 'avg_l_distance': 'Avg (L1, L8) Distance', 'vicis_wave_hadges_distance': 'Vicis Wave Hadges Distance', 'vicis_symmetric_chi_squared_1_distance': 'Vicis-Symmetric X2 1 Distance', 'vicis_symmetric_chi_squared_2_distance': 'Vicis-Symmetric X2 2 Distance', 'vicis_symmetric_chi_squared_3_distance': 'Vicis-Symmetric X2 3 Distance', 'max_symmetric_chi_squared_distance': 'Max Symmetric Chi Squared Distance', 'min_symmetric_chi_squared_distance': 'Min Symmetric Chi Squared Distance', 'additive_sym_chi_sq': 'Additive Symmetric Chi Squared', 'bhattacharya_distance': 'Battacharya Distance', 'generalized_ochiai_index': 'Generalized Ochiai Index', 'gower_distance': 'Gower Distance', 'impr_sqrt_cosine_sim': 'Improved Square Root Cosine Similarity', 'intersection_sim': 'Intersection Similarity', 'j_divergence': 'J Divergence', 'jensen_shannon_index': 'Jensen Shannon Index', 'k_divergence': 'K Divergence', 'VW6': 'VW6', 'VW5': 'VW5', 'VW4': 'VW4', 'VW3': 'VW3', 'VW2': 'VW2', 'VW1': 'VW1', 'taneja_divergence': 'Taneja Divergence', 'symmetric_chi_squared_distance': 'Symmetric Chi Squared Distance', 'squared_chi_squared_distance': 'Squared Chi Squared Distance', 'square_root_cosine_correlation': 'Square Root Cosine Correlation', 'sorensen_distance': 'Sorensen Distance', 'Minokowski_3': 'Minokowski 3 Distance', 'Minokowski_4': 'Minokowski 4 Distance', 'kumarjohnson_divergence': 'Kumar Johnson Divergence', 'kumarhassebrook_similarity': 'Kumar Hassebrook Similarity', 'kullbackleibler_divergence': 'Kullback Leibler Divergence', 'soergel_distance': 'Soergel Distance'}
methods_scale = {'entropy': [0, 1.3862943611198906], 'weighted_entropy': [0, 1.3862943611198906], 'absolute_value': [0, 2], 'avg_l': [0, 1.5], 'bhattacharya_1': [0, 2.4674011002723395], 'bhattacharya_2': [0, inf], 'canberra': [0, inf], 'clark': [0, inf], 'divergence': [0, inf], 'euclidean': [0, 1.4142135623730951], 'hellinger': [0, inf], 'improved_similarity': [0, inf], 'lorentzian': [0, inf], 'manhattan': [0, 2], 'matusita': [0, 1.4142135623730951], 'mean_character': [0, 2], 'motyka': [-0.5, 0], 'ms_for_id': [-inf, 0], 'ms_for_id_v1': [0, inf], 'pearson_correlation': [-1, 1], 'penrose_shape': [0, 1.4142135623730951], 'penrose_size': [0, inf], 'probabilistic_symmetric_chi_squared': [0, 1], 'similarity_index': [0, inf], 'squared_chord': [0, 2], 'squared_euclidean': [0, 2], 'symmetric_chi_squared': [0, 0.7071067811865476], 'topsoe': [0, 1.4142135623730951], 'vicis_symmetric_chi_squared_3': [0, 2], 'wave_hedges': [0, inf], 'whittaker_index_of_association': [0, inf]}
class SpectralSimilarity:
131class SpectralSimilarity:
132    """Class containing methods for calculating spectral similarity between two mass spectra.
133
134    Parameters
135    ----------
136    ms_mz_abun_dict : dict
137        Dictionary of mass to abundance values for the experimental mass spectrum.
138    ref_obj : dict
139        Dictionary of mass to abundance values for the reference mass spectrum.
140    norm_func : function
141        Function to normalize the abundance values.
142
143    Attributes
144    ----------
145    normalize_func : function
146        Function to normalize the abundance values.
147    ms_mz_abun_dict : dict
148        Dictionary of mass to abundance values for the experimental mass spectrum.
149    ref_obj : dict
150        Dictionary of mass to abundance values for the reference mass spectrum.
151    exp_abun : list
152        List of abundance values for the experimental mass spectrum.
153    exp_mz : list
154        List of mass values for the experimental mass spectrum.
155    ref_mz : list
156        List of mass values for the reference mass spectrum.
157    ref_abun : list
158        List of abundance values for the reference mass spectrum.
159    ref_mz_abun_dict : dict
160        Dictionary of mass to abundance values for the reference mass spectrum.
161    df : DataFrame
162        DataFrame containing the experimental and reference mass spectrum data.
163    zero_filled_u_l : tuple
164        Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
165    common_mz_values : list
166        List of common mass values between the experimental and reference mass spectra.
167    n_x_y : int
168        Number of common mass values between the experimental and reference mass spectra.
169
170    Methods
171    -------
172    * nan_fill(df, fill_with=0).
173        Fill missing mass values with a given value.
174    * normalize(x, y, norm_func=sum).
175        Normalize the abundance values.
176    * weighted_cosine_correlation(a=0.5, b=1.3, nanfill=1e-10).
177        Calculate the weighted cosine correlation between the experimental and reference mass spectra.
178    * cosine_correlation().
179        Calculate the cosine correlation between the experimental and reference mass spectra.
180    * stein_scott().
181        Calculate the Stein-Scott similarity between the experimental and reference mass spectra.
182    * pearson_correlation().
183        Calculate the Pearson correlation between the experimental and reference mass spectra.
184    * spearman_correlation().
185        Calculate the Spearman correlation between the experimental and reference mass spectra.
186
187
188    """
189
190    def __init__(self, ms_mz_abun_dict, ref_obj, norm_func=sum):
191        self.normalize_func = norm_func
192        self.ms_mz_abun_dict = ms_mz_abun_dict
193        self.ref_obj = ref_obj
194
195        self.exp_abun = list(self.ms_mz_abun_dict.values())
196        self.exp_mz = list(self.ms_mz_abun_dict.keys())
197
198        self.ref_mz = self.ref_obj.get("mz")
199        self.ref_abun = self.ref_obj.get("abundance")
200
201        self.ref_mz_abun_dict = dict(zip(self.ref_mz, self.ref_abun))
202
203        # parse to dataframe, easier to zerofill and tranpose
204        self.df = DataFrame([self.ms_mz_abun_dict, self.ref_mz_abun_dict])
205
206        # fill missing mz with abundance 0
207        x, y = self.nan_fill(self.df, fill_with=1e-10)
208
209        self.zero_filled_u_l = self.normalize(x, y, norm_func=self.normalize_func)
210
211        # filter out the mass values that have zero intensities in self.exp_abun
212        exp_mz_filtered = set([k for k in self.exp_mz if self.ms_mz_abun_dict[k] != 0])
213
214        # filter out the mass values that have zero intensities in self.ref_mz
215        ref_mz_filtered = set([k for k in self.ref_mz if self.ref_mz_abun_dict[k] != 0])
216
217        # find the intersection/common mass values of both ref and exp, and sort them
218        self.common_mz_values = sorted(
219            list(exp_mz_filtered.intersection(ref_mz_filtered))
220        )
221
222        # find the number of common mass values (after filtering 0s)
223        self.n_x_y = len(self.common_mz_values)
224        # print(self.n_x_y)
225
226    def nan_fill(self, df, fill_with=0):
227        """Fill missing mass values with a given value.
228
229        Parameters
230        ----------
231        df : DataFrame
232            DataFrame containing the experimental and reference mass spectrum data.
233        fill_with : float
234            Value to fill missing mass values with.
235
236        Returns
237        -------
238        x : list
239            List of abundance values for the experimental mass spectrum.
240        y : list
241            List of abundance values for the reference mass spectrum."""
242        df.fillna(fill_with, inplace=True)
243
244        return df.T[0].values, df.T[1].values
245
246    def normalize(self, x, y, norm_func=sum):
247        """Normalize the abundance values.
248
249        Parameters
250        ----------
251        x : list
252            List of abundance values for the experimental mass spectrum.
253        y : list
254            List of abundance values for the reference mass spectrum.
255        norm_func : function
256            Function to normalize the abundance values.
257            Default is sum
258
259        Returns
260        -------
261        u_l : tuple
262            Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
263        """
264        if norm_func is not None:
265            u_l = (x / norm_func(x), y / norm_func(y))
266            return u_l
267        else:
268            return (x, y)
269
270    def weighted_cosine_correlation(self, a=0.5, b=1.3, nanfill=1e-10):
271        """Calculate the weighted cosine correlation between the experimental and reference mass spectra.
272
273        Parameters
274        ----------
275        a : float
276            Weighting factor for the abundance values.
277            Default is 0.5
278        b : float
279            Weighting factor for the mass values.
280            Default is 1.3
281        nanfill : float
282            Value to fill missing mass values with.
283            Default is 1e-10
284
285        Returns
286        -------
287        correlation : float
288            Weighted cosine correlation between the experimental and reference mass spectra.
289        """
290        # create dict['mz'] = abundance, for experimental data
291        # ms_mz_abun_dict = mass_spec.mz_abun_dict
292        # weight exp data
293
294        xc = power(self.exp_abun, a) * power(self.exp_mz, b)
295
296        # track back to individual mz
297        weighted_exp_dict = dict(zip(self.ms_mz_abun_dict.keys(), xc))
298
299        # weight ref data
300        yc = power(self.ref_obj.get("abundance"), a) * power(self.ref_obj.get("mz"), b)
301
302        ref_mz_abun_dict = dict(zip(self.ref_obj.get("mz"), yc))
303
304        # parse to dataframe, easier to zerofill and tranpose
305        df = DataFrame([weighted_exp_dict, ref_mz_abun_dict])
306
307        # fill missing mz with weight {abun**a}{m/z**b} to 0
308        x, y = self.nan_fill(df, fill_with=nanfill)
309
310        # correlation = (1 - cosine(x, y))
311
312        correlation = dot(x, y) / (norm(x) * norm(y))
313
314        return correlation
315
316    def cosine_correlation(self):
317        """Calculate the cosine correlation between the experimental and reference mass spectra.
318
319        Returns
320        -------
321        correlation : float
322            Cosine correlation between the experimental and reference mass spectra.
323
324        """
325        # calculate cosine correlation,
326        x = self.zero_filled_u_l[0]
327        y = self.zero_filled_u_l[1]
328
329        # correlation = (1 - cosine(x, y))
330
331        correlation = dot(x, y) / (norm(x) * norm(y))
332
333        return correlation
334
335    def stein_scott(self):
336        """Calculate the Stein-Scott similarity between the experimental and reference mass spectra.
337
338        Returns
339        -------
340        s_ss_x_y : float
341            Stein-Scott similarity between the experimental and reference mass spectra.
342        s_ss_x_y_nist : float
343            Stein-Scott similarity between the experimental and reference mass spectra.
344        """
345        # TODO check this code
346        if self.n_x_y == 0:
347            return 0, 0
348
349        # count number of non-zero abundance/peak intensity values
350        n_x = sum(a != 0 for a in self.exp_abun)
351
352        s_r_x_y = 0
353
354        a, b = 1, 0
355
356        for i in range(1, self.n_x_y):
357            current_value = self.common_mz_values[i]
358            previous_value = self.common_mz_values[i - 1]
359
360            y_i = self.ref_mz_abun_dict[current_value]
361            y_i_minus1 = self.ref_mz_abun_dict[previous_value]
362
363            lc_current = power(y_i, a) * power(current_value, b)
364            lc_previous = power(y_i_minus1, a) * power(previous_value, b)
365
366            x_i = self.ms_mz_abun_dict[current_value]
367            x_i_minus1 = self.ms_mz_abun_dict[previous_value]
368
369            uc_current = power(x_i, a) * power(current_value, b)
370            uc_previous = power(x_i_minus1, a) * power(previous_value, b)
371
372            T1 = lc_current / lc_previous
373
374            T2 = uc_previous / uc_current
375
376            temp_computation = T1 * T2
377
378            n = 0
379            if temp_computation <= 1:
380                n = 1
381            else:
382                n = -1
383
384            s_r_x_y = s_r_x_y + power(temp_computation, n)
385
386        # finish the calculation of S_R(X,Y)
387
388        s_r_x_y = s_r_x_y / self.n_x_y
389        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
390        s_wc_x_y = self.weighted_cosine_correlation(a=0.5, b=3, nanfill=0)
391
392        s_ss_x_y = ((n_x * s_wc_x_y) + (self.n_x_y * s_r_x_y)) / (n_x + self.n_x_y)
393
394        s_wc_x_y_nist = self.weighted_cosine_correlation(a=0.5, b=1.3, nanfill=0)
395
396        s_ss_x_y_nist = ((n_x * s_wc_x_y_nist) + (self.n_x_y * s_r_x_y)) / (
397            n_x + self.n_x_y
398        )
399        # final step
400
401        return s_ss_x_y, s_ss_x_y_nist
402
403    def pearson_correlation(
404        self,
405    ):
406        """Calculate the Pearson correlation between the experimental and reference mass spectra.
407
408        Returns
409        -------
410        correlation : float
411            Pearson correlation between the experimental and reference mass spectra.
412        """
413        correlation = pearsonr(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
414
415        return correlation[0]
416
417    def spearman_correlation(self):
418        """Calculate the Spearman correlation between the experimental and reference mass spectra.
419
420        Returns
421        -------
422        coorelation : float
423            Spearman correlation between the experimental and reference mass spectra.
424        """
425        # calculate Spearman correlation
426        # ## TODO - Check axis
427        correlation = spearmanr(
428            self.zero_filled_u_l[0], self.zero_filled_u_l[1], axis=0
429        )
430
431        return correlation[0]
432
433    def kendall_tau(self):
434        """Calculate the Kendall's tau correlation between the experimental and reference mass spectra.
435
436        Returns
437        -------
438        correlation : float
439            Kendall's tau correlation between the experimental and reference mass spectra."""
440        # create dict['mz'] = abundance, for experimental data
441        # self.ms_mz_abun_dict = mass_spec.mz_abun_dict
442
443        # create dict['mz'] = abundance, for experimental data
444
445        # calculate Kendall's tau
446        correlation = kendalltau(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
447
448        return correlation[0]
449
450    def dft_correlation(self):
451        """Calculate the DFT correlation between the experimental and reference mass spectra.
452
453        Returns
454        -------
455        correlation : float
456            DFT correlation between the experimental and reference mass spectra.
457        """
458        if self.n_x_y == 0:
459            return 0
460
461        # count number of non-zero abundance/peak intensity values
462        n_x = sum(a != 0 for a in self.exp_abun)
463
464        x, y = self.nan_fill(self.df, fill_with=0)
465
466        x, y = self.normalize(x, y, norm_func=self.normalize_func)
467
468        # get the Fourier transform of x and y
469        x_dft = rfft(x).real
470        y_dft = rfft(y).real
471
472        s_dft_xy = dot(x_dft, y_dft) / (norm(x_dft) * norm(y_dft))
473
474        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
475        s_wc_x_y = self.weighted_cosine_correlation(nanfill=0)
476
477        # final step
478        s_dft = (n_x * s_wc_x_y + self.n_x_y * s_dft_xy) / (n_x + self.n_x_y)
479
480        return s_dft
481
482    def dwt_correlation(self):
483        """Calculate the DWT correlation between the experimental and reference mass spectra.
484
485        Returns
486        -------
487        correlation : float
488            DWT correlation between the experimental and reference mass spectra.
489
490        Notes
491        -----
492        This function requires the PyWavelets library to be installed.
493            This is not a default requirement as this function is not widely used.
494        """
495
496        from pywt import dwt
497
498        if self.n_x_y == 0:
499            return 0
500
501        # count number of non-zero abundance/peak intensity values
502        n_x = sum(a != 0 for a in self.exp_abun)
503
504        # calculate cosine correlation,
505        x, y = self.nan_fill(self.df, fill_with=0)
506
507        x, y = self.normalize(x, y, norm_func=self.normalize_func)
508
509        # Make x and y into an array
510        x_a = list(x)
511        y_a = list(y)
512
513        # get the wavelet transform of x and y (Daubechies with a filter length of 4. Asymmetric. pywavelets function)
514        # Will only use the detail dwt (dwtDd
515        x_dwtD = dwt(x_a, "db2")[1]
516        y_dwtD = dwt(y_a, "db2")[1]
517
518        s_dwt_xy = dot(x_dwtD, y_dwtD) / (norm(x_dwtD) * norm(y_dwtD))
519
520        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
521        s_wc_x_y = self.weighted_cosine_correlation(nanfill=0)
522
523        # final step
524        s_dwt = (n_x * s_wc_x_y + self.n_x_y * s_dwt_xy) / (n_x + self.n_x_y)
525
526        return s_dwt
527
528    def euclidean_distance(self):
529        """Calculate the Euclidean distance between the experimental and reference mass spectra.
530
531        Returns
532        -------
533        correlation : float
534            Euclidean distance between the experimental and reference mass spectra.
535        """
536        # correlation = euclidean_distance_manual(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
537        qlist = self.zero_filled_u_l[0]
538        rlist = self.zero_filled_u_l[1]
539
540        correlation = sqrt(np_sum(power(qlist - rlist, 2)))
541
542        return correlation
543
544    def manhattan_distance(self):
545        """Calculate the Manhattan distance between the experimental and reference mass spectra.
546
547        Returns
548        -------
549        correlation : float
550            Manhattan distance between the experimental and reference mass spectra.
551        """
552        qlist = self.zero_filled_u_l[0]
553        rlist = self.zero_filled_u_l[1]
554
555        return np_sum(absolute(qlist - rlist))
556
557    def jaccard_distance(self):
558        """Calculate the Jaccard distance between the experimental and reference mass spectra.
559
560        Returns
561        -------
562        correlation : float
563            Jaccard distance between the experimental and reference mass spectra.
564        """
565
566        def jaccard_similarity(list1, list2):
567            intersection = len(list(set(list1).intersection(list2)))
568            union = (len(list1) + len(list2)) - intersection
569            return float(intersection) / union
570
571        qlist = self.zero_filled_u_l[0]
572        rlist = self.zero_filled_u_l[1]
573
574        return np_sum(power(qlist - rlist, 2)) / (
575            np_sum(power(qlist, 2)) + np_sum(power(rlist, 2)) - np_sum(qlist * rlist)
576        )
577        # correlation = jaccard_similarity(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
578        # @return correlation
579
580    def extra_distances(self):
581        """Function to calculate distances using additional metrics defined in math_distance.py
582
583        Currently, calculates all distances.
584
585        Returns
586        -------
587        dict_res : dict
588            Dictionary containing the distances between the experimental and reference mass spectra.
589
590        """
591        from corems.molecular_id.calc import math_distance
592
593        # qlist = self.zero_filled_u_l[2]
594        # rlist = self.zero_filled_u_l[3]
595
596        dict_res = {}
597
598        for method in methods_name:
599            # function_name = method + "_distance"
600            function_name = method
601            if hasattr(math_distance, function_name):
602                f = getattr(math_distance, function_name)
603
604                if function_name == "canberra_metric":
605                    x, y = self.nan_fill(self.df, fill_with=0)
606
607                    qlist, rlist = self.normalize(x, y, norm_func=self.normalize_func)
608                    # print("qlist:")
609                    # print(qlist)
610                    # print("rlist:")
611                    # print(rlist)
612
613                else:
614                    qlist = self.zero_filled_u_l[0]
615                    rlist = self.zero_filled_u_l[1]
616
617                dist = f(qlist, rlist)
618                # if method == "Minokowski_3":
619                #    print("qlist:")
620                #    print(qlist)
621                #    print("rlist")
622                #    print(rlist)
623                #    exit()
624                # if dist == np.nan or dis == np.inf:
625                # print(self.exp_abun)
626                # print(self.exp_mz)
627                # print(function_name)
628                # print(len(self.exp_abun))
629                # print(len(self.exp_mz))
630                # print(self.zero_filled_u_l[1])
631                dict_res[method] = dist
632
633        return dict_res

Class containing methods for calculating spectral similarity between two mass spectra.

Parameters
  • ms_mz_abun_dict (dict): Dictionary of mass to abundance values for the experimental mass spectrum.
  • ref_obj (dict): Dictionary of mass to abundance values for the reference mass spectrum.
  • norm_func (function): Function to normalize the abundance values.
Attributes
  • normalize_func (function): Function to normalize the abundance values.
  • ms_mz_abun_dict (dict): Dictionary of mass to abundance values for the experimental mass spectrum.
  • ref_obj (dict): Dictionary of mass to abundance values for the reference mass spectrum.
  • exp_abun (list): List of abundance values for the experimental mass spectrum.
  • exp_mz (list): List of mass values for the experimental mass spectrum.
  • ref_mz (list): List of mass values for the reference mass spectrum.
  • ref_abun (list): List of abundance values for the reference mass spectrum.
  • ref_mz_abun_dict (dict): Dictionary of mass to abundance values for the reference mass spectrum.
  • df (DataFrame): DataFrame containing the experimental and reference mass spectrum data.
  • zero_filled_u_l (tuple): Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
  • common_mz_values (list): List of common mass values between the experimental and reference mass spectra.
  • n_x_y (int): Number of common mass values between the experimental and reference mass spectra.
Methods
  • nan_fill(df, fill_with=0). Fill missing mass values with a given value.
  • normalize(x, y, norm_func=sum). Normalize the abundance values.
  • weighted_cosine_correlation(a=0.5, b=1.3, nanfill=1e-10). Calculate the weighted cosine correlation between the experimental and reference mass spectra.
  • cosine_correlation(). Calculate the cosine correlation between the experimental and reference mass spectra.
  • stein_scott(). Calculate the Stein-Scott similarity between the experimental and reference mass spectra.
  • pearson_correlation(). Calculate the Pearson correlation between the experimental and reference mass spectra.
  • spearman_correlation(). Calculate the Spearman correlation between the experimental and reference mass spectra.
SpectralSimilarity(ms_mz_abun_dict, ref_obj, norm_func=<built-in function sum>)
190    def __init__(self, ms_mz_abun_dict, ref_obj, norm_func=sum):
191        self.normalize_func = norm_func
192        self.ms_mz_abun_dict = ms_mz_abun_dict
193        self.ref_obj = ref_obj
194
195        self.exp_abun = list(self.ms_mz_abun_dict.values())
196        self.exp_mz = list(self.ms_mz_abun_dict.keys())
197
198        self.ref_mz = self.ref_obj.get("mz")
199        self.ref_abun = self.ref_obj.get("abundance")
200
201        self.ref_mz_abun_dict = dict(zip(self.ref_mz, self.ref_abun))
202
203        # parse to dataframe, easier to zerofill and tranpose
204        self.df = DataFrame([self.ms_mz_abun_dict, self.ref_mz_abun_dict])
205
206        # fill missing mz with abundance 0
207        x, y = self.nan_fill(self.df, fill_with=1e-10)
208
209        self.zero_filled_u_l = self.normalize(x, y, norm_func=self.normalize_func)
210
211        # filter out the mass values that have zero intensities in self.exp_abun
212        exp_mz_filtered = set([k for k in self.exp_mz if self.ms_mz_abun_dict[k] != 0])
213
214        # filter out the mass values that have zero intensities in self.ref_mz
215        ref_mz_filtered = set([k for k in self.ref_mz if self.ref_mz_abun_dict[k] != 0])
216
217        # find the intersection/common mass values of both ref and exp, and sort them
218        self.common_mz_values = sorted(
219            list(exp_mz_filtered.intersection(ref_mz_filtered))
220        )
221
222        # find the number of common mass values (after filtering 0s)
223        self.n_x_y = len(self.common_mz_values)
224        # print(self.n_x_y)
normalize_func
ms_mz_abun_dict
ref_obj
exp_abun
exp_mz
ref_mz
ref_abun
ref_mz_abun_dict
df
zero_filled_u_l
common_mz_values
n_x_y
def nan_fill(self, df, fill_with=0):
226    def nan_fill(self, df, fill_with=0):
227        """Fill missing mass values with a given value.
228
229        Parameters
230        ----------
231        df : DataFrame
232            DataFrame containing the experimental and reference mass spectrum data.
233        fill_with : float
234            Value to fill missing mass values with.
235
236        Returns
237        -------
238        x : list
239            List of abundance values for the experimental mass spectrum.
240        y : list
241            List of abundance values for the reference mass spectrum."""
242        df.fillna(fill_with, inplace=True)
243
244        return df.T[0].values, df.T[1].values

Fill missing mass values with a given value.

Parameters
  • df (DataFrame): DataFrame containing the experimental and reference mass spectrum data.
  • fill_with (float): Value to fill missing mass values with.
Returns
  • x (list): List of abundance values for the experimental mass spectrum.
  • y (list): List of abundance values for the reference mass spectrum.
def normalize(self, x, y, norm_func=<built-in function sum>):
246    def normalize(self, x, y, norm_func=sum):
247        """Normalize the abundance values.
248
249        Parameters
250        ----------
251        x : list
252            List of abundance values for the experimental mass spectrum.
253        y : list
254            List of abundance values for the reference mass spectrum.
255        norm_func : function
256            Function to normalize the abundance values.
257            Default is sum
258
259        Returns
260        -------
261        u_l : tuple
262            Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
263        """
264        if norm_func is not None:
265            u_l = (x / norm_func(x), y / norm_func(y))
266            return u_l
267        else:
268            return (x, y)

Normalize the abundance values.

Parameters
  • x (list): List of abundance values for the experimental mass spectrum.
  • y (list): List of abundance values for the reference mass spectrum.
  • norm_func (function): Function to normalize the abundance values. Default is sum
Returns
  • u_l (tuple): Tuple containing the experimental and reference mass spectrum data after zero filling and normalization.
def weighted_cosine_correlation(self, a=0.5, b=1.3, nanfill=1e-10):
270    def weighted_cosine_correlation(self, a=0.5, b=1.3, nanfill=1e-10):
271        """Calculate the weighted cosine correlation between the experimental and reference mass spectra.
272
273        Parameters
274        ----------
275        a : float
276            Weighting factor for the abundance values.
277            Default is 0.5
278        b : float
279            Weighting factor for the mass values.
280            Default is 1.3
281        nanfill : float
282            Value to fill missing mass values with.
283            Default is 1e-10
284
285        Returns
286        -------
287        correlation : float
288            Weighted cosine correlation between the experimental and reference mass spectra.
289        """
290        # create dict['mz'] = abundance, for experimental data
291        # ms_mz_abun_dict = mass_spec.mz_abun_dict
292        # weight exp data
293
294        xc = power(self.exp_abun, a) * power(self.exp_mz, b)
295
296        # track back to individual mz
297        weighted_exp_dict = dict(zip(self.ms_mz_abun_dict.keys(), xc))
298
299        # weight ref data
300        yc = power(self.ref_obj.get("abundance"), a) * power(self.ref_obj.get("mz"), b)
301
302        ref_mz_abun_dict = dict(zip(self.ref_obj.get("mz"), yc))
303
304        # parse to dataframe, easier to zerofill and tranpose
305        df = DataFrame([weighted_exp_dict, ref_mz_abun_dict])
306
307        # fill missing mz with weight {abun**a}{m/z**b} to 0
308        x, y = self.nan_fill(df, fill_with=nanfill)
309
310        # correlation = (1 - cosine(x, y))
311
312        correlation = dot(x, y) / (norm(x) * norm(y))
313
314        return correlation

Calculate the weighted cosine correlation between the experimental and reference mass spectra.

Parameters
  • a (float): Weighting factor for the abundance values. Default is 0.5
  • b (float): Weighting factor for the mass values. Default is 1.3
  • nanfill (float): Value to fill missing mass values with. Default is 1e-10
Returns
  • correlation (float): Weighted cosine correlation between the experimental and reference mass spectra.
def cosine_correlation(self):
316    def cosine_correlation(self):
317        """Calculate the cosine correlation between the experimental and reference mass spectra.
318
319        Returns
320        -------
321        correlation : float
322            Cosine correlation between the experimental and reference mass spectra.
323
324        """
325        # calculate cosine correlation,
326        x = self.zero_filled_u_l[0]
327        y = self.zero_filled_u_l[1]
328
329        # correlation = (1 - cosine(x, y))
330
331        correlation = dot(x, y) / (norm(x) * norm(y))
332
333        return correlation

Calculate the cosine correlation between the experimental and reference mass spectra.

Returns
  • correlation (float): Cosine correlation between the experimental and reference mass spectra.
def stein_scott(self):
335    def stein_scott(self):
336        """Calculate the Stein-Scott similarity between the experimental and reference mass spectra.
337
338        Returns
339        -------
340        s_ss_x_y : float
341            Stein-Scott similarity between the experimental and reference mass spectra.
342        s_ss_x_y_nist : float
343            Stein-Scott similarity between the experimental and reference mass spectra.
344        """
345        # TODO check this code
346        if self.n_x_y == 0:
347            return 0, 0
348
349        # count number of non-zero abundance/peak intensity values
350        n_x = sum(a != 0 for a in self.exp_abun)
351
352        s_r_x_y = 0
353
354        a, b = 1, 0
355
356        for i in range(1, self.n_x_y):
357            current_value = self.common_mz_values[i]
358            previous_value = self.common_mz_values[i - 1]
359
360            y_i = self.ref_mz_abun_dict[current_value]
361            y_i_minus1 = self.ref_mz_abun_dict[previous_value]
362
363            lc_current = power(y_i, a) * power(current_value, b)
364            lc_previous = power(y_i_minus1, a) * power(previous_value, b)
365
366            x_i = self.ms_mz_abun_dict[current_value]
367            x_i_minus1 = self.ms_mz_abun_dict[previous_value]
368
369            uc_current = power(x_i, a) * power(current_value, b)
370            uc_previous = power(x_i_minus1, a) * power(previous_value, b)
371
372            T1 = lc_current / lc_previous
373
374            T2 = uc_previous / uc_current
375
376            temp_computation = T1 * T2
377
378            n = 0
379            if temp_computation <= 1:
380                n = 1
381            else:
382                n = -1
383
384            s_r_x_y = s_r_x_y + power(temp_computation, n)
385
386        # finish the calculation of S_R(X,Y)
387
388        s_r_x_y = s_r_x_y / self.n_x_y
389        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
390        s_wc_x_y = self.weighted_cosine_correlation(a=0.5, b=3, nanfill=0)
391
392        s_ss_x_y = ((n_x * s_wc_x_y) + (self.n_x_y * s_r_x_y)) / (n_x + self.n_x_y)
393
394        s_wc_x_y_nist = self.weighted_cosine_correlation(a=0.5, b=1.3, nanfill=0)
395
396        s_ss_x_y_nist = ((n_x * s_wc_x_y_nist) + (self.n_x_y * s_r_x_y)) / (
397            n_x + self.n_x_y
398        )
399        # final step
400
401        return s_ss_x_y, s_ss_x_y_nist

Calculate the Stein-Scott similarity between the experimental and reference mass spectra.

Returns
  • s_ss_x_y (float): Stein-Scott similarity between the experimental and reference mass spectra.
  • s_ss_x_y_nist (float): Stein-Scott similarity between the experimental and reference mass spectra.
def pearson_correlation(self):
403    def pearson_correlation(
404        self,
405    ):
406        """Calculate the Pearson correlation between the experimental and reference mass spectra.
407
408        Returns
409        -------
410        correlation : float
411            Pearson correlation between the experimental and reference mass spectra.
412        """
413        correlation = pearsonr(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
414
415        return correlation[0]

Calculate the Pearson correlation between the experimental and reference mass spectra.

Returns
  • correlation (float): Pearson correlation between the experimental and reference mass spectra.
def spearman_correlation(self):
417    def spearman_correlation(self):
418        """Calculate the Spearman correlation between the experimental and reference mass spectra.
419
420        Returns
421        -------
422        coorelation : float
423            Spearman correlation between the experimental and reference mass spectra.
424        """
425        # calculate Spearman correlation
426        # ## TODO - Check axis
427        correlation = spearmanr(
428            self.zero_filled_u_l[0], self.zero_filled_u_l[1], axis=0
429        )
430
431        return correlation[0]

Calculate the Spearman correlation between the experimental and reference mass spectra.

Returns
  • coorelation (float): Spearman correlation between the experimental and reference mass spectra.
def kendall_tau(self):
433    def kendall_tau(self):
434        """Calculate the Kendall's tau correlation between the experimental and reference mass spectra.
435
436        Returns
437        -------
438        correlation : float
439            Kendall's tau correlation between the experimental and reference mass spectra."""
440        # create dict['mz'] = abundance, for experimental data
441        # self.ms_mz_abun_dict = mass_spec.mz_abun_dict
442
443        # create dict['mz'] = abundance, for experimental data
444
445        # calculate Kendall's tau
446        correlation = kendalltau(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
447
448        return correlation[0]

Calculate the Kendall's tau correlation between the experimental and reference mass spectra.

Returns
  • correlation (float): Kendall's tau correlation between the experimental and reference mass spectra.
def dft_correlation(self):
450    def dft_correlation(self):
451        """Calculate the DFT correlation between the experimental and reference mass spectra.
452
453        Returns
454        -------
455        correlation : float
456            DFT correlation between the experimental and reference mass spectra.
457        """
458        if self.n_x_y == 0:
459            return 0
460
461        # count number of non-zero abundance/peak intensity values
462        n_x = sum(a != 0 for a in self.exp_abun)
463
464        x, y = self.nan_fill(self.df, fill_with=0)
465
466        x, y = self.normalize(x, y, norm_func=self.normalize_func)
467
468        # get the Fourier transform of x and y
469        x_dft = rfft(x).real
470        y_dft = rfft(y).real
471
472        s_dft_xy = dot(x_dft, y_dft) / (norm(x_dft) * norm(y_dft))
473
474        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
475        s_wc_x_y = self.weighted_cosine_correlation(nanfill=0)
476
477        # final step
478        s_dft = (n_x * s_wc_x_y + self.n_x_y * s_dft_xy) / (n_x + self.n_x_y)
479
480        return s_dft

Calculate the DFT correlation between the experimental and reference mass spectra.

Returns
  • correlation (float): DFT correlation between the experimental and reference mass spectra.
def dwt_correlation(self):
482    def dwt_correlation(self):
483        """Calculate the DWT correlation between the experimental and reference mass spectra.
484
485        Returns
486        -------
487        correlation : float
488            DWT correlation between the experimental and reference mass spectra.
489
490        Notes
491        -----
492        This function requires the PyWavelets library to be installed.
493            This is not a default requirement as this function is not widely used.
494        """
495
496        from pywt import dwt
497
498        if self.n_x_y == 0:
499            return 0
500
501        # count number of non-zero abundance/peak intensity values
502        n_x = sum(a != 0 for a in self.exp_abun)
503
504        # calculate cosine correlation,
505        x, y = self.nan_fill(self.df, fill_with=0)
506
507        x, y = self.normalize(x, y, norm_func=self.normalize_func)
508
509        # Make x and y into an array
510        x_a = list(x)
511        y_a = list(y)
512
513        # get the wavelet transform of x and y (Daubechies with a filter length of 4. Asymmetric. pywavelets function)
514        # Will only use the detail dwt (dwtDd
515        x_dwtD = dwt(x_a, "db2")[1]
516        y_dwtD = dwt(y_a, "db2")[1]
517
518        s_dwt_xy = dot(x_dwtD, y_dwtD) / (norm(x_dwtD) * norm(y_dwtD))
519
520        # using the existing weighted_cosine_correlation function to get S_WC(X,Y)
521        s_wc_x_y = self.weighted_cosine_correlation(nanfill=0)
522
523        # final step
524        s_dwt = (n_x * s_wc_x_y + self.n_x_y * s_dwt_xy) / (n_x + self.n_x_y)
525
526        return s_dwt

Calculate the DWT correlation between the experimental and reference mass spectra.

Returns
  • correlation (float): DWT correlation between the experimental and reference mass spectra.
Notes

This function requires the PyWavelets library to be installed. This is not a default requirement as this function is not widely used.

def euclidean_distance(self):
528    def euclidean_distance(self):
529        """Calculate the Euclidean distance between the experimental and reference mass spectra.
530
531        Returns
532        -------
533        correlation : float
534            Euclidean distance between the experimental and reference mass spectra.
535        """
536        # correlation = euclidean_distance_manual(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
537        qlist = self.zero_filled_u_l[0]
538        rlist = self.zero_filled_u_l[1]
539
540        correlation = sqrt(np_sum(power(qlist - rlist, 2)))
541
542        return correlation

Calculate the Euclidean distance between the experimental and reference mass spectra.

Returns
  • correlation (float): Euclidean distance between the experimental and reference mass spectra.
def manhattan_distance(self):
544    def manhattan_distance(self):
545        """Calculate the Manhattan distance between the experimental and reference mass spectra.
546
547        Returns
548        -------
549        correlation : float
550            Manhattan distance between the experimental and reference mass spectra.
551        """
552        qlist = self.zero_filled_u_l[0]
553        rlist = self.zero_filled_u_l[1]
554
555        return np_sum(absolute(qlist - rlist))

Calculate the Manhattan distance between the experimental and reference mass spectra.

Returns
  • correlation (float): Manhattan distance between the experimental and reference mass spectra.
def jaccard_distance(self):
557    def jaccard_distance(self):
558        """Calculate the Jaccard distance between the experimental and reference mass spectra.
559
560        Returns
561        -------
562        correlation : float
563            Jaccard distance between the experimental and reference mass spectra.
564        """
565
566        def jaccard_similarity(list1, list2):
567            intersection = len(list(set(list1).intersection(list2)))
568            union = (len(list1) + len(list2)) - intersection
569            return float(intersection) / union
570
571        qlist = self.zero_filled_u_l[0]
572        rlist = self.zero_filled_u_l[1]
573
574        return np_sum(power(qlist - rlist, 2)) / (
575            np_sum(power(qlist, 2)) + np_sum(power(rlist, 2)) - np_sum(qlist * rlist)
576        )
577        # correlation = jaccard_similarity(self.zero_filled_u_l[0], self.zero_filled_u_l[1])
578        # @return correlation

Calculate the Jaccard distance between the experimental and reference mass spectra.

Returns
  • correlation (float): Jaccard distance between the experimental and reference mass spectra.
def extra_distances(self):
580    def extra_distances(self):
581        """Function to calculate distances using additional metrics defined in math_distance.py
582
583        Currently, calculates all distances.
584
585        Returns
586        -------
587        dict_res : dict
588            Dictionary containing the distances between the experimental and reference mass spectra.
589
590        """
591        from corems.molecular_id.calc import math_distance
592
593        # qlist = self.zero_filled_u_l[2]
594        # rlist = self.zero_filled_u_l[3]
595
596        dict_res = {}
597
598        for method in methods_name:
599            # function_name = method + "_distance"
600            function_name = method
601            if hasattr(math_distance, function_name):
602                f = getattr(math_distance, function_name)
603
604                if function_name == "canberra_metric":
605                    x, y = self.nan_fill(self.df, fill_with=0)
606
607                    qlist, rlist = self.normalize(x, y, norm_func=self.normalize_func)
608                    # print("qlist:")
609                    # print(qlist)
610                    # print("rlist:")
611                    # print(rlist)
612
613                else:
614                    qlist = self.zero_filled_u_l[0]
615                    rlist = self.zero_filled_u_l[1]
616
617                dist = f(qlist, rlist)
618                # if method == "Minokowski_3":
619                #    print("qlist:")
620                #    print(qlist)
621                #    print("rlist")
622                #    print(rlist)
623                #    exit()
624                # if dist == np.nan or dis == np.inf:
625                # print(self.exp_abun)
626                # print(self.exp_mz)
627                # print(function_name)
628                # print(len(self.exp_abun))
629                # print(len(self.exp_mz))
630                # print(self.zero_filled_u_l[1])
631                dict_res[method] = dist
632
633        return dict_res

Function to calculate distances using additional metrics defined in math_distance.py

Currently, calculates all distances.

Returns
  • dict_res (dict): Dictionary containing the distances between the experimental and reference mass spectra.