corems.mass_spectra.calc.GC_Deconvolution

View Source
  1from matplotlib import pyplot as plt
  2import numpy as np
  3
  4from corems.encapsulation.constant import Labels
  5from corems.encapsulation.factory.parameters import default_parameters
  6from corems.mass_spectrum.factory.MassSpectrumClasses import MassSpecCentroidLowRes
  7from corems.chroma_peak.factory.chroma_peak_classes import GCPeakDeconvolved
  8from corems.mass_spectra.calc import SignalProcessing as sp
  9
 10
 11class MassDeconvolution:
 12    def run_deconvolution(self, plot_res=False):
 13        eic_dict = self.ion_extracted_chroma(self._ms)
 14
 15        peaks_entity_data = self.find_peaks_entity(eic_dict)
 16
 17        """ select model peaks, create Mass Spectrum objs, GCPeak objs, store results in GC_Class gcpeaks obj"""
 18        self.deconvolution(peaks_entity_data, plot_res)
 19
 20    def centroid_detector(self, tic, rt):
 21        """this function has been replaced with sp.peak_picking_first_derivative
 22        and it not used
 23        """
 24        noise_std = self.chromatogram_settings.std_noise_threshold
 25
 26        method = self.chromatogram_settings.noise_threshold_method
 27
 28        """ peak picking"""
 29        min_height = self.chromatogram_settings.peak_height_min_percent
 30        min_datapoints = self.chromatogram_settings.min_peak_datapoints
 31
 32        """ baseline detection"""
 33        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
 34        max_height = self.chromatogram_settings.peak_height_max_percent
 35
 36        peak_indexes_generator = sp.peak_detector_generator(
 37            tic,
 38            noise_std,
 39            method,
 40            rt,
 41            max_height,
 42            min_height,
 43            max_prominence,
 44            min_datapoints,
 45        )
 46
 47        return peak_indexes_generator
 48
 49    def ion_extracted_chroma(self, mass_spectra_obj):
 50        eic_dict = {}
 51
 52        for scan_number, ms_obj in mass_spectra_obj.items():
 53            mz_list = ms_obj.mz_exp
 54            abundance_list = ms_obj.abundance
 55            # add list of scan numbers
 56            for index, mz in enumerate(mz_list):
 57                # dict of mz and tuple (mass spectrum abundances index, and scan number)
 58                if mz not in eic_dict.keys():
 59                    eic_dict[mz] = [[abundance_list[index]], [ms_obj.retention_time]]
 60
 61                else:
 62                    eic_dict[mz][0].append(ms_obj.abundance[index])
 63                    eic_dict[mz][1].append(ms_obj.retention_time)
 64
 65        return eic_dict
 66
 67    def hc(self, X, Y, max_rt_distance=0.025):
 68        from scipy.cluster.hierarchy import dendrogram, linkage
 69        from scipy.cluster.hierarchy import fcluster
 70        # from matplotlib import pyplot as plt
 71
 72        Z = linkage(np.reshape(X, (len(X), 1)), method="ward")
 73        # Z = linkage(X, method  = "ward")
 74
 75        max_d = max_rt_distance
 76        distance_clusters = fcluster(Z, max_d, criterion="distance")
 77        # print("distance")
 78        # print(distance_clusters)
 79
 80        # inconsistency_cluster = fcluster(Z, 2, depth=2)
 81        # max_cluster = fcluster(Z, 2, criterion='maxclust')
 82
 83        grouped_rt = {}
 84
 85        for index_obj, group in enumerate(distance_clusters):
 86            if group not in grouped_rt.keys():
 87                grouped_rt[group] = [X[index_obj]]
 88            else:
 89                grouped_rt[group].append(X[index_obj])
 90
 91        # print(distance_clusters, grouped_rt)
 92        return grouped_rt
 93
 94        # plt.figure(figsize=(10, 8))
 95        # plt.scatter(X, Y, c=distance_clusters, cmap='prism')  # plot points with cluster dependent colors
 96        # plt.show()
 97        # labelList = range(int(min(X)), int(max(X)))
 98
 99        # plt.figure(figsize=(10, 7))
100        # dendrogram(Z,
101        #            orientation='top',
102        #            distance_sort='descending',
103        #            show_leaf_counts=True)
104        # plt.show()
105        # print(Z)
106
107    def find_peaks_entity(self, eic_dict):
108        """combine eic with mathing rt apexes"""
109        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
110
111        max_height = self.chromatogram_settings.peak_height_max_percent
112
113        signal_threshold = self.chromatogram_settings.eic_signal_threshold
114
115        min_peak_datapoints = self.chromatogram_settings.min_peak_datapoints
116
117        peak_derivative_threshold = self.chromatogram_settings.peak_derivative_threshold
118
119        correct_baseline = False
120        peaks_entity_data = {}
121
122        max_eic = 0
123        for mz, eic_scan_index_rt in eic_dict.items():
124            ind_max_eic = max(eic_scan_index_rt[0])
125            max_eic = ind_max_eic if ind_max_eic > max_eic else max_eic
126
127        for mz, eic_scan_index_rt in eic_dict.items():
128            eic = eic_scan_index_rt[0]
129            rt_list = eic_scan_index_rt[1]
130
131            if len(eic) >= min_peak_datapoints:
132                smooth_eic = self.smooth_tic(eic)
133
134                include_indexes = sp.peak_picking_first_derivative(
135                    rt_list,
136                    smooth_eic,
137                    max_height,
138                    max_prominence,
139                    max_eic,
140                    min_peak_datapoints,
141                    peak_derivative_threshold,
142                    signal_threshold=signal_threshold,
143                    correct_baseline=correct_baseline,
144                )
145
146                for initial_scan, apex_scan, final_scan in include_indexes:
147                    rt_corrected_therm = self.quadratic_interpolation(
148                        rt_list, smooth_eic, apex_scan
149                    )
150
151                    ref_apex_rt = round(rt_list[apex_scan] + rt_corrected_therm, 4)
152
153                    apex_rt = rt_list[apex_scan]
154                    # apex_abundance = smooth_eic[apex_scan]
155
156                    # maximum_tic = apex_abundance if apex_abundance > maximum_tic else maximum_tic
157
158                    for scan_index in range(initial_scan, final_scan):
159                        peak_rt = rt_list[scan_index]
160                        peak_abundance = smooth_eic[scan_index]
161
162                        if peak_abundance > 0:
163                            dict_data = {
164                                peak_rt: {
165                                    "mz": [mz],
166                                    "abundance": [peak_abundance],
167                                    "scan_number": [scan_index],
168                                },
169                                "ref_apex_rt": ref_apex_rt,
170                            }
171
172                            if apex_rt not in peaks_entity_data.keys():
173                                peaks_entity_data[apex_rt] = dict_data
174
175                            else:
176                                if peak_rt not in peaks_entity_data[apex_rt].keys():
177                                    peaks_entity_data[apex_rt][peak_rt] = dict_data.get(
178                                        peak_rt
179                                    )
180
181                                else:
182                                    existing_data = peaks_entity_data[apex_rt].get(
183                                        peak_rt
184                                    )
185
186                                    existing_data["mz"].append(mz)
187                                    existing_data["abundance"].append(peak_abundance)
188                                    existing_data["scan_number"].append(scan_index)
189
190        return peaks_entity_data
191
192    def mass_spec_factory(self, rt, datadict):
193        # tic = sum(datadict.get('abundance'))
194
195        scan_index = datadict["scan_number"][0]
196
197        mz_list, abundance_list = zip(
198            *sorted(zip(datadict["mz"], datadict["abundance"]))
199        )
200
201        data_dict = {Labels.mz: mz_list, Labels.abundance: abundance_list}
202
203        d_params = default_parameters(self._ms[scan_index]._filename)
204
205        d_params["rt"] = rt
206
207        d_params["scan_number"] = scan_index
208
209        d_params["label"] = Labels.gcms_centroid
210
211        d_params["polarity"] = self._ms[scan_index].polarity
212
213        d_params["analyzer"] = self._ms[scan_index].analyzer
214
215        d_params["instrument_label"] = self._ms[scan_index].instrument_label
216
217        d_params["filename_path"] = self._ms[scan_index].instrument_label
218
219        ms = MassSpecCentroidLowRes(data_dict, d_params)
220
221        return ms
222
223    def smooth_signal(self, signal):
224        implemented_smooth_method = self.chromatogram_settings.implemented_smooth_method
225
226        pol_order = self.chromatogram_settings.savgol_pol_order
227
228        window_len = self.chromatogram_settings.smooth_window
229
230        window = self.chromatogram_settings.smooth_method
231
232        return sp.smooth_signal(
233            signal, window_len, window, pol_order, implemented_smooth_method
234        )
235
236    def add_gcpeak(
237        self,
238        new_apex_index,
239        start_rt,
240        final_rt,
241        peak_rt,
242        smoothed_tic,
243        datadict,
244        plot_res,
245    ):
246        if start_rt <= peak_rt[new_apex_index[1]] <= final_rt:
247            rt_list = peak_rt[new_apex_index[0] : new_apex_index[2]]
248            tic_list = smoothed_tic[new_apex_index[0] : new_apex_index[2]]
249
250            apex_rt = peak_rt[new_apex_index[1]]
251            apex_i = rt_list.index(apex_rt)
252
253            """workaround for peak picking missing some local minimas"""
254            if apex_rt not in self.processed_appexes:
255                self.processed_appexes.append(apex_rt)
256
257                mass_spectra = (
258                    self.mass_spec_factory(rt, datadict.get(rt)) for rt in rt_list
259                )
260
261                gc_peak = GCPeakDeconvolved(
262                    self, mass_spectra, apex_i, rt_list, tic_list
263                )
264
265                gc_peak.calc_area(tic_list, 1)
266
267                self.gcpeaks.append(gc_peak)
268
269                if plot_res:
270                    plt.plot(gc_peak.rt_list, gc_peak.tic_list)
271                    plt.plot(
272                        gc_peak.retention_time,
273                        gc_peak.tic,
274                        c="black",
275                        marker="^",
276                        linewidth=0,
277                    )
278
279    def deconvolution(self, peaks_entity_data, plot_res):
280        # plot_res = True
281        domain = self.retention_time
282        signal = self._processed_tic
283        max_height = self.chromatogram_settings.peak_height_max_percent
284        max_prominence = self.chromatogram_settings.peak_max_prominence_percent
285        min_peak_datapoints = self.chromatogram_settings.min_peak_datapoints
286        signal_threshold = self.chromatogram_settings.peak_height_min_percent
287        max_rt_distance = self.chromatogram_settings.max_rt_distance
288        peak_derivative_threshold = self.chromatogram_settings.peak_derivative_threshold
289
290        max_signal = max(signal)
291        correct_baseline = False
292
293        include_indexes = sp.peak_picking_first_derivative(
294            domain,
295            signal,
296            max_height,
297            max_prominence,
298            max_signal,
299            min_peak_datapoints,
300            peak_derivative_threshold,
301            signal_threshold=signal_threshold,
302            correct_baseline=correct_baseline,
303            plot_res=False,
304        )
305
306        """ deconvolution window is defined by the TIC peak region"""
307        all_apexes_rt = np.array(list(peaks_entity_data.keys()))
308
309        """workaround for peak picking missing some local minimas"""
310        self.processed_appexes = []
311
312        for indexes_tuple in include_indexes:
313            start_rt = self.retention_time[indexes_tuple[0]]
314            # apex_rt = self.retention_time[indexes_tuple[1]]
315            final_rt = self.retention_time[indexes_tuple[2]]
316
317            """ find all features within TIC peak window"""
318            peak_features_indexes = np.where(
319                (all_apexes_rt > start_rt) & (all_apexes_rt < final_rt)
320            )[0]
321            peak_features_rts = all_apexes_rt[peak_features_indexes]
322
323            # print(start_rt, apex_rt, final_rt )
324
325            filtered_features_rt = []
326            filtered_features_abundance = []
327
328            for each_apex_rt in peak_features_rts:
329                apex_data = peaks_entity_data.get(each_apex_rt).get(each_apex_rt)
330
331                peak_features_tic = sum(
332                    peaks_entity_data.get(each_apex_rt)
333                    .get(each_apex_rt)
334                    .get("abundance")
335                )
336
337                norm_smooth_tic = (peak_features_tic / max_signal) * 100
338
339                """ TODO: 
340                    Improve Peak Filtering
341
342                    Calculate peaks sharpness here and filter it out (Amax - An /n)?
343                    Peak Fit and Calculate Peak Gaussian Similarity?
344                    Currentely using flat % tic relative abundance threshold and min 3 m/z per mass spectrum
345                """
346                if norm_smooth_tic > signal_threshold and len(apex_data["mz"]) > 1:
347                    # print(len(apex_data['mz']))
348                    filtered_features_rt.append(each_apex_rt)
349                    filtered_features_abundance.append(peak_features_tic)
350
351            if len(filtered_features_rt) > 1:
352                """ more than one peak feature identified inside a TIC peak  """
353                # plt.plot(self.retention_time[indexes_tuple[0]:indexes_tuple[2]], signal[indexes_tuple[0]:indexes_tuple[2]], c='black')
354
355                # print(filtered_features_rt)
356                grouped_rt = self.hc(
357                    filtered_features_rt,
358                    filtered_features_abundance,
359                    max_rt_distance=max_rt_distance,
360                )
361                # print(grouped_rt)
362
363                for group, apex_rt_list in grouped_rt.items():
364                    """ each group is a peak feature defined by the hierarchical clutter algorithm
365
366                    """
367                    group_datadict = {}
368                    group_datadict["ref_apex_rt"] = []
369
370                    for each_group_apex_rt in apex_rt_list:
371                        datadict = peaks_entity_data.get(each_group_apex_rt)
372
373                        for rt, each_datadict in datadict.items():
374                            if rt == "ref_apex_rt":
375                                group_datadict["ref_apex_rt"].append(each_datadict)
376
377                            else:
378                                if rt in group_datadict.keys():
379                                    mz_list = each_datadict.get("mz")
380                                    abundance_list = each_datadict.get("abundance")
381
382                                    each_mz_abun = dict(zip(mz_list, abundance_list))
383
384                                    for index_mz, mz in enumerate(
385                                        group_datadict[rt].get("mz")
386                                    ):
387                                        if mz in each_mz_abun.keys():
388                                            each_mz_abun[mz] = (
389                                                each_mz_abun[mz]
390                                                + group_datadict[rt].get("abundance")[
391                                                    index_mz
392                                                ]
393                                            )
394
395                                        else:
396                                            each_mz_abun[mz] = group_datadict[rt].get(
397                                                "abundance"
398                                            )[index_mz]
399
400                                    group_datadict[rt] = {
401                                        "mz": list(each_mz_abun.keys()),
402                                        "abundance": list(each_mz_abun.values()),
403                                        "scan_number": each_datadict.get("scan_number"),
404                                    }
405
406                                else:
407                                    group_datadict[rt] = each_datadict
408
409                    peak_rt = []
410                    peak_tic = []
411
412                    # print(group_datadict.get('ref_apex_rt'))
413                    for rt, each_datadict in group_datadict.items():
414                        if rt != "ref_apex_rt":
415                            peak_rt.append(rt)
416                            peak_tic.append(sum(each_datadict["abundance"]))
417
418                    peak_rt, peak_tic = zip(*sorted(zip(peak_rt, peak_tic)))
419
420                    smoothed_tic = self.smooth_signal(peak_tic)
421
422                    include_indexes = sp.peak_picking_first_derivative(
423                        peak_rt,
424                        smoothed_tic,
425                        max_height,
426                        max_prominence,
427                        max_signal,
428                        min_peak_datapoints,
429                        peak_derivative_threshold,
430                        signal_threshold=signal_threshold,
431                        correct_baseline=False,
432                        plot_res=False,
433                    )
434
435                    include_indexes = list(include_indexes)
436
437                    if include_indexes:
438                        if len(include_indexes) > 1:
439                            """ after sum there are two apexes
440                                check if it is inside the deconvolution window, otherwise ignores it
441                            """
442
443                            for new_apex_index in include_indexes:
444                                # pass
445                                self.add_gcpeak(
446                                    new_apex_index,
447                                    start_rt,
448                                    final_rt,
449                                    peak_rt,
450                                    smoothed_tic,
451                                    group_datadict,
452                                    plot_res,
453                                )
454
455                        else:
456                            """ after sum there is on apex
457                                save it
458                            """
459                            new_apex_index = include_indexes[0]
460                            # print(include_indexes, group, apex_rt_list)
461                            self.add_gcpeak(
462                                new_apex_index,
463                                start_rt,
464                                final_rt,
465                                peak_rt,
466                                smoothed_tic,
467                                group_datadict,
468                                plot_res,
469                            )
470
471            elif len(filtered_features_rt) == 1:
472                """ only one peak feature inside deconvolution window """
473
474                each_apex_rt = filtered_features_rt[0]
475
476                datadict = peaks_entity_data.get(each_apex_rt)
477
478                peak_rt = []
479                peak_tic = []
480
481                for rt, each_datadict in datadict.items():
482                    if rt != "ref_apex_rt":
483                        peak_rt.append(rt)
484                        peak_tic.append(sum(each_datadict["abundance"]))
485
486                peak_rt, peak_tic = zip(*sorted(zip(peak_rt, peak_tic)))
487
488                smoothed_tic = self.smooth_signal(peak_tic)
489
490                include_indexes = sp.peak_picking_first_derivative(
491                    peak_rt,
492                    smoothed_tic,
493                    max_height,
494                    max_prominence,
495                    max_signal,
496                    min_peak_datapoints,
497                    peak_derivative_threshold,
498                    signal_threshold=signal_threshold,
499                    correct_baseline=False,
500                    plot_res=False,
501                )
502                include_indexes = list(include_indexes)
503
504                if include_indexes:
505                    """ after sum there are two apexes
506                            check if it is inside the deconvolution window, otherwise ignores it"""
507                    if len(include_indexes) > 1:
508                        for new_apex_index in include_indexes:
509                            # pass
510                            self.add_gcpeak(
511                                new_apex_index,
512                                start_rt,
513                                final_rt,
514                                peak_rt,
515                                smoothed_tic,
516                                datadict,
517                                plot_res,
518                            )
519
520                    else:
521                        """ after sum there is one apex
522                            save it
523                            includes_indexes = (start, apex, final )"""
524
525                        new_apex_index = include_indexes[0]
526
527                        self.add_gcpeak(
528                            new_apex_index,
529                            start_rt,
530                            final_rt,
531                            peak_rt,
532                            smoothed_tic,
533                            datadict,
534                            plot_res,
535                        )
536
537            else:
538                # print('no data after filter')
539                pass
540        if plot_res:
541            plt.plot(self.retention_time, self._processed_tic, c="black")
542            plt.show()
543
544    def quadratic_interpolation(self, rt_list, tic_list, apex_index):
545        rt_list = np.array(rt_list)
546        tic_list = np.array(tic_list)
547        three_highest_i = [i for i in range(apex_index - 1, apex_index + 2)]
548
549        z = np.poly1d(
550            np.polyfit(rt_list[three_highest_i], tic_list[three_highest_i], 2)
551        )
552        a = z[2]
553        b = z[1]
554
555        corrected_apex_rt = -b / (2 * a)
556        initial_rt = rt_list[apex_index]
557
558        return initial_rt - corrected_apex_rt